Literate version of PEXSI LDOS calculator

Table of Contents

-*- mode: Org -*-

1 Introduction

This is a spin-polarized version of the SIESTA-PEXSI interface for the calculation of the LDOS (which is a "partial" DM computed in a restricted energy window). Module structure:

  MODULE m_pexsi_local_dos
#ifdef SIESTA__PEXSI
  private
  public :: pexsi_local_dos

  CONTAINS
  <<siesta-side-parent-routine>>
  <<pexsi-ldos-routine>>
#endif
  end module m_pexsi_local_dos

2 Parent routine with dispatch to Siesta post-processing

This routine defines the energy window (energy and broadening), and calls the PEXSI routine to compute the partial DM, which is then passed to the dhscf machinery for the calculation of the corresponding charge, which is just the LDOS needed. The output file will have extension .LDSI.

subroutine pexsi_local_dos( )
  use m_energies

  use sparse_matrices
  USE siesta_options
  use siesta_geom
  use atomlist,       only: indxuo, indxua           
  use atomlist,       only: qtot, no_u, no_l
  use atomlist,       only: iphorb                   
  use atomlist,       only: datm, no_s, iaorb        
  use fdf
  use files,          only : slabel     
  use files,          only : filesOut_t ! type for output file names
  use parallel,       only:  SIESTA_worker
  use m_ntm
  use m_forces,       only: fa
  use m_spin,         only: nspin, qs, efs
  use m_dhscf,        only: dhscf

  implicit none

  integer   :: dummy_iscf = 1
  real(dp)  :: dummy_str(3,3), dummy_strl(3,3)  ! for dhscf call
  real(dp)  :: dummy_dipol(3)

  real(dp)  :: factor, g2max
  real(dp)  :: energy, broadening

  type(filesOut_t)     :: filesOut  ! blank output file names

  energy = fdf_get('PEXSI.LDOS.Energy',0.0_dp,"Ry")
  broadening = fdf_get('PEXSI.LDOS.Broadening',0.01_dp,"Ry")

  ! Note that we re-use Dscf, so it will be obliterated
  call get_LDOS_SI( no_u, no_l, nspin,  &
       maxnh, numh, listh, H, S,  &
       Dscf, energy, broadening)

  if (SIESTA_worker) then
     !Find the LDOS in the real space mesh
     filesOut%rho = trim(slabel) // '.LDSI'
     g2max = g2cut

     ! There is too much clutter here, because dhscf is
     ! a "kitchen-sink" routine that does too many things

     call dhscf( nspin, no_s, iaorb, iphorb, no_l, &
          no_u, na_u, na_s, isa, xa_last, indxua,  &
          ntm, 0, 0, 0, filesOut,                  &
          maxnh, numh, listhptr, listh, Dscf, Datm, maxnh, H, &
          Enaatm, Enascf, Uatm, Uscf, DUscf, DUext, Exc, Dxc, &
          dummy_dipol, dummy_str, fa, dummy_strl )
  endif

END subroutine pexsi_local_dos

3 PEXSI LDOS routine

3.1 Main structure

This is the main structure of the LDOS routine.

<<routine-header>>
<<routine-variables>>
!  --------  for serial compilation
<<define-communicators>>
<<re-distribute-matrices>>
<<set-options>>
<<prepare-plan>>
<<load-hs-matrices>>
<<factorization>>
<<compute-ldos>>
<<copy-to-siesta-side>>
<<clean-up>>

CONTAINS

<<support-routines>>

end subroutine get_LDOS_SI

3.2 Routine header

subroutine get_LDOS_SI( no_u, no_l, nspin_in,  &
     maxnh, numh, listh, H, S,  &
     LDOS_DM, energy, broadening)

<<used-modules>>

implicit          none

integer, intent(in)          :: maxnh, no_u, no_l, nspin_in
integer, intent(in), target  :: listh(maxnh), numh(no_l)
real(dp), intent(in), target :: H(maxnh,nspin_in), S(maxnh)
real(dp), intent(in)         :: energy, broadening
real(dp), intent(out)        :: LDOS_DM(maxnh,nspin_in)

3.2.1 Used modules

use precision, only  : dp
use fdf
use units,       only: eV, pi
use sys,         only: die
use m_mpi_utils, only: broadcast
use parallel, only   : SIESTA_worker, BlockSize
use parallel, only   : SIESTA_Group, SIESTA_Comm
use m_redist_spmatrix, only: aux_matrix, redistribute_spmatrix
use class_Distribution
use alloc,             only: re_alloc, de_alloc
use mpi_siesta
use iso_c_binding
use f_ppexsi_interface, only: f_ppexsi_options
use f_ppexsi_interface, only: f_ppexsi_plan_finalize
use f_ppexsi_interface, only: f_ppexsi_plan_initialize
use f_ppexsi_interface, only: f_ppexsi_selinv_complex_symmetric_matrix
use f_ppexsi_interface, only: f_ppexsi_load_real_symmetric_hs_matrix
use f_ppexsi_interface, only: f_ppexsi_set_default_options
use f_ppexsi_interface, &
      only: f_ppexsi_symbolic_factorize_complex_symmetric_matrix

3.3 Routine variables

The local variables for the routine must be declared in a certain place for the compiler, but it is more clear to introduce them as they are needed. The routine-variables noweb-ref will be used for this throughout this document.

integer        :: ih, i
integer        :: info
logical        :: write_ok
!------------
external         :: timer

3.4 Define communicators

World_Comm, which is in principle set to Siesta's copy of MPI_Comm_World, is the global communicator.

integer          :: World_Comm, mpirank, ierr
!
integer   :: norbs
integer   :: nspin
!
! Our global communicator is a duplicate of the passed communicator
!
call MPI_Comm_Dup(true_MPI_Comm_World, World_Comm, ierr)
call mpi_comm_rank( World_Comm, mpirank, ierr )

call timer("pexsi-ldos", 1)  

if (SIESTA_worker) then
   ! rename some intent(in) variables, which are only
   ! defined for the Siesta subset
   norbs = no_u
   nspin = nspin_in
endif
!
call broadcast(norbs,comm=World_Comm)
call broadcast(nspin,World_Comm)

Now we need to define the Siesta distribution object and the communicator and distribution object for the first team of PEXSI workers, for the purposes of re-distribution of the relevant matrices.

For spin, things are a bit more complicated. We need to make sure that the distributions are defined and known to all processors (via actual ranks) with respect to the same reference bridge communicator. For now, this is WorldComm.

integer :: PEXSI_Pole_Group, PEXSI_Spatial_Group, World_Group
integer, allocatable :: pexsi_pole_ranks_in_world(:)
integer  :: nworkers_SIESTA
integer, allocatable :: siesta_ranks_in_world(:)
integer :: PEXSI_Pole_Group_in_World
integer, allocatable :: PEXSI_Pole_ranks_in_World_Spin(:,:)
integer :: PEXSI_Pole_Comm, PEXSI_Spatial_Comm, PEXSI_Spin_Comm
integer :: numNodesTotal
integer :: npPerPole
logical  :: PEXSI_worker
!
type(Distribution)   :: dist1
type(Distribution), allocatable, target   :: dist2_spin(:)
type(Distribution), pointer :: dist2
integer  :: pbs, color, spatial_rank, spin_rank

Define the Siesta distribution. Note that this is known to all nodes.

call MPI_Comm_Group(World_Comm,World_Group, ierr)
call MPI_Group_Size(SIESTA_Group, nworkers_SIESTA, ierr)
allocate(siesta_ranks_in_world(nworkers_SIESTA))
call MPI_Group_translate_ranks( SIESTA_Group, nworkers_SIESTA, &
     (/ (i,i=0,nworkers_SIESTA-1) /), &
     World_Group, siesta_ranks_in_world, ierr )
call newDistribution(dist1,World_Comm,siesta_ranks_in_world, &
                     TYPE_BLOCK_CYCLIC,BlockSize,"bc dist")
deallocate(siesta_ranks_in_world)
call MPI_Barrier(World_Comm,ierr)

For possibly spin-polarized calculations, we split the communicator. Note that we give the user the option of requesting more processors per pole for the LDOS calculation.

call mpi_comm_size( World_Comm, numNodesTotal, ierr )

npPerPole  = fdf_get("PEXSI.np-per-pole",4)
npPerPole  = fdf_get("PEXSI.LDOS.np-per-pole",npPerPole)
if (nspin*npPerPole > numNodesTotal) &
     call die("PEXSI.np-per-pole is too big for MPI size")

! "Row" communicator for independent PEXSI operations on each spin
! The name refers to "spatial" degrees of freedom.
color = mod(mpirank,nspin)    ! {0,1} for nspin = 2, or {0} for nspin = 1
call MPI_Comm_Split(World_Comm, color, mpirank, PEXSI_Spatial_Comm, ierr)

! "Column" communicator for spin reductions
color = mpirank/nspin       
call MPI_Comm_Split(World_Comm, color, mpirank, PEXSI_Spin_Comm, ierr)

! Group and Communicator for first-pole team of PEXSI workers
!
call MPI_Comm_Group(PEXSI_Spatial_Comm, PEXSI_Spatial_Group, Ierr)
call MPI_Group_incl(PEXSI_Spatial_Group, npPerPole,   &
     (/ (i,i=0,npPerPole-1) /),&
     PEXSI_Pole_Group, Ierr)
call MPI_Comm_create(PEXSI_Spatial_Comm, PEXSI_Pole_Group,&
     PEXSI_Pole_Comm, Ierr)


call mpi_comm_rank( PEXSI_Spatial_Comm, spatial_rank, ierr )
call mpi_comm_rank( PEXSI_Spin_Comm, spin_rank, ierr )
PEXSI_worker = (spatial_rank < npPerPole)   ! Could be spin up or spin down

! PEXSI blocksize 
pbs = norbs/npPerPole

! Careful with this. For the purposes of matrix transfers,
! we need the ranks of the Pole group
! in the "bridge" communicator/group (World)

allocate(pexsi_pole_ranks_in_world(npPerPole))
call MPI_Comm_Group(World_Comm, World_Group, Ierr)

call MPI_Group_translate_ranks( PEXSI_Pole_Group, npPerPole, &
     (/ (i,i=0,npPerPole-1) /), &
     World_Group, pexsi_pole_ranks_in_world, ierr )

! What we need is to include the actual world ranks
! in the distribution object
allocate (PEXSI_Pole_ranks_in_World_Spin(npPerPole,nspin))
call MPI_AllGather(pexsi_pole_ranks_in_world,npPerPole,MPI_integer,&
     PEXSI_Pole_Ranks_in_World_Spin(1,1),npPerPole, &
     MPI_integer,PEXSI_Spin_Comm,ierr)

! Create distributions known to all nodes
allocate(dist2_spin(nspin))
do ispin = 1, nspin
   call newDistribution(dist2_spin(ispin), World_Comm, &
        PEXSI_Pole_Ranks_in_World_Spin(:,ispin),  &
        TYPE_PEXSI, pbs, "px dist")
enddo
deallocate(pexsi_pole_ranks_in_world,PEXSI_Pole_Ranks_in_World_Spin)
call MPI_Barrier(World_Comm,ierr)

3.5 Re-distribute matrices

This is slightly unseemly, but it works. The aux_matrix derived types are used to store and retrieve the matrices in either side. The code is in external auxiliary modules.

type(aux_matrix), allocatable, target :: m1_spin(:)
type(aux_matrix) :: m2
type(aux_matrix), pointer :: m1
integer :: nrows, nnz, nnzLocal, numColLocal
integer, pointer, dimension(:) ::  colptrLocal=> null(), rowindLocal=>null()
!
real(dp), pointer, dimension(:) :: &
        HnzvalLocal=>null(), SnzvalLocal=>null(),  &
        DMnzvalLocal => null()
!
integer :: ispin, pexsi_spin
pexsi_spin = spin_rank+1  ! {1,2}
! This is done serially on the Siesta side, each time
! filling in the structures in one PEXSI set

allocate(m1_spin(nspin))
do ispin = 1, nspin

   m1 => m1_spin(ispin)

   if (SIESTA_worker) then
      m1%norbs = norbs
      m1%no_l  = no_l
      m1%nnzl  = sum(numH(1:no_l))
      m1%numcols => numH
      m1%cols    => listH
      allocate(m1%vals(2))
      m1%vals(1)%data => S(:)
      m1%vals(2)%data => H(:,ispin)

   endif  ! SIESTA_worker

   call timer("redist_orbs_fwd", 1)

   ! Note that we cannot simply wrap this in a pexsi_spin test, as
   ! there are Siesta nodes in both spin sets.
   ! We must discriminate the PEXSI workers by the distribution info
   dist2 => dist2_spin(ispin)
   call redistribute_spmatrix(norbs,m1,dist1,m2,dist2,World_Comm)

   call timer("redist_orbs_fwd", 2)

   if (PEXSI_worker .and. (pexsi_spin == ispin) ) then

      nrows = m2%norbs          ! or simply 'norbs'
      numColLocal = m2%no_l
      nnzLocal    = m2%nnzl
      call MPI_AllReduce(nnzLocal,nnz,1,MPI_integer,MPI_sum,PEXSI_Pole_Comm,ierr)

      call re_alloc(colptrLocal,1,numColLocal+1,"colptrLocal","pexsi_ldos")
      colptrLocal(1) = 1
      do ih = 1,numColLocal
         colptrLocal(ih+1) = colptrLocal(ih) + m2%numcols(ih)
      enddo

      rowindLocal => m2%cols
      SnzvalLocal => m2%vals(1)%data
      HnzvalLocal => m2%vals(2)%data

      call re_alloc(DMnzvalLocal,1,nnzLocal,"DMnzvalLocal","pexsi_ldos")

      call memory_all("after transferring H+S for PEXSI-LDOS",PEXSI_Pole_Comm)

   endif ! PEXSI worker
enddo

! Make these available to all
! (Note that the values are those on process 0, which is in the spin=1 set
! In fact, they are only needed for calls to the interface, so the broadcast
! could be over PEXSI_Spatial_Comm only.

call MPI_Bcast(nrows,1,MPI_integer,0,World_Comm,ierr)
call MPI_Bcast(nnz,1,MPI_integer,0,World_Comm,ierr)

call memory_all("after setting up H+S for PEXSI LDOS",World_comm)

3.6 Set options

We use the options interface to get a template with default values, and then fill in a few custom options based on fdf variables.

type(f_ppexsi_options) :: options
!
integer                :: isSIdentity
integer                :: verbosity
! -- 
  isSIdentity = 0
!
  call f_ppexsi_set_default_options( options )
  ! Ordering flag:
  !   1: Use METIS
  !   0: Use PARMETIS/PTSCOTCH
  options%ordering = fdf_get("PEXSI.ordering",1)
  ! Number of processors for symbolic factorization
  ! Only relevant for PARMETIS/PT_SCOTCH
  options%npSymbFact = fdf_get("PEXSI.np-symbfact",1)
  options%verbosity = fdf_get("PEXSI.verbosity",1)

3.7 Prepare plan

Each spin-set of PEXSI processors has its own plan, but we only include the first-pole group of nodes…

integer(c_intptr_t)    :: plan
  integer :: numProcRow, numProcCol
  integer :: outputFileIndex
  call get_row_col(npPerPole,numProcRow,numProcCol)

  ! Set the outputFileIndex to be the pole index.
  ! Starting from PEXSI v0.8.0, the first processor for each pole outputs
  ! information

  if( mod( mpirank, npPerPole ) .eq. 0 ) then
    outputFileIndex = mpirank / npPerPole;
  else
    outputFileIndex = -1;
  endif
!
! Note that even though we only use one pole's worth of processors, we
! still use the full spatial PEXSI communicator in the plan.
! Failing to do so leads to an error. This is not sufficiently documented.
!
  plan = f_ppexsi_plan_initialize(&
      PEXSI_Spatial_Comm,&
      numProcRow,&
      numProcCol,&
      outputFileIndex,&
      info) 

call check_info(info,"plan_initialize in LDOS")

3.8 Load H and S matrices

In this version H and S are symmetric. We associate them with the plan (I really do not know very well what happens behind the scenes. Presumably no copy is made.)

call f_ppexsi_load_real_symmetric_hs_matrix(&
      plan,&
      options,&
      nrows,&
      nnz,&
      nnzLocal,&
      numColLocal,&
      colptrLocal,&
      rowindLocal,&
      HnzvalLocal,&
      isSIdentity,&
      SnzvalLocal,&
      info) 

call check_info(info,"load_real_sym_hs_matrix in LDOS")

3.9 Factorization

This is a bit ambiguous, as we have loaded a "symmetric" matrix (actually H and S), but I believe that inside (and in the plan) specifically complex structures are handled and filled in.

  call f_ppexsi_symbolic_factorize_complex_symmetric_matrix(&
       plan, &
       options,&
       info)
  call check_info(info,"factorize complex matrix in LDOS")

options%isSymbolicFactorize = 0 ! We do not need it anymore

3.10 Compute the LDOS

real(dp), pointer, dimension(:) :: AnzvalLocal => null()
real(dp), pointer, dimension(:) :: AinvnzvalLocal => null()
integer :: loc

Note that only the first-pole team does this.

if (PEXSI_worker) then

   if(mpirank == 0) then
      write(6,"(a,f16.5,f10.5)") &
           'Calling PEXSI LDOS routine. Energy and broadening (eV) ', &
           energy/eV, broadening/eV
      write(6,"(a,i4)") &
           'Processors working on selected inversion: ', npPerPole
   endif

   call timer("pexsi-ldos-selinv", 1)

   ! Build AnzvalLocal as H-zS, where z=(E,broadening), and treat
   ! it as a one-dimensional real array with 2*nnzlocal entries

   call re_alloc(AnzvalLocal,1,2*nnzLocal,"AnzvalLocal","pexsi_ldos")
   call re_alloc(AinvnzvalLocal,1,2*nnzLocal,"AinvnzvalLocal","pexsi_ldos")

   loc = 1
   do i = 1, nnzLocal
      AnzvalLocal(loc) = Hnzvallocal(i) - energy*Snzvallocal(i)
      AnzvalLocal(loc+1) =  - broadening*Snzvallocal(i)
      loc = loc + 2
   enddo

   call f_ppexsi_selinv_complex_symmetric_matrix(&
        plan,&
        options,&
        AnzvalLocal,&
        AinvnzvalLocal,&
        info) 

   call check_info(info,"selinv complex matrix in LDOS")

   ! Get DMnzvalLocal as 1/pi * Imag(Ainv...)

   loc = 1
   do i = 1, nnzLocal
      DMnzvalLocal(i) = (1.0_dp/pi) * AinvnzvalLocal(loc+1)
      loc = loc + 2
   enddo
   call de_alloc(AnzvalLocal,"AnzvalLocal","pexsi_ldos")
   call de_alloc(AinvnzvalLocal,"AinvnzvalLocal","pexsi_ldos")

   call timer("pexsi-ldos-selinv", 2)
   !
endif ! PEXSI_worker

3.11 Copy information to Siesta side

do ispin = 1, nspin

   m1 => m1_spin(ispin)

   if (PEXSI_worker .and. (pexsi_spin == ispin)) then
      ! Prepare m2 to transfer

      call de_alloc(colPtrLocal,"colPtrLocal","pexsi_ldos")

      call de_alloc(m2%vals(1)%data,"m2%vals(1)%data","pexsi_ldos")
      call de_alloc(m2%vals(2)%data,"m2%vals(2)%data","pexsi_ldos")

     deallocate(m2%vals)
     allocate(m2%vals(1))
     m2%vals(1)%data => DMnzvalLocal(1:nnzLocal)

   endif

   ! Prepare m1 to receive the results
   if (SIESTA_worker) then
      nullify(m1%vals(1)%data)    ! formerly pointing to S
      nullify(m1%vals(2)%data)    ! formerly pointing to H
      deallocate(m1%vals)
      nullify(m1%numcols)         ! formerly pointing to numH
      nullify(m1%cols)            ! formerly pointing to listH
   endif

   call timer("redist_orbs_bck", 1)
   dist2 => dist2_spin(ispin)
   call redistribute_spmatrix(norbs,m2,dist2,m1,dist1,World_Comm)
   call timer("redist_orbs_bck", 2)

   if (PEXSI_worker .and. (pexsi_spin == ispin)) then
      call de_alloc(DMnzvalLocal, "DMnzvalLocal", "pexsi_ldos")

      nullify(m2%vals(1)%data)    ! formerly pointing to DM
      deallocate(m2%vals)
      ! allocated in the direct transfer
      call de_alloc(m2%numcols,"m2%numcols","pexsi_ldos")
      call de_alloc(m2%cols,   "m2%cols",   "pexsi_ldos")
   endif

   if (SIESTA_worker) then

      LDOS_DM(:,ispin)  = m1%vals(1)%data(:)    
      ! Check no_l
      if (no_l /= m1%no_l) then
         call die("Mismatch in no_l")
      endif
      ! Check listH
      if (any(listH(:) /= m1%cols(:))) then
         call die("Mismatch in listH")
      endif

      call de_alloc(m1%vals(1)%data,"m1%vals(1)%data","pexsi_ldos")
      deallocate(m1%vals)
      ! allocated in the direct transfer
      call de_alloc(m1%numcols,"m1%numcols","pexsi_ldos") 
      call de_alloc(m1%cols,   "m1%cols",   "pexsi_ldos")

   endif
enddo
call timer("pexsi-ldos", 2)

3.12 Clean up

call delete(dist1)
do ispin = 1, nspin
   call delete(dist2_spin(ispin))
enddo
deallocate(dist2_spin)
deallocate(m1_spin)

call MPI_Comm_Free(PEXSI_Spatial_Comm, ierr)
call MPI_Comm_Free(PEXSI_Spin_Comm, ierr)
call MPI_Comm_Free(World_Comm, ierr)

! This communicator was created from a subgroup.
! As such, it is MPI_COMM_NULL for those processes
! not in the subgroup (non PEXSI_workers). Only
! defined communicators can be freed
if (PEXSI_worker) then
   call MPI_Comm_Free(PEXSI_Pole_Comm, ierr)
endif

! We finalize the plan here
call f_ppexsi_plan_finalize( plan, info )

call MPI_Group_Free(PEXSI_Spatial_Group, ierr)
call MPI_Group_Free(PEXSI_Pole_Group, ierr)
call MPI_Group_Free(World_Group, ierr)

3.13 Support routines

A couple of routines

<<get-row-col>>
<<check-info>>

3.13.1 Row and column partition of npPerPole

subroutine get_row_col(np,nrow,ncol)
integer, intent(in)  :: np
integer, intent(out) :: nrow, ncol
!
! Finds the factors nrow and ncol such that nrow*ncol=np,
! are as similar as possible, and nrow>=ncol.
! For prime np, ncol=1, nrow=np.

ncol  = floor(sqrt(dble(np)))
do
  nrow = np/ncol
  if (nrow*ncol == np) exit
  ncol = ncol - 1
enddo
end subroutine get_row_col

3.13.2 Error dispatcher

subroutine check_info(info,str)
integer, intent(in) :: info
character(len=*), intent(in) :: str

    if(mpirank == 0) then
       if (info /= 0) then
          write(6,*) trim(str) // " info : ", info
          call die("Error exit from " // trim(str) // " routine")
       endif
      call pxfflush(6)
    endif       
end subroutine check_info

Author: Alberto Garcia

Created: 2016-09-14 Wed 11:32

Emacs 24.5.1 (Org mode 8.2.10)

Validate