====================== SMITER Parallelisation ====================== :Author: Leon Kos Most of the computation time when having large models and therefore many triangles to calculate is spent in the ``powcal`` code. .. highlight:: diff Introducing MPI to powcal ------------------------- Change of compiler to support MPI in when building powcal :: diff --git a/f95/Makefile.powcal b/f95/Makefile.powcal index 606e157..00783cf 100644 --- a/f95/Makefile.powcal +++ b/f95/Makefile.powcal @@ -72,6 +72,8 @@ MODULES=$(SOURCES:.f90=.mod) OBJECTS=$(SOURCES:.f90=.o) +F90 = mpif90 $(PROG): $(OBJECTS) $(F90) $(F90FLAGS) -o $(PROG) $(OBJECTS) $(LIBS) Building ``powcall`` and submitting to the SLURM scheduler is possible by .. code-block:: bash $ make -f Makefile.powcal $ sbatch a.cmd Submitted batch job 528580 $cat a.cmd #!/bin/bash #SBATCH -N1 -n5 # 36 cores on 1 node #SBATCH --time=0:05:00 # time limits: 1 hour #SBATCH --error=myJob.err # standard error file #SBATCH --output=myJob.out # standard output file #SBATCH --partition=bdw_fua_gwdbg # partition name (for Gateway: bdw_fua_gw) ##SBATCH --qos=bdw_fua_gw # no quality of service for debug queue hostname=$(hostname) echo "Hello from ${hostname}" rm Powcal_powcal.out Powcal_powx.vtk mpirun ${HOME}/smiter/f95/powcal Powcal $ squeue -u $USER JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 528670 bdw_fua_g a.cmd g2kosl R 0:19 1 r040c03s01 The main program needs initialization and for a start disabling of all sections that write results to files. :: diff --git a/f95/powcal.f90 b/f95/powcal.f90 index 25cff21..32c625b 100644 --- a/f95/powcal.f90 +++ b/f95/powcal.f90 @@ -62,6 +62,8 @@ program powcal_p use odes_m use stack_m + use mpi + implicit none ! Local variables @@ -83,6 +85,9 @@ program powcal_p integer(ki4):: ifldspec !< field specification real(kr8):: zivac !< value of I in field file real(kr8):: zfac !< ratio of I in different field files + + integer error + call MPI_Init ( error ) !-------------------------------------------------------------------------- !! initialise timing @@ -205,6 +210,7 @@ program powcal_p call powcal_refine(powcal,gshadl,btree) call clock_stop(8) !-------------------------------------------------------------------------- +if (.false.) then ! field line ends if (powcal%powres%flinends) then call gfile_close @@ -248,6 +254,7 @@ program powcal_p call poutfile_write(powcal,timestamp) call poutfile_close call clock_stop(30) +end if !-------------------------------------------------------------------------- !! cleanup and closedown ! if (powcal%powres%n%nanalau==1) then @@ -264,5 +271,6 @@ program powcal_p call log_close call clock_delete !-------------------------------------------------------------------------- + call MPI_Finalize ( error ) end program powcal_p However, nearly all modules write into a ``.log`` file through log module. As one process can have host one open file, we separate *logs* by adding processor *rank* to the filename extension. :: diff --git a/f95/log_m.f90 b/f95/log_m.f90 index dfc8bf4..e07e6eb 100644 --- a/f95/log_m.f90 +++ b/f95/log_m.f90 @@ -2,6 +2,7 @@ module log_m use const_kind_m use date_time_m + use mpi implicit none private @@ -59,6 +60,9 @@ subroutine log_init(fileroot,timestamp) logical :: unitused !< flag to test unit is available integer(ki4) :: ilen !< length of string + integer rank, error + character(5) :: log_id + !! initialise counters errorno=0 seriouserrors=0 @@ -74,6 +78,10 @@ subroutine log_init(fileroot,timestamp) ! and reset file root fileroot=logfile(1:ilen) + call MPI_Comm_Rank(MPI_COMM_WORLD, rank, error) + write(log_id, '(I5.5)'), rank + logfile = trim(logfile)//'-'//trim(log_id) + !! get file unit do i=99,1,-1 inquire(i,opened=unitused) This way we get one *log* file per processor. As this log files tend to be quit large, it would be better for performance reasons that we replace all logs by *dummy* subroutines or prepare pre-processing macros that will ignore the log sentences in all relevant modules. The main loops that can receive paralelisation are within ``powcal_m.f90`` module and ``subroutine powcal_move(self, gshadl, btree)``. We distribute computation with *stride* in number of processors. :: diff --git a/f95/powcal_m.f90 b/f95/powcal_m.f90 index 47bbf44..586204d 100644 --- a/f95/powcal_m.f90 +++ b/f95/powcal_m.f90 @@ -25,6 +25,7 @@ module powcal_m use termplane_m use edgprof_h use edgprof_m + use mpi implicit none private @@ -561,6 +562,10 @@ subroutine powcal_move(self,gshadl,btree) character(*), parameter :: s_name='powcal_move' !< subroutine name type(powelt_t) :: zelt !< power element + INTEGER rank, error, processes + call MPI_Comm_size (MPI_COMM_WORLD, processes, error) + call MPI_Comm_rank(MPI_COMM_WORLD, rank, error) + ! check for axisymmetric if (self%powres%beq%n%vacfile=='null') then if (self%powres%beq%n%mrip/=0) then @@ -578,7 +583,8 @@ subroutine powcal_move(self,gshadl,btree) ! axisymmetric (no ripple field) field_type: select case (self%powres%beq%n%fldspec) case (1) ! should only be for caltype='local', no termplane - do i=1,self%powres%npowe + print *, 'MPIDEBUG 0:', self%powres%npowe + do i=1+rank,self%powres%npowe,processes zelt%ie=i do j=imlevel,inlevel zelt%je=j call powelt_move(zelt,self,gshadl,btree) Additionally, there are other ``powelt_move[0-4]`` subroutines that need this breakdown of workload in the similar manner. Collecting power elements from neighboring processes ---------------------------------------------------- Computation of ``powcal`` starts with ``powcal_move`` that starts from each triangle in the target until it hits a shadow or it misses and is therefore *wetted*. This particle moving is the most expensive operation with the end result of labeling the triangle as being in shadow or not. The allocatable array ``powres%pow`` is therefore needed to collected from neigboring processes back into process rank 0 that can continue with calculation in serial way in routine ``powcal_calc`` that for each *wetted* element where power deposition with routine ``powelt_dep`` on each element is then calculated and stored into arrays ``powres%pmask``, ``powres%psisa``, ``powres%angle``. After calculation of each element some statistics with routine ``powcal_stat`` is computed on each element with ``powelt_stat`` and ``powelt_avg`` and stored into array ``powres%avg``. The ``powelt_move*`` subroutines take ``zelt`` index and fill ``self`` as ``powcal_t`` data structure. Structure ``powres`` described in :file:`powres_t.f90` file contains *allocatable* arrays that needs to be correctly Send/Received with MPI. .. code-block:: fortran type(powelt_t) :: zelt !< power element type(powcal_t), intent(inout) :: self !< powcal data structure !> data structure describing powelt type, public :: powelt_t integer(ki4) :: ie !< geometrical object integer(ki4) :: je !< sub-element object end type powelt_t !> type describing all power calculation data type, public :: powcal_t type(powres_t) :: powres !< powres type(edgprof_t) :: edgprof !< edgprof type(powlau_t) :: powlau !< powlau type(odes_t) :: odes !< odes type(pnumerics_t) :: n !< control parameters end type powcal_t !> type storing power results data type, public :: powres_t type(posveclis_t) :: vecb !< list of B-vectors type(posveclis_t) :: vecx !< list of Cartesian position vectors type(geobjlist_t) :: geobjl !< geobj list type(beq_t) :: beq !< beq (only part defined) integer(ki4) :: npowe !< number of powelts (ie. level 1 triangles) integer(ki4) :: npow !< number of entries in pow integer(ki4) :: npows !< number of entries in pows (normally use npowe) real(kr8) :: rblfac !< Lomas' factor \f$ \frac{1}{2 \pi \lambda_{mid} R_m B_{pm}} \f$ real(kr8) :: fpfac !< Lomas' factor \f$ \frac{F P_{loss}}}{2 \pi \lambda_{mid} R_m B_{pm}} \f$ real(kr8) :: slfac !< factor \f$ \frac{\sigma}{2 \lambda_{mid}} \f$ real(kr8) :: qfac !< quantising factor real(kr8), dimension(3) :: qfaca !< quantising factors real(kr4), dimension(:), allocatable :: pow !< power integer(ki4), dimension(:), allocatable :: pmask !< power mask real(kr4), dimension(:), allocatable :: pows !< power statistic real(kr4), dimension(:), allocatable :: powa !< power average statistic real(kr4), dimension(:), allocatable :: psista !< \f$ \psi \f$ at start of fieldline !----------------- real(kr4), dimension(:), allocatable :: angle real(kr4), dimension(:), allocatable :: zbdotnvec !----------------- type(prnumerics_t) :: n !< control parameters real(kr8) :: psimin !< minimum value of \f$ \psi \f$ for interpolation real(kr8) :: psimax !< maximum value of \f$ \psi \f$ for interpolation real(kr8) :: thetamin !< minimum value of \f$ \theta \f$ for interpolation real(kr8) :: thetamax !< maximum value of \f$ \theta \f$ for interpolation logical :: flincart !< plot switch for field line following logical :: flinx !< plot switch for field line following logical :: flinends !< gnuplot switch for field line ends logical :: flinptz !< plot switch for field line following in \f$ (\psi,\theta,\zeta) \f$ logical :: flinm !< plot switch for field line following in \f$ (\psi,\theta,\zeta) \f$ integer(ki4) :: nflends !< gnuplot file unit for field line ends end type powres_t We can continue using *strided* power element calculation with the following update of the loop after the ``powelt_move*``:: diff --git a/f95/powcal_m.f90 b/f95/powcal_m.f90 index 586204d..c40a975 100644 --- a/f95/powcal_m.f90 +++ b/f95/powcal_m.f90 @@ -562,7 +562,9 @@ subroutine powcal_move(self,gshadl,btree) character(*), parameter :: s_name='powcal_move' !< subroutine name type(powelt_t) :: zelt !< power element - INTEGER rank, error, processes + INTEGER rank, error, processes, request, mpi_status(MPI_STATUS_SIZE) + INTEGER expected_number_of_elements !< to receive in rank 0 + INTEGER k !< counter for elements to be send call MPI_Comm_size (MPI_COMM_WORLD, processes, error) call MPI_Comm_rank(MPI_COMM_WORLD, rank, error) @@ -590,7 +592,46 @@ subroutine powcal_move(self,gshadl,btree) zelt%je=j call powelt_move(zelt,self,gshadl,btree) end do + if (rank .eq. 0) then + if (self%powres%npowe - i .ge. processes) then + expected_number_of_elements = processes - 1 + else + expected_number_of_elements = self%powres%npowe - i + end if + do k=1, expected_number_of_elements + call MPI_IRecv(self%powres%pow(i+k), 1, MPI_REAL, & + & k, 111+i, MPI_COMM_WORLD, request, error) + end do + else + call MPI_ISend(self%powres%pow(i), 1, MPI_REAL, & + & 0, 111+i, MPI_COMM_WORLD, request, error) + end if + call MPI_Barrier(MPI_COMM_WORLD, error) end do case default ! should only be fldspec=3, all caltypes with termplane calcn_type: select case (self%n%caltype) case ('afws','local') Without MPI_Barrier at the end of each "swarm" of non-blocking communication we receive *Exhausted 1048576 MQ irecv request descriptors, which usually indicates a user program error or insufficient request descriptors (PSM2_MQ_RECVREQS_MAX=1048576)* message. The message indicates that we should wait at least every millionth message to be received before continuing. Rank 0 process is therefore collects the results and finally prepares the files when we enable:: diff --git a/f95/powcal.f90 b/f95/powcal.f90 index 32c625b..980961e 100644 --- a/f95/powcal.f90 +++ b/f95/powcal.f90 @@ -86,7 +86,7 @@ program powcal_p real(kr8):: zivac !< value of I in field file real(kr8):: zfac !< ratio of I in different field files - integer error + integer error, rank call MPI_Init ( error ) !-------------------------------------------------------------------------- !! initialise timing @@ -210,7 +210,9 @@ program powcal_p call powcal_refine(powcal,gshadl,btree) call clock_stop(8) !-------------------------------------------------------------------------- -if (.false.) then + call MPI_Comm_rank(MPI_COMM_WORLD, rank, error) + +if (rank .eq. 0) then ! field line ends if (powcal%powres%flinends) then call gfile_close Timings for NF55 benchmark case, reported as ``powcal evaluation time`` in the output are the following : ========== =========== =============== Processors run time(s) evaluation time ========== =========== =============== 1 184 172 2 104 90 4 62 47 8 42 26 16 29 16 32 26 14 64 26 14 ========== =========== =============== Disabling processor logging with:: diff --git a/f95/log_m.f90 b/f95/log_m.f90 index e07e6eb..d7c2aff 100644 --- a/f95/log_m.f90 +++ b/f95/log_m.f90 @@ -78,6 +78,7 @@ subroutine log_init(fileroot,timestamp) ! and reset file root fileroot=logfile(1:ilen) + return call MPI_Comm_Rank(MPI_COMM_WORLD, rank, error) write(log_id, '(I5.5)'), rank logfile = trim(logfile)//'-'//trim(log_id) @@ -113,7 +114,7 @@ subroutine log_error(modname,subname,point,severity,errormessage) character(*), intent(in) :: errormessage !< error message !! local - + return if(severity num. of od arrays to IRecv + integer :: displacements(2) + integer, dimension(:,:), allocatable :: array_of_statuses + integer, dimension(:), allocatable :: array_of_requests + integer :: mpi_kr4_stride_type, mpi_ki4_stride_type + call MPI_Comm_size (MPI_COMM_WORLD, processes, error) call MPI_Comm_rank(MPI_COMM_WORLD, rank, error) + displacements(1) = 0 !< just one INT or REAL the stride at MPI_LB address + displacements(2) = 4*processes !< stride of reals and ints of 4 bytes + + call MPI_Type_struct(2, blocklengths, displacements, & + & mpi_kr4_stride_array_types, mpi_kr4_stride_type, error) + call MPI_Type_commit(mpi_kr4_stride_type, error) + + call MPI_Type_struct(2, blocklengths, displacements, & + & mpi_ki4_stride_array_types, mpi_ki4_stride_type, error) + call MPI_Type_commit(mpi_ki4_stride_type, error) + + allocate(array_of_statuses(MPI_STATUS_SIZE, num_arr_irecv*processes)) + allocate(array_of_requests(num_arr_irecv*processes)) + ! check for axisymmetric if (self%powres%beq%n%vacfile=='null') then if (self%powres%beq%n%mrip/=0) then @@ -635,6 +656,9 @@ subroutine powcal_move(self,gshadl,btree) end do end if + call MPI_Type_free(mpi_ki4_stride_type, error) + call MPI_Type_free(mpi_kr4_stride_type, error) + end subroutine powcal_move !------------------------------------------------------------- !> coordinate calculation of power deposition (=powres_XX) Before we start processing ``powelt_move*`` routines in strided way in Rank 0 we claim non-blocking receives. The expected number of strided block count depend on process rank and can differ due to padding according to the total number of triangles in ``self%powres%npowe``. .. code-block:: fortran if (use_non_blocking_communication) then if (rank .eq. 0) then allocate(array_of_statuses(MPI_STATUS_SIZE, & & num_arr_irecv*(processes-1))) allocate(array_of_requests(num_arr_irecv*(processes-1))) do i = 1, processes-1 ! i == rank from which we expect array strided_block_count = (self%powres%npowe-i)/processes if (mod(self%powres%npowe-i, processes) .gt. 0) & strided_block_count = strided_block_count + 1 call MPI_IRecv(self%powres%pow(i+1), strided_block_count, & & mpi_kr4_stride_type, i, 111, MPI_COMM_WORLD, & & array_of_requests((i-1)*num_arr_irecv + 1), error) end do end if end if When the calculation of each process is finished we send the results immediately with single ``MPI_Send`` from neighboring processors to rank 0 process that waits all requests to complete. The order of receiving arrays is unspecified. .. code-block:: fortran if (use_non_blocking_communication) then if (rank .eq. 0) then call MPI_Waitall(num_arr_irecv*(processes-1), array_of_requests, & & array_of_statuses, error) deallocate(array_of_statuses, array_of_requests) else strided_block_count = (self%powres%npowe-rank)/processes if (mod(self%powres%npowe-rank, processes) .gt. 0) & strided_block_count = strided_block_count + 1 call MPI_Send(self%powres%pow(rank+1), strided_block_count, & & mpi_kr4_stride_type, 0, 111, MPI_COMM_WORLD, & & error) end if else ! use blocking communication if (rank .eq. 0) then do i=1, processes-1 ! i = rank strided_block_count = (self%powres%npowe-i)/processes if (mod(self%powres%npowe-i, processes) .gt. 0) & strided_block_count = strided_block_count + 1 call MPI_Recv(self%powres%pow(i+1), strided_block_count, & mpi_kr4_stride_type, i, 111, MPI_COMM_WORLD, mpi_status, error) call MPI_Recv(self%powres%angle(i+1), strided_block_count, & mpi_kr4_stride_type, i, 111, MPI_COMM_WORLD, mpi_status, error) end do else strided_block_count = (self%powres%npowe-rank)/processes if (mod(self%powres%npowe-rank, processes) .gt. 0) & strided_block_count = strided_block_count + 1 call MPI_Send(self%powres%pow(rank+1), strided_block_count, & mpi_kr4_stride_type, 0, 111, MPI_COMM_WORLD, error) call MPI_Send(self%powres%angle(rank+1), strided_block_count, & mpi_kr4_stride_type, 0, 111, MPI_COMM_WORLD, error) end if end if Instead of opening non-blocking receive at the beginning we can just use blocking communication that collects results from each processor at the end of ``powelt_move*`` loops when ``use_non_blocking_communication`` logical parameter is set to ``.false.``. .. _blocking_vs_non_blocking_times: .. table:: Run and evaluation times with MPI_Barrier compared with no communication. ========== =========== =============== ================ =============== Processors run time(s) evaluation time RT non-blocking ET non-blocking ========== =========== =============== ================ =============== 1 185 170 180 (181) 165 (168) 2 100 86 98 (96) 83 (84) 4 58 43 57 (55) 42 (42) 8 37 21 36 (33) 22 (21) 16 26 12.2 26 (24) 12 (10) 32 21 6.9 21 (19) 6.5 (5.3) 64 19 4.9 19 (16) 4.2 (2.7) ========== =========== =============== ================ =============== Timings comparing blocking and non-blocking strategy are shown in :numref:`blocking_vs_non_blocking_times`. Times in parentheses nearby non-blocking times are *no communication* taken from :numref:`barrier_times`. The difference between non-blocking and blocking communication is small as expected. When we compare times with the initial solution (cf.:numref:`barrier_times`) we see that speedup achieved in the last column is now linear. Larger cases are now possible provided enough computing resources. Conditional compilation WITH_MPI -------------------------------- If we want to retain serial code and still be able to compile *with MPI* within the same source code then conditional compilation needs to be introduced. Industry standard for FORTRAN is using CPP preprocessor. Enabling CPP preprocessor is done by renaming file ``%.f90`` extensions to uppercase ``%.F90`` in files and Makefiles. This means that all sources needs to be renamed for conditional compilation to have single rules in Makefiles. Since all SMITER codes use :file:`log_m.F90` module we can't link serial codes without MPI. This means that serial codes needs to receive at least call to ``MPI_Init()`` and ``MPI_Finalize()`` when compiling ``WITH_MPI``. For example in :file:`geoq.F90` (see :numref:`single_mpi`) we have added conditional use of the MPI module and forced that only rank 0 process is run serially even if *geoq* is started in parallel. .. _single_mpi: .. code-block:: diff :caption: Adaptation to serial processing diff --git a/f95/geoq.F90 b/f95/geoq.F90 index be9f749..e88d160 100644 --- a/f95/geoq.F90 +++ b/f95/geoq.F90 @@ -1,4 +1,7 @@ program geoq_p +#ifdef WITH_MPI + use mpi +#endif use const_kind_m use const_numphys_h @@ -72,6 +75,13 @@ program geoq_p integer(ki4):: ifldspec !< field specification real(kr8):: zivac !< value of I in field file character(len=80) :: ibuf !< character workspace +#ifdef WITH_MPI + integer error, rank + call MPI_Init ( error ) + call MPI_Comm_rank(MPI_COMM_WORLD, rank, error) + if (rank .eq. 0) then +#endif + !-------------------------------------------------------------------------- !! initialise timing @@ -383,5 +393,8 @@ program geoq_p call log_close call clock_delete !-------------------------------------------------------------------------- - +#ifdef WITH_MPI + end if + call MPI_Finalize ( error ) +#endif end program geoq_p With all these defines the MPI code will be compiled only then ``WITH_MPI`` variable is defined at compile time. Because it is only relevant to define ``WITH_MPI`` when we compile with MPI compiler it is reasonable to modify configuration make-files to add define when compiler name contains name ``mpi``. For example the following update is needed to default compiler configuration:: diff --git a/config/config_gfortran_dbg.inc b/config/config_gfortran_dbg.inc index 4755f45..88a13fe 100644 --- a/config/config_gfortran_dbg.inc +++ b/config/config_gfortran_dbg.inc @@ -5,11 +5,14 @@ EXEDIR = ../exec # Fortran compiler -F90 = gfortran +F90 ?= gfortran FC = gfortran # Fortran compiler flags F90FLAGS = -g -fbacktrace -fbounds-check -fexceptions +ifeq ($(findstring mpi,$(F90)),mpi) + F90FLAGS += -DWITH_MPI +endif FORTRAN 90 compiler configuration variable :envvar:`F90` is now "deferred" and "overriable" from command line. We can replace :envvar:`F90` compiler variable with environment variable by typing on the command line .. code-block:: bash $ env F90=mpif90 exec/install .. note:: Note that the codes compiled with :command:`mpif90` can run on single processor without the need of :command:`mpirun` command.