======================
SMITER Parallelisation
======================

:Author: Leon Kos

Most of the computation time when having large models and therefore
many triangles to calculate is spent in the ``powcal`` code. 


.. highlight:: diff

Introducing MPI to powcal
-------------------------

Change of compiler to support MPI in when building powcal ::

   diff --git a/f95/Makefile.powcal b/f95/Makefile.powcal
   index 606e157..00783cf 100644
   --- a/f95/Makefile.powcal
   +++ b/f95/Makefile.powcal
   @@ -72,6 +72,8 @@ MODULES=$(SOURCES:.f90=.mod)

   OBJECTS=$(SOURCES:.f90=.o)

   +F90 = mpif90

   $(PROG): $(OBJECTS)
   $(F90) $(F90FLAGS) -o $(PROG) $(OBJECTS)  $(LIBS)

Building ``powcall`` and submitting to the SLURM scheduler is possible by 

.. code-block:: bash

    $ make -f Makefile.powcal
    $ sbatch a.cmd
    Submitted batch job 528580
    $<g2kosl@s59 /pfs/work/g2kosl/KFWP4/P>cat a.cmd
    #!/bin/bash
    #SBATCH -N1 -n5                     # 36 cores on 1 node
    #SBATCH --time=0:05:00              # time limits: 1 hour
    #SBATCH --error=myJob.err           # standard error file
    #SBATCH --output=myJob.out          # standard output file
    #SBATCH --partition=bdw_fua_gwdbg   # partition name (for Gateway: bdw_fua_gw)
    ##SBATCH --qos=bdw_fua_gw           # no quality of service for debug queue
    hostname=$(hostname)
    echo "Hello from ${hostname}"
    rm Powcal_powcal.out Powcal_powx.vtk
    mpirun ${HOME}/smiter/f95/powcal Powcal
    $ squeue -u $USER
            JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            528670 bdw_fua_g    a.cmd   g2kosl  R       0:19      1 r040c03s01

The main program needs initialization and for a start disabling
of all sections that write results to files. ::

    diff --git a/f95/powcal.f90 b/f95/powcal.f90
    index 25cff21..32c625b 100644
    --- a/f95/powcal.f90
    +++ b/f95/powcal.f90
    @@ -62,6 +62,8 @@ program powcal_p
       use odes_m
       use stack_m
    
    +  use mpi
    +
       implicit none
    
     ! Local variables
    @@ -83,6 +85,9 @@ program powcal_p
       integer(ki4):: ifldspec !< field specification
       real(kr8):: zivac !< value of I in field file
       real(kr8):: zfac !< ratio of I in different field files
    +
    +  integer error
    +  call MPI_Init ( error )
     !--------------------------------------------------------------------------
     !! initialise timing
    
    @@ -205,6 +210,7 @@ program powcal_p
       call powcal_refine(powcal,gshadl,btree)
       call clock_stop(8)
     !--------------------------------------------------------------------------
    +if (.false.) then
     ! field line ends
       if (powcal%powres%flinends) then
          call gfile_close
    @@ -248,6 +254,7 @@ program powcal_p
       call poutfile_write(powcal,timestamp)
       call poutfile_close
       call clock_stop(30)
    +end if
     !--------------------------------------------------------------------------
     !! cleanup and closedown
     !      if (powcal%powres%n%nanalau==1) then
    @@ -264,5 +271,6 @@ program powcal_p
       call log_close
       call clock_delete
     !--------------------------------------------------------------------------
    +  call MPI_Finalize ( error )
    
     end program powcal_p

However, nearly all modules write into a ``.log`` file through log module.
As one process can have host one open file, we separate *logs* by adding
processor *rank* to the filename extension. ::

    diff --git a/f95/log_m.f90 b/f95/log_m.f90
    index dfc8bf4..e07e6eb 100644
    --- a/f95/log_m.f90
    +++ b/f95/log_m.f90
    @@ -2,6 +2,7 @@ module log_m
    
       use const_kind_m
       use date_time_m
    +  use mpi
    
       implicit none
       private
    @@ -59,6 +60,9 @@ subroutine log_init(fileroot,timestamp)
       logical :: unitused !< flag to test unit is available
       integer(ki4) :: ilen  !< length of string
    
    +  integer rank, error
    +  character(5)  :: log_id
    +
       !! initialise counters
       errorno=0
       seriouserrors=0
    @@ -74,6 +78,10 @@ subroutine log_init(fileroot,timestamp)
       ! and reset file root
       fileroot=logfile(1:ilen)
    
    +  call MPI_Comm_Rank(MPI_COMM_WORLD, rank, error)
    +  write(log_id, '(I5.5)'), rank
    +  logfile = trim(logfile)//'-'//trim(log_id)
    +
       !! get file unit
       do i=99,1,-1
          inquire(i,opened=unitused)

This way we get one *log* file per processor. As this log files tend
to be quit large, it would be better for performance reasons that we
replace all logs by *dummy* subroutines or prepare pre-processing
macros that will ignore the log sentences in all relevant modules.

The main loops that can receive paralelisation are within
``powcal_m.f90`` module and ``subroutine powcal_move(self, gshadl,
btree)``. We distribute computation with *stride* in number of
processors.

::

    diff --git a/f95/powcal_m.f90 b/f95/powcal_m.f90
    index 47bbf44..586204d 100644
    --- a/f95/powcal_m.f90
    +++ b/f95/powcal_m.f90
    @@ -25,6 +25,7 @@ module powcal_m
       use termplane_m
       use edgprof_h
       use edgprof_m
    +  use mpi
    
       implicit none
       private
    @@ -561,6 +562,10 @@ subroutine powcal_move(self,gshadl,btree)
       character(*), parameter :: s_name='powcal_move' !< subroutine name
       type(powelt_t) :: zelt   !< power element
    
    +  INTEGER rank, error, processes
    +  call MPI_Comm_size (MPI_COMM_WORLD, processes, error)
    +  call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)
    +
       ! check for axisymmetric
       if (self%powres%beq%n%vacfile=='null') then
          if (self%powres%beq%n%mrip/=0) then
    @@ -578,7 +583,8 @@ subroutine powcal_move(self,gshadl,btree)
             ! axisymmetric (no ripple field)
             field_type: select case (self%powres%beq%n%fldspec)
             case (1) ! should only be for caltype='local', no termplane
    -           do i=1,self%powres%npowe
    +           print *, 'MPIDEBUG 0:', self%powres%npowe
    +           do i=1+rank,self%powres%npowe,processes
                   zelt%ie=i
                   do j=imlevel,inlevel
                      zelt%je=j
                      call powelt_move(zelt,self,gshadl,btree)

Additionally, there are other ``powelt_move[0-4]`` subroutines that need
this breakdown of workload in the similar manner. 

Collecting power elements from neighboring processes
----------------------------------------------------

Computation of ``powcal`` starts with ``powcal_move`` that starts from
each triangle in the target until it hits a shadow or it misses and is
therefore *wetted*. This particle moving is the most expensive
operation with the end result of labeling the triangle as being in
shadow or not. The allocatable array ``powres%pow`` is therefore
needed to collected from neigboring processes back into process rank 0
that can continue with calculation in serial way in routine
``powcal_calc`` that for each *wetted* element where power deposition
with routine ``powelt_dep`` on each element is then calculated and
stored into arrays ``powres%pmask``, ``powres%psisa``,
``powres%angle``. After calculation of each element some statistics
with routine ``powcal_stat`` is computed on each element with
``powelt_stat`` and ``powelt_avg`` and stored into array
``powres%avg``.


The ``powelt_move*`` subroutines take ``zelt`` index and fill ``self``
as ``powcal_t`` data structure. Structure ``powres`` described in
:file:`powres_t.f90` file contains *allocatable* arrays that needs to
be correctly Send/Received with MPI.

.. code-block:: fortran
   
    type(powelt_t) :: zelt   !< power element
    type(powcal_t), intent(inout) :: self   !< powcal data structure

    !> data structure describing powelt
      type, public :: powelt_t
         integer(ki4)  :: ie !< geometrical object
         integer(ki4)  :: je !< sub-element object
      end type powelt_t

    !> type describing all power calculation data
      type, public :: powcal_t
         type(powres_t) :: powres !< powres
         type(edgprof_t) :: edgprof !< edgprof
         type(powlau_t) :: powlau !< powlau
         type(odes_t) :: odes !< odes
         type(pnumerics_t) :: n !< control  parameters
      end type powcal_t

    !> type storing power results data
      type, public :: powres_t
         type(posveclis_t) :: vecb !< list of B-vectors
         type(posveclis_t) :: vecx !< list of Cartesian position vectors
         type(geobjlist_t) :: geobjl !< geobj list
         type(beq_t) :: beq !< beq (only part defined)
         integer(ki4)  :: npowe !< number of powelts (ie. level 1 triangles)
         integer(ki4)  :: npow !< number of entries in pow
         integer(ki4)  :: npows !< number of entries in pows (normally use npowe)
         real(kr8) :: rblfac !< Lomas' factor \f$ \frac{1}{2 \pi \lambda_{mid} R_m B_{pm}} \f$
         real(kr8) :: fpfac !< Lomas' factor \f$ \frac{F P_{loss}}}{2 \pi \lambda_{mid} R_m B_{pm}} \f$
         real(kr8) :: slfac !< factor \f$ \frac{\sigma}{2 \lambda_{mid}} \f$
         real(kr8) :: qfac !< quantising factor
         real(kr8), dimension(3) :: qfaca !< quantising factors
         real(kr4), dimension(:), allocatable :: pow !< power
         integer(ki4), dimension(:), allocatable :: pmask !< power mask
         real(kr4), dimension(:), allocatable :: pows !< power statistic
         real(kr4), dimension(:), allocatable :: powa !< power average statistic
         real(kr4), dimension(:), allocatable :: psista !< \f$ \psi \f$ at start of fieldline
         !-----------------
         real(kr4), dimension(:), allocatable :: angle
         real(kr4), dimension(:), allocatable :: zbdotnvec
         !-----------------
         type(prnumerics_t) :: n !< control  parameters
         real(kr8) :: psimin !< minimum value of \f$ \psi \f$ for interpolation
         real(kr8) :: psimax !< maximum value of \f$ \psi \f$ for interpolation
         real(kr8) :: thetamin !< minimum value of \f$ \theta \f$ for interpolation
         real(kr8) :: thetamax !< maximum value of \f$ \theta \f$ for interpolation
         logical  :: flincart    !< plot switch for field line following
         logical  :: flinx    !< plot switch for field line following
         logical  :: flinends    !< gnuplot switch for field line ends
         logical  :: flinptz    !< plot switch for field line following in \f$ (\psi,\theta,\zeta) \f$
         logical  :: flinm    !< plot switch for field line following in \f$ (\psi,\theta,\zeta) \f$
         integer(ki4)  :: nflends !< gnuplot file unit for field line ends
      end type powres_t
    

We can continue using *strided* power element calculation with the following
update of the loop after the ``powelt_move*``::

    diff --git a/f95/powcal_m.f90 b/f95/powcal_m.f90
    index 586204d..c40a975 100644
    --- a/f95/powcal_m.f90
    +++ b/f95/powcal_m.f90
    @@ -562,7 +562,9 @@ subroutine powcal_move(self,gshadl,btree)
       character(*), parameter :: s_name='powcal_move' !< subroutine name
       type(powelt_t) :: zelt   !< power element
    
    -  INTEGER rank, error, processes
    +  INTEGER rank, error, processes, request, mpi_status(MPI_STATUS_SIZE)
    +  INTEGER expected_number_of_elements !< to receive in rank 0
    +  INTEGER k !< counter for elements to be send
       call MPI_Comm_size (MPI_COMM_WORLD, processes, error)
       call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)
    
    @@ -590,7 +592,46 @@ subroutine powcal_move(self,gshadl,btree)
                      zelt%je=j
                      call powelt_move(zelt,self,gshadl,btree)
                   end do
    +              if (rank .eq. 0) then
    +                 if (self%powres%npowe - i .ge. processes) then
    +                    expected_number_of_elements = processes - 1
    +                 else
    +                    expected_number_of_elements = self%powres%npowe - i
    +                 end if
    +                 do k=1, expected_number_of_elements
    +                    call MPI_IRecv(self%powres%pow(i+k), 1, MPI_REAL, &
    +                         & k, 111+i, MPI_COMM_WORLD, request, error)
    +                 end do
    +              else
    +                 call MPI_ISend(self%powres%pow(i), 1, MPI_REAL, &
    +                      & 0, 111+i, MPI_COMM_WORLD, request, error)
    +              end if
    +           call MPI_Barrier(MPI_COMM_WORLD, error)
                end do
             case default ! should only be fldspec=3, all caltypes with termplane
                calcn_type: select case (self%n%caltype)
                case ('afws','local')

Without MPI_Barrier at the end of each "swarm" of non-blocking
communication we receive *Exhausted 1048576 MQ irecv request
descriptors, which usually indicates a user program error or
insufficient request descriptors (PSM2_MQ_RECVREQS_MAX=1048576)*
message. The message indicates that we should wait at least every
millionth message to be received before continuing.

Rank 0 process is therefore collects the results and finally prepares
the files when we enable::

    diff --git a/f95/powcal.f90 b/f95/powcal.f90
    index 32c625b..980961e 100644
    --- a/f95/powcal.f90
    +++ b/f95/powcal.f90
    @@ -86,7 +86,7 @@ program powcal_p
       real(kr8):: zivac !< value of I in field file
       real(kr8):: zfac !< ratio of I in different field files
    
    -  integer error
    +  integer error, rank
       call MPI_Init ( error )
     !--------------------------------------------------------------------------
     !! initialise timing
    @@ -210,7 +210,9 @@ program powcal_p
       call powcal_refine(powcal,gshadl,btree)
       call clock_stop(8)
     !--------------------------------------------------------------------------
    -if (.false.) then
    +  call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)
    +
    +if (rank .eq. 0) then
     ! field line ends
       if (powcal%powres%flinends) then
          call gfile_close

Timings for NF55 benchmark case, reported as ``powcal evaluation time`` in
the output are the following :

==========  ===========  ===============
Processors  run time(s)  evaluation time
==========  ===========  ===============
  1            184           172
  2            104            90
  4             62            47
  8             42            26
 16             29            16            
 32             26            14
 64             26            14
==========  ===========  ===============

Disabling processor logging with::

    diff --git a/f95/log_m.f90 b/f95/log_m.f90
    index e07e6eb..d7c2aff 100644
    --- a/f95/log_m.f90
    +++ b/f95/log_m.f90
    @@ -78,6 +78,7 @@ subroutine log_init(fileroot,timestamp)
       ! and reset file root
       fileroot=logfile(1:ilen)
    
    +  return
       call MPI_Comm_Rank(MPI_COMM_WORLD, rank, error)
       write(log_id, '(I5.5)'), rank
       logfile = trim(logfile)//'-'//trim(log_id)
    @@ -113,7 +114,7 @@ subroutine log_error(modname,subname,point,severity,errormessage)
       character(*), intent(in) :: errormessage  !< error message
    
       !! local
    -
    +  return
       if(severity<log_info) then
          errorno=errorno+1
          write(nlog, '(i7.7,a,i2,a)') &
    @@ -164,7 +165,7 @@ subroutine log_alloc_check(modname,subname,point,status)
       character(len=*), intent(in) :: subname  !< subprogram name
       integer, intent(in) :: point    !< calling point
       integer, intent(in) :: status   !< error status flag
    -
    +
       if(status/=0)then
          call log_error(modname,subname,point,error_fatal,'allocation failed')
       end if
    @@ -234,7 +235,7 @@ end subroutine log_getunit
     subroutine log_close
    
       !! arguments
    -
    +  return
       write(nlog, '(//,a,/,a)') ' Error Summary ',' ------------- '
       write(nlog, '(a,i7)') ' total number of errors   = ',errorno
       write(nlog, '(a,i7)') ' number of serious errors = ',seriouserrors
    @@ -250,7 +251,7 @@ subroutine log_value_ki4(varname,value,units)
       character(*), intent(in) :: varname  !< variable name
       integer(ki4), intent(in) :: value    !< variable value
       character(*), intent(in),optional :: units  !< units name
    -
    +  return
       !!output
       if(present(units)) then
          write(nlog, '("Log  : ",a," = ",i10,1x,a)') varname,value,units
    @@ -264,6 +265,7 @@ subroutine log_value_kr4(varname,value,units)
       real(ki4), intent(in) :: value    !< variable value
       character(*), intent(in), optional :: units  !< units name
       !!output
    +  return
       if(present(units)) then
          write(nlog, '("Log  : ",a32,"  = ",g12.5,1x,a)') varname,value,units
       else
    @@ -276,7 +278,7 @@ subroutine log_value_kr8(varname,value,units)
       real(ki8), intent(in) :: value    !< variable value
       character(*), intent(in),optional :: units  !< units name
       !!output
    -
    +  return
       if(present(units)) then
          write(nlog, '("Log  : ",a32,"  = ",1pe15.8,2x,a)') varname,value,units
       else
    @@ -288,6 +290,7 @@ subroutine log_value_kl(varname,value)
       character(*), intent(in) :: varname  !< variable name
       logical, intent(in) :: value    !< variable value
       !!output
    +  return
       write(nlog, '("Log  : ",a," = ",l2)') varname,value
     end subroutine log_value_kl
     subroutine log_value_char(varname,value)
    @@ -295,6 +298,7 @@ subroutine log_value_char(varname,value)
       character(*), intent(in) :: varname  !< variable name
       character(*), intent(in) :: value    !< variable value
       !!output
    +  return
       write(nlog, '("Log  : ",a," = ",a)') varname,value
     end subroutine log_value_char
    
    
New timings are obtained (log writes disabled):

.. _barrier_times:

.. table::  Run and evaluation times with MPI_Barrier compared with no communication.

   ==========  ===========  ===============  ===========  ==========
   Processors  run time(s)  evaluation time   RT no comm  ET no comm
   ==========  ===========  ===============  ===========  ==========
    1            184           172             181          168
    2            104            90              96           84
    4             62            47              55           42
    8             42            26              33           21
   16             28            15              24           10
   32             25            13              19            5.3
   64             26            13              16            2.7
   ==========  ===========  ===============  ===========  ==========

MPI_Barrier run times (RT) and evaluation time (ET) from
:numref:`barrier_times` when compared to times without communication
(no comm) show that practical limit is 32 processors. ET no comm times
inside parallel loop dispatching workload show that *linear* speedup
can be achieved. There is 15 seconds of case setup overhead where
mainly reading/writing of files and setting up the structure occurs.

.. code-block:: fortran

		powcal run time=                    29.282     secs
		pcontrol_init time=                0.28980E-02 secs
		geoq_init time=                    0.50290E-02 secs
		vacfld_init time=                   0.0000     secs
		results geobjlist time=             1.4621     secs
		shadow geobjlist time=              9.6960     secs
		hds_init time=                      1.1324     secs
		initialise powcal time=            0.10057E-01 secs
		powcal evaluation time=             14.484     secs
		vfile_powx time=                    2.1499     secs
		outfile_init time=                 0.33838     secs


Complete timings on two nodes with total of 72 processes [Intel(R)
Xeon(R) CPU E5-2697 v4 @ 2.30GHz] show that most of the overhead time
(10 secs) is taking ``shadow geobjlist`` setup.

Allocate IRecv statuses and requests for receiving strided arrays
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Instead of sending non-blocking many small values in allocatable
arrays we change strategy and create a MPI derived type of single
INTEGER4 or REAL4 size that corresponds to types ``ki4`` and ``kr4``
defined in ``const_kind_m.f90`` respectively. 

.. note:: One needs to assure that ``MPI_REAL4`` actually corresponds
   in size of bytes to our ``kr4 = selected_real_kind(6)``. The same
   needs to be obeyed for ``ki4 = selected_int_kind(9)`` as
   ``MPI_INTEGER`` type.

Rank 0 process will receive all arrays that are variable length and
"strided' by number of processes. For that we pre-allocate MPI arrays of
*statuses* and *request*. Two new MPI strided types are created before
the loop and then free-ed at the end of subroutine ::

    diff --git a/f95/powcal_m.f90 b/f95/powcal_m.f90
    index b966780..dc0f637 100644
    --- a/f95/powcal_m.f90
    +++ b/f95/powcal_m.f90
    @@ -562,12 +562,33 @@ subroutine powcal_move(self,gshadl,btree)
       character(*), parameter :: s_name='powcal_move' !< subroutine name
       type(powelt_t) :: zelt   !< power element
    
    +  integer :: rank, processes, error  !< Standard MPI layout variables
    +  integer, parameter :: blocklengths(2) = (/1, 1/)
    +  integer, parameter :: mpi_kr4_stride_array_types(2) = (/MPI_REAL4, MPI_UB/)
    +  integer, parameter :: mpi_ki4_stride_array_types(2) = (/MPI_INTEGER4, MPI_UB/)
    +  integer, parameter :: num_arr_irecv = 1 !> num. of od arrays to IRecv
    +  integer :: displacements(2)
    +  integer, dimension(:,:), allocatable :: array_of_statuses
    +  integer, dimension(:), allocatable :: array_of_requests
    +  integer :: mpi_kr4_stride_type, mpi_ki4_stride_type
    +
       call MPI_Comm_size (MPI_COMM_WORLD, processes, error)
       call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)
    
    +  displacements(1) = 0 !< just one INT or REAL the stride at MPI_LB address
    +  displacements(2) = 4*processes !< stride of reals and ints of 4 bytes
    +
    +  call MPI_Type_struct(2, blocklengths, displacements, &
    +       & mpi_kr4_stride_array_types, mpi_kr4_stride_type, error)
    +  call MPI_Type_commit(mpi_kr4_stride_type, error)
    +
    +  call MPI_Type_struct(2, blocklengths, displacements, &
    +       & mpi_ki4_stride_array_types, mpi_ki4_stride_type, error)
    +  call MPI_Type_commit(mpi_ki4_stride_type, error)
    +
    +  allocate(array_of_statuses(MPI_STATUS_SIZE, num_arr_irecv*processes))
    +  allocate(array_of_requests(num_arr_irecv*processes))
    +
       ! check for axisymmetric
       if (self%powres%beq%n%vacfile=='null') then
          if (self%powres%beq%n%mrip/=0) then
    @@ -635,6 +656,9 @@ subroutine powcal_move(self,gshadl,btree)
          end do
       end if
    
    +  call MPI_Type_free(mpi_ki4_stride_type, error)
    +  call MPI_Type_free(mpi_kr4_stride_type, error)
    +
     end subroutine powcal_move
     !-------------------------------------------------------------
     !> coordinate calculation of power deposition (=powres_XX)

Before we start processing ``powelt_move*`` routines in strided way in
Rank 0 we claim non-blocking receives. The expected number of strided
block count depend on process rank and can differ due to padding
according to the total number of triangles in ``self%powres%npowe``.

.. code-block:: fortran

  if (use_non_blocking_communication) then
     if (rank .eq. 0) then
        allocate(array_of_statuses(MPI_STATUS_SIZE, &
	  & num_arr_irecv*(processes-1)))
        allocate(array_of_requests(num_arr_irecv*(processes-1)))
        do i = 1, processes-1 ! i == rank from which we expect array
           strided_block_count = (self%powres%npowe-i)/processes
           if (mod(self%powres%npowe-i, processes) .gt. 0) &
                strided_block_count = strided_block_count + 1
           call MPI_IRecv(self%powres%pow(i+1), strided_block_count, &
                & mpi_kr4_stride_type, i, 111, MPI_COMM_WORLD, &
                & array_of_requests((i-1)*num_arr_irecv + 1), error)
        end do
     end if
  end if


When the calculation of each process is finished we send the results
immediately with single ``MPI_Send`` from neighboring processors to
rank 0 process that waits all requests to complete. The order of
receiving arrays is unspecified.


.. code-block:: fortran

  if (use_non_blocking_communication) then
     if (rank .eq. 0) then
        call MPI_Waitall(num_arr_irecv*(processes-1), array_of_requests, &
             & array_of_statuses, error)
        deallocate(array_of_statuses, array_of_requests)
     else
        strided_block_count = (self%powres%npowe-rank)/processes
        if (mod(self%powres%npowe-rank, processes) .gt. 0) &
             strided_block_count = strided_block_count + 1
        call MPI_Send(self%powres%pow(rank+1), strided_block_count, &
             & mpi_kr4_stride_type, 0, 111, MPI_COMM_WORLD, &
             &  error)
     end if
  else ! use blocking communication
     if (rank .eq. 0) then
        do i=1, processes-1 ! i = rank
           strided_block_count = (self%powres%npowe-i)/processes
           if (mod(self%powres%npowe-i, processes) .gt. 0) &
                strided_block_count = strided_block_count + 1
           call MPI_Recv(self%powres%pow(i+1), strided_block_count, &
                mpi_kr4_stride_type, i, 111, MPI_COMM_WORLD, mpi_status, error) 
           call MPI_Recv(self%powres%angle(i+1), strided_block_count, &
                mpi_kr4_stride_type, i, 111, MPI_COMM_WORLD, mpi_status, error) 
        end do
     else
        strided_block_count = (self%powres%npowe-rank)/processes
        if (mod(self%powres%npowe-rank, processes) .gt. 0) &
             strided_block_count = strided_block_count + 1
        call MPI_Send(self%powres%pow(rank+1), strided_block_count, &
             mpi_kr4_stride_type, 0, 111, MPI_COMM_WORLD, error) 
        call MPI_Send(self%powres%angle(rank+1), strided_block_count, &
             mpi_kr4_stride_type, 0, 111, MPI_COMM_WORLD, error) 
     end if
  end if

Instead of opening non-blocking receive at the beginning we can just
use blocking communication that collects results from each processor
at the end of ``powelt_move*`` loops when
``use_non_blocking_communication`` logical parameter is set to
``.false.``.

.. _blocking_vs_non_blocking_times:

.. table::  Run and evaluation times with MPI_Barrier compared with no communication.

   ==========  ===========  ===============  ================  ===============
   Processors  run time(s)  evaluation time   RT non-blocking  ET non-blocking
   ==========  ===========  ===============  ================  ===============   
    1            185           170             180 (181)          165 (168)
    2            100            86              98 (96)            83 (84)
    4             58            43              57 (55)            42 (42)
    8             37            21              36 (33)            22 (21)
   16             26            12.2            26 (24)            12 (10)
   32             21             6.9            21 (19)           6.5 (5.3)
   64             19             4.9            19 (16)           4.2 (2.7)
   ==========  ===========  ===============  ================  ===============


Timings comparing blocking and non-blocking strategy are shown in
:numref:`blocking_vs_non_blocking_times`. Times in parentheses nearby
non-blocking times are *no communication* taken from
:numref:`barrier_times`.

The difference between non-blocking and blocking communication is
small as expected.  When we compare times with the initial solution
(cf.:numref:`barrier_times`) we see that speedup achieved in the last
column is now linear. Larger cases are now possible provided enough
computing resources.

Conditional compilation WITH_MPI
--------------------------------

If we want to retain serial code and still be able to compile *with
MPI* within the same source code then conditional compilation needs to
be introduced. Industry standard for FORTRAN is using CPP
preprocessor. Enabling CPP preprocessor is done by renaming file
``%.f90`` extensions to uppercase ``%.F90`` in files and Makefiles.
This means that all sources needs to be renamed for conditional
compilation to have single rules in Makefiles.


Since all SMITER codes use :file:`log_m.F90` module we can't link
serial codes without MPI. This means that serial codes needs to
receive at least call to ``MPI_Init()`` and ``MPI_Finalize()`` when
compiling ``WITH_MPI``.

For example in :file:`geoq.F90` (see :numref:`single_mpi`) we have added
conditional use of the MPI module and forced that only rank 0 process
is run serially even if *geoq* is started in parallel.

.. _single_mpi:

.. code-block:: diff
    :caption: Adaptation to serial processing

    diff --git a/f95/geoq.F90 b/f95/geoq.F90
    index be9f749..e88d160 100644
    --- a/f95/geoq.F90
    +++ b/f95/geoq.F90
    @@ -1,4 +1,7 @@
     program geoq_p
    +#ifdef WITH_MPI
    +  use mpi
    +#endif
    
       use const_kind_m
       use const_numphys_h
    @@ -72,6 +75,13 @@ program geoq_p
       integer(ki4):: ifldspec !< field specification
       real(kr8):: zivac !< value of I in field file
       character(len=80) :: ibuf !< character workspace
    +#ifdef WITH_MPI
    +  integer error, rank
    +  call MPI_Init ( error )
    +  call MPI_Comm_rank(MPI_COMM_WORLD, rank, error)
    +  if (rank .eq. 0) then
    +#endif
    +
     !--------------------------------------------------------------------------
     !! initialise timing
    
    @@ -383,5 +393,8 @@ program geoq_p
       call log_close
       call clock_delete
     !--------------------------------------------------------------------------
    -
    +#ifdef WITH_MPI
    +  end if
    +  call MPI_Finalize ( error )
    +#endif
     end program geoq_p

With all these defines the MPI code will be compiled only then
``WITH_MPI`` variable is defined at compile time. Because it is only
relevant to define ``WITH_MPI`` when we compile with MPI compiler it
is reasonable to modify configuration make-files to add define when
compiler name contains name ``mpi``. 

For example the following update is needed to default compiler configuration::

    diff --git a/config/config_gfortran_dbg.inc b/config/config_gfortran_dbg.inc
    index 4755f45..88a13fe 100644
    --- a/config/config_gfortran_dbg.inc
    +++ b/config/config_gfortran_dbg.inc
    @@ -5,11 +5,14 @@
     EXEDIR = ../exec
    
     # Fortran compiler
    -F90 = gfortran
    +F90 ?= gfortran
     FC = gfortran
    
     # Fortran compiler flags
     F90FLAGS = -g -fbacktrace -fbounds-check -fexceptions
    +ifeq ($(findstring mpi,$(F90)),mpi)
    +   F90FLAGS += -DWITH_MPI
    +endif
    

FORTRAN 90 compiler configuration variable :envvar:`F90` is now
"deferred" and "overriable" from command line. We can replace
:envvar:`F90` compiler variable with environment variable by typing on
the command line

.. code-block:: bash

   $ env F90=mpif90 exec/install


.. note::

   Note that the codes compiled with :command:`mpif90`
   can run on single processor without the need of :command:`mpirun`
   command.