Using Hardware Performance Monitor (HPM) Toolkit: A primer
last update: 01/07/2009

Hybrid OpenMP - MPI parallel example

This example uses the same exponential sum loop as the simple serial example and the OpenMP threaded example. The problem is now divided between two MPI processes; each MPI process employs two OpenMP threads to calculate the partial sum on each MPI process. Each MPI process does exactly half of the loop so that the final sum is the sum of the answers of both MPI processes.

Script for compiling source code, creating batch job, and running:

#! /bin/csh

# Part1: put the hybrid Fortran source code in file it.F
cat << 'EOF1' > it.F
      program main
      implicit none
      include 'mpif.h'
#include "/usr/include/f_hpm.h"
      integer thdID, omp_get_thread_num
      integer i,rank,ierr,error,tag,length,status(MPI_STATUS_SIZE)
      real sum, buf(2), elapsed, rtc
      call mpi_init(error)
      call mpi_comm_rank(MPI_COMM_WORLD,rank,error)
      tag=0
      length=2

      if (rank .eq. 1) then
      print*, 'rank = ', rank
! Initialize hpmtoolkit:
      call f_hpminit(rank+1,'exp_sum_2_MPI_procs_2_thds')
      sum=0.0
      elapsed=rtc()
!$omp parallel private (thdID)
! Start instrumentation around compute loop:
      thdID = 1+omp_get_thread_num()
      call f_hpmtstart(thdID, "Partial Sum EXP")
!$omp do reduction(+:sum)
      do i=1,500000
         sum=sum+exp(.00000001*i)
      end do
! Stop intrumentation after compute loop:
      call f_hpmtstop(thdID)
!$omp end parallel
! Generate hardware analysis output file:
      call f_hpmterminate(rank+1)
      elapsed=rtc()-elapsed
      buf(1)=sum
      buf(2)=elapsed
      call mpi_send(buf,length,MPI_REAL8,0,tag,MPI_COMM_WORLD,error)

      else
      print*, 'rank = ', rank
! Initialize hpmtoolkit:
      call f_hpminit(rank+1,'exp_sum_2_MPI_procs_2_thds')
      sum=0.0
      elapsed=rtc()
!$omp parallel private (thdID)
! Start instrumentation around compute loop:
      thdID = 1+omp_get_thread_num()
      call f_hpmtstart(thdID, "Partial Sum EXP")
!$omp do reduction(+:sum)
      do i=500001,1000000
         sum=sum+exp(.00000001*i)
      end do
! Stop intrumentation after compute loop:
      call f_hpmtstop(thdID)
!$omp end parallel
! Generate hardware analysis output file:
      call f_hpmterminate (rank+1)
      elapsed=rtc()-elapsed
      print*, 'got here'
      print*, 'length=', length
      call mpi_recv(buf,length,MPI_REAL8,1,tag,MPI_COMM_WORLD,status,error)
      print*,'Sum=',sum+buf(1), 'Loop time=', elapsed+buf(2)

      end if
      call mpi_finalize(error)
      stop
      end
'EOF1'

# Part2: compile the it.F source code with hpm and pmapi libraries
mpxlf95_r -I/usr/include -qarch=auto -qtune=auto -qsmp=omp -O3 -qstrict -oit it.F -L/usr/lib -lhpm_r -lpmapi -lm

# Part3: create the batch job lsf.hybridjob
cat << 'EOF2' > lsf.hybridjob
#!/bin/csh
#
# LSF script to run an OMP code
#
#BSUB -x                         # exclusive use of node
#BSUB -n 2                       # use 2 processors
#BSUB -R "span[ptile=2]"         # run 2 tasks per host
#BSUB -o hyblsf.%J.out           # ouput filename
#BSUB -e hyblsf.%J.err           # input filename
#BSUB -J hyblsf.test             # job name
#BSUB -P xxxxxxxx                # your valid 8-digit project number
#BSUB -W 0:10                    # hh:mm wall clock time 
#BSUB -q regular                 # queue

setenv OMP_NUM_THREADS 2
mpirun.lsf ./it
exit
'EOF2'

# Part4: submit lsf.hybridjob to the batch queue specified by #BSUB -q
bsub < lsf.hybridjob

# Part5: cleanup
rm -f it.F it lsf.hybridjob 

Output on bluefire POWER6:

The first MPI process:

libhpm summary - running on POWER6 (bluefire)

Total execution time of instrumented code (wall time): 0.016698 seconds

 ########  Resource Usage Statistics  ########  

 Total amount of time in user mode            : 0.100610 seconds
 Total amount of time in system mode          : 0.030243 seconds
 Maximum resident set size                    : 35352 Kbytes
 Average shared memory use in text segment    : 1 Kbytes*sec
 Average unshared memory use in data segment  : 2477 Kbytes*sec
 Number of page faults without I/O activity   : 8983
 Number of page faults with I/O activity      : 24
 Number of times process was swapped out      : 0
 Number of times file system performed INPUT  : 0
 Number of times file system performed OUTPUT : 0
 Number of IPC messages sent                  : 0
 Number of IPC messages received              : 0
 Number of signals delivered                  : 0
 Number of voluntary context switches         : 95
 Number of involuntary context switches       : 3

 #######  End of Resource Statistics  ########

 Instrumented section: 1 - Label: Partial Sum EXP - process: 2
 file: it.F, lines: 23 <--> 29
  Count: 1
  Wall Clock Time: 0.015507 seconds
  Total time in user mode: 0.015373412627551 seconds

 Set: 1
 Counting duration: 0.015437242 seconds
  PM_FPU_1FLOP (FPU executed one flop instruction )          :         3000003
  PM_FPU_FMA (FPU executed multiply-add instruction)         :         2750000
  PM_FPU_FSQRT_FDIV (FPU executed FSQRT or FDIV instruction) :               0
  PM_CYC (Processor cycles)                                  :        72316533
  PM_RUN_INST_CMPL (Run instructions completed)              :        19609628
  PM_RUN_CYC (Run cycles)                                    :        72597572


  Utilization rate                                 :          99.139 %
  Flop                                             :           8.500 Mflop
  Flop rate (flops / WCT)                          :         548.140 Mflop/s
  Flops / user time                                :         552.903 Mflop/s
  FMA percentage                                   :          95.652 %


 Instrumented section: 2 - Label: Partial Sum EXP - process: 2
 file: it.F, lines: 23 <--> 29
  Count: 1
  Wall Clock Time: 0.015499 seconds
  Total time in user mode: 0.0153830153061224 seconds

 Set: 1
 Counting duration: 0.015428521 seconds
  PM_FPU_1FLOP (FPU executed one flop instruction )          :         3000001
  PM_FPU_FMA (FPU executed multiply-add instruction)         :         2750000
  PM_FPU_FSQRT_FDIV (FPU executed FSQRT or FDIV instruction) :               0
  PM_CYC (Processor cycles)                                  :        72361704
  PM_RUN_INST_CMPL (Run instructions completed)              :        20502860
  PM_RUN_CYC (Run cycles)                                    :        72560961


  Utilization rate                                 :          99.252 %
  Flop                                             :           8.500 Mflop
  Flop rate (flops / WCT)                          :         548.423 Mflop/s
  Flops / user time                                :         552.558 Mflop/s
  FMA percentage                                   :          95.652 %

The second MPI process:

libhpm summary - running on POWER6 (bluefire)

 Total execution time of instrumented code (wall time): 0.01658 seconds

 ########  Resource Usage Statistics  ########  

 Total amount of time in user mode            : 0.097866 seconds
 Total amount of time in system mode          : 0.028342 seconds
 Maximum resident set size                    : 35384 Kbytes
 Average shared memory use in text segment    : 1 Kbytes*sec
 Average unshared memory use in data segment  : 2241 Kbytes*sec
 Number of page faults without I/O activity   : 8849
 Number of page faults with I/O activity      : 5
 Number of times process was swapped out      : 0
 Number of times file system performed INPUT  : 0
 Number of times file system performed OUTPUT : 0
 Number of IPC messages sent                  : 0
 Number of IPC messages received              : 0
 Number of signals delivered                  : 0
 Number of voluntary context switches         : 95
 Number of involuntary context switches       : 6

 #######  End of Resource Statistics  ########

 Instrumented section: 1 - Label: Partial Sum EXP - process: 1
 file: it.F, lines: 48 <--> 54
  Count: 1
  Wall Clock Time: 0.015307 seconds
  Total time in user mode: 0.0151363465136054 seconds

 Set: 1
 Counting duration: 0.015226662 seconds
  PM_FPU_1FLOP (FPU executed one flop instruction )          :         3000003
  PM_FPU_FMA (FPU executed multiply-add instruction)         :         2750000
  PM_FPU_FSQRT_FDIV (FPU executed FSQRT or FDIV instruction) :               0
  PM_CYC (Processor cycles)                                  :        71201374
  PM_RUN_INST_CMPL (Run instructions completed)              :        19604442
  PM_RUN_CYC (Run cycles)                                    :        71608943


  Utilization rate                                 :          98.885 %
  Flop                                             :           8.500 Mflop
  Flop rate (flops / WCT)                          :         555.302 Mflop/s
  Flops / user time                                :         561.562 Mflop/s
  FMA percentage                                   :          95.652 %


 Instrumented section: 2 - Label: Partial Sum EXP - process: 1
 file: it.F, lines: 48 <--> 54
  Count: 1
  Wall Clock Time: 0.015256 seconds
  Total time in user mode: 0.0151644024234694 seconds

 Set: 1
 Counting duration: 0.015199242 seconds
  PM_FPU_1FLOP (FPU executed one flop instruction )          :         3000001
  PM_FPU_FMA (FPU executed multiply-add instruction)         :         2750000
  PM_FPU_FSQRT_FDIV (FPU executed FSQRT or FDIV instruction) :               0
  PM_CYC (Processor cycles)                                  :        71333349
  PM_RUN_INST_CMPL (Run instructions completed)              :        19654310
  PM_RUN_CYC (Run cycles)                                    :        71484032


  Utilization rate                                 :          99.400 %
  Flop                                             :           8.500 Mflop
  Flop rate (flops / WCT)                          :         557.158 Mflop/s
  Flops / user time                                :         560.523 Mflop/s
  FMA percentage                                   :          95.652 %

Next page | Table of contents - HPM Toolkit primer

If you have questions about this document, please contact via any of the methods shown on this page: CISL Customer Support.

© Copyright 2003-2009. University Corporation for Atmospheric Research (UCAR). All Rights Reserved.

Address of this page: http://www.cisl.ucar.edu/docs/ibm/hpm.toolkit/ex.hybrid.html