matrix-vector multiply program
      program matrix_vector_product
      implicit none
      include 'mpif.h'
      double precision mflops, time
      double precision a, b, lc, gc
      integer comm_row, comm_col, comm_cart
      integer ierror, nprocs, myproc, myrow, mycol
      integer iter, m, n, p, q, mp, nq, ndims
      integer i, j
      integer dims
      logical periods, hor_dims, ver_dims
      integer desca(8), descb(8), descgc(8), ctxt
      include 'mxvsizes'
      dimension a(mp,nq), b(nq), lc(mp), gc(mp)
      dimension dims(ndims), periods(ndims)
      dimension hor_dims(ndims), ver_dims(ndims)
      data (dims(i), i=1,2) / p, q /
      data (periods(i), i=1,2) /.false., .false. /
      data (hor_dims(i), i=1,2) /.false., .true. /
      data (ver_dims(i), i=1,2) /.true., .false. /
      call MPI_INIT (ierror)
      call MPI_COMM_SIZE (MPI_COMM_WORLD, nprocs, ierror)
      call MPI_COMM_RANK (MPI_COMM_WORLD, myproc, ierror)
      call MPI_DIMS_CREATE (nprocs, ndims, dims, ierror)
      call MPI_CART_CREATE (MPI_COMM_WORLD, ndims, dims, periods, .true.
     $     ,comm_cart, ierror)  
      call MPI_CART_SUB(comm_cart, hor_dims, comm_row, ierror)
      call MPI_CART_SUB(comm_cart, ver_dims, comm_col, ierror)
      call MPI_COMM_RANK (comm_row, mycol, ierror)
      call MPI_COMM_RANK (comm_col, myrow, ierror)
      do j = 1, nq
         do i= 1, mp
            a(i,j) = dble (myrow * mp + mycol * nq + i + j - 1)
         enddo
      enddo
      do j = 1, nq
         b(j) = 1.0d0
      enddo
      call MPI_BARRIER (MPI_COMM_WORLD, ierror)
      if (myproc .eq. 0) time = MPI_WTIME ()
      do i = 1, iter
         call mxv (mp, nq, a, b, lc, gc, comm_row)
      enddo
      call MPI_BARRIER (MPI_COMM_WORLD, ierror)
      if (myproc .eq. 0) then
         time = MPI_WTIME () - time
         mflops = 1.0d-6 * (2 * m * n - m) * iter / time
         write (6,*) 'mxv achieves ', mflops, ' Mflop/s'
         write (6,*) '(', iter, ' iterations in ', time, ' secs)'
      endif
      if (iter .eq. 1) call check_mxv (gc, mp, n, myrow)
C Initialize Communications 
      call blas_mxv (mp, nq, a, b, lc, gc, comm_row)
      call MPI_BARRIER (MPI_COMM_WORLD, ierror)
      if (myproc .eq. 0) time = MPI_WTIME ()
      do i = 1, iter
         call blas_mxv (mp, nq, a, b, lc, gc, comm_row)
      enddo
      call MPI_BARRIER (MPI_COMM_WORLD, ierror)
      if (myproc .eq. 0) then
         time = MPI_WTIME () - time
         mflops = 1.0d-6 * (2 * m * n - m) * iter / time
         write (6,*) 'blas_mxv achieves ', mflops, ' Mflop/s'
         write (6,*) '(', iter, ' iterations in ', time, ' secs)'
      endif
      if (iter .eq. 1) call check_mxv (gc, mp, n, myrow)
      call BLACS_GET (0, 0 , ctxt)
      call BLACS_GRIDINIT (ctxt, 'Row-major', dims(1), dims(2))
      call descinit (desca, m, n, mp, nq, 0, 0, ctxt, mp)
      call descinit (descb, 1, n, 1, nq, 0, 0, ctxt, 1)
      call descinit (descgc, m, 1, mp, 1, 0, 0, ctxt, mp)
C Initialize Communications 
      call pblas_mxv (m, n, a, desca, b, descb, gc, descgc)
      call MPI_BARRIER (MPI_COMM_WORLD, ierror)
      if (myproc .eq. 0) time = MPI_WTIME ()
      do i = 1, iter
         call pblas_mxv (m, n, a, desca, b, descb, gc, descgc)
      enddo
      call MPI_BARRIER (MPI_COMM_WORLD, ierror)
      if (myproc .eq. 0) then
         time = MPI_WTIME () - time
         mflops = 1.0d-6 * (2 * m * n - m) * iter / time
         write (6,*) 'pblas_mxv achieves ', mflops, ' Mflop/s'
         write (6,*) '(', iter, ' iterations in ', time, ' secs)'
      endif
      if (iter .eq. 1) call check_mxv (gc, mp, n, myrow)
      call BLACS_EXIT (0)
      call MPI_FINALIZE (ierror)
      end