In this demo program we compute the dot product of two vectors x and y.
We use upc_forall to compute partial sums and then use a upc_lock to
protect the part where we reduce the individual partial sums to the total
dot product.
The other part of the demo is to show difference between the
blocked and cyclic distribution.
The arrays x_cyc and y_cyc are declared with block size 1
and arrays x_blk and y_blk are declared with [* ]]
block size. This can't change the results, but it does change the
partial sums collected by each thread.
<dotproduct.c>=
//dotproduct.c -- simple dot product
//Intro: upc_forall, locks, cyclic vs blocked
#include <stdio.h>
#include <upc.h>
#define NperTHREAD 100
#define SIZE (NperTHREAD * THREADS)
#define BLOCK NperTHREAD
shared float dot_cyc, dot_blk;
shared float x_cyc[SIZE], y_cyc[SIZE];
shared [*] float x_blk[SIZE], y_blk[SIZE];
upc_lock_t *dotlock;
main ()
{
int i;
float mydot;
dotlock = upc_all_lock_alloc();
upc_lock_init( dotlock );
if(MYTHREAD == 0)
dot_blk = dot_cyc = 0.0;
upc_barrier(0);
// "affinity" is an int so it is (i mod THREADS)
upc_forall( i=0; i< SIZE; i++; i ){
x_cyc[i] = (float) i;
y_cyc[i] = x_cyc[i];
x_blk[i] = (float) i;
y_blk[i] = x_blk[i];
}
upc_barrier(1);
mydot = 0.0;
// "affinity" is found from affinity of x_cyc[i]
upc_forall( i=0; i< SIZE; i++; &x_cyc[i] )
mydot += x_cyc[i] * y_cyc[i];
printf ("Process %2d holds %g (cyclic)\n", MYTHREAD, mydot);
upc_lock(dotlock);
dot_cyc = dot_cyc + mydot;
upc_unlock(dotlock);
upc_barrier(2);
if( MYTHREAD == 0 )
printf("Total (cyclic) is %g\n", dot_cyc);
upc_barrier(3);
mydot = 0.0;
// "affinity" is found from affinity of x[i]
upc_forall( i=0; i< SIZE; i++; &x_blk[i] )
mydot += x_blk[i] * y_blk[i];
printf ("Process %2d holds %g (blocked)\n", MYTHREAD, mydot);
upc_lock(dotlock);
dot_blk = dot_blk + mydot;
upc_unlock(dotlock);
upc_barrier(2);
if( MYTHREAD == 0 )
printf("Total (blocked) is %g\n", dot_blk);
}