Mar 042011
 

Introduction to OpenMP

This is by no means meant to be an exhaustive, academic guide to parallel programming with openmp. It is meant for those people who already have a piece of code that can only be made to run faster by throwing more hardware at the problem.

OpenMP Application Program Interface
Summary of OpenMP 3.0 C/C++ Syntax

OpenMP is an Application Program Interface (API) that may be used to explicitly direct multi-threaded, shared memory parallelism. That is, whether it be single-socket, multicore or multi-socket, multicore, only a single motherboard is involved and there is zero network communication between different machines. There is a small caveat here – Intel have a product that is variant on OpenMP for distributed, or multi-motherboard configurations (“Cluster OpenMP”) but I will not talk about that here.

OpenMP_Shared_Memory

OpenMP uses a fork and join execution model. Execution begins in a single master thread, parallel regions are executed by a team; Execution returns to a single thread at the end of the parallel region. OpenMP is particularly suited for large array-based applications (loop paralelism – loop level and functional parallelism).
OpenMP_Fork_Join

It is easy to enable OpenMP; All you have to do is include the header file and link against the library.
OpenMP_Include
The OpenMP website lists the compilers from various vendors that implement the OpenMP API:
OpenMP Compilers

Key OpenMP Directives and API Methods

Below is a list of directives that exist for the OpenMP framework.
OpenMP_Directives

The fundamental construct in OpenMP is that of a “parallel region”. This is a region where N threads execute a region of code in parallel.
Example 1: HelloWorld

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
 
/*
 *  Main method
 *
 *  Get everybody to say hello.
 */
int main(int argc, char **argv)
{
    fprintf(stdout, "omp_get_num_threads() = %dn", omp_get_num_threads());
    fprintf(stdout, "omp_get_max_threads() = %dnn", omp_get_max_threads());
 
    #pragma omp parallel
    {
        fprintf(stdout, "Hello OpenMP! (from thread %d)n", omp_get_thread_num());
    }
 
    return 0;
}

icpc -O2 -xHost -openmp HelloOpenMP.cpp -o SayHello

g++ -fopenmp -o SayHello HelloWorldMP.cpp


omp_get_num_threads() = 1
omp_get_max_threads() = 4

Hello OpenMP! (from thread 0)
Hello OpenMP! (from thread 3)
Hello OpenMP! (from thread 2)
Hello OpenMP! (from thread 1)

Loop/Data Parallelism

OpenMP is particularly well suited for loop and data parallelisation. Simply split iteration space across multiply threads (assuming the iterations are independent of one another!).

Example 2: saxpy (Scalar Alpha X plus Y or z = a*x+y) OpenMP_SAXPY

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
 
 
#define RGSIZE 10000000
 
 
/*
 *  Manual saxpy
 *
 *  Specify all private and shared variables
 */
void manual_saxpy(double *z, double a, double *x, double *y, int n) 
{
    // How many processors do I have?
    int procCount = omp_get_num_procs();
    // How much work will each processor do?
    int chunk = n/procCount;
    // There is always that little bit left over
    int extra = n % procCount;
    int i, j, start, end, threadNum;
 
    #pragma omp parallel private(i, j, start, end, threadNum)
    {
        // Which processor am I? (Remember, all processors execute this individually)
        threadNum = omp_get_thread_num();
        // What is my starting index?
        start = threadNum*chunk;
        // What is the index I stop working on?
        end = start + chunk;
        // if we're the last thread, take on any extra cycles
        if (threadNum == procCount-1)
            end += extra;
 
        printf("Thread %d processing from %d to %dn",
            threadNum, start, end-1);
        for (i=start; i<end; i++) 
        {
            z[i] = a * x[i] + y[i];
        }
    }
}
 
 
/*
 * Automatic saxpy
 *
 * Let the compiler and directive inspect the data structures
 */
void automatic_saxpy(double *z, double a, double *x, double *y, int n)
{
    #pragma omp parallel for
    for (int i=0; i<n; i++)
    {
        z[i] = a * x[i] + y[i];
    }
}
 
 
/*
 *  Main method
 *
 *  Get everybody to say hello.
 */
int main(int argc, char **argv)
{
    // Initialise the data structures
    double* z = new double[RGSIZE];
    double a = 3.5;
    double *x = new double[RGSIZE];
    double *y = new double[RGSIZE];
 
    #pragma omp parallel
    {
        #pragma omp for nowait
        for (int i=0; i<RGSIZE; i++)
            x[i] = i;
 
        #pragma omp for nowait
        for (int j=0; j<RGSIZE; j++)
            y[j] = j/2.0;
    }
 
    double start, end;
    start = omp_get_wtime();
    automatic_saxpy(z, a, x, y, RGSIZE);
    end = omp_get_wtime();
    printf("automatic saxpy loop took %f secondsn", end-start);
 
    start = omp_get_wtime();
    manual_saxpy(z, a, x, y, RGSIZE);
    end = omp_get_wtime();
    printf("manual saxpy loop took %f secondsn", end-start);
 
    delete z;
    delete x;
    delete y;
 
    return 0;
}

icpc -O2 -xHost -vec-report2 -openmp -c Saxpy.cpp -o Saxpy.o


Saxpy.cpp(73) (col. 13): remark: LOOP WAS VECTORIZED.
Saxpy.cpp(79) (col. 13): remark: LOOP WAS VECTORIZED.
Saxpy.cpp(88) (col. 5): remark: loop was not vectorized: existence of vector dependence.
Saxpy.cpp(93) (col. 5): remark: loop was not vectorized: existence of vector dependence.
Saxpy.cpp(33) (col. 9): remark: LOOP WAS VECTORIZED.
Saxpy.cpp(49) (col. 5): remark: LOOP WAS VECTORIZED.

icpc -O2 -xHost -vec-report2 -openmp Saxpy.o -o Go


automatic saxpy loop took 0.052939 seconds
Thread 0 processing from 0 to 2499999
Thread 2 processing from 5000000 to 7499999
Thread 3 processing from 7500000 to 9999999
Thread 1 processing from 2500000 to 4999999
manual saxpy loop took 0.053051 seconds

Scheduling

Load balancing across threads is critical for performance. The default scheduling of OpenMP is implementation specific so you may need to use more advanced scheduling to fully utillise all your processors.

static – divide loop iterations evenly across available threads at the start. chunk_size defaults to iterations/threads
dynamic – hand each iteration off to an available thread on-demand, chunk_size defaults to 1
guided – iterations are assigned to threads in chunks with decreasing sizes
runtime – determined at runtime, or use value in OMP_SCHEDULE environment variable (if defined)

Scheduling with Mandelbrot

A GPU version of this code can be found: Here

Please note that the right hand border indicates different threads. The images below have been generated with four (4) threads. (cat /proc/cpuinfo : Intel(R) Core(TM)2 Quad CPU Q9550 @ 2.83GHz)

Dynamic Scheduling (1) 2.924 seconds

Dynamic Scheduling (9) 2.93 seconds

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <omp.h>
#include <SDL.h>
 
// Bytes per pixel
#define BPP 4
#define DEPTH 32
 
int width, height;
unsigned int *image;
int *proc;
 
 
int colourmapID = 0;
int numberOfProcs;
int *proc_count;
unsigned int *colourMap;
 
 
////////////////////////////////////////////////////////////////////////
//		Timer Stuff
////////////////////////////////////////////////////////////////////////
double timer1, timer2;
 
double gettime() 
{
  	struct timeval t;
  	gettimeofday(&t,NULL);
  	return t.tv_sec+t.tv_usec*1e-6;
}
 
 
/*
 *  Paint the canvas
 *
 *  The canvas is located in "unsigned int *image" and need to copy this
 *  to the frame buffer.  SDL does this nicely
 */
void DrawScreen(SDL_Surface* screen)
{ 
    if(SDL_MUSTLOCK(screen)) 
    {
        if(SDL_LockSurface(screen) < 0) return;
    }
 
    // copy image to screen->pixels
    memcpy(screen->pixels, image, width*height*BPP);
 
    if(SDL_MUSTLOCK(screen)) 
        SDL_UnlockSurface(screen);
 
    SDL_Flip(screen);  
}
 
 
/*
 *  Paint the canvas black
 *
 *
 */
void PaintBlack(SDL_Surface* screen)
{
    for ( int i = 0; i<width*height; i++ )
    {
        image[i] = 0;
    }
    DrawScreen(screen);
}
 
 
/*
 *  Creates a colour bar on the right hand side of the image
 *  that indicates the which row, which thread was
 *  processing.
 */
void GetProcessorDetails()
{
    // reset proc count
    for (int i = 0; i<numberOfProcs; i++)
        proc_count[i] = 0;
    // Colour the rows the on the right by the processor id
    for (int row = 0; row<height; row++)
    {
        proc_count[proc[row]]++;
        for (int column = width-10; column<width; column++)
        {
            image[row*width + column] = colourMap[proc[row]];
        }
    }
 
    fprintf(stdout, "Processor Workload:n");
    for (int i = 0; i<numberOfProcs; i++)
    {
        fprintf(stdout, "threadID (%d) : %d rowsn", i, proc_count[i]);
    }
}
 
 
 
 
 
/*
 * Returns a color reflecting the value 
 * of the Mandelbrot set element at this position.
 *
 */
unsigned int MandelbrotColour(double yp, double xp, double y, double x, double size)
{ 
    // compute pixel position:
    double ypos = y + size * (yp - height / 2) / ((double)height);
    double xpos = x + size * (xp - width / 2) / ((double)width);
 
    // Reference: http://en.wikipedia.org/wiki/Mandelbrot_set
    y = ypos;
    x = xpos;
 
    double y2 = y * y;
    double x2 = x * x;
 
    int c = 1;
    const int MAXCOLOUR = 8191;
    while ((y2 + x2) <= 4 && c < MAXCOLOUR)
    {
        y = 2 * x * y + ypos;
        x = x2 - y2 + xpos;
        y2 = y * y;
        x2 = x * x;
        c++;
    }
 
    // Fractal colourmap
    unsigned char alpha = 255;
    unsigned char red = 255 - (c * 3 % 256);
    unsigned char green = 255 - (c * 7 % 256);
    unsigned char blue = 255 - (c * 13 % 256);
 
    // Create the correct colour
    unsigned int colour = (alpha << 24) + (blue << 16) + (green << 8) + red;
    return colour;
}
 
 
/*
 *  Perform a parallel Mandelbrot rendering with
 *  dynamic work scheduling among processors.
 *
 */
void Dynamic_DoMandleBrot(SDL_Surface *screen, int X, int Y, int key)
{
    double x = 0.0;
    double y = -0.5;
    double size = 2.0;    
    // Paint the screen black
    PaintBlack(screen);
 
    timer1 = gettime();
 
    #pragma omp parallel for schedule(dynamic, key)
    for (int row = 0; row < height; row++) 
    {
        for (int column = 0; column < width; column++)
        {
            image[row*width + column] = MandelbrotColour(row, column, x, y, size);
        }
        proc[row] = omp_get_thread_num();
    }
 
    timer2 = gettime();
    fprintf(stdout, "schedule(dynamic, %d): %lf secondsn",key, (timer2-timer1));
 
    GetProcessorDetails();         
    // Paint the results to the screen
    DrawScreen(screen);
}
 
 
/*
 *  Perform a parallel Mandelbrot rendering with 
 *  static workload among processors
 *
 */
void Static_DoMandleBrot(SDL_Surface *screen, int X, int Y)
{
    double x = 0.0;
    double y = -0.5;
    double size = 2.0;
 
    // Paint the canvas black
    PaintBlack(screen);
 
    timer1 = gettime();      
    #pragma omp parallel for schedule(static)
    for (int row = 0; row < height; row++) 
    {
        for (int column = 0; column < width; column++)
        {
            image[row*width + column] = MandelbrotColour(row, column, x, y, size);
        }
        proc[row] = omp_get_thread_num();
    }
 
    timer2 = gettime();
    fprintf(stdout, "schedule (static): %lf secondsn",(timer2-timer1));
 
    GetProcessorDetails();
    // Paint the results to the screen
    DrawScreen(screen);
}
 
 
/*
 *  Generate a colourmap to colour the individual 
 *  processors by.
 *
 */
void GenerateColourMap()
{
    colourMap = new unsigned int[numberOfProcs];
    unsigned char alpha = 255;
    unsigned char red, blue, green;
    int counter = 0;
 
    float increment = 1.0/((float)numberOfProcs-1.0);
    for (int i = 0; i<numberOfProcs; i++)
    {
        if (i<(numberOfProcs+1)/2)
        {
            red = 255;
            green = (unsigned char)((i*increment*2)*255.0);
            blue = (unsigned char)((i*increment*2)*255.0); 
        }
        else
        {
            red = (unsigned char)((1.0-i*increment*2)*255.0);
            green = (unsigned char)((1.0-i*increment*2)*255.0);
            blue = 255; 
        }     
        colourMap[counter] = (alpha << 24) + (blue << 16) + (green << 8) + red;
        counter++;
    }  
}
 
 
/*
 *  Main method
 *
 *  Get everybody to say hello.
 */
int main(int argc, char **argv)
{
    numberOfProcs = omp_get_max_threads();
    fprintf(stdout, "Number Of Processors = %dn", numberOfProcs);
    proc_count = new int[numberOfProcs];
    GenerateColourMap();
 
    // Set some stuff up to render the image
    width = 1024;
    height = 768;
    image = new unsigned int[width*height];
    proc = new int[height];
 
    SDL_Surface *screen;
    SDL_Event event;
    int done;
 
    // Can I get a video handle?
    if (SDL_Init(SDL_INIT_VIDEO) < 0 ) 
        return 1;
 
    // Define what I want the video handle to be
    if (!(screen = SDL_SetVideoMode(width, height, DEPTH, SDL_HWSURFACE)))
    {
        SDL_Quit();
        return 1;
    }
 
    // Wait for a keystroke 
    done = 0;
    while (!done && SDL_WaitEvent(&event)) 
    {
        switch (event.type) 
        {
            case SDL_KEYDOWN:
 
                if (event.key.keysym.sym == SDLK_s)
                {
                    Static_DoMandleBrot(screen, event.button.x, event.button.y);
                    break;
                }
                //http://wiki.libsdl.org/moin.cgi/SDLKeycodeLookup
                if (event.key.keysym.sym > 48 & event.key.keysym.sym < 58)
                {
                    Dynamic_DoMandleBrot(screen, event.button.x, event.button.y, event.key.keysym.sym-48);
                    break;
                }
                break;
            case SDL_QUIT:
                done = 1;
                break;
            default:
                break;
        }
    }
 
    SDL_Quit();
    return 0;
}
#!/bin/make
BINARY = Mandelbrot
 
INTEL = /opt/intel/composerxe-2011.0.084
SDL = /home/dwyer3/Public/SDL/1.3
 
CC = icpc
OPTS = -O2 -openmp
SRC = Mandelbrot.cpp
OBJS = $(SRC:.cpp=.o)
 
# Mark@Work
INC = -I$(SDL)/include/SDL
LIBS = -L$(SDL)/lib -lGL -lglut -lSDL
 
.SUFFIXES: .cpp .o
 
.cpp.o:
	$(CC) $(OPTS) $(INC) -c $< -o $@
 
$(BINARY): $(OBJS)
	$(CC) $(OPTS) $(OBJS) -o $(BINARY) $(LIBS)
	@echo Binary created!!
 
clean:
	set nonomatch; rm -f $(BINARY) *.o

Navigation: Press the ‘s’ key to generate the Mandelbrot with static scheduling, press ‘1’,’2′,…’9′ to generate with dynamic scheduling.

To be continued …

 Leave a Reply

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <s> <strike> <strong>

(required)

(required)

Human Conf Test * Time limit is exhausted. Please reload CAPTCHA.