|  Rosella
      Machine Intelligence & Data Mining | |||
| Two Golden Rules for GPU ProgrammingFor certain applications, GPU can unleash huge performance. But GPU calling has huge base and synchronization latency. So GPU will perform poorly for most applications. Basically GPUs are good for large array computations. Here are two golden rules for GPU programming; 
 The first rule minimizes global memory write/read latency. The second rule minimizes base and synchronization latency. If you can organize your program observing these two golden rules, you can gain huge gpu performance. Otherwise performance will be poor! For OpenCL installation, Install OpenCL on Debian and Ubuntu and Armbian for Orange Pi 5. For Nvidia Cuda GPU programming guides, please read Cuda Programing Guides with Example Code. Why GPU can outperform CPU?There are three main reasons that GPU can outperform multicore CPU, resulting in dozens to tens of thousands times of speedup. 
 On Orange Pi 5, on a single thread, a 25 million parameter YOLO-like computer vision model takes 35 seconds to complete. On 8 CPU threads, it takes 13 seconds. On GPU, it takes only 0.677 second! (On Nvidia Quadro T1000 896 core GPU, it only takes 0.06 second. 10 times faster than OPi5.) Important GPU Programing TechniquesGPU programing is basically array processing. To achieve maximum speedup, observe the followings; 
 High Performance OpenCL Program ExampleFollowing the two golden rules, we will describe how to write efficient OpenCL GPU programs. You should be able to write your OpenCL applications by copy-paste-change the following codes. First, include the following lines in your C/C++ program. The "#define" line is to set target OpenCL version. "120" means version 1.2.0. The "#include" is to set OpenCL header files. (Note that crimson letters are to be replaced with your contents. For documentation of OpenCL API functions, find from search engines with API function name as search keyword. ) 
#ifndef CL_TARGET_OPENCL_VERSION
#define CL_TARGET_OPENCL_VERSION 120
#endif
#include <CL/cl.h>
#include <stdio.h>
[Variables] int verbose = 1; // to print error messages; // selected GPU device, context and command queue; cl_platform_id devicePlatform = NULL; cl_device_id deviceId = NULL; cl_context context = NULL; cl_command_queue commandQueue = NULL; // recommended local work group size; int maxWorkUnits = 16; // program and kernels; cl_program program = NULL; cl_kernel kernelArrayADD = NULL; cl_kernel kernelArraySUM = NULL; 
[Selecting GPU Device] 
int selectDevice(char *devicename) {
	int i, j;
	cl_int ret;
	cl_uint numPlatforms;
	cl_uint numDevices;
	// get platform IDs;
	ret = clGetPlatformIDs(0, NULL, &numPlatforms);
	if (CL_SUCCESS != ret) {
 		if (verbose) printf("Error clGetPlatformIDs() : %d\n", ret);
		return ret;
	}
	cl_platform_id platforms[numPlatforms];
	ret = clGetPlatformIDs(numPlatforms, platforms, NULL);
	size_t maxWItemSize3D[3];
	char local_dev_buf[250];
	cl_device_id devices[20]; // maximum number of GPU devices, say, 20;
	// search named device or the first GPU device if not specified;
    	for (i = 0; i < numPlatforms; i++) {
		ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
		if (CL_SUCCESS != ret) {
			continue;
		}
 		ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 
                	numDevices, devices, NULL);
		if (CL_SUCCESS != ret) {
			continue;
		}
       		for (j=0; j < numDevices; j++) {
            		ret = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 
				sizeof(local_dev_buf), local_dev_buf, NULL);
			if (CL_SUCCESS != ret) {
 				if (verbose) printf("Error clGetDeviceInfo() : %d\n", ret);
				return ret;
			}
			if (devicename==NULL || strcmp(devicename, local_dev_buf) == 0) {
         			ret = clGetDeviceInfo(devices[j], 
                  			CL_DEVICE_MAX_WORK_ITEM_SIZES, 
                   		 	sizeof(maxWItemSize3D), &maxWItemSize3D, NULL);
				// set prefered local group size==maxWorkUnits;
				maxWorkUnits = (int)maxWItemSize3D[2];
				deviceId = devices[j];
				devicePlatform = platforms[i];
				delete [] platforms;
				return 0;
			}
		}
	}
	if (verbose) {
		if (devicename==NULL) {
			printf("Error device not found.\n");
		} else {
			printf("Error device not found: %s\n", devicename);
		}
	}
	return -1;
}
On Windows and MacOS, you can find GPU device names from CMSR ML Studio. From the "File" menu, select "GPU Information". On Linux, the command "clinfo" will show GPU device names. Alternatively, use OpenCL Device Names. 
[Creating Context and Command Queue] The "#if ... #endif" block covers deprecated API starting from OpenCL 3.0.0. 
int createConextAndCommandQueue() {
	int ret;
	// create context;
	cl_context_properties props[3];
	props[0] = CL_CONTEXT_PLATFORM;
	props[1] = (cl_context_properties)devicePlatform;
	props[2] = 0;
	context = clCreateContext(props, 1, &deviceId, NULL, NULL, &ret);
	if (CL_SUCCESS != ret) {
 		if (verbose) printf("Error clCreateContext() : %d\n", ret);
		return ret;
	}
	// create command queue;
#if CL_TARGET_OPENCL_VERSION >= 300
	commandQueue = clCreateCommandQueueWithProperties(context, deviceId, NULL, &ret);
#else
	commandQueue = clCreateCommandQueue(context, deviceId, 0, &ret);
#endif
	if (CL_SUCCESS != ret) {
 		if (verbose) printf("Error clCreateCommandQueueWithProperties() : %d\n", ret);
		return ret;
		return ret;
	}
	return 0;
}
[Compiling GPU Shading Program] 
int prepareProgramAndShadersWithData(char *programsource) {
	// calculate program source length;
	int leng = 0;
	while (true) {
		if (programsource[leng]==0) {
			break;
		}
		leng++;
	}
	size_t src_size = leng;
	int ret;
	// create and build program;
	program = clCreateProgramWithSource(context,
			1, (const char**)&programsource, &src_size, &ret);
	if (CL_SUCCESS != ret) {
 		if (verbose) printf("Error clCreateProgramWithSource() : %d\n", ret);
		return ret;
	}
	ret = (int)clBuildProgram(program, 1, &deviceId, NULL, NULL, NULL);	
	if (CL_SUCCESS != ret) {
 		if (verbose) printf("Error clBuildProgram() : %d\n", ret);
		return ret;
	}
	// create first kernel;
	kernelArrayADD = clCreateKernel(program, "ArrayADD", &ret);
	if (CL_SUCCESS != ret) {
 		if (verbose) printf("Error clCreateKernel() : %d\n", ret);
		return ret;
	}
	// create second kernel;
	kernelArraySUM = clCreateKernel(program, "ArraySUM", &ret);
	if (CL_SUCCESS != ret) {
 		if (verbose) printf("Error clCreateKernel() : %d\n", ret);
		return ret;
	}
	return 0;
}
[Shading Program] 
// compute element-wise sums of INPUT1 and INPUT2;
__kernel void ArrayADD ( 
	int  wid,
	int  hgt,
	__global float *INPUT1,
	__global float *INPUT2,
	__global float *OUTPUT
) {
	int totalwork = wid * hgt;
	int peIndex = get_global_id(0); // get process element index ID
	if (peIndex >= totalwork) { // to filter extras
		return;
	}
	OUTPUT[peIndex] = INPUT1[peIndex] + INPUT2[peIndex];
}
// compute row-wise sums of INPUT;
__kernel void ArraySUM (
	int  wid,
	int  hgt,
	__global float *INPUT,
	__global float *OUTPUT
) {
	int totalwork = hgt;
	int peIndex = get_global_id(0); // get process element index ID
	if (peIndex >= totalwork) { // to filter extras
		return;
	}
	int i, startindex;
	float sum = 0.0f;
	startindex = wid * peIndex;
	for (i = 0; i < wid; i++) { // compute the sum of the row;
		sum += INPUT[startindex + i];
	}
	OUTPUT[peIndex] = sum;
}
Fine Grained Data Parallel Computing: Notice that kernel functions compute (only) one output value. This means that each GPU core computes one output value! Consecutive GPU cores write to consecutive array elements, meaning consecutive memory locations without overlapping. This is to maximize GPU write efficiency. If you have multiple times output values than GPU cores, you might add loops into the above code to compute multiple output values. However our tests show that such method is slower than computing one output value. 
[Creating GPU Memory Blocks and Data Transfer] int ret = 0; int wid = 100; int hgt = 100; // input value arrays float input1[wid*hgt]; float input2[wid*hgt]; // populate input arrays. // create GPU memory array blocks with initial input data. // this will copy input1 and input2 data into GPU memory blocks. cl_mem memINPUT1 = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * (wid*hgt), input1, &ret); cl_mem memINPUT2 = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * (wid*hgt), input2, &ret); // create GPU memory blocks for output values cl_mem memOUTPUT1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * (wid*hgt), NULL, &ret); cl_mem memOUTPUT2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * hgt, NULL, &ret); 
[Enqueueing GPU Jobs] int k = 0; cl_kernel kernel; size_t global_work_size; size_t local_work_size; int localunits, globalunits, totalworkunits; // enqueue first command; kernel = kernelArrayADD; k = 0; // set kernel parameter arguments; clSetKernelArg(kernel, k++, sizeof(cl_int), &wid); clSetKernelArg(kernel, k++, sizeof(cl_int), &hgt); clSetKernelArg(kernel, k++, sizeof(cl_mem), &memINPUT1); clSetKernelArg(kernel, k++, sizeof(cl_mem), &memINPUT2); clSetKernelArg(kernel, k++, sizeof(cl_mem), &memOUTPUT1); // Ideally, the value of "maxWorkUnits" should be the core size of each GPU computing unit. // However OpenCL does not support API to get this value. So it will be set to // the lowest dimension maximum size, when you select a device. // If you know the core size of each GPU computing unit, you can overide this value // as follows. For example, 16 is the core size of each computing unit for Orange Pi 5. // maxWorkUnits = 16; // determine local and global work sizes; totalworkunits = wid * hgt; // for each array element; localunits = maxWorkUnits; if (totalworkunits <= localunits) { localunits = totalworkunits; globalunits = totalworkunits; } else { globalunits = (((totalworkunits - 1) / localunits) + 1) * localunits; } global_work_size = globalunits; local_work_size = localunits; cl_int ret = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (CL_SUCCESS != ret) { printf("Enqueue failed: %d\n", ret); } // the following is to notify GPU to start processing. // This is optional and needed only after the first enqueue! clFlush(commandQueue); // enqueue second command; kernel = kernelArraySUM; k = 0; // set kernel parameter arguments; clSetKernelArg(kernel, k++, sizeof(cl_int), &wid); clSetKernelArg(kernel, k++, sizeof(cl_int), &hgt); clSetKernelArg(kernel, k++, sizeof(cl_mem), &memOUTPUT1); // output of previous command clSetKernelArg(kernel, k++, sizeof(cl_mem), &memOUTPUT2); // determine local and global work sizes; totalworkunits = hgt; // for each row; localunits = maxWorkUnits; if (totalworkunits <= localunits) { localunits = totalworkunits; globalunits = totalworkunits; } else { globalunits = (((totalworkunits - 1) / localunits) + 1) * localunits; } global_work_size = globalunits; local_work_size = localunits; cl_int ret = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (CL_SUCCESS != ret) { printf("Enqueue failed: %d\n", ret); } 
[Retrieving Final Result Values] float output[hgt]; // to retrieve value here! clFinish(commandQueue); // wait until both commands are completed; clEnqueueReadBuffer(commandQueue, memOUTPUT2, CL_TRUE, 0, sizeof(cl_float) * hgt, output, 0, NULL, NULL); 
[Repeating Execution] clEnqueueWriteBuffer(commandQueue, memINPUT1, CL_TRUE, 0, sizeof(cl_float) * (wid * hgt), input1, 0, NULL, NULL); clEnqueueWriteBuffer(commandQueue, memINPUT2, CL_TRUE, 0, sizeof(cl_float) * (wid * hgt), input2, 0, NULL, NULL); 
[Releasing Resources] // release GPU memory blocks; clReleaseMemObject(memINPUT1); clReleaseMemObject(memINPUT2); clReleaseMemObject(memOUTPUT1); clReleaseMemObject(memOUTPUT2); // release kernels, program, queue, and context; clReleaseKernel(kernelArrayADD); clReleaseKernel(kernelArraySUM); clReleaseProgram(program); clReleaseCommandQueue(commandQueue); clReleaseContext(context); | |||
