Rosella Machine Intelligence & Data Mining | |||
Two Golden Rules for GPU ProgrammingFor certain applications, GPU can unleash huge performance. But GPU calling has huge base and synchronization latency. So GPU will perform poorly for most applications. Basically GPUs are good for large array computations. Here are two golden rules for GPU programming;
The first rule minimizes global memory write/read latency. The second rule minimizes base and synchronization latency. If you can organize your program observing these two golden rules, you can gain huge gpu performance. Otherwise performance will be poor! For OpenCL programing, OpenCL GPU Programming Guide with Code. Why GPU can outperform CPU?There are three main reasons that GPU can outperform multicore CPU, resulting in dozens to tens of thousands times of speedup.
Cuda Program ExampleFollowing the two golden rules, we will describe how to write efficient Cuda GPU programs. You should be able to write your Cuda applications by copy-paste-change the following codes. First, include the following lines in your C/C++ program. The "#include" is to set Cuda header files. (Note that crimson letters are to be replaced with your contents. For documentation of Cuda API functions, find from search engines with API function name as search keyword. ) #include <cuda.h>
[Variables] bool verbose = true; // to print error messages; // selected GPU device, context and command queue; CUdevice device; CUcontext context; // CUstream stream; // The default stream (=command queue in OpenCL) will be used. // recommended local work group size; int maxWorkUnits = 16; // program and kernels; CUmodule program = NULL; CUfunction kernelArrayADD = NULL; CUfunction kernelArraySUM = NULL;
[Selecting GPU Device] int selectDevice(int deviceno) { CUresult ret; cuInit(0); ret = cuDeviceGet(&device, deviceno); if (CUDA_SUCCESS != ret) { if (verbose) printf("Error cuDeviceGet() : %d\n", ret); return (int)ret; } ret = cuDeviceGetAttribute(&maxWorkUnits, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device); if (CUDA_SUCCESS != ret) { if (verbose) printf("Error cuDeviceGetAttribute() : %d\n", ret); return (int)ret; } if (verbose) printf("maxWorkUnits = %d\n", maxWorkUnits); return 0; }
[Creating Context] int createConext() { CUresult ret = cuCtxCreate(&context, 0, device); if (CUDA_SUCCESS != ret) { if (verbose) printf("Error cuCtxCreate() : %d\n", ret); return (int)ret; } return 0; }
[Compiling GPU Shading Program] int prepareProgramAndShadersWithData(char *ptxsource) { CUresult ret; ret = cuModuleLoadDataEx(&program, ptxsource, 0, 0, 0); if (CUDA_SUCCESS != ret) { if (verbose) printf("Error cuModuleLoadDataEx() : %d\n", ret); return ret; } ret = cuModuleGetFunction(&kernelArrayADD, program, "ArrayADD"); if (CUDA_SUCCESS != ret) { if (verbose) printf("Error cuModuleGetFunction() : %d\n", ret); return ret; } ret = cuModuleGetFunction(&kernelArraySUM, program, "ArraySUM"); if (CUDA_SUCCESS != ret) { if (verbose) printf("Error cuModuleGetFunction() : %d\n", ret); return ret; } return 0; }
[Shading Program and PTX Source Creation] // compute element-wise sums of INPUT1 and INPUT2; extern "C" __global__ void ArrayADD ( int wid, int hgt, float *INPUT1, float *INPUT2, float *OUTPUT ) { int totalwork = wid * hgt; int peIndex = get_global_id(0); // get process element index ID if (peIndex >= totalwork) { // to filter extras return; } OUTPUT[peIndex] = INPUT1[peIndex] + INPUT2[peIndex]; } // compute row-wise sums of INPUT; extern "C" __global__ void ArraySUM ( int wid, int hgt, float *INPUT, float *OUTPUT ) { int totalwork = hgt; int peIndex = get_global_id(0); // get process element index ID if (peIndex >= totalwork) { // to filter extras return; } int i, startindex; float sum = 0.0f; startindex = wid * peIndex; for (i = 0; i < wid; i++) { // compute the sum of the row; sum += INPUT[startindex + i]; } OUTPUT[peIndex] = sum; } Fine Grained Data Parallel Computing: Notice that kernel functions compute (only) one output value. This means that each GPU core computes one output value! Consecutive GPU cores write to consecutive array elements, meaning consecutive memory locations without overlapping. This is to maximize GPU write efficiency. If you have multiple times output values than GPU cores, you might add loops into the above code to compute multiple output values. However our tests show that such method is slower than computing one output value. Notice that except kernel function headers, all body source codes are the same as OpenCL! Once your GPU program is ready, you need to compile and create PTX assembly source code with the following OS command; nvcc YourGPUprogramSource -ptx -o YourOutputName.ptx
[Creating GPU Memory Blocks and Data Transfer] CUdeviceptr gpumemory; int ret = 0; int wid = 100; int hgt = 100; // input value arrays float input1[wid*hgt]; float input2[wid*hgt]; // populate input arrays. // create GPU memory array blocks with initial input data. // this will copy input1 and input2 data into GPU memory blocks. cuMemAlloc(&gpumemory, sizeof(float) * wid*hgt); cuMemcpyHtoD(gpumemory, input1, sizeof(float) * wid*hgt); // copy data to GPU CUdeviceptr memINPUT1 = gpumemory; cuMemAlloc(&gpumemory, sizeof(float) * wid*hgt); cuMemcpyHtoD(gpumemory, input2, sizeof(float) * wid*hgt); // copy data to GPU CUdeviceptr memINPUT2 = gpumemory; // create GPU memory blocks for output values cuMemAlloc(&gpumemory, sizeof(float) * hgt); CUdeviceptr memOUTPUT1 = gpumemory; cuMemAlloc(&gpumemory, sizeof(float) * hgt); CUdeviceptr memOUTPUT2 = gpumemory;
[Enqueueing GPU Jobs] int localunits, globalunits, totalworkunits; // set GPU kernel parameter value arrays; void **pointerADD = new void*[5] { new int[1] {wid}, new int[1] {hgt}, new CUdeviceptr[1] {memINPUT1}, new CUdeviceptr[1] {memINPUT2}, new CUdeviceptr[1] {memOUTPUT1} }; void **pointerSUM = new void*[4] { new int[1] {wid}, new int[1] {hgt}, new CUdeviceptr[1] {memOUTPUT1}, new CUdeviceptr[1] {memOUTPUT2} }; // enqueue the first ADD command; totalworkunits = wid * hgt; // for each array element; localunits = maxWorkUnits; if (totalworkunits <= localunits) { localunits = totalworkunits; globalunits = 1; // notice difference from OpenCL } else { // notice difference from OpenCL globalunits = (((totalworkunits - 1) / localunits) + 1); } ret = cuLaunchKernel(kernelADD, globalunits, 1, 1, // Grid dimension localunits, 1, 1, // Block dimension 0, NULL, // Shared memory size and stream pointerADD, NULL // Kernel- and extra parameters ); if (CUDA_SUCCESS != ret) { if (verbose) printf("Error cuLaunchKernel() : %d\n", ret); } // enqueue the second SUM command; totalworkunits = hgt; // for each row; localunits = maxWorkUnits; if (totalworkunits <= localunits) { localunits = totalworkunits; globalunits = 1; // notice difference from OpenCL } else { // notice difference from OpenCL globalunits = (((totalworkunits - 1) / localunits) + 1); } ret = cuLaunchKernel(kernelSUM, globalunits, 1, 1, // Grid dimension localunits, 1, 1, // Block dimension 0, NULL, // Shared memory size and stream pointerSUM, NULL // Kernel- and extra parameters ); if (CUDA_SUCCESS != ret) { if (verbose) printf("Error cuLaunchKernel() : %d\n", ret); }
[Retrieving Final Result Values] float output[hgt]; // to retrieve value here! cuCtxSynchronize(); cuMemcpyDtoH(output, memOUTPUT2, sizeof(float) * hgt);
[Repeating Execution] cuMemcpyHtoD(memINPUT1, input1, sizeof(float) * (wid * hgt)); cuMemcpyHtoD(memINPUT2, input2, sizeof(float) * (wid * hgt));
[Releasing Resources] // release GPU memory blocks; cuMemFree(memINPUT1); cuMemFree(memINPUT2); cuMemFree(memOUTPUT1); cuMemFree(memOUTPUT2); // release program and context; cuModuleUnload(program); cuCtxDestroy(context); |
|||