OpenCL: Difference between revisions

Language
Watch
View source

@@ Line 1: / Line 1: @@
 __FORCETOC__
-==Installing==
+==Installation==
 ===Windows===
-If you're using an Nvidia GPU, install the Nvidia Cuda Toolkit.
+If you're using an NVIDIA GPU, install the [https://developer.nvidia.com/cuda-toolkit CUDA Toolkit].
 ===Linux===
 https://gist.github.com/Brainiarc7/dc80b023af5b4e0d02b33923de7ba1ed
+<pre>
+sudo apt install ocl-icd-opencl-dev opencl-headers
+sudo apt install opencl-c-headers opencl-clhpp-headers
+</pre>
-===Usage===
+==Getting Started==
+===Compiling===
+OpenCL kernels are compiled at runtime. All you have to do is link OpenCL when compiling your program and include your kernels in your program. For <code>gcc</code> just add flag <code>-lOpenCL</code>
+===C===
 See https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/
+{{hidden | C example |
+vector_add_kernel.cl
+<syntaxhighlight lang="c">
+__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
+    // Get the index of the current element to be processed
+    int i = get_global_id(0);
+    // Do the operation
+    C[i] = A[i] + B[i];
+}
+</syntaxhighlight>
+<syntaxhighlight lang="c">
+#include <stdio.h>
+#include <stdlib.h>
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+#define MAX_SOURCE_SIZE (0x100000)
+int main(void) {
+    // Create the two input vectors
+    int i;
+    const int LIST_SIZE = 1024;
+    int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
+    int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
+    for(i = 0; i < LIST_SIZE; i++) {
+        A[i] = i;
+        B[i] = LIST_SIZE - i;
+    }
+    // Load the kernel source code into the array source_str
+    FILE *fp;
+    char *source_str;
+    size_t source_size;
+    fp = fopen("vector_add_kernel.cl", "r");
+    if (!fp) {
+        fprintf(stderr, "Failed to load kernel.\n");
+        exit(1);
+    }
+    source_str = (char*)malloc(MAX_SOURCE_SIZE);
+    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
+    fclose( fp );
+    // Get platform and device information
+    cl_platform_id platform_id = NULL;
+    cl_device_id device_id = NULL;
+    cl_uint ret_num_devices;
+    cl_uint ret_num_platforms;
+    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
+    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
+            &device_id, &ret_num_devices);
+    // Create an OpenCL context
+    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
+    // Create a command queue
+    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
+    // Create memory buffers on the device for each vector
+    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
+            LIST_SIZE * sizeof(int), NULL, &ret);
+    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
+            LIST_SIZE * sizeof(int), NULL, &ret);
+    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+            LIST_SIZE * sizeof(int), NULL, &ret);
+    // Copy the lists A and B to their respective memory buffers
+    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
+            LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
+    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
+            LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
+    // Create a program from the kernel source
+    cl_program program = clCreateProgramWithSource(context, 1,
+            (const char **)&source_str, (const size_t *)&source_size, &ret);
+    // Build the program
+    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+    // Create the OpenCL kernel
+    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
+    // Set the arguments of the kernel
+    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
+    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
+    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
+    // Execute the OpenCL kernel on the list
+    size_t global_item_size = LIST_SIZE; // Process the entire lists
+    size_t local_item_size = 64; // Divide work items into groups of 64
+    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
+            &global_item_size, &local_item_size, 0, NULL, NULL);
+    // Read the memory buffer C on the device to the local variable C
+    int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
+    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
+            LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
+    // Display the result to the screen
+    for(i = 0; i < LIST_SIZE; i++)
+        printf("%d + %d = %d\n", A[i], B[i], C[i]);
+    // Clean up
+    ret = clFlush(command_queue);
+    ret = clFinish(command_queue);
+    ret = clReleaseKernel(kernel);
+    ret = clReleaseProgram(program);
+    ret = clReleaseMemObject(a_mem_obj);
+    ret = clReleaseMemObject(b_mem_obj);
+    ret = clReleaseMemObject(c_mem_obj);
+    ret = clReleaseCommandQueue(command_queue);
+    ret = clReleaseContext(context);
+    free(A);
+    free(B);
+    free(C);
+    return 0;
+}
+</syntaxhighlight>
+}}
+===C++===
+[https://github.khronos.org/OpenCL-CLHPP/index.html#intro C++ Bindings]<br>
+While you can use the C bindings in your C++ application, Khronos also provides a set of C++ bindings in <code>CL/cl.hpp</code> (or <code>CL/cl2.hpp</code>) which are much easier to use alongside std containers such as <code>std::vector</code>.
+When using C++ bindings, you also do not need to worry about releasing buffers since these are reference-counted.
+{{hidden | C++ example |
+<syntaxhighlight lang="cpp">
+#include <CL/cl.hpp>
+#include <fstream>
+#include <iostream>
+int main(void) {
+  int ret = 0;
+  // Create the two input vectors
+  const int LIST_SIZE = 1024;
+  std::vector<int> A(LIST_SIZE);
+  std::vector<int> B(LIST_SIZE);
+  for (int i = 0; i < LIST_SIZE; i++) {
+    A[i] = i;
+    B[i] = LIST_SIZE - i;
+  }
+  // Load the kernel source code into the string source_str
+  std::string source_str;
+  {
+    std::ifstream file("vector_add_kernel.cl");
+    file.seekg(0, std::ios::end);
+    source_str.resize(file.tellg());
+    file.seekg(0, std::ios::beg);
+    file.read(&source_str[0], source_str.size());
+  }
+  // Get platform and device information
+  std::vector<cl::Platform> platforms;
+  ret = cl::Platform::get(&platforms);
+  std::vector<cl::Device> devices;
+  ret = platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
+  // Create an OpenCL context
+  cl::Context context(devices[0], NULL, NULL, NULL, &ret);
+  // Create a command queue
+  cl::CommandQueue command_queue(context, devices[0], 0UL, &ret);
+  // Create memory buffers on the device for each vector
+  cl::Buffer a_mem_obj(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int));
+  cl::Buffer b_mem_obj(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int));
+  cl::Buffer c_mem_obj(context, CL_MEM_READ_WRITE, LIST_SIZE * sizeof(int));
+  // Copy the lists A and B to their respective memory buffers
+  ret = cl::copy(command_queue, A.begin(), A.end(), a_mem_obj);
+  ret = cl::copy(command_queue, B.begin(), B.end(), b_mem_obj);
+  // Create a program from the kernel source
+  cl::Program program(context, source_str);
+  // Build the program
+  ret = program.build(std::vector<cl::Device>{devices[0]});
+  if (ret != CL_SUCCESS) {
+    std::cerr << "Error building program" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  // Create the OpenCL kernel
+  cl::Kernel kernel(program, "vector_add", &ret);
+  if (ret != CL_SUCCESS) {
+    std::cerr << "Error creating kernel" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  // Set the arguments of the kernel
+  ret = kernel.setArg(0, sizeof(cl_mem), &a_mem_obj());
+  ret = kernel.setArg(1, sizeof(cl_mem), &b_mem_obj());
+  ret = kernel.setArg(2, sizeof(cl_mem), &c_mem_obj());
+  // Execute the OpenCL kernel on the list
+  cl::NDRange global_item_size(LIST_SIZE); // Process the entire lists
+  cl::NDRange local_item_size(64); // Divide work items into groups of 64
+  ret = command_queue.enqueueNDRangeKernel(kernel, 0, global_item_size,
+                                           local_item_size, NULL, NULL);
+  if (ret != CL_SUCCESS) {
+    std::cerr << "Error starting kernel" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  // Read the memory buffer C on the device to the local variable C
+  std::vector<int> C(LIST_SIZE);
+  ret = cl::copy(command_queue, c_mem_obj, C.begin(), C.end());
+  if (ret != CL_SUCCESS) {
+    std::cerr << "Error copying C from gpu to memory " << ret << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  // Display the result to the screen
+  for (int i = 0; i < LIST_SIZE; i++)
+    printf("%d + %d = %d\n", A[i], B[i], C[i]);
+  return 0;
+}
+</syntaxhighlight>
+}}
+===Python===
+See [https://documen.tician.de/pyopencl/index.html pyopencl].
+===Julia===
+See [https://github.com/JuliaGPU/OpenCL.jl OpenCL.jl].
+==Usage==
+===Scalar Types===
+[https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/scalarDataTypes.html OpenCL 1.2 Scalar Data Types]<br>
+While all OpenCL devices support single-precision floats, not all support double-precision doubles.<br>
+===Vector Types===
+[https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/dataTypes.html OpenCL Data Types]<br>
+[https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/vectorDataTypes.html OpenCL 1.2 Vector Data Types]<br>
+Just like glsl, OpenCL supports vector types such
+<syntaxhighlight lang="c">float3 my_vec = (float3)(1.0);</syntaxhighlight>
+where its elements are accessed using x,y,z as <code>my_vec.x</code>.<br>
+To convert between vector types, use <code>convert_T()</code><br>
+;Notes
+* 3-component data types are aligned to 4 components. I.e. an array of <code>uchar3</code> with 4 elements will be equivalent to an array of <code>uchar4</code> with 4 elements.
+==OpenGL Interop==
+Setting up OpenCL/OpenGL interop is fairly complicated and very hard to debug.
+You will also need to manage synchronizing OpenGL/OpenCL so they do not access the same memory at the same time.
+If you can, just use OpenGL compute shaders rather than OpenCL to simplify your life.
+===Textures===
+See [https://software.intel.com/content/www/us/en/develop/articles/opencl-and-opengl-interoperability-tutorial.html OpenCL™ and OpenGL* Interoperability Tutoria].
+In C++, you can use [https://github.khronos.org/OpenCL-CLHPP/classcl_1_1_image_g_l.html <code>cl::ImageGL</code>] to access textures in OpenGL.
+Note that <code>cl::Image</code> and <code>cl::Buffer</code> are not the same thing. Interchanging them will result in <code>CL_INVALID_MEM_OBJECT</code> errors or similar.
+I recommend writing to a separate buffer and copying to images.
+See [https://www.khronos.org/registry/OpenCL/sdk/2.2/docs/man/html/clCreateFromGLTexture.html clCreateFromGLTexture] to get a list of compatible pixel formats.
+If in doubt, use <code>GL_RGBA8</code> which is the most likely format to be supported.
+===Buffers===
+[https://web.engr.oregonstate.edu/~mjb/cs575/Handouts/opencl.opengl.vbo.1pp.pdf Oregon State VBO Interop]
+[https://github.khronos.org/OpenCL-CLHPP/classcl_1_1_buffer_g_l.html cl::BufferGL]
+==Advanced Topics==
+====Local Memory v. Global Memory====
+[[Category:Programming languages]]
+[[Category:GPU Programming languages]]

Retrieved from "https://wiki.davidl.me/view/OpenCL"

Languages

This page is not available in other languages.

David's Wiki

Privacy policy
About David's Wiki
Disclaimers
Desktop