OpenCL

Installation

Windows

If you're using an NVIDIA GPU, install the CUDA Toolkit.

Linux

https://gist.github.com/Brainiarc7/dc80b023af5b4e0d02b33923de7ba1ed

sudo apt install ocl-icd-opencl-dev opencl-headers
sudo apt install opencl-c-headers opencl-clhpp-headers

Getting Started

Compiling

OpenCL kernels are compiled at runtime. All you have to do is link OpenCL when compiling your program and include your kernels in your program. For gcc just add flag -lOpenCL

C

See https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/

C example

vector_add_kernel.cl

__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
 
    // Get the index of the current element to be processed
    int i = get_global_id(0);
 
    // Do the operation
    C[i] = A[i] + B[i];
}

#include <stdio.h>
#include <stdlib.h>
 
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
 
#define MAX_SOURCE_SIZE (0x100000)
 
int main(void) {
    // Create the two input vectors
    int i;
    const int LIST_SIZE = 1024;
    int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
    int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
    for(i = 0; i < LIST_SIZE; i++) {
        A[i] = i;
        B[i] = LIST_SIZE - i;
    }
 
    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;
 
    fp = fopen("vector_add_kernel.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );
 
    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL; <br />
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, 
            &device_id, &ret_num_devices);
 
    // Create an OpenCL context
    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
 
    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
 
    // Create memory buffers on the device for each vector 
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, 
            LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
            LIST_SIZE * sizeof(int), NULL, &ret);
 
    // Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
            LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
 
    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1, 
            (const char **)&source_str, (const size_t *)&source_size, &ret);
 
    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 
    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
 
    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
 
    // Execute the OpenCL kernel on the list
    size_t global_item_size = LIST_SIZE; // Process the entire lists
    size_t local_item_size = 64; // Divide work items into groups of 64
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
            &global_item_size, &local_item_size, 0, NULL, NULL);
 
    // Read the memory buffer C on the device to the local variable C
    int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
 
    // Display the result to the screen
    for(i = 0; i < LIST_SIZE; i++)
        printf("%d + %d = %d\n", A[i], B[i], C[i]);
 
    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(a_mem_obj);
    ret = clReleaseMemObject(b_mem_obj);
    ret = clReleaseMemObject(c_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free(A);
    free(B);
    free(C);
    return 0;
}

C++

C++ Bindings
While you can use the C bindings in your C++ application, Khronos also provides a set of C++ bindings in CL/cl.hpp (or CL/cl2.hpp) which are much easier to use alongside std containers such as std::vector.
When using C++ bindings, you also do not need to worry about releasing buffers since these are reference-counted.

C++ example

#include <CL/cl.hpp>
#include <fstream>
#include <iostream>

int main(void) {
  int ret = 0;
  // Create the two input vectors
  const int LIST_SIZE = 1024;
  std::vector<int> A(LIST_SIZE);
  std::vector<int> B(LIST_SIZE);
  for (int i = 0; i < LIST_SIZE; i++) {
    A[i] = i;
    B[i] = LIST_SIZE - i;
  }

  // Load the kernel source code into the string source_str
  std::string source_str;
  {
    std::ifstream file("vector_add_kernel.cl");
    file.seekg(0, std::ios::end);
    source_str.resize(file.tellg());
    file.seekg(0, std::ios::beg);
    file.read(&source_str[0], source_str.size());
  }

  // Get platform and device information
  std::vector<cl::Platform> platforms;
  ret = cl::Platform::get(&platforms);

  std::vector<cl::Device> devices;
  ret = platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);

  // Create an OpenCL context
  cl::Context context(devices[0], NULL, NULL, NULL, &ret);

  // Create a command queue
  cl::CommandQueue command_queue(context, devices[0], 0UL, &ret);

  // Create memory buffers on the device for each vector
  cl::Buffer a_mem_obj(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int));
  cl::Buffer b_mem_obj(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int));
  cl::Buffer c_mem_obj(context, CL_MEM_READ_WRITE, LIST_SIZE * sizeof(int));

  // Copy the lists A and B to their respective memory buffers
  ret = cl::copy(command_queue, A.begin(), A.end(), a_mem_obj);
  ret = cl::copy(command_queue, B.begin(), B.end(), b_mem_obj);

  // Create a program from the kernel source
  cl::Program program(context, source_str);

  // Build the program
  ret = program.build(std::vector<cl::Device>{devices[0]});
  if (ret != CL_SUCCESS) {
    std::cerr << "Error building program" << std::endl;
    exit(EXIT_FAILURE);
  }

  // Create the OpenCL kernel
  cl::Kernel kernel(program, "vector_add", &ret);
  if (ret != CL_SUCCESS) {
    std::cerr << "Error creating kernel" << std::endl;
    exit(EXIT_FAILURE);
  }

  // Set the arguments of the kernel
  ret = kernel.setArg(0, sizeof(cl_mem), &a_mem_obj());
  ret = kernel.setArg(1, sizeof(cl_mem), &b_mem_obj());
  ret = kernel.setArg(2, sizeof(cl_mem), &c_mem_obj());

  // Execute the OpenCL kernel on the list
  cl::NDRange global_item_size(LIST_SIZE); // Process the entire lists
  cl::NDRange local_item_size(64); // Divide work items into groups of 64
  ret = command_queue.enqueueNDRangeKernel(kernel, 0, global_item_size,
                                           local_item_size, NULL, NULL);
  if (ret != CL_SUCCESS) {
    std::cerr << "Error starting kernel" << std::endl;
    exit(EXIT_FAILURE);
  }

  // Read the memory buffer C on the device to the local variable C
  std::vector<int> C(LIST_SIZE);
  ret = cl::copy(command_queue, c_mem_obj, C.begin(), C.end());
  if (ret != CL_SUCCESS) {
    std::cerr << "Error copying C from gpu to memory " << ret << std::endl;
    exit(EXIT_FAILURE);
  }

  // Display the result to the screen
  for (int i = 0; i < LIST_SIZE; i++)
    printf("%d + %d = %d\n", A[i], B[i], C[i]);

  return 0;
}

Python

See pyopencl.

Julia

See OpenCL.jl.

Usage

Scalar Types

OpenCL 1.2 Scalar Data Types
While all OpenCL devices support single-precision floats, not all support double-precision doubles.

Vector Types

OpenCL Data Types
OpenCL 1.2 Vector Data Types
Just like glsl, OpenCL supports vector types such

float3 my_vec = (float3)(1.0);

where its elements are accessed using x,y,z as my_vec.x.
To convert between vector types, use convert_T()

Notes

3-component data types are aligned to 4 components. I.e. an array of uchar3 with 4 elements will be equivalent to an array of uchar4 with 4 elements.

OpenGL Interop

Setting up OpenCL/OpenGL interop is fairly complicated and very hard to debug.
You will also need to manage synchronizing OpenGL/OpenCL so they do not access the same memory at the same time.
If you can, just use OpenGL compute shaders rather than OpenCL to simplify your life.

Textures

See OpenCL™ and OpenGL* Interoperability Tutoria.

In C++, you can use cl::ImageGL to access textures in OpenGL.
Note that cl::Image and cl::Buffer are not the same thing. Interchanging them will result in CL_INVALID_MEM_OBJECT errors or similar. I recommend writing to a separate buffer and copying to images.

See clCreateFromGLTexture to get a list of compatible pixel formats.
If in doubt, use GL_RGBA8 which is the most likely format to be supported.

Buffers

Oregon State VBO Interop
cl::BufferGL

Advanced Topics

Local Memory v. Global Memory