OpenCL: Difference between revisions
No edit summary |
|||
Line 290: | Line 290: | ||
==Advanced Topics== | ==Advanced Topics== | ||
====Local Memory v. Global Memory==== | ====Local Memory v. Global Memory==== | ||
[[Category:Programming languages]] |
Revision as of 16:45, 31 January 2022
Installation
Windows
If you're using an NVIDIA GPU, install the CUDA Toolkit.
Linux
https://gist.github.com/Brainiarc7/dc80b023af5b4e0d02b33923de7ba1ed
sudo apt install ocl-icd-opencl-dev opencl-headers sudo apt install opencl-c-headers opencl-clhpp-headers
Getting Started
Compiling
OpenCL kernels are compiled at runtime. All you have to do is link OpenCL when compiling your program and include your kernels in your program. For gcc
just add flag -lOpenCL
C
See https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/
vector_add_kernel.cl
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
// Get the index of the current element to be processed
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
}
#include <stdio.h>
#include <stdlib.h>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
// Create the two input vectors
int i;
const int LIST_SIZE = 1024;
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL; <br />
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
C++
C++ Bindings
While you can use the C bindings in your C++ application, Khronos also provides a set of C++ bindings in CL/cl.hpp
(or CL/cl2.hpp
) which are much easier to use alongside std containers such as std::vector
.
When using C++ bindings, you also do not need to worry about releasing buffers since these are reference-counted.
#include <CL/cl.hpp>
#include <fstream>
#include <iostream>
int main(void) {
int ret = 0;
// Create the two input vectors
const int LIST_SIZE = 1024;
std::vector<int> A(LIST_SIZE);
std::vector<int> B(LIST_SIZE);
for (int i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
// Load the kernel source code into the string source_str
std::string source_str;
{
std::ifstream file("vector_add_kernel.cl");
file.seekg(0, std::ios::end);
source_str.resize(file.tellg());
file.seekg(0, std::ios::beg);
file.read(&source_str[0], source_str.size());
}
// Get platform and device information
std::vector<cl::Platform> platforms;
ret = cl::Platform::get(&platforms);
std::vector<cl::Device> devices;
ret = platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
// Create an OpenCL context
cl::Context context(devices[0], NULL, NULL, NULL, &ret);
// Create a command queue
cl::CommandQueue command_queue(context, devices[0], 0UL, &ret);
// Create memory buffers on the device for each vector
cl::Buffer a_mem_obj(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int));
cl::Buffer b_mem_obj(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int));
cl::Buffer c_mem_obj(context, CL_MEM_READ_WRITE, LIST_SIZE * sizeof(int));
// Copy the lists A and B to their respective memory buffers
ret = cl::copy(command_queue, A.begin(), A.end(), a_mem_obj);
ret = cl::copy(command_queue, B.begin(), B.end(), b_mem_obj);
// Create a program from the kernel source
cl::Program program(context, source_str);
// Build the program
ret = program.build(std::vector<cl::Device>{devices[0]});
if (ret != CL_SUCCESS) {
std::cerr << "Error building program" << std::endl;
exit(EXIT_FAILURE);
}
// Create the OpenCL kernel
cl::Kernel kernel(program, "vector_add", &ret);
if (ret != CL_SUCCESS) {
std::cerr << "Error creating kernel" << std::endl;
exit(EXIT_FAILURE);
}
// Set the arguments of the kernel
ret = kernel.setArg(0, sizeof(cl_mem), &a_mem_obj());
ret = kernel.setArg(1, sizeof(cl_mem), &b_mem_obj());
ret = kernel.setArg(2, sizeof(cl_mem), &c_mem_obj());
// Execute the OpenCL kernel on the list
cl::NDRange global_item_size(LIST_SIZE); // Process the entire lists
cl::NDRange local_item_size(64); // Divide work items into groups of 64
ret = command_queue.enqueueNDRangeKernel(kernel, 0, global_item_size,
local_item_size, NULL, NULL);
if (ret != CL_SUCCESS) {
std::cerr << "Error starting kernel" << std::endl;
exit(EXIT_FAILURE);
}
// Read the memory buffer C on the device to the local variable C
std::vector<int> C(LIST_SIZE);
ret = cl::copy(command_queue, c_mem_obj, C.begin(), C.end());
if (ret != CL_SUCCESS) {
std::cerr << "Error copying C from gpu to memory " << ret << std::endl;
exit(EXIT_FAILURE);
}
// Display the result to the screen
for (int i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
return 0;
}
Python
See pyopencl.
Julia
See OpenCL.jl.
Usage
Scalar Types
OpenCL 1.2 Scalar Data Types
While all OpenCL devices support single-precision floats, not all support double-precision doubles.
Vector Types
OpenCL Data Types
OpenCL 1.2 Vector Data Types
Just like glsl, OpenCL supports vector types such
float3 my_vec = (float3)(1.0);
where its elements are accessed using x,y,z as my_vec.x
.
To convert between vector types, use convert_T()
- Notes
- 3-component data types are aligned to 4 components. I.e. an array of
uchar3
with 4 elements will be equivalent to an array ofuchar4
with 4 elements.
OpenGL Interop
Setting up OpenCL/OpenGL interop is fairly complicated and very hard to debug.
You will also need to manage synchronizing OpenGL/OpenCL so they do not access the same memory at the same time.
If you can, just use OpenGL compute shaders rather than OpenCL to simplify your life.
Textures
See OpenCL™ and OpenGL* Interoperability Tutoria.
In C++, you can use cl::ImageGL
to access textures in OpenGL.
Note that cl::Image
and cl::Buffer
are not the same thing. Interchanging them will result in CL_INVALID_MEM_OBJECT
errors or similar.
I recommend writing to a separate buffer and copying to images.
See clCreateFromGLTexture to get a list of compatible pixel formats.
If in doubt, use GL_RGBA8
which is the most likely format to be supported.
Buffers
Oregon State VBO Interop
cl::BufferGL