OpenCL: Difference between revisions
Line 246: | Line 246: | ||
==Usage== | ==Usage== | ||
===Types=== | ===Scalar Types=== | ||
[https://www.khronos.org/registry/OpenCL/sdk/1. | [https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/scalarDataTypes.html OpenCL 1.2 Scalar Data Types]<br> | ||
While all OpenCL devices support single-precision floats, not all support double-precision doubles.<br> | While all OpenCL devices support single-precision floats, not all support double-precision doubles.<br> | ||
===Vector Types=== | |||
[https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/dataTypes.html OpenCL Data Types]<br> | |||
[https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/vectorDataTypes.html OpenCL 1.2 Vector Data Types]<br> | |||
Just like glsl, OpenCL supports vector types such | Just like glsl, OpenCL supports vector types such | ||
<syntaxhighlight lang="c">float3 my_vec = (float3)(1.0);</syntaxhighlight> | <syntaxhighlight lang="c">float3 my_vec = (float3)(1.0);</syntaxhighlight> | ||
where its elements are accessed using x,y,z as <code>my_vec.x</code>.<br> | where its elements are accessed using x,y,z as <code>my_vec.x</code>.<br> | ||
To convert between vector types, use <code>convert_T()</code><br> | To convert between vector types, use <code>convert_T()</code><br> | ||
;Notes | |||
* 3-component data types are aligned to 4 components. I.e. an array of <code>uchar3</code> with 4 elements will be equivalent to an array of <code>uchar4</code> with 4 elements. | |||
==Advanced Topics== | ==Advanced Topics== | ||
====Local Memory v. Global Memory==== | ====Local Memory v. Global Memory==== |
Revision as of 13:13, 30 March 2020
Installation
Windows
If you're using an NVIDIA GPU, install the CUDA Toolkit.
Linux
https://gist.github.com/Brainiarc7/dc80b023af5b4e0d02b33923de7ba1ed
Getting Started
Compiling
OpenCL kernels are compiled at runtime. All you have to do is link OpenCL when compiling your program and include your kernels in your program. For gcc
just add flag -lOpenCL
C
See https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/
vector_add_kernel.cl
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
// Get the index of the current element to be processed
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
}
#include <stdio.h>
#include <stdlib.h>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
// Create the two input vectors
int i;
const int LIST_SIZE = 1024;
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL; <br />
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
C++
C++ Bindings
While you can use the C bindings in your C++ application, Khronos also provides a set of C++ bindings in CL/cl.hpp
(or CL/cl2.hpp
) which are much easier to use alongside std containers such as std::vector
. When using C++ bindings, you also do not need to worry about releasing buffers since these are reference-counted.
#include <CL/cl.hpp>
#include <fstream>
#include <iostream>
int main(void) {
int ret = 0;
// Create the two input vectors
const int LIST_SIZE = 1024;
std::vector<int> A(LIST_SIZE);
std::vector<int> B(LIST_SIZE);
for (int i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
// Load the kernel source code into the string source_str
std::string source_str;
{
std::ifstream file("vector_add_kernel.cl");
file.seekg(0, std::ios::end);
source_str.resize(file.tellg());
file.seekg(0, std::ios::beg);
file.read(&source_str[0], source_str.size());
}
// Get platform and device information
std::vector<cl::Platform> platforms;
ret = cl::Platform::get(&platforms);
std::vector<cl::Device> devices;
ret = platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
// Create an OpenCL context
cl::Context context(devices[0], NULL, NULL, NULL, &ret);
// Create a command queue
cl::CommandQueue command_queue(context, devices[0], 0UL, &ret);
// Create memory buffers on the device for each vector
cl::Buffer a_mem_obj(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int));
cl::Buffer b_mem_obj(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int));
cl::Buffer c_mem_obj(context, CL_MEM_READ_WRITE, LIST_SIZE * sizeof(int));
// Copy the lists A and B to their respective memory buffers
ret = cl::copy(command_queue, A.begin(), A.end(), a_mem_obj);
ret = cl::copy(command_queue, B.begin(), B.end(), b_mem_obj);
// Create a program from the kernel source
cl::Program program(context, source_str);
// Build the program
ret = program.build(std::vector<cl::Device>{devices[0]});
if (ret != CL_SUCCESS) {
std::cerr << "Error building program" << std::endl;
exit(EXIT_FAILURE);
}
// Create the OpenCL kernel
cl::Kernel kernel(program, "vector_add", &ret);
if (ret != CL_SUCCESS) {
std::cerr << "Error creating kernel" << std::endl;
exit(EXIT_FAILURE);
}
// Set the arguments of the kernel
ret = kernel.setArg(0, sizeof(cl_mem), &a_mem_obj());
ret = kernel.setArg(1, sizeof(cl_mem), &b_mem_obj());
ret = kernel.setArg(2, sizeof(cl_mem), &c_mem_obj());
// Execute the OpenCL kernel on the list
cl::NDRange global_item_size(LIST_SIZE); // Process the entire lists
cl::NDRange local_item_size(64); // Divide work items into groups of 64
ret = command_queue.enqueueNDRangeKernel(kernel, 0, global_item_size,
local_item_size, NULL, NULL);
if (ret != CL_SUCCESS) {
std::cerr << "Error starting kernel" << std::endl;
exit(EXIT_FAILURE);
}
// Read the memory buffer C on the device to the local variable C
std::vector<int> C(LIST_SIZE);
ret = cl::copy(command_queue, c_mem_obj, C.begin(), C.end());
if (ret != CL_SUCCESS) {
std::cerr << "Error copying C from gpu to memory " << ret << std::endl;
exit(EXIT_FAILURE);
}
// Display the result to the screen
for (int i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
return 0;
}
Julia
See OpenCL.jl.
Usage
Scalar Types
OpenCL 1.2 Scalar Data Types
While all OpenCL devices support single-precision floats, not all support double-precision doubles.
Vector Types
OpenCL Data Types
OpenCL 1.2 Vector Data Types
Just like glsl, OpenCL supports vector types such
float3 my_vec = (float3)(1.0);
where its elements are accessed using x,y,z as my_vec.x
.
To convert between vector types, use convert_T()
- Notes
- 3-component data types are aligned to 4 components. I.e. an array of
uchar3
with 4 elements will be equivalent to an array ofuchar4
with 4 elements.