How I built an OpenCL binary for the Nexus 4

It turns out that the Nexus 4 does, in fact, have the OpenCL library installed. It also turns out that, with a little work, it’s possible to compile and run OpenCL code directly on the Nexus 4. Curious how to make it happen? Follow me…

First of all, make sure you have the Android NDK installed. I built a standalone toolchain (see including the Clang 3.1 compiler.

You’ll need the OpenCL headers. You can get those from the Khronos OpenCL API registry. I fetched all of the Version 1.2 headers, including the C++ bindings. You’ll need to patch a few lines to make it work with the NDK, mostly stuff related to SSE2 instructions and OpenGL. (If you have an OpenGL SDK, you can probably point it in the right direction, but I’m sticking to pure OpenCL for now.) The following should work:

--- CL-orig/cl.hpp	2013-02-12 03:30:29.000000000 -0800
+++ CL-patched/cl.hpp	2013-02-28 16:26:36.000000000 -0800
@@ -172,7 +172,7 @@
 #include <OpenCL/opencl.h>
 #include <libkern/OSAtomic.h>
 #else
-#include <GL/gl.h>
+//#include <GL/gl.h>
 #include <CL/opencl.h>
 #endif // !__APPLE__

@@ -212,8 +212,8 @@
 #if defined(linux) || defined(__APPLE__) || defined(__MACOSX)
 #include <alloca.h>

-#include <emmintrin.h>
-#include <xmmintrin.h>
+//#include <emmintrin.h>
+//#include <xmmintrin.h>
 #endif // linux

 #include <cstring>
@@ -1035,7 +1035,11 @@
 #endif // !_WIN32
     }

+#ifdef __SSE2__
     inline void fence() { _mm_mfence(); }
+#else
+    inline void fence() { } // NOOP
+#endif
 }; // namespace detail

Next, go fetch libOpenCL.so from the phone:

adb pull /system/lib/libOpenCL.so

We now have an include/CL directory, and a lib directory.

Here’s my little sample OpenCL program. I’ll be compiling it using the C++ bindings, just to verify that it works all the way through, but you’ll notice that this is using just straight C.:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <GLES2/gl2.h>
#include <CL/cl.hpp>

static const size_t kDataSize = 1E6;
static const float kEpsilon = 0.0001f;

const char *KernelSource =                 "\n" \
	"__kernel void square(                    \n" \
	"   __global const float* input,          \n" \
	"   __global float* output,               \n" \
	"   const unsigned int count)             \n" \
	"{                                        \n" \
	"   int i = get_global_id(0);             \n" \
	"   if(i < count) {                       \n" \
	"       float temp = input[i];            \n" \
	"       output[i] = temp * temp;          \n" \
	"   }                                     \n" \
	"}                                        \n" \
	"\n";

int main()
{
	cl_int err;

	size_t global;
	size_t local;

	cl_platform_id cpPlatform;
	cl_device_id device_id;
	cl_context context;
	cl_command_queue commands;
	cl_program program;
	cl_kernel kernel;

	err = clGetPlatformIDs(1, &cpPlatform, NULL);
	if (err != CL_SUCCESS) {
		fprintf(stderr, "Error: Failed to find a platform: %d\n", err);
		return EXIT_FAILURE;
	}

	err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	if (err != CL_SUCCESS) {
		fprintf(stderr, "Error: Failed to create a device group: %d\n", err);
		return EXIT_FAILURE;
	}

	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if (!context) {
		fprintf(stderr, "Error: Failed to create a compute context: %d\n", err);
		return EXIT_FAILURE;
	}

	commands = clCreateCommandQueue(context, device_id, 0, &err);
	if (!commands) {
		fprintf(stderr, "Error: Failed to create a command queue: %d\n", err);
		return EXIT_FAILURE;
	}

	program = clCreateProgramWithSource(context, 1, (const char **) &KernelSource, NULL, &err);
	if (!program) {
		fprintf(stderr, "Error: Failed to create a compute program: %d\n", err);
		return EXIT_FAILURE;
	}

	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS) {
		size_t len;
		char buffer[2048];

		fprintf(stderr, "Error: Failed to create a program executable: %d\n", err);
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		fprintf(stderr, "%s\n", buffer);
		return EXIT_FAILURE;
	}

	kernel = clCreateKernel(program, "square", &err);
	if (!kernel || err != CL_SUCCESS) {
		fprintf(stderr, "Error: Failed to create a compute kernel: %d\n", err);
		return EXIT_FAILURE;
	}

	float *data = (float *)malloc(kDataSize * sizeof(float));
	if (data == NULL) {
		fprintf(stderr, "Error: could not allocate %u bytes of host memory for data\n", kDataSize * sizeof(float));
		return EXIT_FAILURE;
	}
	float *results = (float *)malloc(kDataSize * sizeof(float));
	if (results == NULL) {
		fprintf(stderr, "Error: could not allocate %u bytes of host memory for results\n", kDataSize * sizeof(float));
		return EXIT_FAILURE;
	}
	size_t correct;
	cl_mem input;
	cl_mem output;

	size_t count = kDataSize;
	size_t i = 0;
	for(; i < count; i++) {
		data[i] = rand() / (float)RAND_MAX;
	}

	input = clCreateBuffer(context,  CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
	output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
	if (!input || !output) {
		fprintf(stderr, "Error: Failed to allocate device memory\n");
		return EXIT_FAILURE;
	}

	err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
	if (err != CL_SUCCESS) {
		fprintf(stderr, "Error: Failed to write to source array: %d\n", err);
		return EXIT_FAILURE;
	}

	err = 0;
	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
	err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
	if (err != CL_SUCCESS) {
		fprintf(stderr, "Error: Failed to set kernel arguments: %d\n", err);
		return EXIT_FAILURE;
	}

	err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
	if (err != CL_SUCCESS) {
		fprintf(stderr, "Error: Failed to retrieve kernel work group info: %d\n", err);
		return EXIT_FAILURE;
	}

	global = count;
	local /= 2; /* TODO look into why retrieved value doesn't just work */
	err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
	if (err) {
		fprintf(stderr, "Error: Failed to execute kernel: %d\n", err);
		return EXIT_FAILURE;
	}

	err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
	if (err != CL_SUCCESS) {
		fprintf(stderr, "Error: Failed to read output array: %d\n", err);
		return EXIT_FAILURE;
	}
	clFinish(commands);

	correct = 0;
	for(i = 0; i < count; i++) {
		float expected = data[i] * data[i];
		if (fabs(results[i] - expected) < kEpsilon) {
			++correct;
		}
	}

	printf("Computed %u / %u correct values\n", correct, count);

	free(data);
	free(results);

	clReleaseMemObject(input);
	clReleaseMemObject(output);
	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	return 0;
}

Ok. Almost done. Now comes the magical compiler invocation. This is a debug build, assuming that your standalone toolchain is in /opt/android-ndk-standalone, and the headers and libraries are in /opt/nexus4-opencl. I admit that I’m not sure all of these options are necessary. You may want to play with them to see if you can come up with something better.:

/opt/android-ndk-standalone/bin/clang31++ -target armv7-none-linux-androideabi -O0 -g -DANDROID \
-DUSE_STD_NAMESPACE -include ctype.h -include unistd.h -fno-rtti -Wno-psabi -march=armv7-a \
-mfloat-abi=softfp -mfpu=neon -fpic -fsigned-char -D__ARM_ARCH_5__ -D__ARM_ARCH_5T__ -D__ARM_ARCH_5E__ \
-D__ARM_ARCH_5TE__ -D__ARM_ARCH_7__ -D__ARM_ARCH_7A__ -fdata-sections -funwind-tables -fstack-protector \
-ffunction-sections -W -Wall -Werror=address -Werror=sequence-point -Wformat -Wundef -Winit-self \
-Wpointer-arith -Wshadow -Wno-narrowing -fdiagnostics-show-option -fPIC -Wno-attributes \
-Wno-strict-prototypes -Wno-missing-prototypes -Wno-missing-declarations -fno-strict-aliasing \
-fno-omit-frame-pointer -I/opt/nexus4-opencl/include -march=armv7-a -Wl,--fix-cortex-a8 -lOpenCL \
-L/opt/nexus4-opencl/lib -o squares squares.cpp

Now you’ll have a squares binary sitting in your working directory. You’ll need to put it somewhere on the phone that can execute binaries. If you’re just building a library for your app, you should be fine adding this to your usual Android.mk with the include and library directories added. Otherwise, to run this at the command line, you’ll need to have a rooted phone. Assuming you do, you can put this anywhere in the /data directory. Then, shell into your phone and run it:

./squares
Computed 1000000 / 1000000 correct values

And that does it!

One thought on “How I built an OpenCL binary for the Nexus 4

  1. Pingback: Computational physics on the smartphone GPU | Trespassing allowed: from the classical to the quantum world (and back again)

Leave a Reply

Your email address will not be published. Required fields are marked *

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>