c++ - Problem with OpenCL kernel recompile slowing down program and possible memory issues because of that -


i'm new opencl , i'm running os x 10.6 nvidia 330 graphics card. i'm working on cloth simulation in c++ i've managed write kernel compiles , runs. problem it's running slower did on cpu without opencl. believe reason every time call update() method calculations i'm setting context , device , recompiling kernel source.

to solve this, tried encapsulating various opencl types needed cloth simulation class try , store them there, , created initcl() set these values. created runcl() execute kernel. strangely gives me memory problem when separate opencl stuff 2 methods. works fine if initcl() , runcl() both combined 1 method though why i'm little stuck.

the program compiles , runs sigabrt or exc bad access @ point marked in runcl() code. when sigabrt error cl_invalid_command_queue can't work out life of me why happens when split 2 methods. sigabrt when assertion fails expected other times bad memory access error when trying write buffer.

also if can tell me better way/the right or if jit recompiling isn't what's slowing code down i'd grateful because i've been staring @ far long!

thanks,

jon

the initialisation of opencl variables code:

int vpesimulationcloth::initcl(){    // find cpu cl device, fallback    err = clgetdeviceids(null, cl_device_type_cpu, 1, &device, null);    assert(err == cl_success);     // find gpu cl device, want // if there no gpu device cl capable, fall cpu   err = clgetdeviceids(null, cl_device_type_gpu, 1, &device, null); if (err != cl_success) err = clgetdeviceids(null, cl_device_type_cpu, 1, &device, null); assert(device);  // information returned device cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; err = clgetdeviceinfo(device, cl_device_vendor, sizeof(vendor_name),                  vendor_name, &returned_size); err |= clgetdeviceinfo(device, cl_device_name, sizeof(device_name),                   device_name, &returned_size); assert(err == cl_success); //printf("connecting %s %s...\n", vendor_name, device_name);  // create context perform our calculation  // specified device  context = clcreatecontext(0, 1, &device, null, null, &err); assert(err == cl_success);  // , command queue context cmd_queue = clcreatecommandqueue(context, device, 0, null);  // load program source disk // kernel/program should in resource directory const char * filename = "clothsimkernel.cl"; char *program_source = load_program_source(filename);   program[0] = clcreateprogramwithsource(context, 1, (const char**)&program_source,                              null, &err); if (!program[0]) {    printf("error: failed create compute program!\n");    return exit_failure; } assert(err == cl_success);  err = clbuildprogram(program[0], 0, null, null, null, null); if (err != cl_success) {    char build[2048];    clgetprogrambuildinfo(program[0], device, cl_program_build_log, 2048, build, null);    printf("build log:\n%s\n",build);    if (err == cl_build_program_failure) {       printf("cl_build_program_failure\n");    } } if (err != cl_success) {    cout<<geterrordesc(err)<<endl; } assert(err == cl_success); //writebinaries(); // create kernel "objects" want use in example file  kernel[0] = clcreatekernel(program[0], "clothsimulation", &err);  } 

the method execute kernel code:

int vpesimulationcloth::runcl(){  // find gpu cl device, want // if there no gpu device cl capable, fall cpu err = clgetdeviceids(null, cl_device_type_gpu, 1, &device, null); if (err != cl_success) err = clgetdeviceids(null, cl_device_type_cpu, 1, &device, null); assert(device);  // information returned device cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; err = clgetdeviceinfo(device, cl_device_vendor, sizeof(vendor_name),                  vendor_name, &returned_size); err |= clgetdeviceinfo(device, cl_device_name, sizeof(device_name),                   device_name, &returned_size); assert(err == cl_success); //printf("connecting %s %s...\n", vendor_name, device_name);  // create context perform our calculation  // specified device   //cmd_queue = clcreatecommandqueue(context, device, 0, null); //memory allocation cl_mem nowpos_mem, prevpos_mem, rforce_mem, mass_mem, passive_mem,    canmove_mem,numpart_mem, theforces_mem, numforces_mem, drag_mem, answerpos_mem;  // allocate memory on device hold our data , store results buffer_size = sizeof(float4) * numparts;  // input arrays  //------------------------------------ // error occurs nowpos_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, nowpos_mem, cl_true, 0, buffer_size,                     (void*)nowpos, 0, null, null); if (err != cl_success) {   cout<<geterrordesc(err)<<endl; } assert(err == cl_success); //------------------------------------ prevpos_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, prevpos_mem, cl_true, 0, buffer_size,                     (void*)prevpos, 0, null, null); assert(err == cl_success); rforce_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, rforce_mem, cl_true, 0, buffer_size,                     (void*)rforce, 0, null, null); assert(err == cl_success); mass_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, mass_mem, cl_true, 0, buffer_size,                     (void*)mass, 0, null, null); assert(err == cl_success); answerpos_mem = clcreatebuffer(context, cl_mem_read_write, buffer_size, null, null); //uint buffer buffer_size = sizeof(uint) * numparts; passive_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, passive_mem, cl_true, 0, buffer_size,                     (void*)passive, 0, null, null); assert(err == cl_success); canmove_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, canmove_mem, cl_true, 0, buffer_size,                     (void*)canmove, 0, null, null); assert(err == cl_success);  buffer_size = sizeof(float4) * numforces; theforces_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, theforces_mem, cl_true, 0, buffer_size,                     (void*)theforces, 0, null, null); assert(err == cl_success);  //drag float buffer_size = sizeof(float); drag_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err |= clenqueuewritebuffer(cmd_queue, drag_mem, cl_true, 0, buffer_size,                     (void*)drag, 0, null, null); assert(err == cl_success);  // setup arguments our kernel err  = clsetkernelarg(kernel[0],  0, sizeof(cl_mem), &nowpos_mem); err |= clsetkernelarg(kernel[0],  1, sizeof(cl_mem), &prevpos_mem); err |= clsetkernelarg(kernel[0],  2, sizeof(cl_mem), &rforce_mem); err |= clsetkernelarg(kernel[0],  3, sizeof(cl_mem), &mass_mem); err |= clsetkernelarg(kernel[0],  4, sizeof(cl_mem), &passive_mem); err |= clsetkernelarg(kernel[0],  5, sizeof(cl_mem), &canmove_mem); err |= clsetkernelarg(kernel[0],  6, sizeof(cl_mem), &numparts); err |= clsetkernelarg(kernel[0],  7, sizeof(cl_mem), &theforces_mem); err |= clsetkernelarg(kernel[0],  8, sizeof(cl_mem), &numforces); err |= clsetkernelarg(kernel[0],  9, sizeof(cl_mem), &drag_mem); err |= clsetkernelarg(kernel[0],  10, sizeof(cl_mem), &answerpos_mem); if (err != cl_success) {    cout<<geterrordesc(err)<<endl; } assert(err == cl_success); // run calculation enqueuing , forcing  // command queue complete task size_t global_work_size = numparts; size_t local_work_size = global_work_size/8; err = clenqueuendrangekernel(cmd_queue, kernel[0], 1, null,                       &global_work_size, &local_work_size, 0, null, null); if (err != cl_success) {    cout<<geterrordesc(err)<<endl; }  assert(err == cl_success); //clfinish(cmd_queue);  // once finished read results answer  // array results array //reset buffer first buffer_size = sizeof(float4) * numparts; err = clenqueuereadbuffer(cmd_queue, answerpos_mem, cl_true, 0, buffer_size,                     answerpos, 0, null, null); if (err != cl_success) {    cout<<geterrordesc(err)<<endl; }   //cl mem clreleasememobject(nowpos_mem); clreleasememobject(prevpos_mem); clreleasememobject(rforce_mem); clreleasememobject(mass_mem); clreleasememobject(passive_mem); clreleasememobject(canmove_mem); clreleasememobject(theforces_mem); clreleasememobject(drag_mem); clreleasememobject(answerpos_mem); clreleasecommandqueue(cmd_queue); clreleasecontext(context); assert(err == cl_success); return err;  } 

problem solved! @ bottom of runcl() method "freeing" cl types, though freeing cl_mem on closer inspection freeing context etc. obvious , annoying mistake :).

thanks andrew.brownsword on khronos forums spotting one.


Comments

Popular posts from this blog

asp.net - repeatedly call AddImageUrl(url) to assemble pdf document -

java - Android recognize cell phone with keyboard or not? -

iphone - How would you achieve a LED Scrolling effect? -