c++ - Problem with OpenCL kernel recompile slowing down program and possible memory issues because of that -
i'm new opencl , i'm running os x 10.6 nvidia 330 graphics card. i'm working on cloth simulation in c++ i've managed write kernel compiles , runs. problem it's running slower did on cpu without opencl. believe reason every time call update() method calculations i'm setting context , device , recompiling kernel source.
to solve this, tried encapsulating various opencl types needed cloth simulation class try , store them there, , created initcl() set these values. created runcl() execute kernel. strangely gives me memory problem when separate opencl stuff 2 methods. works fine if initcl() , runcl() both combined 1 method though why i'm little stuck.
the program compiles , runs sigabrt or exc bad access @ point marked in runcl() code. when sigabrt error cl_invalid_command_queue can't work out life of me why happens when split 2 methods. sigabrt when assertion fails expected other times bad memory access error when trying write buffer.
also if can tell me better way/the right or if jit recompiling isn't what's slowing code down i'd grateful because i've been staring @ far long!
thanks,
jon
the initialisation of opencl variables code:
int vpesimulationcloth::initcl(){ // find cpu cl device, fallback err = clgetdeviceids(null, cl_device_type_cpu, 1, &device, null); assert(err == cl_success); // find gpu cl device, want // if there no gpu device cl capable, fall cpu err = clgetdeviceids(null, cl_device_type_gpu, 1, &device, null); if (err != cl_success) err = clgetdeviceids(null, cl_device_type_cpu, 1, &device, null); assert(device); // information returned device cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; err = clgetdeviceinfo(device, cl_device_vendor, sizeof(vendor_name), vendor_name, &returned_size); err |= clgetdeviceinfo(device, cl_device_name, sizeof(device_name), device_name, &returned_size); assert(err == cl_success); //printf("connecting %s %s...\n", vendor_name, device_name); // create context perform our calculation // specified device context = clcreatecontext(0, 1, &device, null, null, &err); assert(err == cl_success); // , command queue context cmd_queue = clcreatecommandqueue(context, device, 0, null); // load program source disk // kernel/program should in resource directory const char * filename = "clothsimkernel.cl"; char *program_source = load_program_source(filename); program[0] = clcreateprogramwithsource(context, 1, (const char**)&program_source, null, &err); if (!program[0]) { printf("error: failed create compute program!\n"); return exit_failure; } assert(err == cl_success); err = clbuildprogram(program[0], 0, null, null, null, null); if (err != cl_success) { char build[2048]; clgetprogrambuildinfo(program[0], device, cl_program_build_log, 2048, build, null); printf("build log:\n%s\n",build); if (err == cl_build_program_failure) { printf("cl_build_program_failure\n"); } } if (err != cl_success) { cout<<geterrordesc(err)<<endl; } assert(err == cl_success); //writebinaries(); // create kernel "objects" want use in example file kernel[0] = clcreatekernel(program[0], "clothsimulation", &err); }
the method execute kernel code:
int vpesimulationcloth::runcl(){ // find gpu cl device, want // if there no gpu device cl capable, fall cpu err = clgetdeviceids(null, cl_device_type_gpu, 1, &device, null); if (err != cl_success) err = clgetdeviceids(null, cl_device_type_cpu, 1, &device, null); assert(device); // information returned device cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; err = clgetdeviceinfo(device, cl_device_vendor, sizeof(vendor_name), vendor_name, &returned_size); err |= clgetdeviceinfo(device, cl_device_name, sizeof(device_name), device_name, &returned_size); assert(err == cl_success); //printf("connecting %s %s...\n", vendor_name, device_name); // create context perform our calculation // specified device //cmd_queue = clcreatecommandqueue(context, device, 0, null); //memory allocation cl_mem nowpos_mem, prevpos_mem, rforce_mem, mass_mem, passive_mem, canmove_mem,numpart_mem, theforces_mem, numforces_mem, drag_mem, answerpos_mem; // allocate memory on device hold our data , store results buffer_size = sizeof(float4) * numparts; // input arrays //------------------------------------ // error occurs nowpos_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, nowpos_mem, cl_true, 0, buffer_size, (void*)nowpos, 0, null, null); if (err != cl_success) { cout<<geterrordesc(err)<<endl; } assert(err == cl_success); //------------------------------------ prevpos_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, prevpos_mem, cl_true, 0, buffer_size, (void*)prevpos, 0, null, null); assert(err == cl_success); rforce_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, rforce_mem, cl_true, 0, buffer_size, (void*)rforce, 0, null, null); assert(err == cl_success); mass_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, mass_mem, cl_true, 0, buffer_size, (void*)mass, 0, null, null); assert(err == cl_success); answerpos_mem = clcreatebuffer(context, cl_mem_read_write, buffer_size, null, null); //uint buffer buffer_size = sizeof(uint) * numparts; passive_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, passive_mem, cl_true, 0, buffer_size, (void*)passive, 0, null, null); assert(err == cl_success); canmove_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, canmove_mem, cl_true, 0, buffer_size, (void*)canmove, 0, null, null); assert(err == cl_success); buffer_size = sizeof(float4) * numforces; theforces_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err = clenqueuewritebuffer(cmd_queue, theforces_mem, cl_true, 0, buffer_size, (void*)theforces, 0, null, null); assert(err == cl_success); //drag float buffer_size = sizeof(float); drag_mem = clcreatebuffer(context, cl_mem_read_only, buffer_size, null, null); err |= clenqueuewritebuffer(cmd_queue, drag_mem, cl_true, 0, buffer_size, (void*)drag, 0, null, null); assert(err == cl_success); // setup arguments our kernel err = clsetkernelarg(kernel[0], 0, sizeof(cl_mem), &nowpos_mem); err |= clsetkernelarg(kernel[0], 1, sizeof(cl_mem), &prevpos_mem); err |= clsetkernelarg(kernel[0], 2, sizeof(cl_mem), &rforce_mem); err |= clsetkernelarg(kernel[0], 3, sizeof(cl_mem), &mass_mem); err |= clsetkernelarg(kernel[0], 4, sizeof(cl_mem), &passive_mem); err |= clsetkernelarg(kernel[0], 5, sizeof(cl_mem), &canmove_mem); err |= clsetkernelarg(kernel[0], 6, sizeof(cl_mem), &numparts); err |= clsetkernelarg(kernel[0], 7, sizeof(cl_mem), &theforces_mem); err |= clsetkernelarg(kernel[0], 8, sizeof(cl_mem), &numforces); err |= clsetkernelarg(kernel[0], 9, sizeof(cl_mem), &drag_mem); err |= clsetkernelarg(kernel[0], 10, sizeof(cl_mem), &answerpos_mem); if (err != cl_success) { cout<<geterrordesc(err)<<endl; } assert(err == cl_success); // run calculation enqueuing , forcing // command queue complete task size_t global_work_size = numparts; size_t local_work_size = global_work_size/8; err = clenqueuendrangekernel(cmd_queue, kernel[0], 1, null, &global_work_size, &local_work_size, 0, null, null); if (err != cl_success) { cout<<geterrordesc(err)<<endl; } assert(err == cl_success); //clfinish(cmd_queue); // once finished read results answer // array results array //reset buffer first buffer_size = sizeof(float4) * numparts; err = clenqueuereadbuffer(cmd_queue, answerpos_mem, cl_true, 0, buffer_size, answerpos, 0, null, null); if (err != cl_success) { cout<<geterrordesc(err)<<endl; } //cl mem clreleasememobject(nowpos_mem); clreleasememobject(prevpos_mem); clreleasememobject(rforce_mem); clreleasememobject(mass_mem); clreleasememobject(passive_mem); clreleasememobject(canmove_mem); clreleasememobject(theforces_mem); clreleasememobject(drag_mem); clreleasememobject(answerpos_mem); clreleasecommandqueue(cmd_queue); clreleasecontext(context); assert(err == cl_success); return err; }
problem solved! @ bottom of runcl() method "freeing" cl types, though freeing cl_mem on closer inspection freeing context etc. obvious , annoying mistake :).
thanks andrew.brownsword on khronos forums spotting one.
Comments
Post a Comment