__shared__ variable behaving oddly CUDA -
consider below code runs under 9us on k20 :
__global__ void histogram( unsigned char *inputpointer, int *outputpointer) { __shared__ unsigned char localdispersedhistogram[ 256 ] [ 32 ]; __shared__ unsigned int parthist[ 256 ] ; int ; int tx = threadidx.x; int pixeloffset = (blockidx.x * blockdim.x) + threadidx.x; uint8_t val = inputpointer[ pixeloffset ]; uint8_t data = val/ 8 ; uint8_t position = val % 8 ; /**trying avoid loops thats why code */ localdispersedhistogram [ tx ] [ tx % 32 ] = 0 ; __syncthreads(); turn_on( localdispersedhistogram [ tx ] [ data ] , position ); __syncthreads(); parthist[ tx ] = 0; int k = 0 ; ( int = 0 ; < 256 ; ++ ) { k++; } }
now below code take 72us on access of shared variable:
__global__ void histogram( unsigned char *inputpointer, int *outputpointer) { __shared__ unsigned char localdispersedhistogram[ 256 ] [ 32 ]; __shared__ unsigned int parthist[ 256 ] ; int ; int tx = threadidx.x; int pixeloffset = (blockidx.x * blockdim.x) + threadidx.x; uint8_t val = inputpointer[ pixeloffset ]; uint8_t data = val/ 8 ; uint8_t position = val % 8 ; /**trying avoid loops thats why code */ localdispersedhistogram [ tx ] [ tx % 32 ] = 0 ; __syncthreads(); turn_on( localdispersedhistogram [ tx ] [ data ] , position ); __syncthreads(); parthist[ tx ] = 0; ( int = 0 ; < 256 ; ++ ) { parthist[ tx ]++; } }
why should shared access make such huge difference? understand shared access expensive register access if in above code, line
turn_on( localdispersedhistogram [ tx ] [ data ] , position );
is using shared variable, how come manipulation of localdispersedhistogram takes less time , parthist access taking crazy time ?
help.
update: apologies:
my kernel configuration <<< 256 , 256 >>>
full code:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <stdint.h> #include <conio.h> #define size_of_output_array 256 * 256 * 256 #define size_of_input_array 256 * 256 #define turn_on(data,position) (data|=(1<<(position))) __global__ void histogram( unsigned char *inputpointer, int *outputpointer) { #if 1 __shared__ unsigned char localdispersedhistogram[ 256 ] [ 32 ]; __shared__ long long parthist[ 256 ] ; int ; int tx = threadidx.x; int pixeloffset = (blockidx.x * blockdim.x) + threadidx.x; uint8_t val = inputpointer[ pixeloffset ]; uint8_t data = val/ 8 ; uint8_t position = val % 8 ; ( int j = 0 ; j < 32 ; j++) { localdispersedhistogram[ tx ] [ j ] = 0; } __syncthreads(); turn_on( localdispersedhistogram [ tx ] [ data ] , position ); __syncthreads(); //parthist[ tx ] = 0; int sum = 0 ; ( int = 0 ; < 256 ; ++ ) { sum += (localdispersedhistogram [ ] [ tx/ 8 ] & ( 1 << (tx % 8 ))) >> (tx % 8 ) ; } parthist[ tx ] = sum; atomicadd( &outputpointer[ tx ] , parthist[ tx ] ); #endif } int main() { #if 1 printf(" code name, sad buddy 17 "); unsigned char *inputpointer = (unsigned char * ) malloc (size_of_input_array); ( int = 0 ; < size_of_input_array ; ++ ) { int t = rand() % 256 ; //int t = 0; inputpointer [ ] = t; } unsigned char *device_inputpointer; int *device_outputpointer; cudamalloc((void**)&device_inputpointer, size_of_input_array); cudamemcpy( device_inputpointer, inputpointer , size_of_input_array, cudamemcpyhosttodevice ); cudamalloc((void**)&device_outputpointer, 256 * sizeof ( int ) ); cudamemset(device_outputpointer,0,256 * sizeof ( int ) ); histogram <<< 256 , 256 >>> ( device_inputpointer , device_outputpointer ); unsigned int *output = ( unsigned int * )malloc ( 256 * sizeof( int )); cudamemcpy( output, device_outputpointer , 256 * sizeof( int ), cudamemcpydevicetohost ); unsigned int cpuhist [ 256 ] ; unsigned int gpuhist [ 256 ] ; ( int = 0 ; < 256 ;i ++ ) { cpuhist[ ] = 0; gpuhist [ ] = 0; //printf( " %d " , inputpointer[ ]); } ( int = 0 ; < size_of_input_array ; i++ ) { cpuhist[ inputpointer [ ] ] ++; } int flag = 0 ; ( int = 0 ; < 256 ;i ++ ) { printf(" %d gpuhist %d cpuhist\n" , output[ ] , cpuhist[i]); if (output[ ] != cpuhist[i] ) { flag = 1 ; } } printf("\n\n======================\n\n"); if ( flag ) { printf("test case fail "); } else { printf("test case pass"); } printf("\n\n======================\n\n"); cudadevicereset(); #endif getch(); return 0; }
since haven't posted both complete cases comparison, i'm inferring 2 cases based on first posting , update.
when have code this:
int sum = 0 ; int k = 0 ; ( int = 0 ; < 256 ; ++ ) { k++; } parthist[ tx ] = sum; atomicadd( &outputpointer[ tx ] , parthist[ tx ] );
(or if k
variable replaced sum
, not matter) compiler can figure out end in parthist[tx]
without running of previous code. therefore can optimize previous code out (i.e. delete it) , still same result, , so. code execution therefore shorter, , getting ~9us timing result.
on other hand, when code posted:
int sum = 0 ; ( int = 0 ; < 256 ; ++ ) { sum += (localdispersedhistogram [ ] [ tx/ 8 ] & ( 1 << (tx % 8 ))) >> (tx % 8 ) ; } parthist[ tx ] = sum; atomicadd( &outputpointer[ tx ] , parthist[ tx ] );
then code depends on preceding code determine result, , compiler cannot optimize out.
you can additional confirmation of perhaps compiling no optimization (nvcc -g ...
) or else using cuobjdump -sass mycode
dump out generated assembly code in each case, , discover major difference in kernel code @ assembly level, due compiler optimization.
whenever relative small changes made code, , huge changes in execution timing occur, should suspect compiler optimization side-effect.
Comments
Post a Comment