__shared__ variable behaving oddly CUDA -


consider below code runs under 9us on k20 :

__global__ void histogram( unsigned char *inputpointer, int *outputpointer) {      __shared__ unsigned char localdispersedhistogram[ 256  ] [ 32 ];     __shared__ unsigned int parthist[ 256 ] ;      int ;     int tx = threadidx.x;     int pixeloffset = (blockidx.x * blockdim.x) +  threadidx.x;     uint8_t val = inputpointer[ pixeloffset ];      uint8_t  data = val/ 8 ;     uint8_t  position = val % 8 ;      /**trying avoid loops thats why code */     localdispersedhistogram [  tx ] [  tx % 32 ] = 0 ;      __syncthreads();      turn_on( localdispersedhistogram [  tx ] [  data ] , position );       __syncthreads();      parthist[ tx ] = 0;      int k = 0 ;     ( int = 0 ; < 256 ; ++ ) {         k++;     }  } 

now below code take 72us on access of shared variable:

__global__ void histogram( unsigned char *inputpointer, int *outputpointer) {      __shared__ unsigned char localdispersedhistogram[ 256  ] [ 32 ];     __shared__ unsigned int parthist[ 256 ] ;      int ;     int tx = threadidx.x;     int pixeloffset = (blockidx.x * blockdim.x) +  threadidx.x;     uint8_t val = inputpointer[ pixeloffset ];      uint8_t  data = val/ 8 ;     uint8_t  position = val % 8 ;      /**trying avoid loops thats why code */     localdispersedhistogram [  tx ] [  tx % 32 ] = 0 ;      __syncthreads();      turn_on( localdispersedhistogram [  tx ] [  data ] , position );       __syncthreads();      parthist[ tx ] = 0;         ( int = 0 ; < 256 ; ++ ) {         parthist[ tx ]++;     }  } 

why should shared access make such huge difference? understand shared access expensive register access if in above code, line

turn_on( localdispersedhistogram [  tx ] [  data ] , position ); 

is using shared variable, how come manipulation of localdispersedhistogram takes less time , parthist access taking crazy time ?

help.

update: apologies:

my kernel configuration <<< 256 , 256 >>>

full code:

#include "cuda_runtime.h" #include "device_launch_parameters.h"  #include <stdio.h> #include <stdlib.h> #include <string.h> #include <stdint.h> #include <conio.h>   #define size_of_output_array 256 * 256 * 256 #define size_of_input_array 256 * 256   #define turn_on(data,position) (data|=(1<<(position)))   __global__ void histogram( unsigned char *inputpointer, int *outputpointer) {     #if 1        __shared__ unsigned char localdispersedhistogram[ 256  ] [ 32 ];     __shared__ long long parthist[ 256 ] ;      int ;     int tx = threadidx.x;     int pixeloffset = (blockidx.x * blockdim.x) +  threadidx.x;     uint8_t val = inputpointer[ pixeloffset ];      uint8_t  data = val/ 8 ;     uint8_t  position = val % 8 ;       ( int j =  0 ; j < 32 ; j++) {         localdispersedhistogram[ tx ] [ j ] = 0;     }      __syncthreads();      turn_on( localdispersedhistogram [  tx ] [  data ] , position );       __syncthreads();      //parthist[ tx ] = 0;      int sum = 0 ;       ( int = 0 ; < 256 ; ++ ) {         sum += (localdispersedhistogram [  ] [  tx/ 8 ] & ( 1 << (tx % 8 ))) >> (tx % 8 ) ;     }     parthist[ tx ] = sum;      atomicadd( &outputpointer[ tx ] , parthist[ tx ]  );      #endif  }    int main() {      #if 1     printf(" code name, sad buddy 17 ");      unsigned char *inputpointer = (unsigned char * ) malloc (size_of_input_array);      ( int = 0 ; < size_of_input_array ; ++ ) {          int t = rand() % 256 ;          //int t = 0;          inputpointer [ ]   = t;      }      unsigned char *device_inputpointer;     int  *device_outputpointer;      cudamalloc((void**)&device_inputpointer, size_of_input_array);     cudamemcpy( device_inputpointer,  inputpointer , size_of_input_array, cudamemcpyhosttodevice );     cudamalloc((void**)&device_outputpointer, 256 * sizeof ( int ) );     cudamemset(device_outputpointer,0,256 * sizeof ( int ) );      histogram <<< 256 , 256 >>>  ( device_inputpointer , device_outputpointer  );       unsigned int *output = ( unsigned int * )malloc ( 256 * sizeof( int ));      cudamemcpy( output,  device_outputpointer , 256 * sizeof( int ), cudamemcpydevicetohost );          unsigned int cpuhist [ 256 ]  ;     unsigned int gpuhist [ 256 ]  ;      ( int = 0 ; < 256 ;i ++ ) {         cpuhist[ ] = 0;         gpuhist [ ] = 0;          //printf( " %d " ,  inputpointer[ ]);     }       ( int = 0 ; < size_of_input_array ; i++ ) {         cpuhist[ inputpointer [ ] ] ++;     }          int flag = 0 ;     ( int = 0 ; < 256 ;i ++ ) {         printf(" %d  gpuhist  %d cpuhist\n" , output[ ] , cpuhist[i]);          if (output[ ] != cpuhist[i]  ) {             flag = 1 ;         }     }      printf("\n\n======================\n\n");      if ( flag ) {         printf("test case fail ");     }     else {         printf("test case pass");     }      printf("\n\n======================\n\n");       cudadevicereset();      #endif     getch();     return 0; } 

since haven't posted both complete cases comparison, i'm inferring 2 cases based on first posting , update.

when have code this:

int sum = 0 ;    int k = 0 ; ( int = 0 ; < 256 ; ++ ) {     k++; }  parthist[ tx ] = sum;  atomicadd( &outputpointer[ tx ] , parthist[ tx ]  ); 

(or if k variable replaced sum, not matter) compiler can figure out end in parthist[tx] without running of previous code. therefore can optimize previous code out (i.e. delete it) , still same result, , so. code execution therefore shorter, , getting ~9us timing result.

on other hand, when code posted:

int sum = 0 ;   ( int = 0 ; < 256 ; ++ ) {     sum += (localdispersedhistogram [  ] [  tx/ 8 ] & ( 1 << (tx % 8 ))) >> (tx % 8 ) ; } parthist[ tx ] = sum;  atomicadd( &outputpointer[ tx ] , parthist[ tx ]  ); 

then code depends on preceding code determine result, , compiler cannot optimize out.

you can additional confirmation of perhaps compiling no optimization (nvcc -g ...) or else using cuobjdump -sass mycode dump out generated assembly code in each case, , discover major difference in kernel code @ assembly level, due compiler optimization.

whenever relative small changes made code, , huge changes in execution timing occur, should suspect compiler optimization side-effect.


Comments

Popular posts from this blog

python - Subclassed QStyledItemDelegate ignores Stylesheet -

java - HttpClient 3.1 Connection pooling vs HttpClient 4.3.2 -

SQL: Divide the sum of values in one table with the count of rows in another -