Lesson of 4 March 2011 (part b)

// g++ -O3 -sse2 mask.cc TimeMeter.cc
// Elapsed 8:9 ms

#include <iostream>
#include <iomanip>
#include <cstdlib> // near equivalent to <stdlib.h>  for exit, rand, ...
#include <cmath>   // near equivalent to <math.h>    for sin, cos, log, ..

#include <pthread.h>

#include "TimeMeter.hh"
//#include "pstdint.h"
#include <stdint.h>

using namespace std ;

// general random numer generator
template <typename U>
inline
U random() { return (U)rand()-(U)rand() ; }

/*
//
//  +-----------+
//  | 1 | 2 | 3 |
//  +-----------+
//  | 4 | 5 | 6 | x mat
//  +-----------+
//  | 7 | 8 | 9 |
//  +-----------+
//
//
//
*/

template <typename UNSIGNED>
void
applyMask( UNSIGNED * matrix, unsigned dimCol, unsigned Nrow, unsigned Ncol, UNSIGNED mask[3][3] ) {
  UNSIGNED * r0 = (UNSIGNED *)alloca( 2 * Ncol * sizeof(UNSIGNED) ) ;
  UNSIGNED * r1 = r0 + Nrow ;
  //UNSIGNED r0[Ncol] ;
  //UNSIGNED r1[Ncol] ;

  // copy first column
  std::copy( matrix, matrix + Nrow, r0 ) ;

  // loop by column
  for ( unsigned i = 1 ; i < Nrow-1 ; ++i ) {
    UNSIGNED * ri  = matrix + i*dimCol ;
    UNSIGNED * ri1 = ri + dimCol ;
    std::copy( ri, ri + Ncol, r1 ) ;
    for ( unsigned j = 1 ; j < Ncol-1 ; ++j ) {
      ri[j] = mask[0][0] * r0[j-1]  + mask[0][1] * r1[j]  + mask[0][2] * r1[j+1] +  
              mask[1][0] * r1[j-1]  + mask[1][1] * r1[j]  + mask[1][2] * r1[j+1] +  
              mask[2][0] * ri1[j-1] + mask[2][1] * ri1[j] + mask[2][2] * ri1[j+1] ;  
    }
    std::copy( r1, r1 + Ncol, r0 ) ;
  }
}

//typedef uint8_t BITS ;
//typedef uint16_t BITS ;
typedef uint32_t BITS ;
//typedef uint64_t BITS ;

int
main(int argc, char *argv) {

  TimeMeter tm ;

  unsigned const dimCol = 2000 ;
  unsigned const Nrow   = 1000 ;
  unsigned const Ncol   = 1000 ;

  BITS * matrix = new BITS[dimCol*Nrow] ;
  BITS mask[3][3] = {  1,   2,   4,
                       8,  16,  32,
                      64, 128, 256  } ;

  for ( unsigned i = 0 ; i < dimCol*Nrow ; ++i ) matrix[i] = random<BITS>() ;
  
  tm . start() ;
  
  for ( unsigned i = 0 ; i < 10 ; ++i ) applyMask<BITS>( matrix, dimCol, Nrow, Ncol, mask ) ;
  double elapsed = tm . milliseconds() ;
  cout << "Elapsed " << elapsed/10 << "ms\n" ;

  //pthread_exit(NULL) ;
  return 0 ;
}

#ifndef IMAGE_CONVOLUTION_HH
#define IMAGE_CONVOLUTION_HH

template <typename UNSIGNED>
void
imageConvolution( UNSIGNED *     matrix,
                  unsigned       dimRowBlock,
                  unsigned       numRowBlock,
                  unsigned       numCols,
                  uint32_t const compressedTableConvolution[] ) ;

#endif

#include <iostream>
#include <iomanip>

#include "pstdint.h"
//#include <stdint.h>

using namespace std ;

/*
//
//  Given the bits
//  A = x x x x x x x
//  B = x x x x x x x
//  C = x x x x x x x
//
//  extract 3 x 3 bit mask and ...
*/

template <typename UNSIGNED>
inline
bool
useTable( uint32_t const compressedTableConvolution[], unsigned tn ) {
  return (compressedTableConvolution[tn>>5]>>(tn&0x1F)) & 0x01 ;
}

template <typename UNSIGNED>
inline
UNSIGNED
convolution( bool do_left_special,
             bool do_right_special,
             UNSIGNED AL, UNSIGNED A, UNSIGNED AR,
             UNSIGNED BL, UNSIGNED B, UNSIGNED BR,
             UNSIGNED CL, UNSIGNED C, UNSIGNED CR,
             uint32_t const compressedTableConvolution[] ) {

  unsigned const numBits = sizeof(UNSIGNED) * CHAR_BIT ;
  static UNSIGNED const mask3bit = 0x07 ;
  static UNSIGNED const mask2bit = 0x03 ;
    
  UNSIGNED RES = B ;
  // special case first bit
  unsigned tn ;
  
  UNSIGNED bit = 0x01 ; // Starting bit
  if ( do_left_special ) {
    tn = ((A & mask2bit)<<1)|(AL>>(numBits-1)) |
         ((B & mask2bit)<<4)|((BL>>(numBits-4))&0x08) |
         ((C & mask2bit)<<7)|((CL>>(numBits-7))&0x20) ;
    if ( useTable<UNSIGNED>(compressedTableConvolution,tn) ) RES |= bit ;
    else                                                     RES &= ~bit ;
  }

  bit = 0x02 ;
  for ( unsigned i = 1 ; i < numBits - 1 ; ++i ) {
    tn = (A & mask3bit) | ((B & mask3bit)<<3) | ((C & mask3bit)<<6) ;
    if ( useTable<UNSIGNED>(compressedTableConvolution,tn) ) RES |= bit ;
    else                                                     RES &= ~bit ;
    A   >>= 1 ;
    B   >>= 1 ;
    C   >>= 1 ;
    bit <<= 1 ;
  }

  // special case last bit
  if ( do_right_special ) {
    tn = ((A & mask2bit)<<1)|(AR&0x01) |
         ((B & mask2bit)<<4)|((BR<<3)&0x08) |
         ((C & mask2bit)<<7)|((CR<<7)&0x20) ;
    if ( useTable<UNSIGNED>(compressedTableConvolution,tn) ) RES |= bit ;
    else                                                     RES &= ~bit ;
  }
}

/*
//
// Do convolution for 3 column (or rows)
//
*/

template <typename UNSIGNED>
inline
void
convolution( unsigned const N,
             UNSIGNED const Avec[],
             UNSIGNED       Bvec[],
             UNSIGNED const Cvec[],
             uint32_t const compressedTableConvolution[] ) {
             
  UNSIGNED Bsaved = Bvec[0] ;
  Bvec[0] = convolution<UNSIGNED>( false, true,
                                   0, Avec[0], Avec[1],
                                   0, Bvec[0], Bvec[1],
                                   0, Cvec[0], Cvec[1],
                                   compressedTableConvolution ) ;
  for ( unsigned i = 1 ; i < N-1 ; ++i ) {
    UNSIGNED Bsaved1 = Bvec[i] ;
    Bvec[i] = convolution<UNSIGNED>( true, true,
                                     Avec[i-1], Avec[i], Avec[i+1],
                                     Bsaved,    Bvec[i], Bvec[i+1],
                                     Cvec[i-1], Cvec[i], Cvec[i+1],
                                     compressedTableConvolution ) ;
    Bsaved = Bsaved1 ;  
  }
  Bvec[N-1] = convolution<UNSIGNED>( true, false,
                                     Avec[N-2], Avec[N-1], 0,
                                     Bsaved,    Bvec[N-1], 0,
                                     Cvec[N-2], Cvec[N-1], 0,
                                     compressedTableConvolution ) ;
}

template <typename UNSIGNED>
void
imageConvolution( UNSIGNED *     matrix,
                  unsigned       dimRowBlock,
                  unsigned       numRowBlock,
                  unsigned       numCols,
                  uint32_t const compressedTableConvolution[] ) {

  UNSIGNED * csaved  = (UNSIGNED *)alloca( 2 * numRowBlock * sizeof(UNSIGNED) ) ;
  UNSIGNED * csaved1 = csaved + numRowBlock ;

  // copy first column
  std::copy( matrix + dimRowBlock, matrix + dimRowBlock + numRowBlock, csaved ) ;

  // loop by column
  for ( unsigned i = 1 ; i < numCols-1 ; ++i ) {
    UNSIGNED * cim1 = matrix + (i-1)*dimRowBlock ;
    UNSIGNED * ci   = matrix + i*dimRowBlock ;
    UNSIGNED * cip1 = matrix + (i+1)*dimRowBlock ;

    // save row c1
    std::copy( ci, ci + numRowBlock, csaved1 ) ;
    convolution<UNSIGNED>( numRowBlock, csaved, ci, cip1, compressedTableConvolution ) ;
    std::copy( csaved1, csaved1 + numRowBlock, csaved ) ;
  }
}

template void imageConvolution<uint8_t> ( uint8_t  *, unsigned, unsigned numRowBlock, unsigned numCols, uint32_t const [] ) ;
template void imageConvolution<uint16_t>( uint16_t *, unsigned, unsigned numRowBlock, unsigned numCols, uint32_t const [] ) ;
template void imageConvolution<uint32_t>( uint32_t *, unsigned, unsigned numRowBlock, unsigned numCols, uint32_t const [] ) ;
template void imageConvolution<uint64_t>( uint64_t *, unsigned, unsigned numRowBlock, unsigned numCols, uint32_t const [] ) ;

// eof imageConvolution.cc

// g++ -O3 maskBit.cc imageConvolution.cc TimeMeter.cc 
// g++ -O3 -sse2 -funroll-loops maskBit.cc imageConvolution.cc TimeMeter.cc
// 0.21 ms

#include <iostream>
#include <iomanip>
#include <cstdlib> // near equivalent to <stdlib.h>  for exit, rand, ...
#include <cmath>   // near equivalent to <math.h>    for sin, cos, log, ..

#include <pthread.h>

#include "imageConvolution.hh"
#include "TimeMeter.hh"
#include "pstdint.h"
//#include <stdint.h>

using namespace std ;

// general random numer generator
template <typename U>
inline
U random() { return (U)rand()-(U)rand() ; }

uint32_t compressedTableConvolution[16] = {
  0xFFFFFFFF,  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
  0xFFFFFFFF,  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
  0xFFFFFFFF,  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
  0xFFFFFFFF,  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
} ;

//typedef uint8_t BITS ;
//typedef uint16_t BITS ;
//typedef uint32_t BITS ;
typedef uint64_t BITS ;

#define REPEAT 100000

int
main(int argc, char *argv) {

  TimeMeter tm ;

  unsigned const numBits = sizeof(BITS) * CHAR_BIT ;

  unsigned const dimRowBlock = 4000/numBits ;
  unsigned const numRowBlock = 2000/numBits ;
  unsigned const numCols     = 2000 ;

  BITS * matrix = new BITS[dimRowBlock*numCols] ;

  for ( unsigned i = 0 ; i < dimRowBlock*numCols ; ++i ) matrix[i] = random<BITS>() ;
  
  tm . start() ;
  for ( unsigned i = 0 ; i < REPEAT ; ++i )
    imageConvolution<BITS>( matrix, dimRowBlock, numRowBlock, numCols, compressedTableConvolution ) ;

  double elapsed = tm . milliseconds() ;
  cout << "Elapsed " << elapsed/REPEAT << "ms\n" ;

  //pthread_exit(NULL) ;
  return 0 ;
}

// g++ -O3 -funroll-loops maskBitThread.cc imageConvolution.cc TimeMeter.cc
// g++ -O3 -sse2 -funroll-loops maskBitThread.cc imageConvolution.cc TimeMeter.cc
// 0.1 ms

#include <iostream>
#include <iomanip>
#include <cstdlib> // near equivalent to <stdlib.h>  for exit, rand, ...
#include <cmath>   // near equivalent to <math.h>    for sin, cos, log, ..

#include <pthread.h>

#include "imageConvolution.hh"
#include "TimeMeter.hh"
#include "pstdint.h"
//#include <stdint.h>

using namespace std ;

// general random numer generator
template <typename U>
inline
U random() { return (U)rand()-(U)rand() ; }

uint32_t compressedTableConvolution[16] = {
  0xFFFFFFFF,  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
  0xFFFFFFFF,  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
  0xFFFFFFFF,  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
  0xFFFFFFFF,  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
} ;

//typedef uint8_t BITS ;
//typedef uint16_t BITS ;
//typedef uint32_t BITS ;
typedef uint64_t BITS ;

typedef struct {
  BITS *           matrix ;
  unsigned         dimRowBlock ;
  unsigned         numRowBlock ;
  unsigned         numCols ;
  uint32_t const * compressedTableConvolution ;
} applyMask_data ;

extern "C"
void *
imageConvolution( void * args ) {
  applyMask_data * data = static_cast<applyMask_data*>(args) ;
  imageConvolution<BITS>( data -> matrix,
                          data -> dimRowBlock,
                          data -> numRowBlock,
                          data -> numCols,
                          data -> compressedTableConvolution ) ;
  pthread_exit(NULL) ;
}

#define REPEAT 100000

int
main(int argc, char *argv) {

  pthread_t thread[2] ;

  TimeMeter tm ;

  unsigned const numBits = sizeof(BITS) * CHAR_BIT ;

  unsigned const dimRowBlock = 2000/numBits ;
  unsigned const numRowBlock = 1000/numBits ;
  unsigned const numCols     = 1000 ;

  BITS * matrix = new BITS[dimRowBlock*numCols] ;

  for ( unsigned i = 0 ; i < dimRowBlock*numCols ; ++i ) matrix[i] = random<BITS>() ;

  double elapsed1 = 0 ;
  double elapsed2 = 0 ;
  double elapsed3 = 0 ;

  tm . start() ;
  for ( unsigned i = 0 ; i < REPEAT ; ++i ) {
 
    unsigned numCols2 = numCols/2 ;
    applyMask_data data0 = { matrix, dimRowBlock, numRowBlock, numCols2, compressedTableConvolution } ;
    applyMask_data data1 = { matrix + dimRowBlock * numCols2, dimRowBlock, numRowBlock, numCols-numCols2, compressedTableConvolution } ;
    if ( pthread_create(&thread[0], NULL, imageConvolution, (void *) &data0) ||
         pthread_create(&thread[1], NULL, imageConvolution, (void *) &data1) ) { 
      cerr << "Error while creating thread\n" ; 
      exit(1);
    }
    if ( pthread_join(thread[0], NULL) ) { cerr << "Error while joining thread\n" ; exit(1); }
    if ( pthread_join(thread[1], NULL) ) { cerr << "Error while joining thread\n" ; exit(1); }
  }

  double elapsed = tm . milliseconds() ;
  cout << "Elapsed  " << elapsed/REPEAT << "ms\n" ;

  //pthread_exit(NULL) ;
  return 0 ;
}

Lesson of 4 March 2011 (part b)¶