Lesson of 4 March 2011 (part b)¶
Bit matrix convolution
A simple bit-matrix convolution example
File mask.cc
1// g++ -O3 -sse2 mask.cc TimeMeter.cc
2// Elapsed 8:9 ms
3
4#include <iostream>
5#include <iomanip>
6#include <cstdlib> // near equivalent to <stdlib.h> for exit, rand, ...
7#include <cmath> // near equivalent to <math.h> for sin, cos, log, ..
8
9#include <pthread.h>
10
11#include "TimeMeter.hh"
12//#include "pstdint.h"
13#include <stdint.h>
14
15using namespace std ;
16
17// general random numer generator
18template <typename U>
19inline
20U random() { return (U)rand()-(U)rand() ; }
21
22/*
23//
24// +-----------+
25// | 1 | 2 | 3 |
26// +-----------+
27// | 4 | 5 | 6 | x mat
28// +-----------+
29// | 7 | 8 | 9 |
30// +-----------+
31//
32//
33//
34*/
35
36template <typename UNSIGNED>
37void
38applyMask( UNSIGNED * matrix, unsigned dimCol, unsigned Nrow, unsigned Ncol, UNSIGNED mask[3][3] ) {
39 UNSIGNED * r0 = (UNSIGNED *)alloca( 2 * Ncol * sizeof(UNSIGNED) ) ;
40 UNSIGNED * r1 = r0 + Nrow ;
41 //UNSIGNED r0[Ncol] ;
42 //UNSIGNED r1[Ncol] ;
43
44 // copy first column
45 std::copy( matrix, matrix + Nrow, r0 ) ;
46
47 // loop by column
48 for ( unsigned i = 1 ; i < Nrow-1 ; ++i ) {
49 UNSIGNED * ri = matrix + i*dimCol ;
50 UNSIGNED * ri1 = ri + dimCol ;
51 std::copy( ri, ri + Ncol, r1 ) ;
52 for ( unsigned j = 1 ; j < Ncol-1 ; ++j ) {
53 ri[j] = mask[0][0] * r0[j-1] + mask[0][1] * r1[j] + mask[0][2] * r1[j+1] +
54 mask[1][0] * r1[j-1] + mask[1][1] * r1[j] + mask[1][2] * r1[j+1] +
55 mask[2][0] * ri1[j-1] + mask[2][1] * ri1[j] + mask[2][2] * ri1[j+1] ;
56 }
57 std::copy( r1, r1 + Ncol, r0 ) ;
58 }
59}
60
61//typedef uint8_t BITS ;
62//typedef uint16_t BITS ;
63typedef uint32_t BITS ;
64//typedef uint64_t BITS ;
65
66int
67main(int argc, char *argv) {
68
69 TimeMeter tm ;
70
71 unsigned const dimCol = 2000 ;
72 unsigned const Nrow = 1000 ;
73 unsigned const Ncol = 1000 ;
74
75 BITS * matrix = new BITS[dimCol*Nrow] ;
76 BITS mask[3][3] = { 1, 2, 4,
77 8, 16, 32,
78 64, 128, 256 } ;
79
80 for ( unsigned i = 0 ; i < dimCol*Nrow ; ++i ) matrix[i] = random<BITS>() ;
81
82 tm . start() ;
83
84 for ( unsigned i = 0 ; i < 10 ; ++i ) applyMask<BITS>( matrix, dimCol, Nrow, Ncol, mask ) ;
85 double elapsed = tm . milliseconds() ;
86 cout << "Elapsed " << elapsed/10 << "ms\n" ;
87
88 //pthread_exit(NULL) ;
89 return 0 ;
90}
Fast bit-matrix convolution example, header file
File imageConvolution.hh
1#ifndef IMAGE_CONVOLUTION_HH
2#define IMAGE_CONVOLUTION_HH
3
4template <typename UNSIGNED>
5void
6imageConvolution( UNSIGNED * matrix,
7 unsigned dimRowBlock,
8 unsigned numRowBlock,
9 unsigned numCols,
10 uint32_t const compressedTableConvolution[] ) ;
11
12#endif
Fast bit-matrix convolution example, implementation
File imageConvolution.cc
1#include <iostream>
2#include <iomanip>
3
4#include "pstdint.h"
5//#include <stdint.h>
6
7using namespace std ;
8
9/*
10//
11// Given the bits
12// A = x x x x x x x
13// B = x x x x x x x
14// C = x x x x x x x
15//
16// extract 3 x 3 bit mask and ...
17*/
18
19template <typename UNSIGNED>
20inline
21bool
22useTable( uint32_t const compressedTableConvolution[], unsigned tn ) {
23 return (compressedTableConvolution[tn>>5]>>(tn&0x1F)) & 0x01 ;
24}
25
26template <typename UNSIGNED>
27inline
28UNSIGNED
29convolution( bool do_left_special,
30 bool do_right_special,
31 UNSIGNED AL, UNSIGNED A, UNSIGNED AR,
32 UNSIGNED BL, UNSIGNED B, UNSIGNED BR,
33 UNSIGNED CL, UNSIGNED C, UNSIGNED CR,
34 uint32_t const compressedTableConvolution[] ) {
35
36 unsigned const numBits = sizeof(UNSIGNED) * CHAR_BIT ;
37 static UNSIGNED const mask3bit = 0x07 ;
38 static UNSIGNED const mask2bit = 0x03 ;
39
40 UNSIGNED RES = B ;
41 // special case first bit
42 unsigned tn ;
43
44 UNSIGNED bit = 0x01 ; // Starting bit
45 if ( do_left_special ) {
46 tn = ((A & mask2bit)<<1)|(AL>>(numBits-1)) |
47 ((B & mask2bit)<<4)|((BL>>(numBits-4))&0x08) |
48 ((C & mask2bit)<<7)|((CL>>(numBits-7))&0x20) ;
49 if ( useTable<UNSIGNED>(compressedTableConvolution,tn) ) RES |= bit ;
50 else RES &= ~bit ;
51 }
52
53 bit = 0x02 ;
54 for ( unsigned i = 1 ; i < numBits - 1 ; ++i ) {
55 tn = (A & mask3bit) | ((B & mask3bit)<<3) | ((C & mask3bit)<<6) ;
56 if ( useTable<UNSIGNED>(compressedTableConvolution,tn) ) RES |= bit ;
57 else RES &= ~bit ;
58 A >>= 1 ;
59 B >>= 1 ;
60 C >>= 1 ;
61 bit <<= 1 ;
62 }
63
64 // special case last bit
65 if ( do_right_special ) {
66 tn = ((A & mask2bit)<<1)|(AR&0x01) |
67 ((B & mask2bit)<<4)|((BR<<3)&0x08) |
68 ((C & mask2bit)<<7)|((CR<<7)&0x20) ;
69 if ( useTable<UNSIGNED>(compressedTableConvolution,tn) ) RES |= bit ;
70 else RES &= ~bit ;
71 }
72}
73
74/*
75//
76// Do convolution for 3 column (or rows)
77//
78*/
79
80template <typename UNSIGNED>
81inline
82void
83convolution( unsigned const N,
84 UNSIGNED const Avec[],
85 UNSIGNED Bvec[],
86 UNSIGNED const Cvec[],
87 uint32_t const compressedTableConvolution[] ) {
88
89 UNSIGNED Bsaved = Bvec[0] ;
90 Bvec[0] = convolution<UNSIGNED>( false, true,
91 0, Avec[0], Avec[1],
92 0, Bvec[0], Bvec[1],
93 0, Cvec[0], Cvec[1],
94 compressedTableConvolution ) ;
95 for ( unsigned i = 1 ; i < N-1 ; ++i ) {
96 UNSIGNED Bsaved1 = Bvec[i] ;
97 Bvec[i] = convolution<UNSIGNED>( true, true,
98 Avec[i-1], Avec[i], Avec[i+1],
99 Bsaved, Bvec[i], Bvec[i+1],
100 Cvec[i-1], Cvec[i], Cvec[i+1],
101 compressedTableConvolution ) ;
102 Bsaved = Bsaved1 ;
103 }
104 Bvec[N-1] = convolution<UNSIGNED>( true, false,
105 Avec[N-2], Avec[N-1], 0,
106 Bsaved, Bvec[N-1], 0,
107 Cvec[N-2], Cvec[N-1], 0,
108 compressedTableConvolution ) ;
109}
110
111template <typename UNSIGNED>
112void
113imageConvolution( UNSIGNED * matrix,
114 unsigned dimRowBlock,
115 unsigned numRowBlock,
116 unsigned numCols,
117 uint32_t const compressedTableConvolution[] ) {
118
119 UNSIGNED * csaved = (UNSIGNED *)alloca( 2 * numRowBlock * sizeof(UNSIGNED) ) ;
120 UNSIGNED * csaved1 = csaved + numRowBlock ;
121
122 // copy first column
123 std::copy( matrix + dimRowBlock, matrix + dimRowBlock + numRowBlock, csaved ) ;
124
125 // loop by column
126 for ( unsigned i = 1 ; i < numCols-1 ; ++i ) {
127 UNSIGNED * cim1 = matrix + (i-1)*dimRowBlock ;
128 UNSIGNED * ci = matrix + i*dimRowBlock ;
129 UNSIGNED * cip1 = matrix + (i+1)*dimRowBlock ;
130
131 // save row c1
132 std::copy( ci, ci + numRowBlock, csaved1 ) ;
133 convolution<UNSIGNED>( numRowBlock, csaved, ci, cip1, compressedTableConvolution ) ;
134 std::copy( csaved1, csaved1 + numRowBlock, csaved ) ;
135 }
136}
137
138template void imageConvolution<uint8_t> ( uint8_t *, unsigned, unsigned numRowBlock, unsigned numCols, uint32_t const [] ) ;
139template void imageConvolution<uint16_t>( uint16_t *, unsigned, unsigned numRowBlock, unsigned numCols, uint32_t const [] ) ;
140template void imageConvolution<uint32_t>( uint32_t *, unsigned, unsigned numRowBlock, unsigned numCols, uint32_t const [] ) ;
141template void imageConvolution<uint64_t>( uint64_t *, unsigned, unsigned numRowBlock, unsigned numCols, uint32_t const [] ) ;
142
143// eof imageConvolution.cc
Test without thread
File maskBit.cc
1// g++ -O3 maskBit.cc imageConvolution.cc TimeMeter.cc
2// g++ -O3 -sse2 -funroll-loops maskBit.cc imageConvolution.cc TimeMeter.cc
3// 0.21 ms
4
5#include <iostream>
6#include <iomanip>
7#include <cstdlib> // near equivalent to <stdlib.h> for exit, rand, ...
8#include <cmath> // near equivalent to <math.h> for sin, cos, log, ..
9
10#include <pthread.h>
11
12#include "imageConvolution.hh"
13#include "TimeMeter.hh"
14#include "pstdint.h"
15//#include <stdint.h>
16
17using namespace std ;
18
19// general random numer generator
20template <typename U>
21inline
22U random() { return (U)rand()-(U)rand() ; }
23
24uint32_t compressedTableConvolution[16] = {
25 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
26 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
27 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
28 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
29} ;
30
31//typedef uint8_t BITS ;
32//typedef uint16_t BITS ;
33//typedef uint32_t BITS ;
34typedef uint64_t BITS ;
35
36#define REPEAT 100000
37
38int
39main(int argc, char *argv) {
40
41 TimeMeter tm ;
42
43 unsigned const numBits = sizeof(BITS) * CHAR_BIT ;
44
45 unsigned const dimRowBlock = 4000/numBits ;
46 unsigned const numRowBlock = 2000/numBits ;
47 unsigned const numCols = 2000 ;
48
49 BITS * matrix = new BITS[dimRowBlock*numCols] ;
50
51 for ( unsigned i = 0 ; i < dimRowBlock*numCols ; ++i ) matrix[i] = random<BITS>() ;
52
53 tm . start() ;
54 for ( unsigned i = 0 ; i < REPEAT ; ++i )
55 imageConvolution<BITS>( matrix, dimRowBlock, numRowBlock, numCols, compressedTableConvolution ) ;
56
57 double elapsed = tm . milliseconds() ;
58 cout << "Elapsed " << elapsed/REPEAT << "ms\n" ;
59
60 //pthread_exit(NULL) ;
61 return 0 ;
62}
Test with thread
File maskBitThread.cc
1// g++ -O3 -funroll-loops maskBitThread.cc imageConvolution.cc TimeMeter.cc
2// g++ -O3 -sse2 -funroll-loops maskBitThread.cc imageConvolution.cc TimeMeter.cc
3// 0.1 ms
4
5#include <iostream>
6#include <iomanip>
7#include <cstdlib> // near equivalent to <stdlib.h> for exit, rand, ...
8#include <cmath> // near equivalent to <math.h> for sin, cos, log, ..
9
10#include <pthread.h>
11
12#include "imageConvolution.hh"
13#include "TimeMeter.hh"
14#include "pstdint.h"
15//#include <stdint.h>
16
17using namespace std ;
18
19// general random numer generator
20template <typename U>
21inline
22U random() { return (U)rand()-(U)rand() ; }
23
24uint32_t compressedTableConvolution[16] = {
25 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
26 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
27 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
28 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
29} ;
30
31//typedef uint8_t BITS ;
32//typedef uint16_t BITS ;
33//typedef uint32_t BITS ;
34typedef uint64_t BITS ;
35
36typedef struct {
37 BITS * matrix ;
38 unsigned dimRowBlock ;
39 unsigned numRowBlock ;
40 unsigned numCols ;
41 uint32_t const * compressedTableConvolution ;
42} applyMask_data ;
43
44extern "C"
45void *
46imageConvolution( void * args ) {
47 applyMask_data * data = static_cast<applyMask_data*>(args) ;
48 imageConvolution<BITS>( data -> matrix,
49 data -> dimRowBlock,
50 data -> numRowBlock,
51 data -> numCols,
52 data -> compressedTableConvolution ) ;
53 pthread_exit(NULL) ;
54}
55
56#define REPEAT 100000
57
58int
59main(int argc, char *argv) {
60
61 pthread_t thread[2] ;
62
63 TimeMeter tm ;
64
65 unsigned const numBits = sizeof(BITS) * CHAR_BIT ;
66
67 unsigned const dimRowBlock = 2000/numBits ;
68 unsigned const numRowBlock = 1000/numBits ;
69 unsigned const numCols = 1000 ;
70
71 BITS * matrix = new BITS[dimRowBlock*numCols] ;
72
73 for ( unsigned i = 0 ; i < dimRowBlock*numCols ; ++i ) matrix[i] = random<BITS>() ;
74
75 double elapsed1 = 0 ;
76 double elapsed2 = 0 ;
77 double elapsed3 = 0 ;
78
79 tm . start() ;
80 for ( unsigned i = 0 ; i < REPEAT ; ++i ) {
81
82 unsigned numCols2 = numCols/2 ;
83 applyMask_data data0 = { matrix, dimRowBlock, numRowBlock, numCols2, compressedTableConvolution } ;
84 applyMask_data data1 = { matrix + dimRowBlock * numCols2, dimRowBlock, numRowBlock, numCols-numCols2, compressedTableConvolution } ;
85 if ( pthread_create(&thread[0], NULL, imageConvolution, (void *) &data0) ||
86 pthread_create(&thread[1], NULL, imageConvolution, (void *) &data1) ) {
87 cerr << "Error while creating thread\n" ;
88 exit(1);
89 }
90 if ( pthread_join(thread[0], NULL) ) { cerr << "Error while joining thread\n" ; exit(1); }
91 if ( pthread_join(thread[1], NULL) ) { cerr << "Error while joining thread\n" ; exit(1); }
92 }
93
94 double elapsed = tm . milliseconds() ;
95 cout << "Elapsed " << elapsed/REPEAT << "ms\n" ;
96
97 //pthread_exit(NULL) ;
98 return 0 ;
99}