Lesson of 25 February 2011¶
Various matrix-matrix multiplication routine
The header file
File mm.hh
1/*
2 Matrix Matrix multiplication routine
3 */
4
5
6template <typename Type>
7void
8mm_standard( Type const A[], unsigned ldA,
9 Type const B[], unsigned ldB,
10 Type C[], unsigned ldC,
11 unsigned n, // number of the rows of A
12 unsigned p, // number of the columns of A and rows of B
13 unsigned m // number of the columns of B
14 // the results is a matrix C of n rows and m columns
15
16 ) ;
17
18template <typename Type>
19void
20mm_tiling( Type const A[], unsigned ldA,
21 Type const B[], unsigned ldB,
22 Type C[], unsigned ldC,
23 unsigned n, // number of the rows of A
24 unsigned p, // number of the columns of A and rows of B
25 unsigned m, // number of the columns of B
26 // the results is a matrix C of n rows and m columns
27 unsigned BK // block size
28 ) ;
29
30template <typename Type>
31void
32mm_recurr( Type const A[], unsigned ldA,
33 Type const B[], unsigned ldB,
34 Type C[], unsigned ldC,
35 unsigned n, // number of the rows of A
36 unsigned p, // number of the columns of A and rows of B
37 unsigned m ); // number of the columns of B
38 // the results is a matrix C of n rows and m columns
Standard routine
File mm_standard.cc
1/*
2 Matrix Matrix multiplication routine
3
4 g++ -O3 -funroll-loops -sse2 -sse3 -sse3 -ssse3 -sse4.1 mm_check.cc mm_standard.cc TimeMeter.cc
5
6 5 x 5 average time 0.01
7 10 x 10 average time 0.004
8 20 x 20 average time 0.014
9 40 x 40 average time 0.099
10 80 x 80 average time 0.789
11 160 x 160 average time 8.526
12 320 x 320 average time 70.61
13 640 x 640 average time 981.838
14 1280 x 1280 average time 10938.1
15 2560 x 2560 average time 624531
16 Time Ratio [ 10/5] = 0.4
17 Time Ratio [ 20/10] = 3.5
18 Time Ratio [ 40/20] = 7.07143
19 Time Ratio [ 80/40] = 7.9697
20 Time Ratio [ 160/80] = 10.8061
21 Time Ratio [ 320/160] = 8.28173
22 Time Ratio [ 640/320] = 13.9051
23 Time Ratio [ 1280/640] = 11.1404
24 Time Ratio [ 2560/1280] = 57.0969
25*/
26
27#include "mm.hh"
28#include "pstdint.h"
29
30#define A(I,J) A[(I)+(J)*ldA]
31#define B(I,J) B[(I)+(J)*ldB]
32#define C(I,J) C[(I)+(J)*ldC]
33
34template <typename Type>
35void
36mm_standard( Type const A[], unsigned ldA,
37 Type const B[], unsigned ldB,
38 Type C[], unsigned ldC,
39 unsigned n,
40 unsigned p,
41 unsigned m ) {
42 for ( unsigned i = 0 ; i < n ; ++i )
43 for ( unsigned j = 0 ; j < m ; ++j ) {
44 C(i,j) = 0 ;
45 for ( unsigned k = 0 ; k < p ; ++k )
46 C(i,j) += A(i,k) * B(k,j) ;
47 }
48}
49
50#define EXPLICIT_INSTANTIATE(T) \
51template void mm_standard<T>( T const [], unsigned, \
52 T const [], unsigned, \
53 T [], unsigned, \
54 unsigned, unsigned, unsigned ) ;
55
56EXPLICIT_INSTANTIATE(float) ;
57EXPLICIT_INSTANTIATE(double) ;
58EXPLICIT_INSTANTIATE(int32_t) ;
59EXPLICIT_INSTANTIATE(int64_t) ;
Using tiling
File mm_tiling.cc
1/*
2 Matrix Matrix multiplication routine
3
4 g++ -O3 mm_check.cc mm_standard.cc TimeMeter.cc
5 g++ -O3 -funroll-loops -sse2 -sse3 -sse3 -ssse3 -sse4.1 mm_check.cc mm_tiling.cc TimeMeter.cc
6
7 5 x 5 average time 0.01
8 10 x 10 average time 0.004
9 20 x 20 average time 0.014
10 40 x 40 average time 0.099
11 80 x 80 average time 0.789
12 160 x 160 average time 8.526
13 320 x 320 average time 70.61
14 640 x 640 average time 981.838
15 1280 x 1280 average time 10938.1
16 2560 x 2560 average time 624531
17 Time Ratio [ 10/5] = 0.4
18 Time Ratio [ 20/10] = 3.5
19 Time Ratio [ 40/20] = 7.07143
20 Time Ratio [ 80/40] = 7.9697
21 Time Ratio [ 160/80] = 10.8061
22 Time Ratio [ 320/160] = 8.28173
23 Time Ratio [ 640/320] = 13.9051
24 Time Ratio [ 1280/640] = 11.1404
25 Time Ratio [ 2560/1280] = 57.0969
26
27 */
28
29#include "mm.hh"
30#include <algorithm>
31
32#define A(I,J) A[(I)+(J)*ldA]
33#define B(I,J) B[(I)+(J)*ldB]
34#define C(I,J) C[(I)+(J)*ldC]
35
36template <typename Type>
37inline
38void
39mm_tiling_addTo( Type const A[], unsigned ldA,
40 Type const B[], unsigned ldB,
41 Type C[], unsigned ldC,
42 unsigned n,
43 unsigned p,
44 unsigned m ) {
45 for ( unsigned i = 0 ; i < n ; ++i )
46 for ( unsigned j = 0 ; j < m ; ++j ) {
47 Type tmp = 0 ;
48 for ( unsigned k = 0 ; k < p ; ++k ) tmp += A(i,k) * B(k,j) ;
49 C(i,j) += tmp ;
50 }
51}
52
53template <typename Type>
54void
55mm_tiling( Type const A[], unsigned ldA,
56 Type const B[], unsigned ldB,
57 Type C[], unsigned ldC,
58 unsigned n,
59 unsigned p,
60 unsigned m,
61 unsigned BK ) {
62 /*
63 // BK
64 // +--+--+--+--+
65 // | | | | |
66 // +--+--+--+--+
67 // | | | | |
68 // +--+--+--+--+
69 // | | | | |
70 // +--+--+--+--+
71 // | | | | |
72 // +--+--+--+--+
73 // | | | | |
74 // +--+--+--+--+
75 */
76 for ( unsigned i = 0 ; i < n ; ++i )
77 for ( unsigned j = 0 ; j < m ; ++j )
78 C(i,j) = 0 ;
79
80 for ( unsigned jj = 0 ; jj < m ; jj += BK ) {
81 unsigned column = std::min(BK,m-jj) ;
82 for ( unsigned ii = 0 ; ii < n ; ii += BK ) {
83 unsigned row = std::min(BK,n-ii) ;
84 for ( unsigned kk = 0 ; kk < p ; kk += BK ) {
85 unsigned rc = std::min(BK,p-kk) ;
86 // multiply block A(ii,kk) * B(kk,jj)
87 mm_tiling_addTo( &A(ii,kk), ldA,
88 &B(kk,jj), ldB,
89 &C(ii,jj), ldC,
90 row, rc, column ) ;
91 }
92 }
93 }
94
95}
96
97#define EXPLICIT_INSTANTIATE(T) \
98template void mm_tiling<T>( T const [], unsigned, \
99 T const [], unsigned, \
100 T [], unsigned, \
101 unsigned, unsigned, unsigned, unsigned ) ;
102
103EXPLICIT_INSTANTIATE(float) ;
104EXPLICIT_INSTANTIATE(double) ;
105EXPLICIT_INSTANTIATE(int32_t) ;
106EXPLICIT_INSTANTIATE(int64_t) ;
Using recurrence
File mm_recurr.cc
1/*
2 Matrix Matrix multiplication routine
3
4 */
5
6#include "mm.hh"
7#include <algorithm>
8
9#define A(I,J) A[(I)+(J)*ldA]
10#define B(I,J) B[(I)+(J)*ldB]
11#define C(I,J) C[(I)+(J)*ldC]
12
13template <typename Type>
14inline
15void
16mm_base_addto4x4( Type const A[], unsigned ldA,
17 Type const B[], unsigned ldB,
18 Type C[], unsigned ldC ) {
19 for ( unsigned i = 0 ; i < 4 ; ++i )
20 for ( unsigned j = 0 ; j < 4 ; ++j ) {
21 Type tmp = 0 ;
22 for ( unsigned k = 0 ; k < 4 ; ++k ) tmp += A(i,k) * B(k,j) ;
23 C(i,j) += tmp ;
24 }
25}
26
27template <typename Type>
28inline
29void
30mm_base_addto8x8( Type const A[], unsigned ldA,
31 Type const B[], unsigned ldB,
32 Type C[], unsigned ldC ) {
33 for ( unsigned i = 0 ; i < 8 ; i += 4 )
34 for ( unsigned j = 0 ; j < 8 ; j += 4 )
35 for ( unsigned k = 0 ; k < 8 ; k += 4 )
36 mm_base_addto4x4( &A(i,k), ldA, &B(k,j), ldB, &C(i,j), ldC ) ;
37}
38
39template <typename Type>
40inline
41void
42mm_base_addto16x16( Type const A[], unsigned ldA,
43 Type const B[], unsigned ldB,
44 Type C[], unsigned ldC ) {
45 for ( unsigned i = 0 ; i < 16 ; i += 8 )
46 for ( unsigned j = 0 ; j < 16 ; j += 8 )
47 for ( unsigned k = 0 ; k < 16 ; k += 8 )
48 mm_base_addto8x8( &A(i,k), ldA, &B(k,j), ldB, &C(i,j), ldC ) ;
49}
50
51template <typename Type>
52inline
53void
54mm_classin_addto( Type const A[], unsigned ldA,
55 Type const B[], unsigned ldB,
56 Type C[], unsigned ldC,
57 unsigned n,
58 unsigned p,
59 unsigned m ) {
60 for ( unsigned i = 0 ; i < n ; ++i )
61 for ( unsigned j = 0 ; j < m ; ++j ) {
62 Type tmp = 0 ;
63 for ( unsigned k = 0 ; k < p ; ++k ) tmp += A(i,k) * B(k,j) ;
64 C(i,j) += tmp ;
65 }
66}
67
68template <typename Type>
69inline
70void
71mm_base_addto( Type const A[], unsigned ldA,
72 Type const B[], unsigned ldB,
73 Type C[], unsigned ldC,
74 unsigned n,
75 unsigned p,
76 unsigned m ) {
77 unsigned const NB = 16 ;
78 for ( unsigned jj = 0 ; jj < m ; jj += NB ) {
79 unsigned column = std::min(NB,m-jj) ;
80 for ( unsigned ii = 0 ; ii < n ; ii += NB ) {
81 unsigned row = std::min(NB,n-ii) ;
82 for ( unsigned kk = 0 ; kk < p ; kk += NB ) {
83 unsigned rc = std::min(NB,p-kk) ;
84 if ( row == NB && rc == NB && column == NB )
85 mm_base_addto16x16( &A(ii,kk), ldA,
86 &B(kk,jj), ldB,
87 &C(ii,jj), ldC ) ;
88 else
89 mm_classin_addto( &A(ii,kk), ldA,
90 &B(kk,jj), ldB,
91 &C(ii,jj), ldC,
92 row, rc, column ) ;
93 }
94 }
95 }
96}
97
98//! C += A*B
99template <typename Type>
100void
101mm_recurr_addto( Type const A[], unsigned ldA,
102 Type const B[], unsigned ldB,
103 Type C[], unsigned ldC,
104 unsigned n,
105 unsigned p,
106 unsigned m ) {
107
108 unsigned const NB = 400 ;
109 if ( n <= NB && p <= NB && m <= NB ) {
110 mm_base_addto( A, ldA, B, ldB, C, ldC, n, p, m ) ;
111 return ;
112 }
113
114 /*
115 // splitting tipo 1A
116 // +-------+ +-------+ +-------+
117 // | | | | | |
118 // | C11 | | A11 | | B11 |
119 // | | | | | |
120 // +-------+ += +-------+ +-------+
121 // | | | |
122 // | C21 | | A21 |
123 // | | | |
124 // +-------+ +-------+
125 //
126 // splitting tipo 1B
127 // +-------+ +-------+-------+ +-------+
128 // | | | | | | |
129 // | C11 | | A11 | A12 | | B11 |
130 // | | | | | | |
131 // +-------+ += +-------+-------+ +-------+
132 // | | | | | | |
133 // | C21 | | A21 | A22 | | B21 |
134 // | | | | | | |
135 // +-------+ +-------+-------+ +-------+
136 //
137 // splitting tipo 2A
138 // +-------+-------+ +-------+ +-------+-------+
139 // | | | | | | | |
140 // | C11 | C12 | += | A11 | | B11 | B12 |
141 // | | | | | | | |
142 // +-------+-------+ +-------+ +-------+-------+
143 //
144 // splitting tipo 2B
145 // +-------+-------+ +-------+-------+ +-------+-------+
146 // | | | | | | | | |
147 // | C11 | C12 | += | A11 | A12 | | B11 | B12 |
148 // | | | | | | | | |
149 // +-------+-------+ +-------+-------+ +-------+-------+
150 // | | |
151 // | B21 | B22 |
152 // | | |
153 // +-------+-------+
154 //
155 // splitting tipo 3A
156 // +------+-------+ +-------+ +-------+-------+
157 // | | | | | | | |
158 // | C11 | C12 | | A11 | | B11 | B12 |
159 // | | | | | | | |
160 // +------+-------+ += +-------+ +-------+-------+
161 // | | | | |
162 // | C21 | C22 | | A21 |
163 // | | | | |
164 // +------+-------+ +-------+
165 //
166 // splitting tipo 3B
167 // +-------+-------+ +-------+-------+ +-------+-------+
168 // | | | | | | | | |
169 // | C11 | C12 | | A11 | A12 | | B11 | B12 |
170 // | | | | | | | | |
171 // +-------+-------+ += +-------+-------+ +-------+-------+
172 // | | | | | | | | |
173 // | C21 | C22 | | A21 | A22 | | B21 | B22 |
174 // | | | | | | | | |
175 // +-------+-------+ +-------+-------+ +-------+-------+
176 //
177 */
178
179 unsigned n2 = n/2 ;
180 unsigned m2 = m/2 ;
181 unsigned p2 = p/2 ;
182
183 // n x p
184 Type const * A11 = A ;
185 Type const * A12 = A + ldA * p2 ;
186 Type const * A21 = A + n2 ;
187 Type const * A22 = A12 + n2 ;
188
189 // p x m
190 Type const * B11 = B ;
191 Type const * B12 = B + ldB * m2 ;
192 Type const * B21 = B + p2 ;
193 Type const * B22 = B12 + p2 ;
194
195 // n x m
196 Type * C11 = C ;
197 Type * C12 = C + ldC * m2 ;
198 Type * C21 = C + n2 ;
199 Type * C22 = C12 + n2 ;
200
201 // matrix C is n x m
202 if ( 2*n > 3*m ) { // matrix is tall
203 if ( 2*n > 3*p ) { // splitting of type 1A
204 mm_recurr_addto( A11, ldA, B11, ldB, C11, ldC, n2, p, m ) ;
205 mm_recurr_addto( A21, ldA, B11, ldB, C21, ldC, n-n2, p, m ) ;
206 } else { // splitting of type 1B
207 mm_recurr_addto( A11, ldA, B11, ldB, C11, ldC, n2, p2, m ) ;
208 mm_recurr_addto( A12, ldA, B21, ldB, C11, ldC, n2, p-p2, m ) ;
209
210 mm_recurr_addto( A21, ldA, B11, ldB, C21, ldC, n-n2, p2, m ) ;
211 mm_recurr_addto( A22, ldA, B21, ldB, C21, ldC, n-n2, p-p2, m ) ;
212 }
213 } else if ( 3*n < 2*m ) { // matrix is flat
214 if ( 2*p > 3*n ) { // splitting of type 2B
215 mm_recurr_addto( A11, ldA, B11, ldB, C11, ldC, n, p2, m2 ) ;
216 mm_recurr_addto( A12, ldA, B21, ldB, C11, ldC, n, p-p2, m2 ) ;
217
218 mm_recurr_addto( A11, ldA, B12, ldB, C12, ldC, n, p2, m-m2 ) ;
219 mm_recurr_addto( A12, ldA, B22, ldB, C12, ldC, n, p-p2, m-m2 ) ;
220 } else { // splitting of type 2A
221 mm_recurr_addto( A11, ldA, B11, ldB, C11, ldC, n, p, m2 ) ;
222 mm_recurr_addto( A11, ldA, B12, ldB, C12, ldC, n, p, m-m2 ) ;
223 }
224 } else {
225 if ( 2*n > 3*p ) { // splitting of type 3A
226 mm_recurr_addto( A11, ldA, B11, ldB, C11, ldC, n2, p, m2 ) ;
227 mm_recurr_addto( A11, ldA, B12, ldB, C12, ldC, n2, p, m-m2 ) ;
228
229 mm_recurr_addto( A21, ldA, B11, ldB, C21, ldC, n-n2, p, m2 ) ;
230 mm_recurr_addto( A21, ldA, B12, ldB, C22, ldC, n-n2, p, m-m2 ) ;
231 } else { // splitting of type 3B
232 mm_recurr_addto( A11, ldA, B11, ldB, C11, ldC, n2, p2, m2 ) ;
233 mm_recurr_addto( A12, ldA, B21, ldB, C11, ldC, n2, p-p2, m2 ) ;
234
235 mm_recurr_addto( A11, ldA, B12, ldB, C12, ldC, n2, p2, m-m2 ) ;
236 mm_recurr_addto( A12, ldA, B22, ldB, C12, ldC, n2, p-p2, m-m2 ) ;
237
238 mm_recurr_addto( A21, ldA, B11, ldB, C21, ldC, n-n2, p2, m2 ) ;
239 mm_recurr_addto( A22, ldA, B21, ldB, C21, ldC, n-n2, p-p2, m2 ) ;
240
241 mm_recurr_addto( A21, ldA, B12, ldB, C22, ldC, n-n2, p2, m-m2 ) ;
242 mm_recurr_addto( A22, ldA, B22, ldB, C22, ldC, n-n2, p-p2, m-m2 ) ;
243 }
244
245 }
246}
247
248////////////////////////////////////////////////////////////////////////////////
249
250template <typename Type>
251void
252mm_recurr( Type const A[], unsigned ldA,
253 Type const B[], unsigned ldB,
254 Type C[], unsigned ldC,
255 unsigned n,
256 unsigned p,
257 unsigned m ) {
258
259 for ( unsigned i = 0 ; i < n ; ++i )
260 for ( unsigned j = 0 ; j < m ; ++j )
261 C(i,j) = 0 ;
262
263 mm_recurr_addto( A, ldA, B, ldB, C, ldC, n, p, m ) ;
264}
265
266#define EXPLICIT_INSTANTIATE(T) \
267template void mm_recurr<T>( T const [], unsigned, \
268 T const [], unsigned, \
269 T [], unsigned, \
270 unsigned, unsigned, unsigned ) ; \
271template void mm_recurr_addto<T>( T const [], unsigned, \
272 T const [], unsigned, \
273 T [], unsigned, \
274 unsigned, unsigned, unsigned )
275
276EXPLICIT_INSTANTIATE(float) ;
277EXPLICIT_INSTANTIATE(double) ;
278EXPLICIT_INSTANTIATE(int32_t) ;
279EXPLICIT_INSTANTIATE(int64_t) ;
Check routine
File mm_check.cc
1/*
2 Matrix Matrix multiplication routine
3
4 g++ -O3 -funroll-loops mm_check.cc mm_standard.cc mm_tiling.cc mm_recurr.cc TimeMeter.cc -lblas
5 g++ -O3 -funroll-loops -sse2 -sse3 -sse3 -ssse3 -sse4.1 mm_check.cc mm_standard.cc mm_tiling.cc mm_recurr.cc TimeMeter.cc -framework Accelerate
6
7time using dgemm
8
95 x 5 average time 0.0076ms
1010 x 10 average time 0.0012ms
1120 x 20 average time 0.0046ms
1240 x 40 average time 0.032ms
1380 x 80 average time 0.303ms
14160 x 160 average time 9.2734ms
15320 x 320 average time 9.183ms
16640 x 640 average time 66.8882ms
171280 x 1280 average time 458.291ms
182560 x 2560 average time 3958.36ms
19Time Ratio [ 10/5] = 0.157895
20Time Ratio [ 20/10] = 3.83333
21Time Ratio [ 40/20] = 6.95652
22Time Ratio [ 80/40] = 9.46875
23Time Ratio [ 160/80] = 30.6053
24Time Ratio [ 320/160] = 0.990252
25Time Ratio [ 640/320] = 7.28392
26Time Ratio [ 1280/640] = 6.8516
27Time Ratio [ 2560/1280] = 8.63722
28
29 */
30
31#include <iostream>
32#include <iomanip>
33#include <cstdlib> // near equivalent to <stdlib.h> for exit, rand, ...
34#include <cmath> // near equivalent to <math.h> for sin, cos, log, ..
35
36#include <map>
37
38#include "mm.hh"
39#include "TimeMeter.hh"
40#include "pstdint.h"
41
42#define A(I,J) A[(I)+(J)*ldA]
43#define B(I,J) B[(I)+(J)*ldB]
44#define C(I,J) C[(I)+(J)*ldC]
45
46using namespace std ;
47
48// general random numer generator
49template <typename U>
50inline
51U random() { return (U)rand()-(U)rand() ; }
52
53// store average time using map of STL
54static map<unsigned,double> averageTime ;
55
56//
57// Fill with random value matric A (n x m)
58//
59template <typename Type>
60static
61void
62fillRandom( Type A[], unsigned ldA, unsigned n, unsigned m ) {
63 for ( unsigned i = 0 ; i < n ; ++i )
64 for ( unsigned j = 0 ; j < m ; ++j )
65 A(i,j) = random() ;
66}
67
68typedef double valueType ;
69//typedef int64_t valueType ;
70
71#define F77NAME(A) A##_
72
73extern "C" {
74 void
75 F77NAME(dgemm)( char const transA[],
76 char const transB[],
77 int const & M,
78 int const & N,
79 int const & K,
80 double const & ALPHA,
81 double const A[], int const & ldA,
82 double const B[], int const & ldB,
83 double const & beta,
84 double C[], int const & ldC ) ;
85/* Purpose
86 * =======
87 *
88 * DGEMM performs one of the matrix-matrix operations
89 *
90 * C := alpha*op( A )*op( B ) + beta*C,
91 *
92 * where op( X ) is one of
93 *
94 * op( X ) = X or op( X ) = X',
95 *
96 * alpha and beta are scalars, and A, B and C are matrices, with op( A )
97 * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix.
98 */
99};
100
101int
102main() {
103
104 TimeMeter tm ;
105 unsigned nruns = 5 ;
106 unsigned minsz = 5 ;
107 unsigned maxsz = 3000 ;
108
109 valueType * A = new valueType[ maxsz * maxsz ] ;
110 valueType * B = new valueType[ maxsz * maxsz ] ;
111 valueType * C = new valueType[ maxsz * maxsz ] ;
112
113 for ( unsigned sz = minsz ; sz < maxsz ; sz *= 2 ) {
114 unsigned ldA = sz ;
115 unsigned ldB = sz ;
116 unsigned ldC = sz ;
117 fillRandom( A, ldA, sz, sz ) ;
118 fillRandom( B, ldB, sz, sz ) ;
119
120 tm . start() ;
121 for ( unsigned k = 0 ; k < nruns ; ++k )
122 mm_recurr( A, ldA, B, ldB, C, ldC, sz, sz, sz ) ;
123 //mm_tiling( A, ldA, B, ldB, C, ldC, sz, sz, sz, 80 ) ;
124 //mm_standard( A, ldA, B, ldB, C, ldC, sz, sz, sz ) ;
125 //F77NAME(dgemm)( "N", "N", sz, sz, sz, 1, A, ldA, B, ldB, 0, C, ldC ) ;
126
127 double elapsed = tm . milliseconds() ;
128 averageTime[sz] = elapsed / nruns ;
129 cout << sz << " x " << sz << " average time " << averageTime[sz] << "ms\n" ;
130 }
131
132 for ( unsigned sz = 2*minsz ; sz < maxsz ; sz *= 2 )
133 cout << "Time Ratio [ " << sz << "/" << sz/2 << "] = "
134 << averageTime[sz] / averageTime[sz/2] << '\n' ;
135
136}