| | #include <stdio.h> |
| | #include <assert.h> |
| | #include <stdlib.h> |
| | #include <time.h> |
| |
|
| | const int N = 1 << 14; |
| | const int M = 1 << 14; |
| |
|
| | void mul_mat_vec_f32_0( |
| | const float * src0, |
| | const float * src1, |
| | float * dst, |
| | unsigned nrows, |
| | unsigned ncols) { |
| | for (unsigned i = 0; i < nrows; i++) { |
| | float sum = 0.0f; |
| | for (unsigned j = 0; j < ncols; j++) { |
| | sum += src0[i*ncols + j]*src1[j]; |
| | } |
| | dst[i] = sum; |
| | } |
| | } |
| | #if defined(_MSC_VER) |
| | typedef float __declspec(align(32)) afloat; |
| | #else |
| | typedef float afloat __attribute__((__aligned__(32))); |
| | #endif |
| | void mul_mat_vec_f32_1( |
| | const afloat *restrict src0, |
| | const afloat *restrict src1, |
| | afloat *restrict dst, |
| | unsigned nrows, |
| | unsigned ncols) { |
| | for (unsigned i = 0; i < nrows; i++) { |
| | const afloat * restrict row = src0 + i*ncols; |
| | const afloat * restrict col = src1; |
| |
|
| | float sum = 0.0f; |
| |
|
| | for (unsigned j = 0; j < ncols; j++) { |
| | sum += *row++ * *col++; |
| | } |
| |
|
| | dst[i] = sum; |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | } |
| | } |
| |
|
| | void mul_mat_vec_f32_2( |
| | const void * src0, |
| | const void * src1, |
| | void * dst, |
| | unsigned nrows, |
| | unsigned ncols) { |
| | void * d = dst; |
| | for (unsigned i = 0; i < nrows; i++) { |
| | float sum = 0.0f; |
| |
|
| | const char * row = (const char*)src0 + i*ncols*sizeof(float); |
| | const char * col = (const char*)src1; |
| | for (unsigned j = 0; j < ncols; j++) { |
| | sum += (*(float *)row) * (*(float *)col); |
| | row += sizeof(float); |
| | col += sizeof(float); |
| | } |
| | *(float *)d = sum; |
| | d = (char*)d + sizeof(float); |
| | } |
| | } |
| |
|
| | #if defined(_MSC_VER) |
| | void* aligned_alloc(size_t alignment, size_t size) { |
| | return _aligned_malloc(size, alignment); |
| | } |
| | #endif |
| |
|
| | int main(int argc, const char ** argv) { |
| | |
| | |
| | |
| |
|
| | afloat * src0 = (float *)(aligned_alloc(32, sizeof(float)*N*M)); |
| | afloat * src1 = (float *)(aligned_alloc(32, sizeof(float)*M)); |
| | afloat * dst = (float *)(aligned_alloc(32, sizeof(float)*N)); |
| |
|
| | for (int i = 0; i < N*M; i++) { |
| | src0[i] = (afloat)i; |
| | } |
| |
|
| | for (int i = 0; i < M; i++) { |
| | src1[i] = (afloat)i; |
| | } |
| |
|
| | const int nIter = 10; |
| |
|
| | const clock_t start = clock(); |
| |
|
| | double sum = 0.0f; |
| | for (int i = 0; i < nIter; i++) { |
| | |
| | mul_mat_vec_f32_1(src0, src1, dst, N, M); |
| | |
| | for (int i = 0; i < N; i++) { |
| | sum += dst[i]; |
| | } |
| | } |
| |
|
| | { |
| | const clock_t end = clock(); |
| | printf("%s: elapsed ticks: %ld\n", __func__, end - start); |
| | } |
| |
|
| | printf("%f\n", sum); |
| |
|
| | return 0; |
| | } |
| |
|