|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <config.h> |
|
|
|
|
|
#include "cksum.h" |
|
|
|
|
|
#include <stdio.h> |
|
|
#include <sys/types.h> |
|
|
#include <x86intrin.h> |
|
|
#include "system.h" |
|
|
|
|
|
|
|
|
#define BUFLEN (1 << 16) |
|
|
|
|
|
bool |
|
|
cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out) |
|
|
{ |
|
|
__m256i buf[BUFLEN / sizeof (__m256i)]; |
|
|
uint_fast32_t crc = 0; |
|
|
uintmax_t length = 0; |
|
|
size_t bytes_read; |
|
|
__m256i single_mult_constant; |
|
|
__m256i four_mult_constant; |
|
|
__m256i shuffle_constant; |
|
|
|
|
|
if (!fp || !crc_out || !length_out) |
|
|
return false; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
single_mult_constant = _mm256_set_epi64x (0x569700E5, 0x75BE46B7, |
|
|
0x569700E5, 0x75BE46B7); |
|
|
four_mult_constant = _mm256_set_epi64x (0x10BD4D7C, 0x567FDDEB, |
|
|
0x10BD4D7C, 0x567FDDEB); |
|
|
|
|
|
|
|
|
shuffle_constant = _mm256_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8, |
|
|
9, 10, 11, 12, 13, 14, 15, |
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, |
|
|
9, 10, 11, 12, 13, 14, 15); |
|
|
while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0) |
|
|
{ |
|
|
__m256i data; |
|
|
__m256i data2; |
|
|
__m256i data3; |
|
|
__m256i data4; |
|
|
__m256i data5; |
|
|
__m256i data6; |
|
|
__m256i data7; |
|
|
__m256i data8; |
|
|
__m256i fold_data; |
|
|
__m256i xor_crc; |
|
|
|
|
|
__m256i *datap; |
|
|
|
|
|
if (length + bytes_read < length) |
|
|
{ |
|
|
errno = EOVERFLOW; |
|
|
return false; |
|
|
} |
|
|
length += bytes_read; |
|
|
|
|
|
datap = (__m256i *)buf; |
|
|
|
|
|
|
|
|
if (bytes_read >= 16 * 8 * 2) |
|
|
{ |
|
|
data = _mm256_loadu_si256 (datap); |
|
|
data = _mm256_shuffle_epi8 (data, shuffle_constant); |
|
|
|
|
|
|
|
|
xor_crc = _mm256_set_epi32 (0, 0, 0, 0, crc, 0, 0, 0); |
|
|
crc = 0; |
|
|
data = _mm256_xor_si256 (data, xor_crc); |
|
|
data3 = _mm256_loadu_si256 (datap + 1); |
|
|
data3 = _mm256_shuffle_epi8 (data3, shuffle_constant); |
|
|
data5 = _mm256_loadu_si256 (datap + 2); |
|
|
data5 = _mm256_shuffle_epi8 (data5, shuffle_constant); |
|
|
data7 = _mm256_loadu_si256 (datap + 3); |
|
|
data7 = _mm256_shuffle_epi8 (data7, shuffle_constant); |
|
|
|
|
|
while (bytes_read >= 16 * 8 * 2) |
|
|
{ |
|
|
datap += 4; |
|
|
|
|
|
|
|
|
data2 = _mm256_clmulepi64_epi128 (data, four_mult_constant, |
|
|
0x00); |
|
|
data = _mm256_clmulepi64_epi128 (data, four_mult_constant, |
|
|
0x11); |
|
|
data4 = _mm256_clmulepi64_epi128 (data3, four_mult_constant, |
|
|
0x00); |
|
|
data3 = _mm256_clmulepi64_epi128 (data3, four_mult_constant, |
|
|
0x11); |
|
|
data6 = _mm256_clmulepi64_epi128 (data5, four_mult_constant, |
|
|
0x00); |
|
|
data5 = _mm256_clmulepi64_epi128 (data5, four_mult_constant, |
|
|
0x11); |
|
|
data8 = _mm256_clmulepi64_epi128 (data7, four_mult_constant, |
|
|
0x00); |
|
|
data7 = _mm256_clmulepi64_epi128 (data7, four_mult_constant, |
|
|
0x11); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data = _mm256_xor_si256 (data, data2); |
|
|
data2 = _mm256_loadu_si256 (datap); |
|
|
data2 = _mm256_shuffle_epi8 (data2, shuffle_constant); |
|
|
data = _mm256_xor_si256 (data, data2); |
|
|
|
|
|
data3 = _mm256_xor_si256 (data3, data4); |
|
|
data4 = _mm256_loadu_si256 (datap + 1); |
|
|
data4 = _mm256_shuffle_epi8 (data4, shuffle_constant); |
|
|
data3 = _mm256_xor_si256 (data3, data4); |
|
|
|
|
|
data5 = _mm256_xor_si256 (data5, data6); |
|
|
data6 = _mm256_loadu_si256 (datap + 2); |
|
|
data6 = _mm256_shuffle_epi8 (data6, shuffle_constant); |
|
|
data5 = _mm256_xor_si256 (data5, data6); |
|
|
|
|
|
data7 = _mm256_xor_si256 (data7, data8); |
|
|
data8 = _mm256_loadu_si256 (datap + 3); |
|
|
data8 = _mm256_shuffle_epi8 (data8, shuffle_constant); |
|
|
data7 = _mm256_xor_si256 (data7, data8); |
|
|
|
|
|
bytes_read -= (16 * 4 * 2); |
|
|
} |
|
|
|
|
|
|
|
|
data = _mm256_shuffle_epi8 (data, shuffle_constant); |
|
|
_mm256_storeu_si256 (datap, data); |
|
|
data3 = _mm256_shuffle_epi8 (data3, shuffle_constant); |
|
|
_mm256_storeu_si256 (datap + 1, data3); |
|
|
data5 = _mm256_shuffle_epi8 (data5, shuffle_constant); |
|
|
_mm256_storeu_si256 (datap + 2, data5); |
|
|
data7 = _mm256_shuffle_epi8 (data7, shuffle_constant); |
|
|
_mm256_storeu_si256 (datap + 3, data7); |
|
|
} |
|
|
|
|
|
|
|
|
if (bytes_read >= 64) |
|
|
{ |
|
|
data = _mm256_loadu_si256 (datap); |
|
|
data = _mm256_shuffle_epi8 (data, shuffle_constant); |
|
|
xor_crc = _mm256_set_epi32 (0, 0, 0, 0, crc, 0, 0, 0); |
|
|
crc = 0; |
|
|
data = _mm256_xor_si256 (data, xor_crc); |
|
|
while (bytes_read >= 64) |
|
|
{ |
|
|
datap++; |
|
|
|
|
|
data2 = _mm256_clmulepi64_epi128 (data, single_mult_constant, |
|
|
0x00); |
|
|
data = _mm256_clmulepi64_epi128 (data, single_mult_constant, |
|
|
0x11); |
|
|
fold_data = _mm256_loadu_si256 (datap); |
|
|
fold_data = _mm256_shuffle_epi8 (fold_data, shuffle_constant); |
|
|
data = _mm256_xor_si256 (data, data2); |
|
|
data = _mm256_xor_si256 (data, fold_data); |
|
|
bytes_read -= 32; |
|
|
} |
|
|
data = _mm256_shuffle_epi8 (data, shuffle_constant); |
|
|
_mm256_storeu_si256 (datap, data); |
|
|
} |
|
|
|
|
|
|
|
|
unsigned char *cp = (unsigned char *)datap; |
|
|
while (bytes_read--) |
|
|
crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF]; |
|
|
if (feof (fp)) |
|
|
break; |
|
|
} |
|
|
|
|
|
*crc_out = crc; |
|
|
*length_out = length; |
|
|
|
|
|
return !ferror (fp); |
|
|
} |
|
|
|