|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <config.h> |
|
|
|
|
|
#include "cksum.h" |
|
|
|
|
|
#include <stdio.h> |
|
|
#include <sys/types.h> |
|
|
#include <x86intrin.h> |
|
|
#include "system.h" |
|
|
|
|
|
|
|
|
#define BUFLEN (1 << 16) |
|
|
|
|
|
|
|
|
|
|
|
bool |
|
|
cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out) |
|
|
{ |
|
|
__m128i buf[BUFLEN / sizeof (__m128i)]; |
|
|
uint_fast32_t crc = 0; |
|
|
uintmax_t length = 0; |
|
|
size_t bytes_read; |
|
|
__m128i single_mult_constant; |
|
|
__m128i four_mult_constant; |
|
|
__m128i shuffle_constant; |
|
|
|
|
|
if (!fp || !crc_out || !length_out) |
|
|
return false; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
single_mult_constant = _mm_set_epi64x (0xC5B9CD4C, 0xE8A45605); |
|
|
four_mult_constant = _mm_set_epi64x (0x8833794C, 0xE6228B11); |
|
|
|
|
|
|
|
|
shuffle_constant = _mm_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8, |
|
|
9, 10, 11, 12, 13, 14, 15); |
|
|
|
|
|
while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0) |
|
|
{ |
|
|
__m128i *datap; |
|
|
__m128i data; |
|
|
__m128i data2; |
|
|
__m128i data3; |
|
|
__m128i data4; |
|
|
__m128i data5; |
|
|
__m128i data6; |
|
|
__m128i data7; |
|
|
__m128i data8; |
|
|
__m128i fold_data; |
|
|
__m128i xor_crc; |
|
|
|
|
|
if (length + bytes_read < length) |
|
|
{ |
|
|
errno = EOVERFLOW; |
|
|
return false; |
|
|
} |
|
|
length += bytes_read; |
|
|
|
|
|
datap = (__m128i *)buf; |
|
|
|
|
|
|
|
|
if (bytes_read >= 16 * 8) |
|
|
{ |
|
|
data = _mm_loadu_si128 (datap); |
|
|
data = _mm_shuffle_epi8 (data, shuffle_constant); |
|
|
|
|
|
|
|
|
xor_crc = _mm_set_epi32 (crc, 0, 0, 0); |
|
|
crc = 0; |
|
|
data = _mm_xor_si128 (data, xor_crc); |
|
|
data3 = _mm_loadu_si128 (datap + 1); |
|
|
data3 = _mm_shuffle_epi8 (data3, shuffle_constant); |
|
|
data5 = _mm_loadu_si128 (datap + 2); |
|
|
data5 = _mm_shuffle_epi8 (data5, shuffle_constant); |
|
|
data7 = _mm_loadu_si128 (datap + 3); |
|
|
data7 = _mm_shuffle_epi8 (data7, shuffle_constant); |
|
|
|
|
|
|
|
|
while (bytes_read >= 16 * 8) |
|
|
{ |
|
|
datap += 4; |
|
|
|
|
|
|
|
|
data2 = _mm_clmulepi64_si128 (data, four_mult_constant, 0x00); |
|
|
data = _mm_clmulepi64_si128 (data, four_mult_constant, 0x11); |
|
|
data4 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x00); |
|
|
data3 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x11); |
|
|
data6 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x00); |
|
|
data5 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x11); |
|
|
data8 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x00); |
|
|
data7 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x11); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data = _mm_xor_si128 (data, data2); |
|
|
data2 = _mm_loadu_si128 (datap); |
|
|
data2 = _mm_shuffle_epi8 (data2, shuffle_constant); |
|
|
data = _mm_xor_si128 (data, data2); |
|
|
|
|
|
data3 = _mm_xor_si128 (data3, data4); |
|
|
data4 = _mm_loadu_si128 (datap + 1); |
|
|
data4 = _mm_shuffle_epi8 (data4, shuffle_constant); |
|
|
data3 = _mm_xor_si128 (data3, data4); |
|
|
|
|
|
data5 = _mm_xor_si128 (data5, data6); |
|
|
data6 = _mm_loadu_si128 (datap + 2); |
|
|
data6 = _mm_shuffle_epi8 (data6, shuffle_constant); |
|
|
data5 = _mm_xor_si128 (data5, data6); |
|
|
|
|
|
data7 = _mm_xor_si128 (data7, data8); |
|
|
data8 = _mm_loadu_si128 (datap + 3); |
|
|
data8 = _mm_shuffle_epi8 (data8, shuffle_constant); |
|
|
data7 = _mm_xor_si128 (data7, data8); |
|
|
|
|
|
bytes_read -= (16 * 4); |
|
|
} |
|
|
|
|
|
|
|
|
data = _mm_shuffle_epi8 (data, shuffle_constant); |
|
|
_mm_storeu_si128 (datap, data); |
|
|
data3 = _mm_shuffle_epi8 (data3, shuffle_constant); |
|
|
_mm_storeu_si128 (datap + 1, data3); |
|
|
data5 = _mm_shuffle_epi8 (data5, shuffle_constant); |
|
|
_mm_storeu_si128 (datap + 2, data5); |
|
|
data7 = _mm_shuffle_epi8 (data7, shuffle_constant); |
|
|
_mm_storeu_si128 (datap + 3, data7); |
|
|
} |
|
|
|
|
|
|
|
|
if (bytes_read >= 32) |
|
|
{ |
|
|
data = _mm_loadu_si128 (datap); |
|
|
data = _mm_shuffle_epi8 (data, shuffle_constant); |
|
|
xor_crc = _mm_set_epi32 (crc, 0, 0, 0); |
|
|
crc = 0; |
|
|
data = _mm_xor_si128 (data, xor_crc); |
|
|
while (bytes_read >= 32) |
|
|
{ |
|
|
datap++; |
|
|
|
|
|
data2 = _mm_clmulepi64_si128 (data, single_mult_constant, 0x00); |
|
|
data = _mm_clmulepi64_si128 (data, single_mult_constant, 0x11); |
|
|
fold_data = _mm_loadu_si128 (datap); |
|
|
fold_data = _mm_shuffle_epi8 (fold_data, shuffle_constant); |
|
|
data = _mm_xor_si128 (data, data2); |
|
|
data = _mm_xor_si128 (data, fold_data); |
|
|
bytes_read -= 16; |
|
|
} |
|
|
data = _mm_shuffle_epi8 (data, shuffle_constant); |
|
|
_mm_storeu_si128 (datap, data); |
|
|
} |
|
|
|
|
|
|
|
|
unsigned char *cp = (unsigned char *)datap; |
|
|
while (bytes_read--) |
|
|
crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF]; |
|
|
if (feof (fp)) |
|
|
break; |
|
|
} |
|
|
|
|
|
*crc_out = crc; |
|
|
*length_out = length; |
|
|
|
|
|
return !ferror (fp); |
|
|
} |
|
|
|