LeanQuant's picture
Add files using upload-large-folder tool
9debd43 verified
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-34097967
// Cuda compilation tools, release 12.4, V12.4.131
// Based on NVVM 7.0.1
//
.version 8.4
.target sm_52
.address_size 64
// .globl huffman_decode
.const .align 1 .b8 shared_mem_constants[1280] = {120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 114, 114, 114, 114, 114, 114, 114, 114, 113, 113, 113, 113, 112, 112, 111, 0, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 106, 106, 106, 106, 106, 106, 106, 106, 104, 104, 103, 0, 122, 122, 105, 105, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 99, 99, 99, 99, 99, 99, 99, 99, 98, 98, 98, 98, 97, 97, 124, 0, 91, 91, 92, 92, 125, 125, 126, 126, 0, 81, 84, 85, 87, 88, 89, 90, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 32, 32, 0, 32, 32, 32, 32, 31, 31, 28, 27, 26, 25, 23, 22, 21, 20, 19, 18, 16, 15, 15, 13, 12, 11, 10, 10, 8, 7, 6, 5, 4, 3, 3, 2, 2, 3, 10, 15, 17, 24, 31, 31};
.extern .shared .align 16 .b8 shared_mem[];
.visible .entry huffman_decode(
.param .u64 huffman_decode_param_0,
.param .u64 huffman_decode_param_1,
.param .u64 huffman_decode_param_2,
.param .u64 huffman_decode_param_3,
.param .u64 huffman_decode_param_4,
.param .u32 huffman_decode_param_5,
.param .u32 huffman_decode_param_6,
.param .u32 huffman_decode_param_7
)
{
.reg .pred %p<33>;
.reg .b16 %rs<48>;
.reg .b32 %r<253>;
.reg .b64 %rd<102>;
ld.param.u64 %rd26, [huffman_decode_param_0];
ld.param.u64 %rd22, [huffman_decode_param_1];
ld.param.u64 %rd23, [huffman_decode_param_2];
ld.param.u64 %rd24, [huffman_decode_param_3];
ld.param.u64 %rd25, [huffman_decode_param_4];
ld.param.u32 %r65, [huffman_decode_param_5];
ld.param.u32 %r66, [huffman_decode_param_6];
ld.param.u32 %r67, [huffman_decode_param_7];
cvta.to.global.u64 %rd1, %rd26;
mov.u32 %r1, %ntid.x;
mul.lo.s32 %r2, %r1, %r65;
mov.u32 %r68, shared_mem;
add.s32 %r3, %r68, %r2;
shl.b32 %r69, %r1, 2;
add.s32 %r4, %r2, %r69;
mov.u32 %r5, %ctaid.x;
mov.u32 %r252, %tid.x;
mad.lo.s32 %r7, %r5, %r1, %r252;
mul.lo.s32 %r8, %r252, %r65;
setp.gt.s32 %p1, %r252, 1279;
@%p1 bra $L__BB0_3;
mov.u32 %r235, %r252;
$L__BB0_2:
cvt.s64.s32 %rd27, %r235;
mov.u64 %rd28, shared_mem_constants;
add.s64 %rd29, %rd28, %rd27;
ld.const.u8 %rs19, [%rd29];
add.s32 %r71, %r68, %r235;
st.volatile.shared.u8 [%r71], %rs19;
add.s32 %r235, %r235, %r1;
setp.lt.s32 %p2, %r235, 1280;
@%p2 bra $L__BB0_2;
$L__BB0_3:
add.s32 %r11, %r1, -1;
setp.eq.s32 %p3, %r252, %r11;
add.s32 %r72, %r65, 4;
selp.b32 %r12, %r72, %r65, %p3;
mul.lo.s32 %r13, %r7, %r65;
sub.s32 %r73, %r66, %r13;
min.s32 %r14, %r12, %r73;
setp.gt.s32 %p4, %r14, 0;
@%p4 bra $L__BB0_4;
bra.uni $L__BB0_10;
$L__BB0_4:
not.b32 %r75, %r66;
add.s32 %r76, %r13, %r75;
not.b32 %r77, %r12;
max.s32 %r78, %r76, %r77;
mov.u32 %r79, -2;
sub.s32 %r80, %r79, %r78;
and.b32 %r240, %r14, 3;
setp.lt.u32 %p5, %r80, 3;
mov.u32 %r238, 0;
@%p5 bra $L__BB0_7;
sub.s32 %r237, %r14, %r240;
mov.u32 %r238, 0;
$L__BB0_6:
add.s32 %r82, %r238, %r13;
cvt.s64.s32 %rd30, %r82;
add.s64 %rd31, %rd1, %rd30;
ld.global.nc.u8 %rs20, [%rd31];
add.s32 %r83, %r8, %r238;
add.s32 %r85, %r68, %r83;
st.volatile.shared.u8 [%r85+1280], %rs20;
ld.global.nc.u8 %rs21, [%rd31+1];
st.volatile.shared.u8 [%r85+1281], %rs21;
ld.global.nc.u8 %rs22, [%rd31+2];
st.volatile.shared.u8 [%r85+1282], %rs22;
ld.global.nc.u8 %rs23, [%rd31+3];
st.volatile.shared.u8 [%r85+1283], %rs23;
add.s32 %r238, %r238, 4;
add.s32 %r237, %r237, -4;
setp.ne.s32 %p6, %r237, 0;
@%p6 bra $L__BB0_6;
$L__BB0_7:
setp.eq.s32 %p7, %r240, 0;
@%p7 bra $L__BB0_10;
add.s32 %r86, %r238, %r8;
add.s32 %r88, %r68, %r86;
add.s32 %r239, %r88, 1280;
add.s32 %r89, %r238, %r13;
cvt.s64.s32 %rd32, %r89;
add.s64 %rd97, %rd1, %rd32;
$L__BB0_9:
.pragma "nounroll";
ld.global.nc.u8 %rs24, [%rd97];
st.volatile.shared.u8 [%r239], %rs24;
add.s32 %r239, %r239, 1;
add.s64 %rd97, %rd97, 1;
add.s32 %r240, %r240, -1;
setp.ne.s32 %p8, %r240, 0;
@%p8 bra $L__BB0_9;
$L__BB0_10:
add.s32 %r27, %r3, 1284;
bar.sync 0;
mul.lo.s32 %r93, %r7, 5;
shr.s32 %r94, %r93, 31;
shr.u32 %r95, %r94, 29;
add.s32 %r96, %r93, %r95;
shr.s32 %r97, %r96, 3;
cvt.s64.s32 %rd33, %r97;
cvta.to.global.u64 %rd34, %rd24;
add.s64 %rd35, %rd34, %rd33;
ld.global.nc.u8 %rs25, [%rd35+1];
cvt.u32.u16 %r98, %rs25;
and.b32 %r99, %r98, 255;
ld.global.nc.u8 %rs26, [%rd35];
cvt.u32.u16 %r100, %rs26;
prmt.b32 %r249, %r100, %r99, 30212;
mov.u32 %r241, 0;
and.b32 %r101, %r96, -8;
sub.s32 %r102, %r101, %r93;
add.s32 %r103, %r102, 11;
shr.u32 %r104, %r249, %r103;
cvt.u64.u32 %rd36, %r104;
cvt.u16.u32 %rs27, %r104;
and.b16 %rs45, %rs27, 31;
add.s32 %r29, %r68, %r8;
ld.volatile.shared.u8 %rd37, [%r29+1280];
shl.b64 %rd38, %rd37, 56;
ld.volatile.shared.u8 %rd39, [%r29+1281];
shl.b64 %rd40, %rd39, 48;
or.b64 %rd41, %rd40, %rd38;
ld.volatile.shared.u8 %rd42, [%r29+1282];
shl.b64 %rd43, %rd42, 40;
or.b64 %rd44, %rd41, %rd43;
ld.volatile.shared.u8 %rd45, [%r29+1283];
shl.b64 %rd46, %rd45, 32;
or.b64 %rd47, %rd44, %rd46;
ld.volatile.shared.u8 %r105, [%r29+1284];
mul.wide.u32 %rd48, %r105, 16777216;
or.b64 %rd49, %rd47, %rd48;
ld.volatile.shared.u8 %r106, [%r29+1285];
mul.wide.u32 %rd50, %r106, 65536;
ld.volatile.shared.u8 %r107, [%r29+1286];
mul.wide.u32 %rd51, %r107, 256;
or.b64 %rd52, %rd49, %rd50;
ld.volatile.shared.u8 %rd53, [%r29+1287];
or.b64 %rd54, %rd52, %rd51;
or.b64 %rd55, %rd54, %rd53;
add.s32 %r251, %r8, 8;
and.b64 %rd5, %rd36, 31;
and.b32 %r108, %r104, 31;
shl.b64 %rd98, %rd55, %r108;
mov.u32 %r109, -8;
sub.s32 %r31, %r109, %r8;
cvta.to.global.u64 %rd7, %rd25;
cvta.to.global.u64 %rd8, %rd22;
cvta.to.global.u64 %rd9, %rd23;
add.s32 %r32, %r68, %r4;
mov.u16 %rs43, %rs45;
mov.u32 %r243, %r251;
bra.uni $L__BB0_11;
$L__BB0_47:
add.s32 %r224, %r68, %r243;
ld.volatile.shared.u8 %r225, [%r224+1280];
shl.b32 %r226, %r225, 24;
ld.volatile.shared.u8 %r227, [%r224+1281];
shl.b32 %r228, %r227, 16;
or.b32 %r229, %r228, %r226;
ld.volatile.shared.u8 %rs42, [%r224+1282];
mul.wide.u16 %r230, %rs42, 256;
or.b32 %r231, %r229, %r230;
ld.volatile.shared.u8 %r232, [%r224+1283];
or.b32 %r249, %r231, %r232;
add.s32 %r243, %r243, 4;
cvt.u64.u32 %rd92, %r249;
cvt.u64.u16 %rd93, %rs43;
and.b64 %rd94, %rd93, 255;
add.s64 %rd95, %rd94, 4294967264;
cvt.u32.u64 %r233, %rd95;
shl.b64 %rd96, %rd92, %r233;
or.b64 %rd98, %rd96, %rd98;
add.s16 %rs43, %rs43, -32;
mov.u32 %r241, %r36;
$L__BB0_11:
shr.u64 %rd56, %rd98, 56;
cvt.u32.u64 %r110, %rd56;
add.s32 %r112, %r68, %r110;
ld.volatile.shared.u8 %rs44, [%r112];
setp.ne.s16 %p9, %rs44, 0;
@%p9 bra $L__BB0_15;
shr.u64 %rd57, %rd98, 48;
cvt.u32.u64 %r113, %rd57;
and.b32 %r114, %r113, 255;
add.s32 %r116, %r68, %r114;
ld.volatile.shared.u8 %rs44, [%r116+256];
setp.ne.s16 %p10, %rs44, 0;
@%p10 bra $L__BB0_15;
shr.u64 %rd58, %rd98, 40;
cvt.u32.u64 %r117, %rd58;
and.b32 %r118, %r117, 255;
add.s32 %r120, %r68, %r118;
ld.volatile.shared.u8 %rs44, [%r120+512];
setp.ne.s16 %p11, %rs44, 0;
@%p11 bra $L__BB0_15;
shr.u64 %rd59, %rd98, 32;
cvt.u32.u64 %r121, %rd59;
and.b32 %r122, %r121, 255;
add.s32 %r124, %r68, %r122;
ld.volatile.shared.u8 %rs44, [%r124+768];
$L__BB0_15:
add.s32 %r36, %r241, 1;
cvt.u32.u16 %r125, %rs44;
and.b32 %r126, %r125, 255;
add.s32 %r128, %r68, %r126;
ld.volatile.shared.u8 %rs28, [%r128+1024];
cvt.u32.u16 %r129, %rs28;
and.b32 %r130, %r129, 255;
shl.b64 %rd98, %rd98, %r130;
add.s16 %rs43, %rs28, %rs43;
and.b16 %rs29, %rs43, 248;
shr.u16 %rs30, %rs29, 3;
cvt.u32.u16 %r131, %rs30;
add.s32 %r132, %r31, %r243;
add.s32 %r133, %r132, %r131;
setp.lt.u32 %p12, %r133, %r65;
@%p12 bra $L__BB0_46;
bra.uni $L__BB0_16;
$L__BB0_46:
and.b16 %rs41, %rs43, 255;
setp.lt.u16 %p32, %rs41, 32;
mov.u32 %r241, %r36;
@%p32 bra $L__BB0_11;
bra.uni $L__BB0_47;
$L__BB0_16:
setp.eq.s32 %p13, %r252, 0;
shl.b32 %r134, %r252, 2;
add.s32 %r37, %r3, %r134;
add.s32 %r38, %r3, %r69;
@%p13 bra $L__BB0_18;
add.s32 %r234, %r241, 1;
st.volatile.shared.u32 [%r37+1284], %r234;
bra.uni $L__BB0_19;
$L__BB0_18:
mul.wide.u32 %rd60, %r5, 4;
add.s64 %rd61, %rd9, %rd60;
ld.global.nc.u32 %r136, [%rd61];
st.volatile.shared.u32 [%r38+1284], %r136;
ld.volatile.shared.u32 %r137, [%r38+1284];
add.s32 %r138, %r137, %r36;
st.volatile.shared.u32 [%r3+1284], %r138;
$L__BB0_19:
bar.sync 0;
setp.lt.u32 %p14, %r1, 2;
@%p14 bra $L__BB0_24;
add.s32 %r39, %r252, 1;
mov.u32 %r244, 2;
$L__BB0_21:
rem.u32 %r140, %r39, %r244;
setp.ne.s32 %p15, %r140, 0;
@%p15 bra $L__BB0_23;
shr.u32 %r141, %r244, 1;
sub.s32 %r142, %r252, %r141;
shl.b32 %r143, %r142, 2;
add.s32 %r144, %r27, %r143;
ld.volatile.shared.u32 %r145, [%r37+1284];
ld.volatile.shared.u32 %r146, [%r144];
add.s32 %r147, %r145, %r146;
st.volatile.shared.u32 [%r37+1284], %r147;
$L__BB0_23:
bar.sync 0;
shl.b32 %r244, %r244, 1;
setp.le.u32 %p16, %r244, %r1;
@%p16 bra $L__BB0_21;
$L__BB0_24:
setp.ne.s32 %p17, %r252, 0;
@%p17 bra $L__BB0_26;
mov.u32 %r148, 0;
st.volatile.shared.u32 [%r38+1280], %r148;
$L__BB0_26:
bar.sync 0;
setp.lt.s32 %p18, %r1, 2;
@%p18 bra $L__BB0_31;
add.s32 %r42, %r252, 1;
mov.u32 %r245, %r1;
$L__BB0_28:
rem.u32 %r149, %r42, %r245;
setp.eq.s32 %p19, %r149, 0;
@%p19 bra $L__BB0_29;
bra.uni $L__BB0_30;
$L__BB0_29:
shr.u32 %r150, %r245, 1;
sub.s32 %r151, %r252, %r150;
shl.b32 %r152, %r151, 2;
add.s32 %r153, %r27, %r152;
ld.volatile.shared.u32 %r154, [%r37+1284];
ld.volatile.shared.u32 %r155, [%r153];
add.s32 %r156, %r154, %r155;
st.volatile.shared.u32 [%r37+1284], %r156;
ld.volatile.shared.u32 %r157, [%r153];
ld.volatile.shared.u32 %r158, [%r37+1284];
sub.s32 %r159, %r158, %r157;
st.volatile.shared.u32 [%r153], %r159;
$L__BB0_30:
shr.u32 %r44, %r245, 1;
bar.sync 0;
setp.gt.u32 %p20, %r245, 3;
mov.u32 %r245, %r44;
@%p20 bra $L__BB0_28;
$L__BB0_31:
@%p3 bra $L__BB0_32;
bra.uni $L__BB0_33;
$L__BB0_32:
ld.volatile.shared.u32 %r160, [%r38+1284];
st.volatile.shared.u32 [%r3+1284], %r160;
ld.volatile.shared.u32 %r161, [%r37+1284];
add.s32 %r162, %r161, %r36;
st.volatile.shared.u32 [%r38+1284], %r162;
$L__BB0_33:
bar.sync 0;
ld.volatile.shared.u32 %r163, [%r37+1284];
cvt.u64.u32 %rd12, %r163;
ld.volatile.shared.u32 %r45, [%r3+1284];
add.s32 %r46, %r163, %r36;
ld.volatile.shared.u8 %rd62, [%r29+1280];
shl.b64 %rd63, %rd62, 56;
ld.volatile.shared.u8 %rd64, [%r29+1281];
shl.b64 %rd65, %rd64, 48;
or.b64 %rd66, %rd65, %rd63;
ld.volatile.shared.u8 %rd67, [%r29+1282];
shl.b64 %rd68, %rd67, 40;
or.b64 %rd69, %rd66, %rd68;
ld.volatile.shared.u8 %rd70, [%r29+1283];
shl.b64 %rd71, %rd70, 32;
or.b64 %rd72, %rd69, %rd71;
ld.volatile.shared.u8 %r164, [%r29+1284];
mul.wide.u32 %rd73, %r164, 16777216;
ld.volatile.shared.u8 %r165, [%r29+1285];
mul.wide.u32 %rd74, %r165, 65536;
ld.volatile.shared.u8 %r166, [%r29+1286];
mul.wide.u32 %rd75, %r166, 256;
ld.volatile.shared.u8 %rd76, [%r29+1287];
or.b64 %rd77, %rd72, %rd73;
or.b64 %rd78, %rd77, %rd74;
or.b64 %rd79, %rd78, %rd75;
or.b64 %rd80, %rd79, %rd76;
cvt.u32.u64 %r167, %rd5;
shl.b64 %rd100, %rd80, %r167;
setp.ge.u32 %p22, %r163, %r67;
@%p22 bra $L__BB0_43;
cvt.u32.u64 %r168, %rd12;
add.s64 %rd99, %rd8, %rd12;
shl.b32 %r169, %r168, 1;
add.s32 %r170, %r4, %r169;
add.s32 %r171, %r170, 1288;
shl.b32 %r172, %r45, 1;
sub.s32 %r173, %r171, %r172;
add.s32 %r247, %r68, %r173;
add.s32 %r246, %r168, 1;
bra.uni $L__BB0_35;
$L__BB0_42:
add.s64 %rd99, %rd99, 1;
add.s32 %r247, %r247, 2;
add.s32 %r246, %r246, 1;
$L__BB0_35:
shr.u64 %rd81, %rd100, 56;
cvt.u32.u64 %r175, %rd81;
add.s32 %r177, %r68, %r175;
ld.volatile.shared.u8 %rs46, [%r177];
setp.ne.s16 %p23, %rs46, 0;
@%p23 bra $L__BB0_39;
shr.u64 %rd82, %rd100, 48;
cvt.u32.u64 %r178, %rd82;
and.b32 %r179, %r178, 255;
add.s32 %r181, %r68, %r179;
ld.volatile.shared.u8 %rs46, [%r181+256];
setp.ne.s16 %p24, %rs46, 0;
@%p24 bra $L__BB0_39;
shr.u64 %rd83, %rd100, 40;
cvt.u32.u64 %r182, %rd83;
and.b32 %r183, %r182, 255;
add.s32 %r185, %r68, %r183;
ld.volatile.shared.u8 %rs46, [%r185+512];
setp.ne.s16 %p25, %rs46, 0;
@%p25 bra $L__BB0_39;
shr.u64 %rd84, %rd100, 32;
cvt.u32.u64 %r186, %rd84;
and.b32 %r187, %r186, 255;
add.s32 %r189, %r68, %r187;
ld.volatile.shared.u8 %rs46, [%r189+768];
$L__BB0_39:
ld.global.nc.u8 %rs31, [%rd99];
and.b16 %rs32, %rs31, 128;
and.b16 %rs33, %rs46, 254;
shr.u16 %rs34, %rs33, 1;
or.b16 %rs35, %rs32, %rs34;
mul.wide.u16 %r190, %rs35, 256;
and.b16 %rs36, %rs31, 127;
cvt.u32.u16 %r191, %rs46;
cvt.u32.u16 %r192, %rs36;
bfi.b32 %r193, %r191, %r192, 7, 9;
and.b32 %r194, %r193, 255;
and.b32 %r195, %r249, -65536;
or.b32 %r196, %r195, %r194;
or.b32 %r249, %r196, %r190;
st.volatile.shared.u16 [%r247], %r249;
setp.ge.u32 %p26, %r246, %r67;
setp.ge.u32 %p27, %r246, %r46;
or.pred %p28, %p26, %p27;
@%p28 bra $L__BB0_43;
and.b32 %r198, %r191, 255;
add.s32 %r200, %r68, %r198;
ld.volatile.shared.u8 %rs37, [%r200+1024];
cvt.u32.u16 %r201, %rs37;
and.b32 %r202, %r201, 255;
shl.b64 %rd100, %rd100, %r202;
add.s16 %rs45, %rs37, %rs45;
and.b16 %rs38, %rs45, 255;
setp.lt.u16 %p29, %rs38, 32;
@%p29 bra $L__BB0_42;
add.s32 %r204, %r68, %r251;
ld.volatile.shared.u8 %r205, [%r204+1280];
shl.b32 %r206, %r205, 24;
ld.volatile.shared.u8 %r207, [%r204+1281];
shl.b32 %r208, %r207, 16;
or.b32 %r209, %r208, %r206;
ld.volatile.shared.u8 %rs39, [%r204+1282];
mul.wide.u16 %r210, %rs39, 256;
or.b32 %r211, %r209, %r210;
ld.volatile.shared.u8 %r212, [%r204+1283];
or.b32 %r249, %r211, %r212;
add.s32 %r251, %r251, 4;
cvt.u64.u32 %rd85, %r249;
cvt.u64.u16 %rd86, %rs45;
and.b64 %rd87, %rd86, 255;
add.s64 %rd88, %rd87, 4294967264;
cvt.u32.u64 %r213, %rd88;
shl.b64 %rd89, %rd85, %r213;
or.b64 %rd100, %rd89, %rd100;
add.s16 %rs45, %rs45, -32;
bra.uni $L__BB0_42;
$L__BB0_43:
bar.sync 0;
ld.volatile.shared.u32 %r214, [%r38+1284];
sub.s32 %r215, %r214, %r45;
sub.s32 %r60, %r67, %r45;
min.u32 %r216, %r215, %r60;
setp.ge.u32 %p30, %r252, %r216;
@%p30 bra $L__BB0_45;
$L__BB0_44:
shl.b32 %r217, %r252, 1;
add.s32 %r218, %r32, %r217;
ld.volatile.shared.u16 %rs40, [%r218+1288];
add.s32 %r219, %r252, %r45;
mul.wide.u32 %rd90, %r219, 2;
add.s64 %rd91, %rd7, %rd90;
st.global.u16 [%rd91], %rs40;
ld.volatile.shared.u32 %r220, [%r38+1284];
sub.s32 %r221, %r220, %r45;
min.u32 %r222, %r221, %r60;
add.s32 %r252, %r252, %r1;
setp.lt.u32 %p31, %r252, %r222;
@%p31 bra $L__BB0_44;
$L__BB0_45:
ret;
}