| // |
| // Generated by NVIDIA NVVM Compiler |
| // |
| // Compiler Build ID: CL-34097967 |
| // Cuda compilation tools, release 12.4, V12.4.131 |
| // Based on NVVM 7.0.1 |
| // |
|
|
| .version 8.4 |
| .target sm_52 |
| .address_size 64 |
|
|
| // .globl huffman_decode |
| .const .align 1 .b8 shared_mem_constants[1280] = {120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 114, 114, 114, 114, 114, 114, 114, 114, 113, 113, 113, 113, 112, 112, 111, 0, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 106, 106, 106, 106, 106, 106, 106, 106, 104, 104, 103, 0, 122, 122, 105, 105, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 99, 99, 99, 99, 99, 99, 99, 99, 98, 98, 98, 98, 97, 97, 124, 0, 91, 91, 92, 92, 125, 125, 126, 126, 0, 81, 84, 85, 87, 88, 89, 90, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 32, 32, 0, 32, 32, 32, 32, 31, 31, 28, 27, 26, 25, 23, 22, 21, 20, 19, 18, 16, 15, 15, 13, 12, 11, 10, 10, 8, 7, 6, 5, 4, 3, 3, 2, 2, 3, 10, 15, 17, 24, 31, 31}; |
| .extern .shared .align 16 .b8 shared_mem[]; |
|
|
| .visible .entry huffman_decode( |
| .param .u64 huffman_decode_param_0, |
| .param .u64 huffman_decode_param_1, |
| .param .u64 huffman_decode_param_2, |
| .param .u64 huffman_decode_param_3, |
| .param .u64 huffman_decode_param_4, |
| .param .u32 huffman_decode_param_5, |
| .param .u32 huffman_decode_param_6, |
| .param .u32 huffman_decode_param_7 |
| ) |
| { |
| .reg .pred %p<33>; |
| .reg .b16 %rs<48>; |
| .reg .b32 %r<253>; |
| .reg .b64 %rd<102>; |
|
|
|
|
| ld.param.u64 %rd26, [huffman_decode_param_0]; |
| ld.param.u64 %rd22, [huffman_decode_param_1]; |
| ld.param.u64 %rd23, [huffman_decode_param_2]; |
| ld.param.u64 %rd24, [huffman_decode_param_3]; |
| ld.param.u64 %rd25, [huffman_decode_param_4]; |
| ld.param.u32 %r65, [huffman_decode_param_5]; |
| ld.param.u32 %r66, [huffman_decode_param_6]; |
| ld.param.u32 %r67, [huffman_decode_param_7]; |
| cvta.to.global.u64 %rd1, %rd26; |
| mov.u32 %r1, %ntid.x; |
| mul.lo.s32 %r2, %r1, %r65; |
| mov.u32 %r68, shared_mem; |
| add.s32 %r3, %r68, %r2; |
| shl.b32 %r69, %r1, 2; |
| add.s32 %r4, %r2, %r69; |
| mov.u32 %r5, %ctaid.x; |
| mov.u32 %r252, %tid.x; |
| mad.lo.s32 %r7, %r5, %r1, %r252; |
| mul.lo.s32 %r8, %r252, %r65; |
| setp.gt.s32 %p1, %r252, 1279; |
| @%p1 bra $L__BB0_3; |
|
|
| mov.u32 %r235, %r252; |
|
|
| $L__BB0_2: |
| cvt.s64.s32 %rd27, %r235; |
| mov.u64 %rd28, shared_mem_constants; |
| add.s64 %rd29, %rd28, %rd27; |
| ld.const.u8 %rs19, [%rd29]; |
| add.s32 %r71, %r68, %r235; |
| st.volatile.shared.u8 [%r71], %rs19; |
| add.s32 %r235, %r235, %r1; |
| setp.lt.s32 %p2, %r235, 1280; |
| @%p2 bra $L__BB0_2; |
|
|
| $L__BB0_3: |
| add.s32 %r11, %r1, -1; |
| setp.eq.s32 %p3, %r252, %r11; |
| add.s32 %r72, %r65, 4; |
| selp.b32 %r12, %r72, %r65, %p3; |
| mul.lo.s32 %r13, %r7, %r65; |
| sub.s32 %r73, %r66, %r13; |
| min.s32 %r14, %r12, %r73; |
| setp.gt.s32 %p4, %r14, 0; |
| @%p4 bra $L__BB0_4; |
| bra.uni $L__BB0_10; |
|
|
| $L__BB0_4: |
| not.b32 %r75, %r66; |
| add.s32 %r76, %r13, %r75; |
| not.b32 %r77, %r12; |
| max.s32 %r78, %r76, %r77; |
| mov.u32 %r79, -2; |
| sub.s32 %r80, %r79, %r78; |
| and.b32 %r240, %r14, 3; |
| setp.lt.u32 %p5, %r80, 3; |
| mov.u32 %r238, 0; |
| @%p5 bra $L__BB0_7; |
|
|
| sub.s32 %r237, %r14, %r240; |
| mov.u32 %r238, 0; |
|
|
| $L__BB0_6: |
| add.s32 %r82, %r238, %r13; |
| cvt.s64.s32 %rd30, %r82; |
| add.s64 %rd31, %rd1, %rd30; |
| ld.global.nc.u8 %rs20, [%rd31]; |
| add.s32 %r83, %r8, %r238; |
| add.s32 %r85, %r68, %r83; |
| st.volatile.shared.u8 [%r85+1280], %rs20; |
| ld.global.nc.u8 %rs21, [%rd31+1]; |
| st.volatile.shared.u8 [%r85+1281], %rs21; |
| ld.global.nc.u8 %rs22, [%rd31+2]; |
| st.volatile.shared.u8 [%r85+1282], %rs22; |
| ld.global.nc.u8 %rs23, [%rd31+3]; |
| st.volatile.shared.u8 [%r85+1283], %rs23; |
| add.s32 %r238, %r238, 4; |
| add.s32 %r237, %r237, -4; |
| setp.ne.s32 %p6, %r237, 0; |
| @%p6 bra $L__BB0_6; |
|
|
| $L__BB0_7: |
| setp.eq.s32 %p7, %r240, 0; |
| @%p7 bra $L__BB0_10; |
|
|
| add.s32 %r86, %r238, %r8; |
| add.s32 %r88, %r68, %r86; |
| add.s32 %r239, %r88, 1280; |
| add.s32 %r89, %r238, %r13; |
| cvt.s64.s32 %rd32, %r89; |
| add.s64 %rd97, %rd1, %rd32; |
|
|
| $L__BB0_9: |
| .pragma "nounroll"; |
| ld.global.nc.u8 %rs24, [%rd97]; |
| st.volatile.shared.u8 [%r239], %rs24; |
| add.s32 %r239, %r239, 1; |
| add.s64 %rd97, %rd97, 1; |
| add.s32 %r240, %r240, -1; |
| setp.ne.s32 %p8, %r240, 0; |
| @%p8 bra $L__BB0_9; |
|
|
| $L__BB0_10: |
| add.s32 %r27, %r3, 1284; |
| bar.sync 0; |
| mul.lo.s32 %r93, %r7, 5; |
| shr.s32 %r94, %r93, 31; |
| shr.u32 %r95, %r94, 29; |
| add.s32 %r96, %r93, %r95; |
| shr.s32 %r97, %r96, 3; |
| cvt.s64.s32 %rd33, %r97; |
| cvta.to.global.u64 %rd34, %rd24; |
| add.s64 %rd35, %rd34, %rd33; |
| ld.global.nc.u8 %rs25, [%rd35+1]; |
| cvt.u32.u16 %r98, %rs25; |
| and.b32 %r99, %r98, 255; |
| ld.global.nc.u8 %rs26, [%rd35]; |
| cvt.u32.u16 %r100, %rs26; |
| prmt.b32 %r249, %r100, %r99, 30212; |
| mov.u32 %r241, 0; |
| and.b32 %r101, %r96, -8; |
| sub.s32 %r102, %r101, %r93; |
| add.s32 %r103, %r102, 11; |
| shr.u32 %r104, %r249, %r103; |
| cvt.u64.u32 %rd36, %r104; |
| cvt.u16.u32 %rs27, %r104; |
| and.b16 %rs45, %rs27, 31; |
| add.s32 %r29, %r68, %r8; |
| ld.volatile.shared.u8 %rd37, [%r29+1280]; |
| shl.b64 %rd38, %rd37, 56; |
| ld.volatile.shared.u8 %rd39, [%r29+1281]; |
| shl.b64 %rd40, %rd39, 48; |
| or.b64 %rd41, %rd40, %rd38; |
| ld.volatile.shared.u8 %rd42, [%r29+1282]; |
| shl.b64 %rd43, %rd42, 40; |
| or.b64 %rd44, %rd41, %rd43; |
| ld.volatile.shared.u8 %rd45, [%r29+1283]; |
| shl.b64 %rd46, %rd45, 32; |
| or.b64 %rd47, %rd44, %rd46; |
| ld.volatile.shared.u8 %r105, [%r29+1284]; |
| mul.wide.u32 %rd48, %r105, 16777216; |
| or.b64 %rd49, %rd47, %rd48; |
| ld.volatile.shared.u8 %r106, [%r29+1285]; |
| mul.wide.u32 %rd50, %r106, 65536; |
| ld.volatile.shared.u8 %r107, [%r29+1286]; |
| mul.wide.u32 %rd51, %r107, 256; |
| or.b64 %rd52, %rd49, %rd50; |
| ld.volatile.shared.u8 %rd53, [%r29+1287]; |
| or.b64 %rd54, %rd52, %rd51; |
| or.b64 %rd55, %rd54, %rd53; |
| add.s32 %r251, %r8, 8; |
| and.b64 %rd5, %rd36, 31; |
| and.b32 %r108, %r104, 31; |
| shl.b64 %rd98, %rd55, %r108; |
| mov.u32 %r109, -8; |
| sub.s32 %r31, %r109, %r8; |
| cvta.to.global.u64 %rd7, %rd25; |
| cvta.to.global.u64 %rd8, %rd22; |
| cvta.to.global.u64 %rd9, %rd23; |
| add.s32 %r32, %r68, %r4; |
| mov.u16 %rs43, %rs45; |
| mov.u32 %r243, %r251; |
| bra.uni $L__BB0_11; |
|
|
| $L__BB0_47: |
| add.s32 %r224, %r68, %r243; |
| ld.volatile.shared.u8 %r225, [%r224+1280]; |
| shl.b32 %r226, %r225, 24; |
| ld.volatile.shared.u8 %r227, [%r224+1281]; |
| shl.b32 %r228, %r227, 16; |
| or.b32 %r229, %r228, %r226; |
| ld.volatile.shared.u8 %rs42, [%r224+1282]; |
| mul.wide.u16 %r230, %rs42, 256; |
| or.b32 %r231, %r229, %r230; |
| ld.volatile.shared.u8 %r232, [%r224+1283]; |
| or.b32 %r249, %r231, %r232; |
| add.s32 %r243, %r243, 4; |
| cvt.u64.u32 %rd92, %r249; |
| cvt.u64.u16 %rd93, %rs43; |
| and.b64 %rd94, %rd93, 255; |
| add.s64 %rd95, %rd94, 4294967264; |
| cvt.u32.u64 %r233, %rd95; |
| shl.b64 %rd96, %rd92, %r233; |
| or.b64 %rd98, %rd96, %rd98; |
| add.s16 %rs43, %rs43, -32; |
| mov.u32 %r241, %r36; |
|
|
| $L__BB0_11: |
| shr.u64 %rd56, %rd98, 56; |
| cvt.u32.u64 %r110, %rd56; |
| add.s32 %r112, %r68, %r110; |
| ld.volatile.shared.u8 %rs44, [%r112]; |
| setp.ne.s16 %p9, %rs44, 0; |
| @%p9 bra $L__BB0_15; |
|
|
| shr.u64 %rd57, %rd98, 48; |
| cvt.u32.u64 %r113, %rd57; |
| and.b32 %r114, %r113, 255; |
| add.s32 %r116, %r68, %r114; |
| ld.volatile.shared.u8 %rs44, [%r116+256]; |
| setp.ne.s16 %p10, %rs44, 0; |
| @%p10 bra $L__BB0_15; |
|
|
| shr.u64 %rd58, %rd98, 40; |
| cvt.u32.u64 %r117, %rd58; |
| and.b32 %r118, %r117, 255; |
| add.s32 %r120, %r68, %r118; |
| ld.volatile.shared.u8 %rs44, [%r120+512]; |
| setp.ne.s16 %p11, %rs44, 0; |
| @%p11 bra $L__BB0_15; |
|
|
| shr.u64 %rd59, %rd98, 32; |
| cvt.u32.u64 %r121, %rd59; |
| and.b32 %r122, %r121, 255; |
| add.s32 %r124, %r68, %r122; |
| ld.volatile.shared.u8 %rs44, [%r124+768]; |
|
|
| $L__BB0_15: |
| add.s32 %r36, %r241, 1; |
| cvt.u32.u16 %r125, %rs44; |
| and.b32 %r126, %r125, 255; |
| add.s32 %r128, %r68, %r126; |
| ld.volatile.shared.u8 %rs28, [%r128+1024]; |
| cvt.u32.u16 %r129, %rs28; |
| and.b32 %r130, %r129, 255; |
| shl.b64 %rd98, %rd98, %r130; |
| add.s16 %rs43, %rs28, %rs43; |
| and.b16 %rs29, %rs43, 248; |
| shr.u16 %rs30, %rs29, 3; |
| cvt.u32.u16 %r131, %rs30; |
| add.s32 %r132, %r31, %r243; |
| add.s32 %r133, %r132, %r131; |
| setp.lt.u32 %p12, %r133, %r65; |
| @%p12 bra $L__BB0_46; |
| bra.uni $L__BB0_16; |
|
|
| $L__BB0_46: |
| and.b16 %rs41, %rs43, 255; |
| setp.lt.u16 %p32, %rs41, 32; |
| mov.u32 %r241, %r36; |
| @%p32 bra $L__BB0_11; |
| bra.uni $L__BB0_47; |
|
|
| $L__BB0_16: |
| setp.eq.s32 %p13, %r252, 0; |
| shl.b32 %r134, %r252, 2; |
| add.s32 %r37, %r3, %r134; |
| add.s32 %r38, %r3, %r69; |
| @%p13 bra $L__BB0_18; |
|
|
| add.s32 %r234, %r241, 1; |
| st.volatile.shared.u32 [%r37+1284], %r234; |
| bra.uni $L__BB0_19; |
|
|
| $L__BB0_18: |
| mul.wide.u32 %rd60, %r5, 4; |
| add.s64 %rd61, %rd9, %rd60; |
| ld.global.nc.u32 %r136, [%rd61]; |
| st.volatile.shared.u32 [%r38+1284], %r136; |
| ld.volatile.shared.u32 %r137, [%r38+1284]; |
| add.s32 %r138, %r137, %r36; |
| st.volatile.shared.u32 [%r3+1284], %r138; |
|
|
| $L__BB0_19: |
| bar.sync 0; |
| setp.lt.u32 %p14, %r1, 2; |
| @%p14 bra $L__BB0_24; |
|
|
| add.s32 %r39, %r252, 1; |
| mov.u32 %r244, 2; |
|
|
| $L__BB0_21: |
| rem.u32 %r140, %r39, %r244; |
| setp.ne.s32 %p15, %r140, 0; |
| @%p15 bra $L__BB0_23; |
|
|
| shr.u32 %r141, %r244, 1; |
| sub.s32 %r142, %r252, %r141; |
| shl.b32 %r143, %r142, 2; |
| add.s32 %r144, %r27, %r143; |
| ld.volatile.shared.u32 %r145, [%r37+1284]; |
| ld.volatile.shared.u32 %r146, [%r144]; |
| add.s32 %r147, %r145, %r146; |
| st.volatile.shared.u32 [%r37+1284], %r147; |
|
|
| $L__BB0_23: |
| bar.sync 0; |
| shl.b32 %r244, %r244, 1; |
| setp.le.u32 %p16, %r244, %r1; |
| @%p16 bra $L__BB0_21; |
|
|
| $L__BB0_24: |
| setp.ne.s32 %p17, %r252, 0; |
| @%p17 bra $L__BB0_26; |
|
|
| mov.u32 %r148, 0; |
| st.volatile.shared.u32 [%r38+1280], %r148; |
|
|
| $L__BB0_26: |
| bar.sync 0; |
| setp.lt.s32 %p18, %r1, 2; |
| @%p18 bra $L__BB0_31; |
|
|
| add.s32 %r42, %r252, 1; |
| mov.u32 %r245, %r1; |
|
|
| $L__BB0_28: |
| rem.u32 %r149, %r42, %r245; |
| setp.eq.s32 %p19, %r149, 0; |
| @%p19 bra $L__BB0_29; |
| bra.uni $L__BB0_30; |
|
|
| $L__BB0_29: |
| shr.u32 %r150, %r245, 1; |
| sub.s32 %r151, %r252, %r150; |
| shl.b32 %r152, %r151, 2; |
| add.s32 %r153, %r27, %r152; |
| ld.volatile.shared.u32 %r154, [%r37+1284]; |
| ld.volatile.shared.u32 %r155, [%r153]; |
| add.s32 %r156, %r154, %r155; |
| st.volatile.shared.u32 [%r37+1284], %r156; |
| ld.volatile.shared.u32 %r157, [%r153]; |
| ld.volatile.shared.u32 %r158, [%r37+1284]; |
| sub.s32 %r159, %r158, %r157; |
| st.volatile.shared.u32 [%r153], %r159; |
|
|
| $L__BB0_30: |
| shr.u32 %r44, %r245, 1; |
| bar.sync 0; |
| setp.gt.u32 %p20, %r245, 3; |
| mov.u32 %r245, %r44; |
| @%p20 bra $L__BB0_28; |
|
|
| $L__BB0_31: |
| @%p3 bra $L__BB0_32; |
| bra.uni $L__BB0_33; |
|
|
| $L__BB0_32: |
| ld.volatile.shared.u32 %r160, [%r38+1284]; |
| st.volatile.shared.u32 [%r3+1284], %r160; |
| ld.volatile.shared.u32 %r161, [%r37+1284]; |
| add.s32 %r162, %r161, %r36; |
| st.volatile.shared.u32 [%r38+1284], %r162; |
|
|
| $L__BB0_33: |
| bar.sync 0; |
| ld.volatile.shared.u32 %r163, [%r37+1284]; |
| cvt.u64.u32 %rd12, %r163; |
| ld.volatile.shared.u32 %r45, [%r3+1284]; |
| add.s32 %r46, %r163, %r36; |
| ld.volatile.shared.u8 %rd62, [%r29+1280]; |
| shl.b64 %rd63, %rd62, 56; |
| ld.volatile.shared.u8 %rd64, [%r29+1281]; |
| shl.b64 %rd65, %rd64, 48; |
| or.b64 %rd66, %rd65, %rd63; |
| ld.volatile.shared.u8 %rd67, [%r29+1282]; |
| shl.b64 %rd68, %rd67, 40; |
| or.b64 %rd69, %rd66, %rd68; |
| ld.volatile.shared.u8 %rd70, [%r29+1283]; |
| shl.b64 %rd71, %rd70, 32; |
| or.b64 %rd72, %rd69, %rd71; |
| ld.volatile.shared.u8 %r164, [%r29+1284]; |
| mul.wide.u32 %rd73, %r164, 16777216; |
| ld.volatile.shared.u8 %r165, [%r29+1285]; |
| mul.wide.u32 %rd74, %r165, 65536; |
| ld.volatile.shared.u8 %r166, [%r29+1286]; |
| mul.wide.u32 %rd75, %r166, 256; |
| ld.volatile.shared.u8 %rd76, [%r29+1287]; |
| or.b64 %rd77, %rd72, %rd73; |
| or.b64 %rd78, %rd77, %rd74; |
| or.b64 %rd79, %rd78, %rd75; |
| or.b64 %rd80, %rd79, %rd76; |
| cvt.u32.u64 %r167, %rd5; |
| shl.b64 %rd100, %rd80, %r167; |
| setp.ge.u32 %p22, %r163, %r67; |
| @%p22 bra $L__BB0_43; |
|
|
| cvt.u32.u64 %r168, %rd12; |
| add.s64 %rd99, %rd8, %rd12; |
| shl.b32 %r169, %r168, 1; |
| add.s32 %r170, %r4, %r169; |
| add.s32 %r171, %r170, 1288; |
| shl.b32 %r172, %r45, 1; |
| sub.s32 %r173, %r171, %r172; |
| add.s32 %r247, %r68, %r173; |
| add.s32 %r246, %r168, 1; |
| bra.uni $L__BB0_35; |
|
|
| $L__BB0_42: |
| add.s64 %rd99, %rd99, 1; |
| add.s32 %r247, %r247, 2; |
| add.s32 %r246, %r246, 1; |
|
|
| $L__BB0_35: |
| shr.u64 %rd81, %rd100, 56; |
| cvt.u32.u64 %r175, %rd81; |
| add.s32 %r177, %r68, %r175; |
| ld.volatile.shared.u8 %rs46, [%r177]; |
| setp.ne.s16 %p23, %rs46, 0; |
| @%p23 bra $L__BB0_39; |
|
|
| shr.u64 %rd82, %rd100, 48; |
| cvt.u32.u64 %r178, %rd82; |
| and.b32 %r179, %r178, 255; |
| add.s32 %r181, %r68, %r179; |
| ld.volatile.shared.u8 %rs46, [%r181+256]; |
| setp.ne.s16 %p24, %rs46, 0; |
| @%p24 bra $L__BB0_39; |
|
|
| shr.u64 %rd83, %rd100, 40; |
| cvt.u32.u64 %r182, %rd83; |
| and.b32 %r183, %r182, 255; |
| add.s32 %r185, %r68, %r183; |
| ld.volatile.shared.u8 %rs46, [%r185+512]; |
| setp.ne.s16 %p25, %rs46, 0; |
| @%p25 bra $L__BB0_39; |
|
|
| shr.u64 %rd84, %rd100, 32; |
| cvt.u32.u64 %r186, %rd84; |
| and.b32 %r187, %r186, 255; |
| add.s32 %r189, %r68, %r187; |
| ld.volatile.shared.u8 %rs46, [%r189+768]; |
|
|
| $L__BB0_39: |
| ld.global.nc.u8 %rs31, [%rd99]; |
| and.b16 %rs32, %rs31, 128; |
| and.b16 %rs33, %rs46, 254; |
| shr.u16 %rs34, %rs33, 1; |
| or.b16 %rs35, %rs32, %rs34; |
| mul.wide.u16 %r190, %rs35, 256; |
| and.b16 %rs36, %rs31, 127; |
| cvt.u32.u16 %r191, %rs46; |
| cvt.u32.u16 %r192, %rs36; |
| bfi.b32 %r193, %r191, %r192, 7, 9; |
| and.b32 %r194, %r193, 255; |
| and.b32 %r195, %r249, -65536; |
| or.b32 %r196, %r195, %r194; |
| or.b32 %r249, %r196, %r190; |
| st.volatile.shared.u16 [%r247], %r249; |
| setp.ge.u32 %p26, %r246, %r67; |
| setp.ge.u32 %p27, %r246, %r46; |
| or.pred %p28, %p26, %p27; |
| @%p28 bra $L__BB0_43; |
|
|
| and.b32 %r198, %r191, 255; |
| add.s32 %r200, %r68, %r198; |
| ld.volatile.shared.u8 %rs37, [%r200+1024]; |
| cvt.u32.u16 %r201, %rs37; |
| and.b32 %r202, %r201, 255; |
| shl.b64 %rd100, %rd100, %r202; |
| add.s16 %rs45, %rs37, %rs45; |
| and.b16 %rs38, %rs45, 255; |
| setp.lt.u16 %p29, %rs38, 32; |
| @%p29 bra $L__BB0_42; |
|
|
| add.s32 %r204, %r68, %r251; |
| ld.volatile.shared.u8 %r205, [%r204+1280]; |
| shl.b32 %r206, %r205, 24; |
| ld.volatile.shared.u8 %r207, [%r204+1281]; |
| shl.b32 %r208, %r207, 16; |
| or.b32 %r209, %r208, %r206; |
| ld.volatile.shared.u8 %rs39, [%r204+1282]; |
| mul.wide.u16 %r210, %rs39, 256; |
| or.b32 %r211, %r209, %r210; |
| ld.volatile.shared.u8 %r212, [%r204+1283]; |
| or.b32 %r249, %r211, %r212; |
| add.s32 %r251, %r251, 4; |
| cvt.u64.u32 %rd85, %r249; |
| cvt.u64.u16 %rd86, %rs45; |
| and.b64 %rd87, %rd86, 255; |
| add.s64 %rd88, %rd87, 4294967264; |
| cvt.u32.u64 %r213, %rd88; |
| shl.b64 %rd89, %rd85, %r213; |
| or.b64 %rd100, %rd89, %rd100; |
| add.s16 %rs45, %rs45, -32; |
| bra.uni $L__BB0_42; |
|
|
| $L__BB0_43: |
| bar.sync 0; |
| ld.volatile.shared.u32 %r214, [%r38+1284]; |
| sub.s32 %r215, %r214, %r45; |
| sub.s32 %r60, %r67, %r45; |
| min.u32 %r216, %r215, %r60; |
| setp.ge.u32 %p30, %r252, %r216; |
| @%p30 bra $L__BB0_45; |
|
|
| $L__BB0_44: |
| shl.b32 %r217, %r252, 1; |
| add.s32 %r218, %r32, %r217; |
| ld.volatile.shared.u16 %rs40, [%r218+1288]; |
| add.s32 %r219, %r252, %r45; |
| mul.wide.u32 %rd90, %r219, 2; |
| add.s64 %rd91, %rd7, %rd90; |
| st.global.u16 [%rd91], %rs40; |
| ld.volatile.shared.u32 %r220, [%r38+1284]; |
| sub.s32 %r221, %r220, %r45; |
| min.u32 %r222, %r221, %r60; |
| add.s32 %r252, %r252, %r1; |
| setp.lt.u32 %p31, %r252, %r222; |
| @%p31 bra $L__BB0_44; |
|
|
| $L__BB0_45: |
| ret; |
|
|
| } |
|
|
|
|