Add files using upload-large-folder tool
Browse files- .gitignore +2 -0
- config.json +3 -0
- counter_original.pkl +3 -0
- decode.ptx +539 -0
- layer_0.pkl +3 -0
- layer_1.pkl +3 -0
- layer_10.pkl +3 -0
- layer_11.pkl +3 -0
- layer_12.pkl +3 -0
- layer_13.pkl +3 -0
- layer_14.pkl +3 -0
- layer_15.pkl +3 -0
- layer_16.pkl +3 -0
- layer_17.pkl +3 -0
- layer_18.pkl +3 -0
- layer_19.pkl +3 -0
- layer_2.pkl +3 -0
- layer_20.pkl +3 -0
- layer_21.pkl +3 -0
- layer_22.pkl +3 -0
- layer_23.pkl +3 -0
- layer_24.pkl +3 -0
- layer_25.pkl +3 -0
- layer_26.pkl +3 -0
- layer_27.pkl +3 -0
- layer_28.pkl +3 -0
- layer_29.pkl +3 -0
- layer_3.pkl +3 -0
- layer_30.pkl +3 -0
- layer_31.pkl +3 -0
- layer_32.pkl +3 -0
- layer_33.pkl +3 -0
- layer_34.pkl +3 -0
- layer_35.pkl +3 -0
- layer_36.pkl +3 -0
- layer_37.pkl +3 -0
- layer_38.pkl +3 -0
- layer_39.pkl +3 -0
- layer_4.pkl +3 -0
- layer_5.pkl +3 -0
- layer_6.pkl +3 -0
- layer_7.pkl +3 -0
- layer_8.pkl +3 -0
- layer_9.pkl +3 -0
- lm_head.pkl +3 -0
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
counter.pkl
|
| 2 |
+
*.cu
|
config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dfloat11_version": "0.1.0"
|
| 3 |
+
}
|
counter_original.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6a8b405b0f5d58b264a6902d9975af85df4c6002341a04aa23a73aafaee39ac
|
| 3 |
+
size 270
|
decode.ptx
ADDED
|
@@ -0,0 +1,539 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by NVIDIA NVVM Compiler
|
| 3 |
+
//
|
| 4 |
+
// Compiler Build ID: CL-34097967
|
| 5 |
+
// Cuda compilation tools, release 12.4, V12.4.131
|
| 6 |
+
// Based on NVVM 7.0.1
|
| 7 |
+
//
|
| 8 |
+
|
| 9 |
+
.version 8.4
|
| 10 |
+
.target sm_52
|
| 11 |
+
.address_size 64
|
| 12 |
+
|
| 13 |
+
// .globl huffman_decode
|
| 14 |
+
.const .align 1 .b8 shared_mem_constants[1280] = {120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 114, 114, 114, 114, 114, 114, 114, 114, 113, 113, 113, 113, 112, 112, 111, 0, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 118, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 106, 106, 106, 106, 106, 106, 106, 106, 104, 104, 103, 0, 122, 122, 105, 105, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 99, 99, 99, 99, 99, 99, 99, 99, 98, 98, 98, 98, 97, 97, 124, 0, 91, 91, 92, 92, 125, 125, 126, 126, 0, 81, 84, 85, 87, 88, 89, 90, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 32, 32, 0, 32, 32, 32, 32, 31, 31, 28, 27, 26, 25, 23, 22, 21, 20, 19, 18, 16, 15, 15, 13, 12, 11, 10, 10, 8, 7, 6, 5, 4, 3, 3, 2, 2, 3, 10, 15, 17, 24, 31, 31};
|
| 15 |
+
.extern .shared .align 16 .b8 shared_mem[];
|
| 16 |
+
|
| 17 |
+
.visible .entry huffman_decode(
|
| 18 |
+
.param .u64 huffman_decode_param_0,
|
| 19 |
+
.param .u64 huffman_decode_param_1,
|
| 20 |
+
.param .u64 huffman_decode_param_2,
|
| 21 |
+
.param .u64 huffman_decode_param_3,
|
| 22 |
+
.param .u64 huffman_decode_param_4,
|
| 23 |
+
.param .u32 huffman_decode_param_5,
|
| 24 |
+
.param .u32 huffman_decode_param_6,
|
| 25 |
+
.param .u32 huffman_decode_param_7
|
| 26 |
+
)
|
| 27 |
+
{
|
| 28 |
+
.reg .pred %p<33>;
|
| 29 |
+
.reg .b16 %rs<48>;
|
| 30 |
+
.reg .b32 %r<253>;
|
| 31 |
+
.reg .b64 %rd<102>;
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
ld.param.u64 %rd26, [huffman_decode_param_0];
|
| 35 |
+
ld.param.u64 %rd22, [huffman_decode_param_1];
|
| 36 |
+
ld.param.u64 %rd23, [huffman_decode_param_2];
|
| 37 |
+
ld.param.u64 %rd24, [huffman_decode_param_3];
|
| 38 |
+
ld.param.u64 %rd25, [huffman_decode_param_4];
|
| 39 |
+
ld.param.u32 %r65, [huffman_decode_param_5];
|
| 40 |
+
ld.param.u32 %r66, [huffman_decode_param_6];
|
| 41 |
+
ld.param.u32 %r67, [huffman_decode_param_7];
|
| 42 |
+
cvta.to.global.u64 %rd1, %rd26;
|
| 43 |
+
mov.u32 %r1, %ntid.x;
|
| 44 |
+
mul.lo.s32 %r2, %r1, %r65;
|
| 45 |
+
mov.u32 %r68, shared_mem;
|
| 46 |
+
add.s32 %r3, %r68, %r2;
|
| 47 |
+
shl.b32 %r69, %r1, 2;
|
| 48 |
+
add.s32 %r4, %r2, %r69;
|
| 49 |
+
mov.u32 %r5, %ctaid.x;
|
| 50 |
+
mov.u32 %r252, %tid.x;
|
| 51 |
+
mad.lo.s32 %r7, %r5, %r1, %r252;
|
| 52 |
+
mul.lo.s32 %r8, %r252, %r65;
|
| 53 |
+
setp.gt.s32 %p1, %r252, 1279;
|
| 54 |
+
@%p1 bra $L__BB0_3;
|
| 55 |
+
|
| 56 |
+
mov.u32 %r235, %r252;
|
| 57 |
+
|
| 58 |
+
$L__BB0_2:
|
| 59 |
+
cvt.s64.s32 %rd27, %r235;
|
| 60 |
+
mov.u64 %rd28, shared_mem_constants;
|
| 61 |
+
add.s64 %rd29, %rd28, %rd27;
|
| 62 |
+
ld.const.u8 %rs19, [%rd29];
|
| 63 |
+
add.s32 %r71, %r68, %r235;
|
| 64 |
+
st.volatile.shared.u8 [%r71], %rs19;
|
| 65 |
+
add.s32 %r235, %r235, %r1;
|
| 66 |
+
setp.lt.s32 %p2, %r235, 1280;
|
| 67 |
+
@%p2 bra $L__BB0_2;
|
| 68 |
+
|
| 69 |
+
$L__BB0_3:
|
| 70 |
+
add.s32 %r11, %r1, -1;
|
| 71 |
+
setp.eq.s32 %p3, %r252, %r11;
|
| 72 |
+
add.s32 %r72, %r65, 4;
|
| 73 |
+
selp.b32 %r12, %r72, %r65, %p3;
|
| 74 |
+
mul.lo.s32 %r13, %r7, %r65;
|
| 75 |
+
sub.s32 %r73, %r66, %r13;
|
| 76 |
+
min.s32 %r14, %r12, %r73;
|
| 77 |
+
setp.gt.s32 %p4, %r14, 0;
|
| 78 |
+
@%p4 bra $L__BB0_4;
|
| 79 |
+
bra.uni $L__BB0_10;
|
| 80 |
+
|
| 81 |
+
$L__BB0_4:
|
| 82 |
+
not.b32 %r75, %r66;
|
| 83 |
+
add.s32 %r76, %r13, %r75;
|
| 84 |
+
not.b32 %r77, %r12;
|
| 85 |
+
max.s32 %r78, %r76, %r77;
|
| 86 |
+
mov.u32 %r79, -2;
|
| 87 |
+
sub.s32 %r80, %r79, %r78;
|
| 88 |
+
and.b32 %r240, %r14, 3;
|
| 89 |
+
setp.lt.u32 %p5, %r80, 3;
|
| 90 |
+
mov.u32 %r238, 0;
|
| 91 |
+
@%p5 bra $L__BB0_7;
|
| 92 |
+
|
| 93 |
+
sub.s32 %r237, %r14, %r240;
|
| 94 |
+
mov.u32 %r238, 0;
|
| 95 |
+
|
| 96 |
+
$L__BB0_6:
|
| 97 |
+
add.s32 %r82, %r238, %r13;
|
| 98 |
+
cvt.s64.s32 %rd30, %r82;
|
| 99 |
+
add.s64 %rd31, %rd1, %rd30;
|
| 100 |
+
ld.global.nc.u8 %rs20, [%rd31];
|
| 101 |
+
add.s32 %r83, %r8, %r238;
|
| 102 |
+
add.s32 %r85, %r68, %r83;
|
| 103 |
+
st.volatile.shared.u8 [%r85+1280], %rs20;
|
| 104 |
+
ld.global.nc.u8 %rs21, [%rd31+1];
|
| 105 |
+
st.volatile.shared.u8 [%r85+1281], %rs21;
|
| 106 |
+
ld.global.nc.u8 %rs22, [%rd31+2];
|
| 107 |
+
st.volatile.shared.u8 [%r85+1282], %rs22;
|
| 108 |
+
ld.global.nc.u8 %rs23, [%rd31+3];
|
| 109 |
+
st.volatile.shared.u8 [%r85+1283], %rs23;
|
| 110 |
+
add.s32 %r238, %r238, 4;
|
| 111 |
+
add.s32 %r237, %r237, -4;
|
| 112 |
+
setp.ne.s32 %p6, %r237, 0;
|
| 113 |
+
@%p6 bra $L__BB0_6;
|
| 114 |
+
|
| 115 |
+
$L__BB0_7:
|
| 116 |
+
setp.eq.s32 %p7, %r240, 0;
|
| 117 |
+
@%p7 bra $L__BB0_10;
|
| 118 |
+
|
| 119 |
+
add.s32 %r86, %r238, %r8;
|
| 120 |
+
add.s32 %r88, %r68, %r86;
|
| 121 |
+
add.s32 %r239, %r88, 1280;
|
| 122 |
+
add.s32 %r89, %r238, %r13;
|
| 123 |
+
cvt.s64.s32 %rd32, %r89;
|
| 124 |
+
add.s64 %rd97, %rd1, %rd32;
|
| 125 |
+
|
| 126 |
+
$L__BB0_9:
|
| 127 |
+
.pragma "nounroll";
|
| 128 |
+
ld.global.nc.u8 %rs24, [%rd97];
|
| 129 |
+
st.volatile.shared.u8 [%r239], %rs24;
|
| 130 |
+
add.s32 %r239, %r239, 1;
|
| 131 |
+
add.s64 %rd97, %rd97, 1;
|
| 132 |
+
add.s32 %r240, %r240, -1;
|
| 133 |
+
setp.ne.s32 %p8, %r240, 0;
|
| 134 |
+
@%p8 bra $L__BB0_9;
|
| 135 |
+
|
| 136 |
+
$L__BB0_10:
|
| 137 |
+
add.s32 %r27, %r3, 1284;
|
| 138 |
+
bar.sync 0;
|
| 139 |
+
mul.lo.s32 %r93, %r7, 5;
|
| 140 |
+
shr.s32 %r94, %r93, 31;
|
| 141 |
+
shr.u32 %r95, %r94, 29;
|
| 142 |
+
add.s32 %r96, %r93, %r95;
|
| 143 |
+
shr.s32 %r97, %r96, 3;
|
| 144 |
+
cvt.s64.s32 %rd33, %r97;
|
| 145 |
+
cvta.to.global.u64 %rd34, %rd24;
|
| 146 |
+
add.s64 %rd35, %rd34, %rd33;
|
| 147 |
+
ld.global.nc.u8 %rs25, [%rd35+1];
|
| 148 |
+
cvt.u32.u16 %r98, %rs25;
|
| 149 |
+
and.b32 %r99, %r98, 255;
|
| 150 |
+
ld.global.nc.u8 %rs26, [%rd35];
|
| 151 |
+
cvt.u32.u16 %r100, %rs26;
|
| 152 |
+
prmt.b32 %r249, %r100, %r99, 30212;
|
| 153 |
+
mov.u32 %r241, 0;
|
| 154 |
+
and.b32 %r101, %r96, -8;
|
| 155 |
+
sub.s32 %r102, %r101, %r93;
|
| 156 |
+
add.s32 %r103, %r102, 11;
|
| 157 |
+
shr.u32 %r104, %r249, %r103;
|
| 158 |
+
cvt.u64.u32 %rd36, %r104;
|
| 159 |
+
cvt.u16.u32 %rs27, %r104;
|
| 160 |
+
and.b16 %rs45, %rs27, 31;
|
| 161 |
+
add.s32 %r29, %r68, %r8;
|
| 162 |
+
ld.volatile.shared.u8 %rd37, [%r29+1280];
|
| 163 |
+
shl.b64 %rd38, %rd37, 56;
|
| 164 |
+
ld.volatile.shared.u8 %rd39, [%r29+1281];
|
| 165 |
+
shl.b64 %rd40, %rd39, 48;
|
| 166 |
+
or.b64 %rd41, %rd40, %rd38;
|
| 167 |
+
ld.volatile.shared.u8 %rd42, [%r29+1282];
|
| 168 |
+
shl.b64 %rd43, %rd42, 40;
|
| 169 |
+
or.b64 %rd44, %rd41, %rd43;
|
| 170 |
+
ld.volatile.shared.u8 %rd45, [%r29+1283];
|
| 171 |
+
shl.b64 %rd46, %rd45, 32;
|
| 172 |
+
or.b64 %rd47, %rd44, %rd46;
|
| 173 |
+
ld.volatile.shared.u8 %r105, [%r29+1284];
|
| 174 |
+
mul.wide.u32 %rd48, %r105, 16777216;
|
| 175 |
+
or.b64 %rd49, %rd47, %rd48;
|
| 176 |
+
ld.volatile.shared.u8 %r106, [%r29+1285];
|
| 177 |
+
mul.wide.u32 %rd50, %r106, 65536;
|
| 178 |
+
ld.volatile.shared.u8 %r107, [%r29+1286];
|
| 179 |
+
mul.wide.u32 %rd51, %r107, 256;
|
| 180 |
+
or.b64 %rd52, %rd49, %rd50;
|
| 181 |
+
ld.volatile.shared.u8 %rd53, [%r29+1287];
|
| 182 |
+
or.b64 %rd54, %rd52, %rd51;
|
| 183 |
+
or.b64 %rd55, %rd54, %rd53;
|
| 184 |
+
add.s32 %r251, %r8, 8;
|
| 185 |
+
and.b64 %rd5, %rd36, 31;
|
| 186 |
+
and.b32 %r108, %r104, 31;
|
| 187 |
+
shl.b64 %rd98, %rd55, %r108;
|
| 188 |
+
mov.u32 %r109, -8;
|
| 189 |
+
sub.s32 %r31, %r109, %r8;
|
| 190 |
+
cvta.to.global.u64 %rd7, %rd25;
|
| 191 |
+
cvta.to.global.u64 %rd8, %rd22;
|
| 192 |
+
cvta.to.global.u64 %rd9, %rd23;
|
| 193 |
+
add.s32 %r32, %r68, %r4;
|
| 194 |
+
mov.u16 %rs43, %rs45;
|
| 195 |
+
mov.u32 %r243, %r251;
|
| 196 |
+
bra.uni $L__BB0_11;
|
| 197 |
+
|
| 198 |
+
$L__BB0_47:
|
| 199 |
+
add.s32 %r224, %r68, %r243;
|
| 200 |
+
ld.volatile.shared.u8 %r225, [%r224+1280];
|
| 201 |
+
shl.b32 %r226, %r225, 24;
|
| 202 |
+
ld.volatile.shared.u8 %r227, [%r224+1281];
|
| 203 |
+
shl.b32 %r228, %r227, 16;
|
| 204 |
+
or.b32 %r229, %r228, %r226;
|
| 205 |
+
ld.volatile.shared.u8 %rs42, [%r224+1282];
|
| 206 |
+
mul.wide.u16 %r230, %rs42, 256;
|
| 207 |
+
or.b32 %r231, %r229, %r230;
|
| 208 |
+
ld.volatile.shared.u8 %r232, [%r224+1283];
|
| 209 |
+
or.b32 %r249, %r231, %r232;
|
| 210 |
+
add.s32 %r243, %r243, 4;
|
| 211 |
+
cvt.u64.u32 %rd92, %r249;
|
| 212 |
+
cvt.u64.u16 %rd93, %rs43;
|
| 213 |
+
and.b64 %rd94, %rd93, 255;
|
| 214 |
+
add.s64 %rd95, %rd94, 4294967264;
|
| 215 |
+
cvt.u32.u64 %r233, %rd95;
|
| 216 |
+
shl.b64 %rd96, %rd92, %r233;
|
| 217 |
+
or.b64 %rd98, %rd96, %rd98;
|
| 218 |
+
add.s16 %rs43, %rs43, -32;
|
| 219 |
+
mov.u32 %r241, %r36;
|
| 220 |
+
|
| 221 |
+
$L__BB0_11:
|
| 222 |
+
shr.u64 %rd56, %rd98, 56;
|
| 223 |
+
cvt.u32.u64 %r110, %rd56;
|
| 224 |
+
add.s32 %r112, %r68, %r110;
|
| 225 |
+
ld.volatile.shared.u8 %rs44, [%r112];
|
| 226 |
+
setp.ne.s16 %p9, %rs44, 0;
|
| 227 |
+
@%p9 bra $L__BB0_15;
|
| 228 |
+
|
| 229 |
+
shr.u64 %rd57, %rd98, 48;
|
| 230 |
+
cvt.u32.u64 %r113, %rd57;
|
| 231 |
+
and.b32 %r114, %r113, 255;
|
| 232 |
+
add.s32 %r116, %r68, %r114;
|
| 233 |
+
ld.volatile.shared.u8 %rs44, [%r116+256];
|
| 234 |
+
setp.ne.s16 %p10, %rs44, 0;
|
| 235 |
+
@%p10 bra $L__BB0_15;
|
| 236 |
+
|
| 237 |
+
shr.u64 %rd58, %rd98, 40;
|
| 238 |
+
cvt.u32.u64 %r117, %rd58;
|
| 239 |
+
and.b32 %r118, %r117, 255;
|
| 240 |
+
add.s32 %r120, %r68, %r118;
|
| 241 |
+
ld.volatile.shared.u8 %rs44, [%r120+512];
|
| 242 |
+
setp.ne.s16 %p11, %rs44, 0;
|
| 243 |
+
@%p11 bra $L__BB0_15;
|
| 244 |
+
|
| 245 |
+
shr.u64 %rd59, %rd98, 32;
|
| 246 |
+
cvt.u32.u64 %r121, %rd59;
|
| 247 |
+
and.b32 %r122, %r121, 255;
|
| 248 |
+
add.s32 %r124, %r68, %r122;
|
| 249 |
+
ld.volatile.shared.u8 %rs44, [%r124+768];
|
| 250 |
+
|
| 251 |
+
$L__BB0_15:
|
| 252 |
+
add.s32 %r36, %r241, 1;
|
| 253 |
+
cvt.u32.u16 %r125, %rs44;
|
| 254 |
+
and.b32 %r126, %r125, 255;
|
| 255 |
+
add.s32 %r128, %r68, %r126;
|
| 256 |
+
ld.volatile.shared.u8 %rs28, [%r128+1024];
|
| 257 |
+
cvt.u32.u16 %r129, %rs28;
|
| 258 |
+
and.b32 %r130, %r129, 255;
|
| 259 |
+
shl.b64 %rd98, %rd98, %r130;
|
| 260 |
+
add.s16 %rs43, %rs28, %rs43;
|
| 261 |
+
and.b16 %rs29, %rs43, 248;
|
| 262 |
+
shr.u16 %rs30, %rs29, 3;
|
| 263 |
+
cvt.u32.u16 %r131, %rs30;
|
| 264 |
+
add.s32 %r132, %r31, %r243;
|
| 265 |
+
add.s32 %r133, %r132, %r131;
|
| 266 |
+
setp.lt.u32 %p12, %r133, %r65;
|
| 267 |
+
@%p12 bra $L__BB0_46;
|
| 268 |
+
bra.uni $L__BB0_16;
|
| 269 |
+
|
| 270 |
+
$L__BB0_46:
|
| 271 |
+
and.b16 %rs41, %rs43, 255;
|
| 272 |
+
setp.lt.u16 %p32, %rs41, 32;
|
| 273 |
+
mov.u32 %r241, %r36;
|
| 274 |
+
@%p32 bra $L__BB0_11;
|
| 275 |
+
bra.uni $L__BB0_47;
|
| 276 |
+
|
| 277 |
+
$L__BB0_16:
|
| 278 |
+
setp.eq.s32 %p13, %r252, 0;
|
| 279 |
+
shl.b32 %r134, %r252, 2;
|
| 280 |
+
add.s32 %r37, %r3, %r134;
|
| 281 |
+
add.s32 %r38, %r3, %r69;
|
| 282 |
+
@%p13 bra $L__BB0_18;
|
| 283 |
+
|
| 284 |
+
add.s32 %r234, %r241, 1;
|
| 285 |
+
st.volatile.shared.u32 [%r37+1284], %r234;
|
| 286 |
+
bra.uni $L__BB0_19;
|
| 287 |
+
|
| 288 |
+
$L__BB0_18:
|
| 289 |
+
mul.wide.u32 %rd60, %r5, 4;
|
| 290 |
+
add.s64 %rd61, %rd9, %rd60;
|
| 291 |
+
ld.global.nc.u32 %r136, [%rd61];
|
| 292 |
+
st.volatile.shared.u32 [%r38+1284], %r136;
|
| 293 |
+
ld.volatile.shared.u32 %r137, [%r38+1284];
|
| 294 |
+
add.s32 %r138, %r137, %r36;
|
| 295 |
+
st.volatile.shared.u32 [%r3+1284], %r138;
|
| 296 |
+
|
| 297 |
+
$L__BB0_19:
|
| 298 |
+
bar.sync 0;
|
| 299 |
+
setp.lt.u32 %p14, %r1, 2;
|
| 300 |
+
@%p14 bra $L__BB0_24;
|
| 301 |
+
|
| 302 |
+
add.s32 %r39, %r252, 1;
|
| 303 |
+
mov.u32 %r244, 2;
|
| 304 |
+
|
| 305 |
+
$L__BB0_21:
|
| 306 |
+
rem.u32 %r140, %r39, %r244;
|
| 307 |
+
setp.ne.s32 %p15, %r140, 0;
|
| 308 |
+
@%p15 bra $L__BB0_23;
|
| 309 |
+
|
| 310 |
+
shr.u32 %r141, %r244, 1;
|
| 311 |
+
sub.s32 %r142, %r252, %r141;
|
| 312 |
+
shl.b32 %r143, %r142, 2;
|
| 313 |
+
add.s32 %r144, %r27, %r143;
|
| 314 |
+
ld.volatile.shared.u32 %r145, [%r37+1284];
|
| 315 |
+
ld.volatile.shared.u32 %r146, [%r144];
|
| 316 |
+
add.s32 %r147, %r145, %r146;
|
| 317 |
+
st.volatile.shared.u32 [%r37+1284], %r147;
|
| 318 |
+
|
| 319 |
+
$L__BB0_23:
|
| 320 |
+
bar.sync 0;
|
| 321 |
+
shl.b32 %r244, %r244, 1;
|
| 322 |
+
setp.le.u32 %p16, %r244, %r1;
|
| 323 |
+
@%p16 bra $L__BB0_21;
|
| 324 |
+
|
| 325 |
+
$L__BB0_24:
|
| 326 |
+
setp.ne.s32 %p17, %r252, 0;
|
| 327 |
+
@%p17 bra $L__BB0_26;
|
| 328 |
+
|
| 329 |
+
mov.u32 %r148, 0;
|
| 330 |
+
st.volatile.shared.u32 [%r38+1280], %r148;
|
| 331 |
+
|
| 332 |
+
$L__BB0_26:
|
| 333 |
+
bar.sync 0;
|
| 334 |
+
setp.lt.s32 %p18, %r1, 2;
|
| 335 |
+
@%p18 bra $L__BB0_31;
|
| 336 |
+
|
| 337 |
+
add.s32 %r42, %r252, 1;
|
| 338 |
+
mov.u32 %r245, %r1;
|
| 339 |
+
|
| 340 |
+
$L__BB0_28:
|
| 341 |
+
rem.u32 %r149, %r42, %r245;
|
| 342 |
+
setp.eq.s32 %p19, %r149, 0;
|
| 343 |
+
@%p19 bra $L__BB0_29;
|
| 344 |
+
bra.uni $L__BB0_30;
|
| 345 |
+
|
| 346 |
+
$L__BB0_29:
|
| 347 |
+
shr.u32 %r150, %r245, 1;
|
| 348 |
+
sub.s32 %r151, %r252, %r150;
|
| 349 |
+
shl.b32 %r152, %r151, 2;
|
| 350 |
+
add.s32 %r153, %r27, %r152;
|
| 351 |
+
ld.volatile.shared.u32 %r154, [%r37+1284];
|
| 352 |
+
ld.volatile.shared.u32 %r155, [%r153];
|
| 353 |
+
add.s32 %r156, %r154, %r155;
|
| 354 |
+
st.volatile.shared.u32 [%r37+1284], %r156;
|
| 355 |
+
ld.volatile.shared.u32 %r157, [%r153];
|
| 356 |
+
ld.volatile.shared.u32 %r158, [%r37+1284];
|
| 357 |
+
sub.s32 %r159, %r158, %r157;
|
| 358 |
+
st.volatile.shared.u32 [%r153], %r159;
|
| 359 |
+
|
| 360 |
+
$L__BB0_30:
|
| 361 |
+
shr.u32 %r44, %r245, 1;
|
| 362 |
+
bar.sync 0;
|
| 363 |
+
setp.gt.u32 %p20, %r245, 3;
|
| 364 |
+
mov.u32 %r245, %r44;
|
| 365 |
+
@%p20 bra $L__BB0_28;
|
| 366 |
+
|
| 367 |
+
$L__BB0_31:
|
| 368 |
+
@%p3 bra $L__BB0_32;
|
| 369 |
+
bra.uni $L__BB0_33;
|
| 370 |
+
|
| 371 |
+
$L__BB0_32:
|
| 372 |
+
ld.volatile.shared.u32 %r160, [%r38+1284];
|
| 373 |
+
st.volatile.shared.u32 [%r3+1284], %r160;
|
| 374 |
+
ld.volatile.shared.u32 %r161, [%r37+1284];
|
| 375 |
+
add.s32 %r162, %r161, %r36;
|
| 376 |
+
st.volatile.shared.u32 [%r38+1284], %r162;
|
| 377 |
+
|
| 378 |
+
$L__BB0_33:
|
| 379 |
+
bar.sync 0;
|
| 380 |
+
ld.volatile.shared.u32 %r163, [%r37+1284];
|
| 381 |
+
cvt.u64.u32 %rd12, %r163;
|
| 382 |
+
ld.volatile.shared.u32 %r45, [%r3+1284];
|
| 383 |
+
add.s32 %r46, %r163, %r36;
|
| 384 |
+
ld.volatile.shared.u8 %rd62, [%r29+1280];
|
| 385 |
+
shl.b64 %rd63, %rd62, 56;
|
| 386 |
+
ld.volatile.shared.u8 %rd64, [%r29+1281];
|
| 387 |
+
shl.b64 %rd65, %rd64, 48;
|
| 388 |
+
or.b64 %rd66, %rd65, %rd63;
|
| 389 |
+
ld.volatile.shared.u8 %rd67, [%r29+1282];
|
| 390 |
+
shl.b64 %rd68, %rd67, 40;
|
| 391 |
+
or.b64 %rd69, %rd66, %rd68;
|
| 392 |
+
ld.volatile.shared.u8 %rd70, [%r29+1283];
|
| 393 |
+
shl.b64 %rd71, %rd70, 32;
|
| 394 |
+
or.b64 %rd72, %rd69, %rd71;
|
| 395 |
+
ld.volatile.shared.u8 %r164, [%r29+1284];
|
| 396 |
+
mul.wide.u32 %rd73, %r164, 16777216;
|
| 397 |
+
ld.volatile.shared.u8 %r165, [%r29+1285];
|
| 398 |
+
mul.wide.u32 %rd74, %r165, 65536;
|
| 399 |
+
ld.volatile.shared.u8 %r166, [%r29+1286];
|
| 400 |
+
mul.wide.u32 %rd75, %r166, 256;
|
| 401 |
+
ld.volatile.shared.u8 %rd76, [%r29+1287];
|
| 402 |
+
or.b64 %rd77, %rd72, %rd73;
|
| 403 |
+
or.b64 %rd78, %rd77, %rd74;
|
| 404 |
+
or.b64 %rd79, %rd78, %rd75;
|
| 405 |
+
or.b64 %rd80, %rd79, %rd76;
|
| 406 |
+
cvt.u32.u64 %r167, %rd5;
|
| 407 |
+
shl.b64 %rd100, %rd80, %r167;
|
| 408 |
+
setp.ge.u32 %p22, %r163, %r67;
|
| 409 |
+
@%p22 bra $L__BB0_43;
|
| 410 |
+
|
| 411 |
+
cvt.u32.u64 %r168, %rd12;
|
| 412 |
+
add.s64 %rd99, %rd8, %rd12;
|
| 413 |
+
shl.b32 %r169, %r168, 1;
|
| 414 |
+
add.s32 %r170, %r4, %r169;
|
| 415 |
+
add.s32 %r171, %r170, 1288;
|
| 416 |
+
shl.b32 %r172, %r45, 1;
|
| 417 |
+
sub.s32 %r173, %r171, %r172;
|
| 418 |
+
add.s32 %r247, %r68, %r173;
|
| 419 |
+
add.s32 %r246, %r168, 1;
|
| 420 |
+
bra.uni $L__BB0_35;
|
| 421 |
+
|
| 422 |
+
$L__BB0_42:
|
| 423 |
+
add.s64 %rd99, %rd99, 1;
|
| 424 |
+
add.s32 %r247, %r247, 2;
|
| 425 |
+
add.s32 %r246, %r246, 1;
|
| 426 |
+
|
| 427 |
+
$L__BB0_35:
|
| 428 |
+
shr.u64 %rd81, %rd100, 56;
|
| 429 |
+
cvt.u32.u64 %r175, %rd81;
|
| 430 |
+
add.s32 %r177, %r68, %r175;
|
| 431 |
+
ld.volatile.shared.u8 %rs46, [%r177];
|
| 432 |
+
setp.ne.s16 %p23, %rs46, 0;
|
| 433 |
+
@%p23 bra $L__BB0_39;
|
| 434 |
+
|
| 435 |
+
shr.u64 %rd82, %rd100, 48;
|
| 436 |
+
cvt.u32.u64 %r178, %rd82;
|
| 437 |
+
and.b32 %r179, %r178, 255;
|
| 438 |
+
add.s32 %r181, %r68, %r179;
|
| 439 |
+
ld.volatile.shared.u8 %rs46, [%r181+256];
|
| 440 |
+
setp.ne.s16 %p24, %rs46, 0;
|
| 441 |
+
@%p24 bra $L__BB0_39;
|
| 442 |
+
|
| 443 |
+
shr.u64 %rd83, %rd100, 40;
|
| 444 |
+
cvt.u32.u64 %r182, %rd83;
|
| 445 |
+
and.b32 %r183, %r182, 255;
|
| 446 |
+
add.s32 %r185, %r68, %r183;
|
| 447 |
+
ld.volatile.shared.u8 %rs46, [%r185+512];
|
| 448 |
+
setp.ne.s16 %p25, %rs46, 0;
|
| 449 |
+
@%p25 bra $L__BB0_39;
|
| 450 |
+
|
| 451 |
+
shr.u64 %rd84, %rd100, 32;
|
| 452 |
+
cvt.u32.u64 %r186, %rd84;
|
| 453 |
+
and.b32 %r187, %r186, 255;
|
| 454 |
+
add.s32 %r189, %r68, %r187;
|
| 455 |
+
ld.volatile.shared.u8 %rs46, [%r189+768];
|
| 456 |
+
|
| 457 |
+
$L__BB0_39:
|
| 458 |
+
ld.global.nc.u8 %rs31, [%rd99];
|
| 459 |
+
and.b16 %rs32, %rs31, 128;
|
| 460 |
+
and.b16 %rs33, %rs46, 254;
|
| 461 |
+
shr.u16 %rs34, %rs33, 1;
|
| 462 |
+
or.b16 %rs35, %rs32, %rs34;
|
| 463 |
+
mul.wide.u16 %r190, %rs35, 256;
|
| 464 |
+
and.b16 %rs36, %rs31, 127;
|
| 465 |
+
cvt.u32.u16 %r191, %rs46;
|
| 466 |
+
cvt.u32.u16 %r192, %rs36;
|
| 467 |
+
bfi.b32 %r193, %r191, %r192, 7, 9;
|
| 468 |
+
and.b32 %r194, %r193, 255;
|
| 469 |
+
and.b32 %r195, %r249, -65536;
|
| 470 |
+
or.b32 %r196, %r195, %r194;
|
| 471 |
+
or.b32 %r249, %r196, %r190;
|
| 472 |
+
st.volatile.shared.u16 [%r247], %r249;
|
| 473 |
+
setp.ge.u32 %p26, %r246, %r67;
|
| 474 |
+
setp.ge.u32 %p27, %r246, %r46;
|
| 475 |
+
or.pred %p28, %p26, %p27;
|
| 476 |
+
@%p28 bra $L__BB0_43;
|
| 477 |
+
|
| 478 |
+
and.b32 %r198, %r191, 255;
|
| 479 |
+
add.s32 %r200, %r68, %r198;
|
| 480 |
+
ld.volatile.shared.u8 %rs37, [%r200+1024];
|
| 481 |
+
cvt.u32.u16 %r201, %rs37;
|
| 482 |
+
and.b32 %r202, %r201, 255;
|
| 483 |
+
shl.b64 %rd100, %rd100, %r202;
|
| 484 |
+
add.s16 %rs45, %rs37, %rs45;
|
| 485 |
+
and.b16 %rs38, %rs45, 255;
|
| 486 |
+
setp.lt.u16 %p29, %rs38, 32;
|
| 487 |
+
@%p29 bra $L__BB0_42;
|
| 488 |
+
|
| 489 |
+
add.s32 %r204, %r68, %r251;
|
| 490 |
+
ld.volatile.shared.u8 %r205, [%r204+1280];
|
| 491 |
+
shl.b32 %r206, %r205, 24;
|
| 492 |
+
ld.volatile.shared.u8 %r207, [%r204+1281];
|
| 493 |
+
shl.b32 %r208, %r207, 16;
|
| 494 |
+
or.b32 %r209, %r208, %r206;
|
| 495 |
+
ld.volatile.shared.u8 %rs39, [%r204+1282];
|
| 496 |
+
mul.wide.u16 %r210, %rs39, 256;
|
| 497 |
+
or.b32 %r211, %r209, %r210;
|
| 498 |
+
ld.volatile.shared.u8 %r212, [%r204+1283];
|
| 499 |
+
or.b32 %r249, %r211, %r212;
|
| 500 |
+
add.s32 %r251, %r251, 4;
|
| 501 |
+
cvt.u64.u32 %rd85, %r249;
|
| 502 |
+
cvt.u64.u16 %rd86, %rs45;
|
| 503 |
+
and.b64 %rd87, %rd86, 255;
|
| 504 |
+
add.s64 %rd88, %rd87, 4294967264;
|
| 505 |
+
cvt.u32.u64 %r213, %rd88;
|
| 506 |
+
shl.b64 %rd89, %rd85, %r213;
|
| 507 |
+
or.b64 %rd100, %rd89, %rd100;
|
| 508 |
+
add.s16 %rs45, %rs45, -32;
|
| 509 |
+
bra.uni $L__BB0_42;
|
| 510 |
+
|
| 511 |
+
$L__BB0_43:
|
| 512 |
+
bar.sync 0;
|
| 513 |
+
ld.volatile.shared.u32 %r214, [%r38+1284];
|
| 514 |
+
sub.s32 %r215, %r214, %r45;
|
| 515 |
+
sub.s32 %r60, %r67, %r45;
|
| 516 |
+
min.u32 %r216, %r215, %r60;
|
| 517 |
+
setp.ge.u32 %p30, %r252, %r216;
|
| 518 |
+
@%p30 bra $L__BB0_45;
|
| 519 |
+
|
| 520 |
+
$L__BB0_44:
|
| 521 |
+
shl.b32 %r217, %r252, 1;
|
| 522 |
+
add.s32 %r218, %r32, %r217;
|
| 523 |
+
ld.volatile.shared.u16 %rs40, [%r218+1288];
|
| 524 |
+
add.s32 %r219, %r252, %r45;
|
| 525 |
+
mul.wide.u32 %rd90, %r219, 2;
|
| 526 |
+
add.s64 %rd91, %rd7, %rd90;
|
| 527 |
+
st.global.u16 [%rd91], %rs40;
|
| 528 |
+
ld.volatile.shared.u32 %r220, [%r38+1284];
|
| 529 |
+
sub.s32 %r221, %r220, %r45;
|
| 530 |
+
min.u32 %r222, %r221, %r60;
|
| 531 |
+
add.s32 %r252, %r252, %r1;
|
| 532 |
+
setp.lt.u32 %p31, %r252, %r222;
|
| 533 |
+
@%p31 bra $L__BB0_44;
|
| 534 |
+
|
| 535 |
+
$L__BB0_45:
|
| 536 |
+
ret;
|
| 537 |
+
|
| 538 |
+
}
|
| 539 |
+
|
layer_0.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b4c01dcac753e1bca5c633fe7df1cd6d74a3cf425b7e3364778d9c20114074a5
|
| 3 |
+
size 758603326
|
layer_1.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:294b42e51f93d30d3cc2cdeb8ed3a807a11dc775a3db6a8022374ccecc11d38a
|
| 3 |
+
size 753471698
|
layer_10.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:509815b7173d47de56adb5f53e76ba790d44201e9ef52cada89ae3db152f5b68
|
| 3 |
+
size 751783012
|
layer_11.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:909d2d5a6d13652b2243b8f1de932a07706db07a0b8313d497898785758c5baf
|
| 3 |
+
size 751751314
|
layer_12.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:358282f3a74d48c83406fd32bcf314a4b903a962f0754b1dadca2f2622bfe466
|
| 3 |
+
size 751875767
|
layer_13.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6678559cd0e32f034fd57de4ab88251a9bde6336435b82e8d4824316895ab346
|
| 3 |
+
size 751892664
|
layer_14.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43e4616f6298bef9fbce486505a580772eb0d5eb0cd27ba52ed302c0c1238ee4
|
| 3 |
+
size 751809969
|
layer_15.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74d8660201d5a6e8639f2cb51a50789b02eecdca323ef865e73166c684ddb75b
|
| 3 |
+
size 751676108
|
layer_16.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e000973dd898635ddd8fac3e86458f2f22fd2a74d36aed2f67c5dc868651456
|
| 3 |
+
size 751630430
|
layer_17.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bdd7c745e2cacc59e9cd8d5de054009d417d8b9f955eaf5791d27898207ac74c
|
| 3 |
+
size 751610977
|
layer_18.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d67e93f6066e9bd30173505b4c656a3c6a0e45ea9389ad9a3a3a50e5db8681ce
|
| 3 |
+
size 751474979
|
layer_19.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e94ca47c23ee8050269a7abac3796d421c0c76e5f5bf6500813baad91e1effca
|
| 3 |
+
size 751448233
|
layer_2.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:df9f492480b7ec3d8d305c7754b482ba1eb05c6c47fddfb9223cc1f0ad470c96
|
| 3 |
+
size 754325380
|
layer_20.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f1df815fdecdd4ed84dbd02a3e95ad8fd525469a426ef466ddea33e7a4ce066
|
| 3 |
+
size 751306616
|
layer_21.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ceb78dc40cbc3e7d1d0631558433f0789d23c89cf10d7ce7849ef7f0daaa9fd
|
| 3 |
+
size 751282410
|
layer_22.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12762755739b97e22179d5062857d8146d62c3888df15dc60df3a4e0d16f5308
|
| 3 |
+
size 751293389
|
layer_23.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d4597269cd56615848163bc9ce7198abe444be464c27efcaffdf5269ab4265e
|
| 3 |
+
size 751293812
|
layer_24.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b4e1321b679c60332a334faf78cf1863f7b042507d85839cd95f7f37d8cb8eb1
|
| 3 |
+
size 751193486
|
layer_25.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:297d65ce9acb406acb02bac73a475a3236de3f7df3c7752bcb742e7d2ad96d49
|
| 3 |
+
size 751121450
|
layer_26.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52a2689015f0b6da486fba1b804254627710244a22c1e9a2694be4deb7c4ddc8
|
| 3 |
+
size 751052181
|
layer_27.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed31813d56c8c765a0395cfc39223675df2df2af86158eeab4889a147f3c0839
|
| 3 |
+
size 751199805
|
layer_28.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8d05913a7bd23f751580e949234fbf544389c256ea079f53597e030c680dafc3
|
| 3 |
+
size 751120584
|
layer_29.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1812d691b600ae6c9184500a00ba3ff1efc01aabc330a6f5b505732d7839afc8
|
| 3 |
+
size 751093621
|
layer_3.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8312b02f653fcd7897ac1f04a672eb89cc90609db319425099fe68984e48ac26
|
| 3 |
+
size 752699382
|
layer_30.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4716d07dd28fed5c5247c0a01e36a58fcb1f8e043d98433ae01938cbb17583de
|
| 3 |
+
size 751033703
|
layer_31.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a13a1624d363b857bae780cc92359a09887c1287ef46c178847f3851f553a216
|
| 3 |
+
size 750950229
|
layer_32.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f72699b7e30a1ba233796182604123ee12de9056366ddd0730a94f279700584
|
| 3 |
+
size 750969981
|
layer_33.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cccf47a74e7b1875d71d19f3b8f3bb0c124b616f9145eb4572b03cc24a030775
|
| 3 |
+
size 751016141
|
layer_34.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74b120d0c47a012978423e49b2a5840f627e86e759d5e34fe87029f68102dc01
|
| 3 |
+
size 751080560
|
layer_35.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59025d4ade28ddc26bf39a3795d761e4997bc58481be9e0d451d2a0c959a2713
|
| 3 |
+
size 751199796
|
layer_36.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:99dbfff5b2c742adeca6d759fa3c1636047bcc329545dbf4c319dad949b47800
|
| 3 |
+
size 751278784
|
layer_37.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cdbf4cc5eb186ab18cd97569cd5475e17189d296f0732330ed4817214d69e77d
|
| 3 |
+
size 751610563
|
layer_38.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a6b3e620b241cfb278e2e827b38972f396bacd49e795c51b2ee2cd2fab80617
|
| 3 |
+
size 751681420
|
layer_39.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4cef6e2c8fcbbf71868d5de86279b4a766351a8f2b5f8108626be35d3b5ff851
|
| 3 |
+
size 753153030
|
layer_4.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b9b195a6cf53c7b650c17662a75647d0f519a5000d470a4f3d1998aae4a63b47
|
| 3 |
+
size 752543953
|
layer_5.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ef0189c5d2d781bc6e4865daf8dca279332c472ad29e2105ea6fbe410cfa27d
|
| 3 |
+
size 752135655
|
layer_6.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97e4345c015c3dbe51a26a66b6ce2b7fba1adb4dca87c3c2fdbd6558dd8af736
|
| 3 |
+
size 752335495
|
layer_7.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f2b15310a708dcc585c123b662273047a308007e052b2e9666cbe65bed10181
|
| 3 |
+
size 752082894
|
layer_8.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a31472c2e5e2316d13c7cab78db48b19d2116ad197d176e2abee8462378e802
|
| 3 |
+
size 751955327
|
layer_9.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:35d8da267336d290326e310f2024ea0757ca72989e58368cef97f40be9701388
|
| 3 |
+
size 751822822
|
lm_head.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b0cd341e6c6a7ed0d21a6e8d680232c6aa35d901a43437604014c14e6fe5cec
|
| 3 |
+
size 910765294
|