AUVDiDo commited on
Commit
0bb6f61
·
verified ·
1 Parent(s): ff467ee

Delete Llama-3.3-70B-Instruct.txt

Browse files
Files changed (1) hide show
  1. Llama-3.3-70B-Instruct.txt +0 -116
Llama-3.3-70B-Instruct.txt DELETED
@@ -1,116 +0,0 @@
1
- ---------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
2
- Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
3
- ---------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
4
- cudaLaunchKernel 13.27% 1.418s 13.27% 1.418s 3.593us 394586
5
- aten::as_strided 1.08% 115.744ms 1.08% 115.744ms 0.337us 343173
6
- aten::to 0.54% 57.976ms 33.12% 3.537s 13.122us 269580
7
- cudaFuncSetAttribute 1.29% 137.964ms 1.29% 137.964ms 0.840us 164256
8
- aten::transpose 1.04% 110.650ms 1.59% 170.262ms 1.107us 153856
9
- aten::slice 1.23% 131.454ms 1.57% 167.184ms 1.340us 124799
10
- aten::view 0.67% 71.079ms 0.67% 71.079ms 0.626us 113537
11
- aten::copy_ 8.59% 917.312ms 29.53% 3.154s 30.919us 102023
12
- aten::mul 5.14% 549.342ms 7.92% 846.232ms 9.106us 92928
13
- aten::reshape 0.77% 81.773ms 4.02% 429.704ms 4.630us 92800
14
- aten::_unsafe_view 0.41% 43.844ms 0.41% 43.844ms 0.474us 92416
15
- aten::empty_strided 3.27% 348.892ms 3.68% 393.075ms 4.319us 91016
16
- aten::_to_copy 1.19% 127.636ms 32.58% 3.479s 43.213us 80519
17
- cudaEventRecord 0.47% 50.468ms 0.47% 50.468ms 0.653us 77312
18
- cudaStreamWaitEvent 1.97% 210.324ms 1.97% 210.324ms 2.720us 77312
19
- aten::matmul 1.48% 158.298ms 22.96% 2.452s 34.087us 71936
20
- aten::linear 0.68% 73.006ms 24.23% 2.588s 36.036us 71808
21
- aten::t 0.69% 73.657ms 1.43% 152.759ms 2.127us 71808
22
- aten::mm 13.62% 1.455s 19.60% 2.093s 29.148us 71808
23
- aten::empty 1.75% 186.943ms 1.75% 186.943ms 2.628us 71141
24
- aten::add 3.44% 366.966ms 5.02% 536.139ms 8.672us 61824
25
- cudaFuncGetAttributes 1.42% 151.850ms 1.42% 151.850ms 2.958us 51343
26
- cudaLaunchKernelExC 1.57% 167.686ms 1.57% 167.686ms 3.267us 51328
27
- aten::unsqueeze 0.40% 42.589ms 0.51% 54.686ms 1.311us 41729
28
- aten::cat 3.05% 325.722ms 4.48% 478.806ms 11.626us 41184
29
- cudaMemcpyAsync 2.85% 303.978ms 2.85% 303.978ms 7.537us 40329
30
- aten::empty_like 0.27% 28.633ms 1.37% 146.555ms 4.673us 31360
31
- cudaDeviceGetAttribute 0.08% 8.631ms 0.08% 8.631ms 0.403us 21401
32
- aten::clone 0.33% 35.761ms 2.83% 301.802ms 14.291us 21119
33
- aten::expand 0.29% 30.827ms 0.36% 38.344ms 1.838us 20864
34
- cuLaunchKernel 0.57% 60.723ms 0.57% 60.723ms 2.947us 20608
35
- aten::pow 1.54% 164.532ms 2.42% 258.327ms 12.535us 20608
36
- aten::result_type 0.05% 5.705ms 0.05% 5.705ms 0.277us 20608
37
- aten::mean 1.72% 183.427ms 2.39% 255.152ms 12.381us 20608
38
- aten::rsqrt 1.18% 126.077ms 1.87% 199.269ms 9.669us 20608
39
- aten::neg 1.28% 136.597ms 2.00% 213.198ms 10.410us 20480
40
- cudaStreamIsCapturing 0.09% 9.314ms 0.09% 9.314ms 0.883us 10551
41
- aten::scaled_dot_product_attention 0.70% 74.586ms 4.81% 513.450ms 50.142us 10240
42
- aten::_scaled_dot_product_flash_attention 0.60% 64.179ms 4.11% 438.864ms 42.858us 10240
43
- aten::_flash_attention_forward 1.14% 121.600ms 3.17% 338.182ms 33.026us 10240
44
- aten::silu 0.72% 77.113ms 1.08% 115.025ms 11.233us 10240
45
- cudaPeekAtLastError 0.00% 216.828us 0.00% 216.828us 0.077us 2820
46
- cudaMemsetAsync 0.03% 3.515ms 0.03% 3.515ms 3.051us 1152
47
- aten::item 0.01% 599.904us 0.68% 73.102ms 94.938us 770
48
- aten::_local_scalar_dense 0.02% 2.496ms 0.68% 72.503ms 94.159us 770
49
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags 0.16% 16.964ms 0.16% 16.964ms 25.820us 657
50
- cudaStreamSynchronize 0.62% 65.837ms 0.62% 65.837ms 101.443us 649
51
- aten::fill_ 0.02% 2.516ms 0.08% 8.853ms 13.768us 643
52
- aten::eq 0.04% 4.230ms 0.11% 11.692ms 18.240us 641
53
- aten::select 0.01% 1.268ms 0.01% 1.562ms 3.033us 515
54
- aten::is_nonzero 0.00% 270.018us 0.07% 7.411ms 19.201us 386
55
- aten::masked_fill_ 0.02% 1.942ms 0.41% 43.750ms 113.933us 384
56
- aten::lt 0.02% 1.824ms 0.06% 5.914ms 23.010us 257
57
- aten::cumsum 0.04% 4.100ms 0.08% 8.919ms 34.703us 257
58
- aten::sub 0.02% 1.757ms 0.08% 9.054ms 35.228us 257
59
- aten::ge 0.02% 1.756ms 0.02% 2.529ms 9.878us 256
60
- aten::resize_ 0.01% 732.984us 0.01% 732.984us 2.863us 256
61
- aten::div 0.02% 1.708ms 0.07% 7.755ms 30.294us 256
62
- aten::masked_fill 0.01% 777.172us 0.06% 6.097ms 23.817us 256
63
- aten::softmax 0.00% 324.262us 0.09% 10.077ms 39.363us 256
64
- aten::_softmax 0.01% 1.380ms 0.09% 9.753ms 38.096us 256
65
- aten::max 0.02% 1.698ms 0.10% 10.692ms 41.764us 256
66
- aten::bitwise_and 0.01% 1.476ms 0.08% 8.821ms 34.459us 256
67
- aten::sum 0.02% 2.235ms 0.11% 11.433ms 44.662us 256
68
- aten::full 0.00% 489.682us 0.02% 2.645ms 10.332us 256
69
- aten::__or__ 0.00% 254.864us 0.02% 2.553ms 9.971us 256
70
- aten::bitwise_or 0.01% 1.588ms 0.02% 2.298ms 8.975us 256
71
- cudaOccupancyMaxActiveClusters 0.00% 95.980us 0.00% 95.980us 0.387us 248
72
- aten::lift_fresh 0.00% 29.450us 0.00% 29.450us 0.221us 133
73
- aten::any 0.01% 1.184ms 0.09% 9.246ms 70.579us 131
74
- aten::isin 0.01% 1.499ms 0.22% 23.226ms 180.048us 129
75
- aten::embedding 0.01% 610.386us 0.04% 3.806ms 29.733us 128
76
- aten::index_select 0.01% 1.322ms 0.03% 2.970ms 23.204us 128
77
- aten::all 0.01% 1.239ms 0.02% 1.798ms 14.048us 128
78
- aten::bmm 0.66% 70.555ms 0.83% 88.254ms 689.487us 128
79
- aten::cos 0.01% 1.069ms 0.02% 2.460ms 19.215us 128
80
- aten::sin 0.01% 1.054ms 0.02% 2.378ms 18.579us 128
81
- aten::new_ones 0.00% 363.206us 0.02% 2.356ms 18.405us 128
82
- aten::new_empty 0.00% 146.671us 0.01% 643.930us 5.031us 128
83
- aten::topk 0.05% 5.737ms 0.33% 35.086ms 274.110us 128
84
- aten::sort 0.03% 3.500ms 0.29% 30.697ms 239.824us 128
85
- aten::le 0.01% 1.203ms 0.02% 1.626ms 12.704us 128
86
- aten::scatter 0.03% 3.105ms 0.07% 7.853ms 61.351us 128
87
- aten::multinomial 0.03% 2.894ms 1.11% 118.961ms 929.384us 128
88
- aten::min 0.01% 1.061ms 0.09% 9.143ms 71.430us 128
89
- aten::exponential_ 0.01% 965.092us 0.02% 2.324ms 18.158us 128
90
- aten::argmax 0.01% 778.371us 0.05% 5.145ms 40.196us 128
91
- aten::squeeze 0.00% 351.418us 0.00% 381.833us 2.983us 128
92
- aten::rsub 0.00% 335.894us 0.02% 1.628ms 12.719us 128
93
- aten::bitwise_not 0.01% 823.513us 0.01% 1.239ms 9.676us 128
94
- aten::__and__ 0.00% 109.667us 0.01% 1.253ms 9.786us 128
95
- aten::index 0.01% 1.528ms 0.12% 13.014ms 102.476us 127
96
- cudaMalloc 1.84% 196.481ms 1.84% 196.481ms 2.487ms 79
97
- cudaDeviceCanAccessPeer 0.00% 29.525us 0.00% 29.525us 1.476us 20
98
- cudaDeviceEnablePeerAccess 13.49% 1.441s 13.49% 1.441s 72.062ms 20
99
- cudaGetDriverEntryPoint 0.00% 10.629us 0.00% 10.629us 0.664us 16
100
- cudaOccupancyAvailableDynamicSMemPerBlock 0.00% 21.142us 0.00% 21.142us 1.321us 16
101
- cudaFree 0.11% 12.251ms 0.11% 12.251ms 1.531ms 8
102
- cudaGetSymbolAddress 0.00% 245.636us 0.00% 245.636us 30.705us 8
103
- cudaStreamCreate 0.00% 167.896us 0.00% 167.896us 20.987us 8
104
- cudaStreamDestroy 0.00% 49.125us 0.00% 49.125us 6.141us 8
105
- aten::detach_ 0.00% 8.144us 0.00% 16.141us 3.228us 5
106
- detach_ 0.00% 7.997us 0.00% 7.997us 1.599us 5
107
- cudaHostAlloc 0.02% 1.896ms 0.02% 1.896ms 947.933us 2
108
- aten::detach 0.00% 3.805us 0.00% 8.961us 4.481us 2
109
- detach 0.00% 5.156us 0.00% 5.156us 2.578us 2
110
- aten::resolve_conj 0.00% 0.844us 0.00% 0.844us 0.422us 2
111
- aten::resolve_neg 0.00% 0.456us 0.00% 0.456us 0.228us 2
112
- aten::ones 0.00% 6.404us 0.00% 61.959us 61.959us 1
113
- aten::ones_like 0.00% 8.571us 0.00% 24.098us 24.098us 1
114
- cudaGetDeviceCount 0.00% 0.144us 0.00% 0.144us 0.144us 1
115
- ---------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
116
- Self CPU time total: 10.681s