Delete Llama-3.3-70B-Instruct.txt
Browse files- Llama-3.3-70B-Instruct.txt +0 -116
Llama-3.3-70B-Instruct.txt
DELETED
|
@@ -1,116 +0,0 @@
|
|
| 1 |
-
---------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
|
| 2 |
-
Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
|
| 3 |
-
---------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
|
| 4 |
-
cudaLaunchKernel 13.27% 1.418s 13.27% 1.418s 3.593us 394586
|
| 5 |
-
aten::as_strided 1.08% 115.744ms 1.08% 115.744ms 0.337us 343173
|
| 6 |
-
aten::to 0.54% 57.976ms 33.12% 3.537s 13.122us 269580
|
| 7 |
-
cudaFuncSetAttribute 1.29% 137.964ms 1.29% 137.964ms 0.840us 164256
|
| 8 |
-
aten::transpose 1.04% 110.650ms 1.59% 170.262ms 1.107us 153856
|
| 9 |
-
aten::slice 1.23% 131.454ms 1.57% 167.184ms 1.340us 124799
|
| 10 |
-
aten::view 0.67% 71.079ms 0.67% 71.079ms 0.626us 113537
|
| 11 |
-
aten::copy_ 8.59% 917.312ms 29.53% 3.154s 30.919us 102023
|
| 12 |
-
aten::mul 5.14% 549.342ms 7.92% 846.232ms 9.106us 92928
|
| 13 |
-
aten::reshape 0.77% 81.773ms 4.02% 429.704ms 4.630us 92800
|
| 14 |
-
aten::_unsafe_view 0.41% 43.844ms 0.41% 43.844ms 0.474us 92416
|
| 15 |
-
aten::empty_strided 3.27% 348.892ms 3.68% 393.075ms 4.319us 91016
|
| 16 |
-
aten::_to_copy 1.19% 127.636ms 32.58% 3.479s 43.213us 80519
|
| 17 |
-
cudaEventRecord 0.47% 50.468ms 0.47% 50.468ms 0.653us 77312
|
| 18 |
-
cudaStreamWaitEvent 1.97% 210.324ms 1.97% 210.324ms 2.720us 77312
|
| 19 |
-
aten::matmul 1.48% 158.298ms 22.96% 2.452s 34.087us 71936
|
| 20 |
-
aten::linear 0.68% 73.006ms 24.23% 2.588s 36.036us 71808
|
| 21 |
-
aten::t 0.69% 73.657ms 1.43% 152.759ms 2.127us 71808
|
| 22 |
-
aten::mm 13.62% 1.455s 19.60% 2.093s 29.148us 71808
|
| 23 |
-
aten::empty 1.75% 186.943ms 1.75% 186.943ms 2.628us 71141
|
| 24 |
-
aten::add 3.44% 366.966ms 5.02% 536.139ms 8.672us 61824
|
| 25 |
-
cudaFuncGetAttributes 1.42% 151.850ms 1.42% 151.850ms 2.958us 51343
|
| 26 |
-
cudaLaunchKernelExC 1.57% 167.686ms 1.57% 167.686ms 3.267us 51328
|
| 27 |
-
aten::unsqueeze 0.40% 42.589ms 0.51% 54.686ms 1.311us 41729
|
| 28 |
-
aten::cat 3.05% 325.722ms 4.48% 478.806ms 11.626us 41184
|
| 29 |
-
cudaMemcpyAsync 2.85% 303.978ms 2.85% 303.978ms 7.537us 40329
|
| 30 |
-
aten::empty_like 0.27% 28.633ms 1.37% 146.555ms 4.673us 31360
|
| 31 |
-
cudaDeviceGetAttribute 0.08% 8.631ms 0.08% 8.631ms 0.403us 21401
|
| 32 |
-
aten::clone 0.33% 35.761ms 2.83% 301.802ms 14.291us 21119
|
| 33 |
-
aten::expand 0.29% 30.827ms 0.36% 38.344ms 1.838us 20864
|
| 34 |
-
cuLaunchKernel 0.57% 60.723ms 0.57% 60.723ms 2.947us 20608
|
| 35 |
-
aten::pow 1.54% 164.532ms 2.42% 258.327ms 12.535us 20608
|
| 36 |
-
aten::result_type 0.05% 5.705ms 0.05% 5.705ms 0.277us 20608
|
| 37 |
-
aten::mean 1.72% 183.427ms 2.39% 255.152ms 12.381us 20608
|
| 38 |
-
aten::rsqrt 1.18% 126.077ms 1.87% 199.269ms 9.669us 20608
|
| 39 |
-
aten::neg 1.28% 136.597ms 2.00% 213.198ms 10.410us 20480
|
| 40 |
-
cudaStreamIsCapturing 0.09% 9.314ms 0.09% 9.314ms 0.883us 10551
|
| 41 |
-
aten::scaled_dot_product_attention 0.70% 74.586ms 4.81% 513.450ms 50.142us 10240
|
| 42 |
-
aten::_scaled_dot_product_flash_attention 0.60% 64.179ms 4.11% 438.864ms 42.858us 10240
|
| 43 |
-
aten::_flash_attention_forward 1.14% 121.600ms 3.17% 338.182ms 33.026us 10240
|
| 44 |
-
aten::silu 0.72% 77.113ms 1.08% 115.025ms 11.233us 10240
|
| 45 |
-
cudaPeekAtLastError 0.00% 216.828us 0.00% 216.828us 0.077us 2820
|
| 46 |
-
cudaMemsetAsync 0.03% 3.515ms 0.03% 3.515ms 3.051us 1152
|
| 47 |
-
aten::item 0.01% 599.904us 0.68% 73.102ms 94.938us 770
|
| 48 |
-
aten::_local_scalar_dense 0.02% 2.496ms 0.68% 72.503ms 94.159us 770
|
| 49 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags 0.16% 16.964ms 0.16% 16.964ms 25.820us 657
|
| 50 |
-
cudaStreamSynchronize 0.62% 65.837ms 0.62% 65.837ms 101.443us 649
|
| 51 |
-
aten::fill_ 0.02% 2.516ms 0.08% 8.853ms 13.768us 643
|
| 52 |
-
aten::eq 0.04% 4.230ms 0.11% 11.692ms 18.240us 641
|
| 53 |
-
aten::select 0.01% 1.268ms 0.01% 1.562ms 3.033us 515
|
| 54 |
-
aten::is_nonzero 0.00% 270.018us 0.07% 7.411ms 19.201us 386
|
| 55 |
-
aten::masked_fill_ 0.02% 1.942ms 0.41% 43.750ms 113.933us 384
|
| 56 |
-
aten::lt 0.02% 1.824ms 0.06% 5.914ms 23.010us 257
|
| 57 |
-
aten::cumsum 0.04% 4.100ms 0.08% 8.919ms 34.703us 257
|
| 58 |
-
aten::sub 0.02% 1.757ms 0.08% 9.054ms 35.228us 257
|
| 59 |
-
aten::ge 0.02% 1.756ms 0.02% 2.529ms 9.878us 256
|
| 60 |
-
aten::resize_ 0.01% 732.984us 0.01% 732.984us 2.863us 256
|
| 61 |
-
aten::div 0.02% 1.708ms 0.07% 7.755ms 30.294us 256
|
| 62 |
-
aten::masked_fill 0.01% 777.172us 0.06% 6.097ms 23.817us 256
|
| 63 |
-
aten::softmax 0.00% 324.262us 0.09% 10.077ms 39.363us 256
|
| 64 |
-
aten::_softmax 0.01% 1.380ms 0.09% 9.753ms 38.096us 256
|
| 65 |
-
aten::max 0.02% 1.698ms 0.10% 10.692ms 41.764us 256
|
| 66 |
-
aten::bitwise_and 0.01% 1.476ms 0.08% 8.821ms 34.459us 256
|
| 67 |
-
aten::sum 0.02% 2.235ms 0.11% 11.433ms 44.662us 256
|
| 68 |
-
aten::full 0.00% 489.682us 0.02% 2.645ms 10.332us 256
|
| 69 |
-
aten::__or__ 0.00% 254.864us 0.02% 2.553ms 9.971us 256
|
| 70 |
-
aten::bitwise_or 0.01% 1.588ms 0.02% 2.298ms 8.975us 256
|
| 71 |
-
cudaOccupancyMaxActiveClusters 0.00% 95.980us 0.00% 95.980us 0.387us 248
|
| 72 |
-
aten::lift_fresh 0.00% 29.450us 0.00% 29.450us 0.221us 133
|
| 73 |
-
aten::any 0.01% 1.184ms 0.09% 9.246ms 70.579us 131
|
| 74 |
-
aten::isin 0.01% 1.499ms 0.22% 23.226ms 180.048us 129
|
| 75 |
-
aten::embedding 0.01% 610.386us 0.04% 3.806ms 29.733us 128
|
| 76 |
-
aten::index_select 0.01% 1.322ms 0.03% 2.970ms 23.204us 128
|
| 77 |
-
aten::all 0.01% 1.239ms 0.02% 1.798ms 14.048us 128
|
| 78 |
-
aten::bmm 0.66% 70.555ms 0.83% 88.254ms 689.487us 128
|
| 79 |
-
aten::cos 0.01% 1.069ms 0.02% 2.460ms 19.215us 128
|
| 80 |
-
aten::sin 0.01% 1.054ms 0.02% 2.378ms 18.579us 128
|
| 81 |
-
aten::new_ones 0.00% 363.206us 0.02% 2.356ms 18.405us 128
|
| 82 |
-
aten::new_empty 0.00% 146.671us 0.01% 643.930us 5.031us 128
|
| 83 |
-
aten::topk 0.05% 5.737ms 0.33% 35.086ms 274.110us 128
|
| 84 |
-
aten::sort 0.03% 3.500ms 0.29% 30.697ms 239.824us 128
|
| 85 |
-
aten::le 0.01% 1.203ms 0.02% 1.626ms 12.704us 128
|
| 86 |
-
aten::scatter 0.03% 3.105ms 0.07% 7.853ms 61.351us 128
|
| 87 |
-
aten::multinomial 0.03% 2.894ms 1.11% 118.961ms 929.384us 128
|
| 88 |
-
aten::min 0.01% 1.061ms 0.09% 9.143ms 71.430us 128
|
| 89 |
-
aten::exponential_ 0.01% 965.092us 0.02% 2.324ms 18.158us 128
|
| 90 |
-
aten::argmax 0.01% 778.371us 0.05% 5.145ms 40.196us 128
|
| 91 |
-
aten::squeeze 0.00% 351.418us 0.00% 381.833us 2.983us 128
|
| 92 |
-
aten::rsub 0.00% 335.894us 0.02% 1.628ms 12.719us 128
|
| 93 |
-
aten::bitwise_not 0.01% 823.513us 0.01% 1.239ms 9.676us 128
|
| 94 |
-
aten::__and__ 0.00% 109.667us 0.01% 1.253ms 9.786us 128
|
| 95 |
-
aten::index 0.01% 1.528ms 0.12% 13.014ms 102.476us 127
|
| 96 |
-
cudaMalloc 1.84% 196.481ms 1.84% 196.481ms 2.487ms 79
|
| 97 |
-
cudaDeviceCanAccessPeer 0.00% 29.525us 0.00% 29.525us 1.476us 20
|
| 98 |
-
cudaDeviceEnablePeerAccess 13.49% 1.441s 13.49% 1.441s 72.062ms 20
|
| 99 |
-
cudaGetDriverEntryPoint 0.00% 10.629us 0.00% 10.629us 0.664us 16
|
| 100 |
-
cudaOccupancyAvailableDynamicSMemPerBlock 0.00% 21.142us 0.00% 21.142us 1.321us 16
|
| 101 |
-
cudaFree 0.11% 12.251ms 0.11% 12.251ms 1.531ms 8
|
| 102 |
-
cudaGetSymbolAddress 0.00% 245.636us 0.00% 245.636us 30.705us 8
|
| 103 |
-
cudaStreamCreate 0.00% 167.896us 0.00% 167.896us 20.987us 8
|
| 104 |
-
cudaStreamDestroy 0.00% 49.125us 0.00% 49.125us 6.141us 8
|
| 105 |
-
aten::detach_ 0.00% 8.144us 0.00% 16.141us 3.228us 5
|
| 106 |
-
detach_ 0.00% 7.997us 0.00% 7.997us 1.599us 5
|
| 107 |
-
cudaHostAlloc 0.02% 1.896ms 0.02% 1.896ms 947.933us 2
|
| 108 |
-
aten::detach 0.00% 3.805us 0.00% 8.961us 4.481us 2
|
| 109 |
-
detach 0.00% 5.156us 0.00% 5.156us 2.578us 2
|
| 110 |
-
aten::resolve_conj 0.00% 0.844us 0.00% 0.844us 0.422us 2
|
| 111 |
-
aten::resolve_neg 0.00% 0.456us 0.00% 0.456us 0.228us 2
|
| 112 |
-
aten::ones 0.00% 6.404us 0.00% 61.959us 61.959us 1
|
| 113 |
-
aten::ones_like 0.00% 8.571us 0.00% 24.098us 24.098us 1
|
| 114 |
-
cudaGetDeviceCount 0.00% 0.144us 0.00% 0.144us 0.144us 1
|
| 115 |
-
---------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
|
| 116 |
-
Self CPU time total: 10.681s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|