Add files using upload-large-folder tool
Browse files- tmp3/Llama-3.3-70B-Instruct.csv +89 -0
- tmp3/Llama-3.3-70B-Instruct.txt +116 -0
- tmp3/report1.nsys-rep +3 -0
- tmp3/report1.sqlite +3 -0
- tmp3/run.py +43 -0
tmp3/Llama-3.3-70B-Instruct.csv
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Processing [report1.sqlite] with [/usr/local/cuda-12.6/NsightSystems-cli-2024.4.2/target-linux-x64/reports/cuda_gpu_kern_sum.py]...
|
| 2 |
+
Time (%),Total Time (ns),Instances,Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Name
|
| 3 |
+
82.1,5473376566,46917,116660.8,155134.0,47456,691065,59637.8,sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off_kernel__5x_cublas
|
| 4 |
+
2.6,175570808,18560,9459.6,9536.0,8960,11136,322.2,void cutlass::Kernel2<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)
|
| 5 |
+
1.7,115144769,37601,3062.3,3072.0,2271,5120,312.1,"void at::native::elementwise_kernel<(int)128, (int)4, void at::native::gpu_kernel_impl_nocast<at::native::BinaryFunctor<c10::BFloat16, c10::BFloat16, c10::BFloat16, at::native::binary_internal::MulFunctor<float>>>(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3)"
|
| 6 |
+
1.2,77083281,18720,4117.7,4032.0,3103,5568,576.9,"void at::native::elementwise_kernel<(int)128, (int)4, void at::native::gpu_kernel_impl_nocast<at::native::direct_copy_kernel_cuda(at::TensorIteratorBase &)::[lambda() (instance 3)]::operator ()() const::[lambda() (instance 12)]::operator ()() const::[lambda(c10::BFloat16) (instance 1)]>(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3)"
|
| 7 |
+
1.1,72863511,18837,3868.1,3904.0,3360,4992,256.9,"void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp<float, at::native::MeanOps<float, float, float, float>, unsigned int, float, (int)4>>(T3)"
|
| 8 |
+
1.0,65670069,18880,3478.3,3520.0,2751,6145,274.5,"void at::native::<unnamed>::CatArrayBatchedCopy<at::native::<unnamed>::OpaqueType<(unsigned int)2>, unsigned int, (int)4, (int)64, (int)64>(T1 *, at::native::<unnamed>::CatArrInputTensorMetadata<T1, T2, T4, T5>, at::native::<unnamed>::TensorSizeStride<T2, (unsigned int)4>, int, T2)"
|
| 9 |
+
1.0,64508122,19071,3382.5,3424.0,1311,3904,215.5,"void at::native::unrolled_elementwise_kernel<at::native::direct_copy_kernel_cuda(at::TensorIteratorBase &)::[lambda() (instance 3)]::operator ()() const::[lambda() (instance 12)]::operator ()() const::[lambda(c10::BFloat16) (instance 1)], at::detail::Array<char *, (int)2>, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithCast<(int)1>, at::native::memory::StoreWithCast<(int)1>>(int, T1, T2, T3, T4, T5, T6)"
|
| 10 |
+
1.0,63819319,37280,1711.9,1663.0,1215,3232,289.2,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::CUDAFunctor_add<c10::BFloat16>, at::detail::Array<char *, (int)3>>(int, T2, T3)"
|
| 11 |
+
0.9,61070759,18400,3319.1,3327.0,2815,3935,205.5,"void at::native::<unnamed>::CatArrayBatchedCopy_contig<at::native::<unnamed>::OpaqueType<(unsigned int)2>, unsigned int, (int)4, (int)128, (int)1>(T1 *, at::native::<unnamed>::CatArrInputTensorMetadata<T1, T2, T4, T5>, at::native::<unnamed>::TensorSizeStride<T2, (unsigned int)4>, int, T2)"
|
| 12 |
+
0.8,56060276,19071,2939.6,2912.0,1600,4320,140.0,"void at::native::unrolled_elementwise_kernel<at::native::direct_copy_kernel_cuda(at::TensorIteratorBase &)::[lambda() (instance 3)]::operator ()() const::[lambda() (instance 7)]::operator ()() const::[lambda(float) (instance 1)], at::detail::Array<char *, (int)2>, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithCast<(int)1>, at::native::memory::StoreWithCast<(int)1>>(int, T1, T2, T3, T4, T5, T6)"
|
| 13 |
+
0.8,54023440,28036,1926.9,1855.0,1536,5151,229.2,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::BinaryFunctor<c10::BFloat16, c10::BFloat16, c10::BFloat16, at::native::binary_internal::MulFunctor<float>>, at::detail::Array<char *, (int)3>>(int, T2, T3)"
|
| 14 |
+
0.8,53346719,18720,2849.7,2848.0,2367,4128,334.1,"void at::native::elementwise_kernel<(int)128, (int)4, void at::native::gpu_kernel_impl_nocast<at::native::neg_kernel_cuda(at::TensorIteratorBase &)::[lambda() (instance 2)]::operator ()() const::[lambda() (instance 9)]::operator ()() const::[lambda(c10::BFloat16) (instance 1)]>(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3)"
|
| 15 |
+
0.7,48946665,6080,8050.4,8352.0,6144,8640,749.5,"void pytorch_flash::flash_fwd_kernel<pytorch_flash::Flash_fwd_kernel_traits<(int)128, (int)128, (int)64, (int)4, (bool)0, (bool)0, cutlass::bfloat16_t, pytorch_flash::Flash_kernel_traits<(int)128, (int)128, (int)64, (int)4, cutlass::bfloat16_t>>, (bool)0, (bool)0, (bool)0, (bool)0, (bool)0, (bool)1, (bool)0>(pytorch_flash::Flash_fwd_params)"
|
| 16 |
+
0.7,46533021,18837,2470.3,2464.0,2240,3616,104.6,"void at::native::elementwise_kernel<(int)128, (int)2, void at::native::gpu_kernel_impl_nocast<at::native::BinaryFunctor<float, float, float, at::native::binary_internal::MulFunctor<float>>>(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3)"
|
| 17 |
+
0.6,42061064,18720,2246.9,2240.0,2143,3040,85.1,"void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat16, (bool)1, (bool)0, (bool)0>(cublasLt::cublasSplitKParams<T6>, const T4 *, const T5 *, T5 *, const T6 *, const T6 *, const T7 *, const T4 *, T7 *, void *, long, T6 *, int *)"
|
| 18 |
+
0.5,32922283,18837,1747.7,1728.0,1280,2369,157.6,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase &)::[lambda() (instance 2)]::operator ()() const::[lambda() (instance 2)]::operator ()() const::[lambda(float) (instance 1)], at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 19 |
+
0.5,32854413,18837,1744.1,1729.0,1279,2592,150.6,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::CUDAFunctorOnSelf_add<float>, at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 20 |
+
0.4,29117804,18837,1545.8,1536.0,1471,2624,81.5,"void at::native::vectorized_elementwise_kernel<(int)4, void at::native::<unnamed>::pow_tensor_scalar_kernel_impl<float, float>(at::TensorIteratorBase &, T2)::[lambda(float) (instance 1)], at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 21 |
+
0.3,21731120,9360,2321.7,2304.0,2080,4256,187.9,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::<unnamed>::silu_kernel(at::TensorIteratorBase &)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 6)]::operator ()() const::[lambda(c10::BFloat16) (instance 1)], at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 22 |
+
0.3,20063273,3200,6269.8,6272.0,5952,6624,110.2,"void pytorch_flash::flash_fwd_splitkv_kernel<pytorch_flash::Flash_fwd_kernel_traits<(int)128, (int)64, (int)128, (int)4, (bool)0, (bool)0, cutlass::bfloat16_t, pytorch_flash::Flash_kernel_traits<(int)128, (int)64, (int)128, (int)4, cutlass::bfloat16_t>>, (bool)0, (bool)0, (bool)0, (bool)0, (bool)1, (bool)1, (bool)0>(pytorch_flash::Flash_fwd_params)"
|
| 23 |
+
0.2,12733119,234,54415.0,54400.0,53376,55647,413.7,"void at::native::<unnamed>::cunn_SoftMaxForward<(int)4, float, float, float, at::native::<unnamed>::SoftMaxForwardEpilogue>(T4 *, const T2 *, int)"
|
| 24 |
+
0.1,8760909,3200,2737.8,2752.0,2527,2912,72.7,"void pytorch_flash::flash_fwd_splitkv_combine_kernel<pytorch_flash::Flash_fwd_kernel_traits<(int)128, (int)64, (int)128, (int)4, (bool)0, (bool)0, cutlass::bfloat16_t, pytorch_flash::Flash_kernel_traits<(int)128, (int)64, (int)128, (int)4, cutlass::bfloat16_t>>, (int)4, (int)1, (bool)1>(pytorch_flash::Flash_fwd_params)"
|
| 25 |
+
0.1,4589018,468,9805.6,9760.5,9184,10688,278.1,"void at::native::mbtopk::radixFindKthValues<float, unsigned int, unsigned int, (int)1>(at::cuda::detail::TensorInfo<const T1, T2>, unsigned int, unsigned int *, unsigned int, T2, int, int, unsigned int, T3, unsigned int *, T3 *, short *, T1 *)"
|
| 26 |
+
0.1,4353862,468,9303.1,9376.0,8320,11231,614.2,"void at_cuda_detail::cub::DeviceRadixSortOnesweepKernel<at_cuda_detail::cub::DeviceRadixSortPolicy<float, at::cuda::cub::detail::OpaqueType<(int)8>, unsigned long long>::Policy900, (bool)0, float, at::cuda::cub::detail::OpaqueType<(int)8>, unsigned long long, int, int, at_cuda_detail::cub::detail::identity_decomposer_t>(T7 *, T7 *, T5 *, const T5 *, T3 *, const T3 *, T4 *, const T4 *, T6, int, int, T8)"
|
| 27 |
+
0.1,3345970,117,28598.0,28672.0,27073,29601,551.5,"void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp<float, at::native::ArgMaxOps<float>, unsigned int, long, (int)4>>(T3)"
|
| 28 |
+
0.0,1650221,160,10313.9,10320.5,9728,11521,362.2,void cutlass::Kernel2<cutlass_80_tensorop_s16816gemm_bf16_128x64_64x6_tn_align8>(T1::Params)
|
| 29 |
+
0.0,1644379,117,14054.5,14112.0,13633,14751,258.8,"void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::MinNanFunctor<float>>, unsigned int, float, (int)4>>(T3)"
|
| 30 |
+
0.0,1640787,117,14023.8,14016.0,13663,14496,243.0,"void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::MaxNanFunctor<float>>, unsigned int, float, (int)4>>(T3)"
|
| 31 |
+
0.0,1625400,117,13892.3,13920.0,13471,14400,254.4,"void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator ()(at::TensorIterator &)::[lambda(float, float) (instance 1)]>, unsigned int, float, (int)4>>(T3)"
|
| 32 |
+
0.0,1146431,468,2449.6,2432.0,2336,2816,67.6,"void at::native::mbtopk::computeBlockwiseWithinKCounts<unsigned int>(T1 *, short *, unsigned int, int, bool, unsigned int *, unsigned int)"
|
| 33 |
+
0.0,746519,80,9331.5,9392.0,8192,10112,369.4,"void pytorch_flash::flash_fwd_kernel<pytorch_flash::Flash_fwd_kernel_traits<(int)128, (int)128, (int)64, (int)4, (bool)0, (bool)0, cutlass::bfloat16_t, pytorch_flash::Flash_kernel_traits<(int)128, (int)128, (int)64, (int)4, cutlass::bfloat16_t>>, (bool)0, (bool)1, (bool)0, (bool)0, (bool)0, (bool)1, (bool)0>(pytorch_flash::Flash_fwd_params)"
|
| 34 |
+
0.0,738617,234,3156.5,3167.0,2848,3456,114.0,"void at_cuda_detail::cub::DeviceScanByKeyKernel<at_cuda_detail::cub::DeviceScanByKeyPolicy<at_cuda_detail::cub::TransformInputIterator<unsigned int, at::native::mbtopk::BlockIdxToKey, at_cuda_detail::cub::CountingInputIterator<unsigned int, unsigned int>, long>, unsigned int, unsigned int, cuda::std::__4::plus<void>>::Policy900, at_cuda_detail::cub::TransformInputIterator<unsigned int, at::native::mbtopk::BlockIdxToKey, at_cuda_detail::cub::CountingInputIterator<unsigned int, unsigned int>, long>, unsigned int *, unsigned int *, at_cuda_detail::cub::ReduceByKeyScanTileState<unsigned int, int, (bool)1>, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, at_cuda_detail::cub::NullType, int, unsigned int, unsigned int>(T2, T11 *, T3, T4, T5, int, T6, T7, T8, T9)"
|
| 35 |
+
0.0,689337,117,5891.8,5920.0,5471,6368,213.5,"void at::native::radixSortKVInPlace<(int)-2, (int)-1, (int)32, (int)4, float, long, unsigned int>(at::cuda::detail::TensorInfo<T5, T7>, T7, T7, T7, at::cuda::detail::TensorInfo<T6, T7>, T7, bool)"
|
| 36 |
+
0.0,629725,234,2691.1,2784.0,2208,3424,300.5,"void at::native::<unnamed>::CatArrayBatchedCopy_aligned16_contig<at::native::<unnamed>::OpaqueType<(unsigned int)8>, unsigned int, (int)2, (int)128, (int)1>(T1 *, at::native::<unnamed>::CatArrInputTensorMetadata<T1, T2, T4, T5>, at::native::<unnamed>::TensorSizeStride<T2, (unsigned int)4>, int, T2)"
|
| 37 |
+
0.0,606459,117,5183.4,5279.0,4512,5408,203.0,"void at::native::mbtopk::gatherTopK<float, unsigned int, (int)1>(at::cuda::detail::TensorInfo<const T1, T2>, T2, T2, bool, unsigned int, T2, at::cuda::detail::TensorInfo<T1, T2>, T2, at::cuda::detail::TensorInfo<long, T2>, T2, unsigned int, unsigned int, T1 *, unsigned int *, unsigned int *, unsigned int)"
|
| 38 |
+
0.0,566850,351,1615.0,1568.0,1280,2176,234.8,"void at::native::vectorized_elementwise_kernel<(int)4, void at::native::compare_scalar_kernel<float>(at::TensorIteratorBase &, at::native::<unnamed>::OpType, T1)::[lambda(float) (instance 1)], at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 39 |
+
0.0,543898,160,3399.4,3392.0,3072,3808,254.5,"void at::native::elementwise_kernel<(int)128, (int)4, void at::native::gpu_kernel_impl_nocast<at::native::CUDAFunctor_add<c10::BFloat16>>(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3)"
|
| 40 |
+
0.0,503368,351,1434.1,1472.0,1311,1569,96.9,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::AUnaryFunctor<long, long, bool, at::native::<unnamed>::CompareEqFunctor<long>>, at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 41 |
+
0.0,472540,117,4038.8,4032.0,3905,4192,43.0,"void at::native::_scatter_gather_elementwise_kernel<(int)128, (int)4, void at::native::_cuda_scatter_gather_internal_kernel<(bool)1, at::native::OpaqueType<(int)1>>::operator ()<at::native::TensorAssign>(at::TensorIterator &, long, long, long, const T1 &)::[lambda(int) (instance 1)]>(int, T3)"
|
| 42 |
+
0.0,429087,352,1219.0,1264.0,1087,1600,98.2,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::FillFunctor<bool>, at::detail::Array<char *, (int)1>>(int, T2, T3)"
|
| 43 |
+
0.0,427098,117,3650.4,3648.0,3615,3712,18.8,"void at_cuda_detail::cub::DeviceRadixSortHistogramKernel<at_cuda_detail::cub::DeviceRadixSortPolicy<float, at::cuda::cub::detail::OpaqueType<(int)8>, unsigned long long>::Policy900, (bool)0, float, unsigned long long, at_cuda_detail::cub::detail::identity_decomposer_t>(T4 *, const T3 *, T4, int, int, T5)"
|
| 44 |
+
0.0,421793,234,1802.5,1792.0,1727,1857,31.8,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::<unnamed>::masked_fill_kernel(at::TensorIterator &, const c10::Scalar &)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 7)]::operator ()() const::[lambda(float, bool) (instance 1)], at::detail::Array<char *, (int)3>>(int, T2, T3)"
|
| 45 |
+
0.0,419424,117,3584.8,3584.0,3360,4000,130.6,"void at_cuda_detail::cub::DeviceScanKernel<at_cuda_detail::cub::DeviceScanPolicy<float, std::plus<float>>::Policy900, const float *, float *, at_cuda_detail::cub::ScanTileState<float, (bool)1>, std::plus<float>, at_cuda_detail::cub::NullType, int, float>(T2, T3, T4, int, T5, T6, T7)"
|
| 46 |
+
0.0,419098,116,3612.9,3663.5,3328,4064,150.5,void cutlass::Kernel2<cutlass_80_simt_sgemm_128x32_8x5_tn_align1>(T1::Params)
|
| 47 |
+
0.0,379354,117,3242.3,3263.0,2816,3776,234.5,"void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp<bool, at::native::func_wrapper_t<bool, at::native::and_kernel_cuda(at::TensorIterator &)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 12)]::operator ()() const::[lambda(bool, bool) (instance 1)]>, unsigned int, bool, (int)4>>(T3)"
|
| 48 |
+
0.0,373581,234,1596.5,1600.0,1280,2080,185.6,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::CUDAFunctorOnSelf_add<long>, at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 49 |
+
0.0,371615,120,3096.8,3120.0,2784,3583,207.0,"void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp<bool, at::native::func_wrapper_t<bool, at::native::or_kernel_cuda(at::TensorIterator &)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 12)]::operator ()() const::[lambda(bool, bool) (instance 1)]>, unsigned int, bool, (int)4>>(T3)"
|
| 50 |
+
0.0,366906,234,1568.0,1600.0,1376,1920,119.5,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::BinaryFunctor<long, long, long, at::native::binary_internal::MulFunctor<long>>, at::detail::Array<char *, (int)3>>(int, T2, T3)"
|
| 51 |
+
0.0,362405,117,3097.5,3104.0,2976,3232,43.5,"void at::native::elementwise_kernel<(int)128, (int)4, void at::native::gpu_kernel_impl_nocast<at::native::<unnamed>::CompareFunctor<float>>(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3)"
|
| 52 |
+
0.0,357757,234,1528.9,1567.5,1376,1823,114.2,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::AUnaryFunctor<float, float, float, at::native::binary_internal::MulFunctor<float>>, at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 53 |
+
0.0,350008,117,2991.5,3007.0,2943,3040,24.0,"void at::native::<unnamed>::distribution_elementwise_grid_stride_kernel<float, (int)4, void at::native::templates::cuda::uniform_and_transform<float, float, (unsigned long)4, at::CUDAGeneratorImpl *, void at::native::templates::cuda::exponential_kernel<at::CUDAGeneratorImpl *>(at::TensorIteratorBase &, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 2)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T4, T5)::[lambda(curandStatePhilox4_32_10 *) (instance 2)], void at::native::<unnamed>::distribution_nullary_kernel<float, float, (int)4, at::CUDAGeneratorImpl *, void at::native::templates::cuda::uniform_and_transform<float, float, (unsigned long)4, at::CUDAGeneratorImpl *, void at::native::templates::cuda::exponential_kernel<at::CUDAGeneratorImpl *>(at::TensorIteratorBase &, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 2)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T4, T5)::[lambda(curandStatePhilox4_32_10 *) (instance 2)], void at::native::templates::cuda::exponential_kernel<at::CUDAGeneratorImpl *>(at::TensorIteratorBase &, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 2)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T4, const T5 &, T6)::[lambda(int, float) (instance 1)]>(int, at::PhiloxCudaState, T3, T4)"
|
| 54 |
+
0.0,342395,234,1463.2,1488.0,1311,1632,100.1,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::BinaryFunctor<bool, bool, bool, at::native::BitwiseOrFunctor<bool>>, at::detail::Array<char *, (int)3>>(int, T2, T3)"
|
| 55 |
+
0.0,335393,116,2891.3,2848.0,2720,3424,102.6,"void at::native::<unnamed>::indexSelectSmallIndex<c10::BFloat16, long, unsigned int, (int)2, (int)2, (int)-2>(at::cuda::detail::TensorInfo<T1, T3>, at::cuda::detail::TensorInfo<const T1, T3>, at::cuda::detail::TensorInfo<const T2, T3>, int, int, T3, long)"
|
| 56 |
+
0.0,328441,117,2807.2,2911.0,2464,3200,228.4,"void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp<long, at::native::func_wrapper_t<long, at::native::MaxNanFunctor<long>>, unsigned int, long, (int)4>>(T3)"
|
| 57 |
+
0.0,326081,234,1393.5,1424.0,1215,1569,146.3,"void at_cuda_detail::cub::DeviceScanByKeyInitKernel<at_cuda_detail::cub::ReduceByKeyScanTileState<unsigned int, int, (bool)1>, at_cuda_detail::cub::TransformInputIterator<unsigned int, at::native::mbtopk::BlockIdxToKey, at_cuda_detail::cub::CountingInputIterator<unsigned int, unsigned int>, long>>(T1, T2, std::iterator_traits<T2>::value_type *, unsigned int, int)"
|
| 58 |
+
0.0,322298,117,2754.7,2848.0,2432,3168,213.5,"void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp<long, at::native::func_wrapper_t<long, at::native::sum_functor<long, long, long>::operator ()(at::TensorIterator &)::[lambda(long, long) (instance 1)]>, unsigned int, long, (int)4>>(T3)"
|
| 59 |
+
0.0,301087,116,2595.6,2608.0,2208,3008,255.9,"void at::native::index_elementwise_kernel<(int)128, (int)4, void at::native::gpu_index_kernel<void at::native::index_kernel_impl<at::native::OpaqueType<(int)8>>(at::TensorIteratorBase &, c10::ArrayRef<long>, c10::ArrayRef<long>)::[lambda(char *, const char *, long) (instance 1)]>(at::TensorIteratorBase &, c10::ArrayRef<long>, c10::ArrayRef<long>, const T1 &)::[lambda(int) (instance 1)]>(long, T3)"
|
| 60 |
+
0.0,282998,117,2418.8,2432.0,2335,2497,46.6,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::BinaryFunctor<float, float, float, at::native::binary_internal::DivFunctor<float>>, at::detail::Array<char *, (int)3>>(int, T2, T3)"
|
| 61 |
+
0.0,282531,116,2435.6,2512.0,2111,2912,272.9,"void at::native::<unnamed>::CatArrayBatchedCopy_aligned16_contig<at::native::<unnamed>::OpaqueType<(unsigned int)4>, unsigned int, (int)3, (int)128, (int)1>(T1 *, at::native::<unnamed>::CatArrInputTensorMetadata<T1, T2, T4, T5>, at::native::<unnamed>::TensorSizeStride<T2, (unsigned int)4>, int, T2)"
|
| 62 |
+
0.0,249796,118,2116.9,2160.0,1856,2433,170.5,"void at::native::elementwise_kernel<(int)128, (int)4, void at::native::gpu_kernel_impl_nocast<at::native::BinaryFunctor<long, long, bool, at::native::<unnamed>::CompareEqFunctor<long>>>(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3)"
|
| 63 |
+
0.0,241437,118,2046.1,2047.0,1792,2432,87.4,"void at_cuda_detail::cub::DeviceScanKernel<at_cuda_detail::cub::DeviceScanPolicy<long, std::plus<long>>::Policy900, const long *, long *, at_cuda_detail::cub::ScanTileState<long, (bool)1>, std::plus<long>, at_cuda_detail::cub::NullType, int, long>(T2, T3, T4, int, T5, T6, T7)"
|
| 64 |
+
0.0,240282,117,2053.7,2048.0,1984,2112,22.1,"at::native::<unnamed>::fill_reverse_indices_kernel(long *, int, at::cuda::detail::IntDivider<unsigned int>)"
|
| 65 |
+
0.0,222944,117,1905.5,1920.0,1760,2176,100.4,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::cos_kernel_cuda(at::TensorIteratorBase &)::[lambda() (instance 2)]::operator ()() const::[lambda() (instance 2)]::operator ()() const::[lambda(float) (instance 1)], at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 66 |
+
0.0,221632,117,1894.3,1920.0,1728,2208,105.4,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::sin_kernel_cuda(at::TensorIteratorBase &)::[lambda() (instance 2)]::operator ()() const::[lambda() (instance 2)]::operator ()() const::[lambda(float) (instance 1)], at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 67 |
+
0.0,217469,117,1858.7,1984.0,1663,2080,179.3,"void at::native::mbtopk::computeBlockwiseKthCounts<unsigned int>(T1 *, short *, unsigned int, unsigned int, unsigned int *)"
|
| 68 |
+
0.0,217211,117,1856.5,1856.0,1663,2080,76.4,"void at::native::unrolled_elementwise_kernel<at::native::BinaryFunctor<long, long, long, at::native::BitwiseAndFunctor<long>>, at::detail::Array<char *, (int)3>, TrivialOffsetCalculator<(int)2, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithCast<(int)2>, at::native::memory::StoreWithCast<(int)1>>(int, T1, T2, T3, T4, T5, T6)"
|
| 69 |
+
0.0,208992,117,1786.3,1824.0,1663,2112,100.8,"void at::native::vectorized_elementwise_kernel<(int)4, void at::native::compare_scalar_kernel<long>(at::TensorIteratorBase &, at::native::<unnamed>::OpType, T1)::[lambda(long) (instance 1)], at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 70 |
+
0.0,199384,117,1704.1,1696.0,1663,2112,58.0,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::BUnaryFunctor<float, float, float, at::native::binary_internal::MulFunctor<float>>, at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 71 |
+
0.0,194342,117,1661.0,1664.0,1472,1793,71.2,"void at::native::unrolled_elementwise_kernel<at::native::direct_copy_kernel_cuda(at::TensorIteratorBase &)::[lambda() (instance 3)]::operator ()() const::[lambda() (instance 4)]::operator ()() const::[lambda(long) (instance 1)], at::detail::Array<char *, (int)2>, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithCast<(int)1>, at::native::memory::StoreWithCast<(int)1>>(int, T1, T2, T3, T4, T5, T6)"
|
| 72 |
+
0.0,190154,117,1625.2,1600.0,1503,1729,92.4,"void at_cuda_detail::cub::DeviceRadixSortExclusiveSumKernel<at_cuda_detail::cub::DeviceRadixSortPolicy<float, at::cuda::cub::detail::OpaqueType<(int)8>, unsigned long long>::Policy900, unsigned long long>(T2 *)"
|
| 73 |
+
0.0,187007,117,1598.4,1568.0,1313,2177,176.1,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::<unnamed>::masked_fill_kernel(at::TensorIterator &, const c10::Scalar &)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 4)]::operator ()() const::[lambda(long, bool) (instance 1)], at::detail::Array<char *, (int)3>>(int, T2, T3)"
|
| 74 |
+
0.0,184958,117,1580.8,1632.0,1408,1952,160.4,"void at::native::mbtopk::fill<unsigned int, unsigned int>(T1 *, T1, T2)"
|
| 75 |
+
0.0,184767,117,1579.2,1568.0,1311,1856,140.0,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::CUDAFunctor_add<long>, at::detail::Array<char *, (int)3>>(int, T2, T3)"
|
| 76 |
+
0.0,179481,117,1534.0,1567.0,1376,2048,116.7,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::BinaryFunctor<bool, bool, bool, at::native::BitwiseAndFunctor<bool>>, at::detail::Array<char *, (int)3>>(int, T2, T3)"
|
| 77 |
+
0.0,169534,117,1449.0,1408.0,1312,1631,97.4,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::bitwise_not_kernel_cuda(at::TensorIteratorBase &)::[lambda(bool) (instance 1)], at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 78 |
+
0.0,167964,118,1423.4,1360.0,1248,1600,143.8,"void at_cuda_detail::cub::DeviceScanInitKernel<at_cuda_detail::cub::ScanTileState<long, (bool)1>>(T1, int)"
|
| 79 |
+
0.0,167266,119,1405.6,1440.0,1280,1728,109.1,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::FillFunctor<long>, at::detail::Array<char *, (int)1>>(int, T2, T3)"
|
| 80 |
+
0.0,166104,117,1419.7,1503.0,1247,1600,143.6,"void at_cuda_detail::cub::DeviceScanInitKernel<at_cuda_detail::cub::ScanTileState<float, (bool)1>>(T1, int)"
|
| 81 |
+
0.0,165696,117,1416.2,1376.0,1280,1537,93.7,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::AUnaryFunctor<float, float, bool, at::native::<unnamed>::CompareEqFunctor<float>>, at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 82 |
+
0.0,164197,117,1403.4,1408.0,1280,1536,93.4,"void at::native::vectorized_elementwise_kernel<(int)4, at::native::CUDAFunctorOnOther_add<long>, at::detail::Array<char *, (int)2>>(int, T2, T3)"
|
| 83 |
+
0.0,142269,117,1216.0,1184.0,1088,1344,96.5,"void at::native::unrolled_elementwise_kernel<at::native::FillFunctor<bool>, at::detail::Array<char *, (int)1>, TrivialOffsetCalculator<(int)0, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, T1, T2, T3, T4, T5, T6)"
|
| 84 |
+
0.0,5888,1,5888.0,5888.0,5888,5888,0.0,"void at::native::<unnamed>::indexSelectLargeIndex<c10::BFloat16, long, unsigned int, (int)2, (int)2, (int)-2, (bool)1>(at::cuda::detail::TensorInfo<T1, T3>, at::cuda::detail::TensorInfo<const T1, T3>, at::cuda::detail::TensorInfo<const T2, T3>, int, int, T3, T3, long)"
|
| 85 |
+
0.0,3232,1,3232.0,3232.0,3232,3232,0.0,void cutlass::Kernel2<cutlass_80_simt_sgemm_128x32_8x5_nn_align1>(T1::Params)
|
| 86 |
+
0.0,2720,1,2720.0,2720.0,2720,2720,0.0,"void at::native::<unnamed>::CatArrayBatchedCopy<at::native::<unnamed>::OpaqueType<(unsigned int)4>, unsigned int, (int)3, (int)64, (int)64>(T1 *, at::native::<unnamed>::CatArrInputTensorMetadata<T1, T2, T4, T5>, at::native::<unnamed>::TensorSizeStride<T2, (unsigned int)4>, int, T2)"
|
| 87 |
+
0.0,1728,1,1728.0,1728.0,1728,1728,0.0,"void at::native::unrolled_elementwise_kernel<void at::native::compare_scalar_kernel<long>(at::TensorIteratorBase &, at::native::<unnamed>::OpType, T1)::[lambda(long) (instance 1)], at::detail::Array<char *, (int)2>, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, T1, T2, T3, T4, T5, T6)"
|
| 88 |
+
0.0,1409,1,1409.0,1409.0,1409,1409,0.0,"void at::native::unrolled_elementwise_kernel<at::native::CUDAFunctorOnSelf_add<long>, at::detail::Array<char *, (int)2>, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, T1, T2, T3, T4, T5, T6)"
|
| 89 |
+
|
tmp3/Llama-3.3-70B-Instruct.txt
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
|
| 2 |
+
Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
|
| 3 |
+
---------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
|
| 4 |
+
cudaLaunchKernel 13.27% 1.418s 13.27% 1.418s 3.593us 394586
|
| 5 |
+
aten::as_strided 1.08% 115.744ms 1.08% 115.744ms 0.337us 343173
|
| 6 |
+
aten::to 0.54% 57.976ms 33.12% 3.537s 13.122us 269580
|
| 7 |
+
cudaFuncSetAttribute 1.29% 137.964ms 1.29% 137.964ms 0.840us 164256
|
| 8 |
+
aten::transpose 1.04% 110.650ms 1.59% 170.262ms 1.107us 153856
|
| 9 |
+
aten::slice 1.23% 131.454ms 1.57% 167.184ms 1.340us 124799
|
| 10 |
+
aten::view 0.67% 71.079ms 0.67% 71.079ms 0.626us 113537
|
| 11 |
+
aten::copy_ 8.59% 917.312ms 29.53% 3.154s 30.919us 102023
|
| 12 |
+
aten::mul 5.14% 549.342ms 7.92% 846.232ms 9.106us 92928
|
| 13 |
+
aten::reshape 0.77% 81.773ms 4.02% 429.704ms 4.630us 92800
|
| 14 |
+
aten::_unsafe_view 0.41% 43.844ms 0.41% 43.844ms 0.474us 92416
|
| 15 |
+
aten::empty_strided 3.27% 348.892ms 3.68% 393.075ms 4.319us 91016
|
| 16 |
+
aten::_to_copy 1.19% 127.636ms 32.58% 3.479s 43.213us 80519
|
| 17 |
+
cudaEventRecord 0.47% 50.468ms 0.47% 50.468ms 0.653us 77312
|
| 18 |
+
cudaStreamWaitEvent 1.97% 210.324ms 1.97% 210.324ms 2.720us 77312
|
| 19 |
+
aten::matmul 1.48% 158.298ms 22.96% 2.452s 34.087us 71936
|
| 20 |
+
aten::linear 0.68% 73.006ms 24.23% 2.588s 36.036us 71808
|
| 21 |
+
aten::t 0.69% 73.657ms 1.43% 152.759ms 2.127us 71808
|
| 22 |
+
aten::mm 13.62% 1.455s 19.60% 2.093s 29.148us 71808
|
| 23 |
+
aten::empty 1.75% 186.943ms 1.75% 186.943ms 2.628us 71141
|
| 24 |
+
aten::add 3.44% 366.966ms 5.02% 536.139ms 8.672us 61824
|
| 25 |
+
cudaFuncGetAttributes 1.42% 151.850ms 1.42% 151.850ms 2.958us 51343
|
| 26 |
+
cudaLaunchKernelExC 1.57% 167.686ms 1.57% 167.686ms 3.267us 51328
|
| 27 |
+
aten::unsqueeze 0.40% 42.589ms 0.51% 54.686ms 1.311us 41729
|
| 28 |
+
aten::cat 3.05% 325.722ms 4.48% 478.806ms 11.626us 41184
|
| 29 |
+
cudaMemcpyAsync 2.85% 303.978ms 2.85% 303.978ms 7.537us 40329
|
| 30 |
+
aten::empty_like 0.27% 28.633ms 1.37% 146.555ms 4.673us 31360
|
| 31 |
+
cudaDeviceGetAttribute 0.08% 8.631ms 0.08% 8.631ms 0.403us 21401
|
| 32 |
+
aten::clone 0.33% 35.761ms 2.83% 301.802ms 14.291us 21119
|
| 33 |
+
aten::expand 0.29% 30.827ms 0.36% 38.344ms 1.838us 20864
|
| 34 |
+
cuLaunchKernel 0.57% 60.723ms 0.57% 60.723ms 2.947us 20608
|
| 35 |
+
aten::pow 1.54% 164.532ms 2.42% 258.327ms 12.535us 20608
|
| 36 |
+
aten::result_type 0.05% 5.705ms 0.05% 5.705ms 0.277us 20608
|
| 37 |
+
aten::mean 1.72% 183.427ms 2.39% 255.152ms 12.381us 20608
|
| 38 |
+
aten::rsqrt 1.18% 126.077ms 1.87% 199.269ms 9.669us 20608
|
| 39 |
+
aten::neg 1.28% 136.597ms 2.00% 213.198ms 10.410us 20480
|
| 40 |
+
cudaStreamIsCapturing 0.09% 9.314ms 0.09% 9.314ms 0.883us 10551
|
| 41 |
+
aten::scaled_dot_product_attention 0.70% 74.586ms 4.81% 513.450ms 50.142us 10240
|
| 42 |
+
aten::_scaled_dot_product_flash_attention 0.60% 64.179ms 4.11% 438.864ms 42.858us 10240
|
| 43 |
+
aten::_flash_attention_forward 1.14% 121.600ms 3.17% 338.182ms 33.026us 10240
|
| 44 |
+
aten::silu 0.72% 77.113ms 1.08% 115.025ms 11.233us 10240
|
| 45 |
+
cudaPeekAtLastError 0.00% 216.828us 0.00% 216.828us 0.077us 2820
|
| 46 |
+
cudaMemsetAsync 0.03% 3.515ms 0.03% 3.515ms 3.051us 1152
|
| 47 |
+
aten::item 0.01% 599.904us 0.68% 73.102ms 94.938us 770
|
| 48 |
+
aten::_local_scalar_dense 0.02% 2.496ms 0.68% 72.503ms 94.159us 770
|
| 49 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags 0.16% 16.964ms 0.16% 16.964ms 25.820us 657
|
| 50 |
+
cudaStreamSynchronize 0.62% 65.837ms 0.62% 65.837ms 101.443us 649
|
| 51 |
+
aten::fill_ 0.02% 2.516ms 0.08% 8.853ms 13.768us 643
|
| 52 |
+
aten::eq 0.04% 4.230ms 0.11% 11.692ms 18.240us 641
|
| 53 |
+
aten::select 0.01% 1.268ms 0.01% 1.562ms 3.033us 515
|
| 54 |
+
aten::is_nonzero 0.00% 270.018us 0.07% 7.411ms 19.201us 386
|
| 55 |
+
aten::masked_fill_ 0.02% 1.942ms 0.41% 43.750ms 113.933us 384
|
| 56 |
+
aten::lt 0.02% 1.824ms 0.06% 5.914ms 23.010us 257
|
| 57 |
+
aten::cumsum 0.04% 4.100ms 0.08% 8.919ms 34.703us 257
|
| 58 |
+
aten::sub 0.02% 1.757ms 0.08% 9.054ms 35.228us 257
|
| 59 |
+
aten::ge 0.02% 1.756ms 0.02% 2.529ms 9.878us 256
|
| 60 |
+
aten::resize_ 0.01% 732.984us 0.01% 732.984us 2.863us 256
|
| 61 |
+
aten::div 0.02% 1.708ms 0.07% 7.755ms 30.294us 256
|
| 62 |
+
aten::masked_fill 0.01% 777.172us 0.06% 6.097ms 23.817us 256
|
| 63 |
+
aten::softmax 0.00% 324.262us 0.09% 10.077ms 39.363us 256
|
| 64 |
+
aten::_softmax 0.01% 1.380ms 0.09% 9.753ms 38.096us 256
|
| 65 |
+
aten::max 0.02% 1.698ms 0.10% 10.692ms 41.764us 256
|
| 66 |
+
aten::bitwise_and 0.01% 1.476ms 0.08% 8.821ms 34.459us 256
|
| 67 |
+
aten::sum 0.02% 2.235ms 0.11% 11.433ms 44.662us 256
|
| 68 |
+
aten::full 0.00% 489.682us 0.02% 2.645ms 10.332us 256
|
| 69 |
+
aten::__or__ 0.00% 254.864us 0.02% 2.553ms 9.971us 256
|
| 70 |
+
aten::bitwise_or 0.01% 1.588ms 0.02% 2.298ms 8.975us 256
|
| 71 |
+
cudaOccupancyMaxActiveClusters 0.00% 95.980us 0.00% 95.980us 0.387us 248
|
| 72 |
+
aten::lift_fresh 0.00% 29.450us 0.00% 29.450us 0.221us 133
|
| 73 |
+
aten::any 0.01% 1.184ms 0.09% 9.246ms 70.579us 131
|
| 74 |
+
aten::isin 0.01% 1.499ms 0.22% 23.226ms 180.048us 129
|
| 75 |
+
aten::embedding 0.01% 610.386us 0.04% 3.806ms 29.733us 128
|
| 76 |
+
aten::index_select 0.01% 1.322ms 0.03% 2.970ms 23.204us 128
|
| 77 |
+
aten::all 0.01% 1.239ms 0.02% 1.798ms 14.048us 128
|
| 78 |
+
aten::bmm 0.66% 70.555ms 0.83% 88.254ms 689.487us 128
|
| 79 |
+
aten::cos 0.01% 1.069ms 0.02% 2.460ms 19.215us 128
|
| 80 |
+
aten::sin 0.01% 1.054ms 0.02% 2.378ms 18.579us 128
|
| 81 |
+
aten::new_ones 0.00% 363.206us 0.02% 2.356ms 18.405us 128
|
| 82 |
+
aten::new_empty 0.00% 146.671us 0.01% 643.930us 5.031us 128
|
| 83 |
+
aten::topk 0.05% 5.737ms 0.33% 35.086ms 274.110us 128
|
| 84 |
+
aten::sort 0.03% 3.500ms 0.29% 30.697ms 239.824us 128
|
| 85 |
+
aten::le 0.01% 1.203ms 0.02% 1.626ms 12.704us 128
|
| 86 |
+
aten::scatter 0.03% 3.105ms 0.07% 7.853ms 61.351us 128
|
| 87 |
+
aten::multinomial 0.03% 2.894ms 1.11% 118.961ms 929.384us 128
|
| 88 |
+
aten::min 0.01% 1.061ms 0.09% 9.143ms 71.430us 128
|
| 89 |
+
aten::exponential_ 0.01% 965.092us 0.02% 2.324ms 18.158us 128
|
| 90 |
+
aten::argmax 0.01% 778.371us 0.05% 5.145ms 40.196us 128
|
| 91 |
+
aten::squeeze 0.00% 351.418us 0.00% 381.833us 2.983us 128
|
| 92 |
+
aten::rsub 0.00% 335.894us 0.02% 1.628ms 12.719us 128
|
| 93 |
+
aten::bitwise_not 0.01% 823.513us 0.01% 1.239ms 9.676us 128
|
| 94 |
+
aten::__and__ 0.00% 109.667us 0.01% 1.253ms 9.786us 128
|
| 95 |
+
aten::index 0.01% 1.528ms 0.12% 13.014ms 102.476us 127
|
| 96 |
+
cudaMalloc 1.84% 196.481ms 1.84% 196.481ms 2.487ms 79
|
| 97 |
+
cudaDeviceCanAccessPeer 0.00% 29.525us 0.00% 29.525us 1.476us 20
|
| 98 |
+
cudaDeviceEnablePeerAccess 13.49% 1.441s 13.49% 1.441s 72.062ms 20
|
| 99 |
+
cudaGetDriverEntryPoint 0.00% 10.629us 0.00% 10.629us 0.664us 16
|
| 100 |
+
cudaOccupancyAvailableDynamicSMemPerBlock 0.00% 21.142us 0.00% 21.142us 1.321us 16
|
| 101 |
+
cudaFree 0.11% 12.251ms 0.11% 12.251ms 1.531ms 8
|
| 102 |
+
cudaGetSymbolAddress 0.00% 245.636us 0.00% 245.636us 30.705us 8
|
| 103 |
+
cudaStreamCreate 0.00% 167.896us 0.00% 167.896us 20.987us 8
|
| 104 |
+
cudaStreamDestroy 0.00% 49.125us 0.00% 49.125us 6.141us 8
|
| 105 |
+
aten::detach_ 0.00% 8.144us 0.00% 16.141us 3.228us 5
|
| 106 |
+
detach_ 0.00% 7.997us 0.00% 7.997us 1.599us 5
|
| 107 |
+
cudaHostAlloc 0.02% 1.896ms 0.02% 1.896ms 947.933us 2
|
| 108 |
+
aten::detach 0.00% 3.805us 0.00% 8.961us 4.481us 2
|
| 109 |
+
detach 0.00% 5.156us 0.00% 5.156us 2.578us 2
|
| 110 |
+
aten::resolve_conj 0.00% 0.844us 0.00% 0.844us 0.422us 2
|
| 111 |
+
aten::resolve_neg 0.00% 0.456us 0.00% 0.456us 0.228us 2
|
| 112 |
+
aten::ones 0.00% 6.404us 0.00% 61.959us 61.959us 1
|
| 113 |
+
aten::ones_like 0.00% 8.571us 0.00% 24.098us 24.098us 1
|
| 114 |
+
cudaGetDeviceCount 0.00% 0.144us 0.00% 0.144us 0.144us 1
|
| 115 |
+
---------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
|
| 116 |
+
Self CPU time total: 10.681s
|
tmp3/report1.nsys-rep
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15e00468457559c63991c4b19ec86ed122486a4104e96986d23b956c67a6e25d
|
| 3 |
+
size 34269530
|
tmp3/report1.sqlite
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf21594ce3ea0d2727a99c1663e7d97f61d9d28c4c109db444bef4992736e4de
|
| 3 |
+
size 117813248
|
tmp3/run.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import transformers
|
| 2 |
+
import torch
|
| 3 |
+
from modelscope import snapshot_download
|
| 4 |
+
|
| 5 |
+
model_id = snapshot_download("LLM-Research/Llama-3.3-70B-Instruct")
|
| 6 |
+
|
| 7 |
+
pipeline = transformers.pipeline(
|
| 8 |
+
"text-generation",
|
| 9 |
+
model=model_id,
|
| 10 |
+
model_kwargs={"torch_dtype": torch.bfloat16},
|
| 11 |
+
device_map="auto",
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
messages = [
|
| 15 |
+
{"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
|
| 16 |
+
{"role": "user", "content": "Who are you?"},
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
with torch.profiler.profile(
|
| 21 |
+
activities=[
|
| 22 |
+
torch.profiler.ProfilerActivity.CPU,
|
| 23 |
+
# torch.profiler.ProfilerActivity.CUDA, # 捕捉 aten function 的调用仅开 CPU 就够了
|
| 24 |
+
],
|
| 25 |
+
#record_shapes=True,
|
| 26 |
+
#with_stack=True,
|
| 27 |
+
) as p:
|
| 28 |
+
outputs = pipeline(
|
| 29 |
+
messages,
|
| 30 |
+
max_new_tokens=256,
|
| 31 |
+
)
|
| 32 |
+
print(outputs[0]["generated_text"][-1])
|
| 33 |
+
|
| 34 |
+
table_str = p.key_averages().table(
|
| 35 |
+
sort_by="count",
|
| 36 |
+
row_limit=-1,
|
| 37 |
+
max_src_column_width=100,
|
| 38 |
+
max_name_column_width=100, # 限制列宽
|
| 39 |
+
)
|
| 40 |
+
with open("Llama-3.3-70B-Instruct.txt", 'wt') as f:
|
| 41 |
+
f.write(table_str)
|
| 42 |
+
|
| 43 |
+
|