TUHs's picture
Upload 207 files
29b9c56
==11704== NVPROF is profiling process 11704, command: python profile.py -f -j 1
==11704== Profiling application: python profile.py -f -j 1
==11704== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 50.51% 3.6099ms 6 601.65us 466.91us 748.76us void spatialDepthwiseConvolutionUpdateOutput<float, float, unsigned int, int=0>(THCDeviceTensor<float, int=4, int, DefaultPtrTraits>, THCDeviceTensor<float, int=4, int, DefaultPtrTraits>, THCDeviceTensor<float, int=4, int, DefaultPtrTraits>, THCDeviceTensor<float, int=1, int, DefaultPtrTraits>, bool, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int)
13.71% 979.45us 13 75.342us 896ns 967.77us [CUDA memcpy HtoD]
10.56% 754.39us 6 125.73us 122.75us 129.60us void kernelPointwiseApply2<TensorTakeOp<float, int, int=-2>, float, long, unsigned int, int=1, int=1>(OffsetInfo<int, TensorTakeOp<float, int, int=-2>, float>, OffsetInfo<int=-2, TensorTakeOp<float, int, int=-2>, long>, TensorTakeOp<float, int, int=-2>, float)
6.80% 485.85us 10 48.585us 2.1440us 123.36us void kernelPointwiseApply3<TensorAddOp<long>, long, long, long, unsigned int, int=1, int=2, int=2>(OffsetInfo<TensorAddOp<long>, long, unsigned int>, OffsetInfo<long, long, int=1>, OffsetInfo<long, long, int=2>, long, long)
5.35% 382.21us 12 31.850us 27.232us 36.736us void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float)
3.27% 233.79us 6 38.965us 35.455us 42.464us void kernelPointwiseApply3<TensorAddOp<float>, float, float, float, unsigned int, int=1, int=2, int=2>(OffsetInfo<TensorAddOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=2>, float, float)
3.07% 219.65us 6 36.607us 35.359us 37.856us void kernelPointwiseApply3<TensorSubOp<float>, float, float, float, unsigned int, int=1, int=2, int=2>(OffsetInfo<TensorSubOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=2>, float, float)
2.43% 173.50us 3 57.833us 54.975us 59.519us void kernelPointwiseApply2<TensorDivConstantOp<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<TensorDivConstantOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)
2.43% 173.47us 6 28.911us 27.584us 32.768us void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=2, int=1>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float)
0.37% 26.208us 24 1.0920us 992ns 1.4720us [CUDA memcpy DtoH]
0.36% 25.503us 6 4.2500us 4.1600us 4.3520us void kernelReduceAll<long, unsigned int, long, thrust::identity<long>, ReduceMax<long>, int=1>(TensorInfo<long, unsigned int>, unsigned int, long, long, thrust::identity<long>, long*)
0.31% 22.495us 6 3.7490us 3.7110us 3.8720us void kernelReduceAll<long, unsigned int, long, thrust::identity<long>, ReduceMin<long>, int=1>(TensorInfo<long, unsigned int>, unsigned int, long, long, thrust::identity<long>, long*)
0.22% 15.456us 12 1.2880us 960ns 1.5680us void kernelPointwiseApply2<TensorMulConstantOp<long>, long, long, unsigned int, int=1, int=1>(OffsetInfo<TensorMulConstantOp<long>, long, unsigned int>, OffsetInfo<long, long, int=1>, long, long)
0.19% 13.312us 12 1.1090us 928ns 1.3120us void kernelPointwiseApply1<TensorFillOp<long>, long, unsigned int, int=1>(OffsetInfo<TensorFillOp<long>, long, unsigned int>, long, long)
0.17% 12.256us 10 1.2250us 832ns 1.7920us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__tabulate::functor<thrust::device_ptr<long>, LinspaceOp<long, long>, long>, long>, thrust::cuda_cub::__tabulate::functor<thrust::device_ptr<long>, LinspaceOp<long, long>, long>, long>(thrust::device_ptr<long>, long)
0.13% 9.5680us 6 1.5940us 1.4720us 1.7920us void kernelPointwiseApply2<TensorRemainderOp<long>, long, long, unsigned int, int=1, int=1>(OffsetInfo<TensorRemainderOp<long>, long, unsigned int>, OffsetInfo<long, long, int=1>, long, long)
0.13% 9.2480us 6 1.5410us 1.5040us 1.6000us void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=1, int=2>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float)
API calls: 99.47% 6.15817s 10 615.82ms 24.250us 6.15283s cudaMalloc
0.22% 13.492ms 185 72.930us 215ns 4.7025ms cuDeviceGetAttribute
0.16% 9.9405ms 2 4.9703ms 4.8841ms 5.0564ms cudaGetDeviceProperties
0.03% 2.0786ms 37 56.179us 2.4000us 655.49us cudaStreamSynchronize
0.03% 2.0602ms 113 18.231us 8.6980us 141.58us cudaLaunch
0.03% 1.7155ms 37 46.364us 7.5740us 1.1114ms cudaMemcpyAsync
0.02% 1.2330ms 2 616.50us 539.44us 693.55us cuDeviceTotalMem
0.01% 799.20us 2 399.60us 257.80us 541.41us cuDeviceGetName
0.01% 726.74us 1370 530ns 425ns 13.177us cudaGetDevice
0.00% 267.61us 461 580ns 463ns 9.3500us cudaSetDevice
0.00% 131.14us 562 233ns 174ns 1.1050us cudaSetupArgument
0.00% 54.167us 10 5.4160us 4.6610us 8.0000us cudaFuncGetAttributes
0.00% 38.099us 113 337ns 227ns 1.2180us cudaConfigureCall
0.00% 31.952us 119 268ns 201ns 579ns cudaGetLastError
0.00% 8.5300us 10 853ns 713ns 1.2310us cudaDeviceGetAttribute
0.00% 5.5960us 13 430ns 162ns 1.4770us cudaGetDeviceCount
0.00% 5.0520us 20 252ns 170ns 380ns cudaPeekAtLastError
0.00% 2.7840us 4 696ns 244ns 1.8040us cuDeviceGetCount
0.00% 1.8990us 3 633ns 257ns 1.2610us cuDeviceGet
0.00% 928ns 1 928ns 928ns 928ns cuInit
0.00% 792ns 1 792ns 792ns 792ns cuDriverGetVersion