Spaces:
Sleeping
Sleeping
| ==11704== NVPROF is profiling process 11704, command: python profile.py -f -j 1 | |
| ==11704== Profiling application: python profile.py -f -j 1 | |
| ==11704== Profiling result: | |
| Type Time(%) Time Calls Avg Min Max Name | |
| GPU activities: 50.51% 3.6099ms 6 601.65us 466.91us 748.76us void spatialDepthwiseConvolutionUpdateOutput<float, float, unsigned int, int=0>(THCDeviceTensor<float, int=4, int, DefaultPtrTraits>, THCDeviceTensor<float, int=4, int, DefaultPtrTraits>, THCDeviceTensor<float, int=4, int, DefaultPtrTraits>, THCDeviceTensor<float, int=1, int, DefaultPtrTraits>, bool, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int) | |
| 13.71% 979.45us 13 75.342us 896ns 967.77us [CUDA memcpy HtoD] | |
| 10.56% 754.39us 6 125.73us 122.75us 129.60us void kernelPointwiseApply2<TensorTakeOp<float, int, int=-2>, float, long, unsigned int, int=1, int=1>(OffsetInfo<int, TensorTakeOp<float, int, int=-2>, float>, OffsetInfo<int=-2, TensorTakeOp<float, int, int=-2>, long>, TensorTakeOp<float, int, int=-2>, float) | |
| 6.80% 485.85us 10 48.585us 2.1440us 123.36us void kernelPointwiseApply3<TensorAddOp<long>, long, long, long, unsigned int, int=1, int=2, int=2>(OffsetInfo<TensorAddOp<long>, long, unsigned int>, OffsetInfo<long, long, int=1>, OffsetInfo<long, long, int=2>, long, long) | |
| 5.35% 382.21us 12 31.850us 27.232us 36.736us void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float) | |
| 3.27% 233.79us 6 38.965us 35.455us 42.464us void kernelPointwiseApply3<TensorAddOp<float>, float, float, float, unsigned int, int=1, int=2, int=2>(OffsetInfo<TensorAddOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=2>, float, float) | |
| 3.07% 219.65us 6 36.607us 35.359us 37.856us void kernelPointwiseApply3<TensorSubOp<float>, float, float, float, unsigned int, int=1, int=2, int=2>(OffsetInfo<TensorSubOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=2>, float, float) | |
| 2.43% 173.50us 3 57.833us 54.975us 59.519us void kernelPointwiseApply2<TensorDivConstantOp<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<TensorDivConstantOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float) | |
| 2.43% 173.47us 6 28.911us 27.584us 32.768us void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=2, int=1>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float) | |
| 0.37% 26.208us 24 1.0920us 992ns 1.4720us [CUDA memcpy DtoH] | |
| 0.36% 25.503us 6 4.2500us 4.1600us 4.3520us void kernelReduceAll<long, unsigned int, long, thrust::identity<long>, ReduceMax<long>, int=1>(TensorInfo<long, unsigned int>, unsigned int, long, long, thrust::identity<long>, long*) | |
| 0.31% 22.495us 6 3.7490us 3.7110us 3.8720us void kernelReduceAll<long, unsigned int, long, thrust::identity<long>, ReduceMin<long>, int=1>(TensorInfo<long, unsigned int>, unsigned int, long, long, thrust::identity<long>, long*) | |
| 0.22% 15.456us 12 1.2880us 960ns 1.5680us void kernelPointwiseApply2<TensorMulConstantOp<long>, long, long, unsigned int, int=1, int=1>(OffsetInfo<TensorMulConstantOp<long>, long, unsigned int>, OffsetInfo<long, long, int=1>, long, long) | |
| 0.19% 13.312us 12 1.1090us 928ns 1.3120us void kernelPointwiseApply1<TensorFillOp<long>, long, unsigned int, int=1>(OffsetInfo<TensorFillOp<long>, long, unsigned int>, long, long) | |
| 0.17% 12.256us 10 1.2250us 832ns 1.7920us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__tabulate::functor<thrust::device_ptr<long>, LinspaceOp<long, long>, long>, long>, thrust::cuda_cub::__tabulate::functor<thrust::device_ptr<long>, LinspaceOp<long, long>, long>, long>(thrust::device_ptr<long>, long) | |
| 0.13% 9.5680us 6 1.5940us 1.4720us 1.7920us void kernelPointwiseApply2<TensorRemainderOp<long>, long, long, unsigned int, int=1, int=1>(OffsetInfo<TensorRemainderOp<long>, long, unsigned int>, OffsetInfo<long, long, int=1>, long, long) | |
| 0.13% 9.2480us 6 1.5410us 1.5040us 1.6000us void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=1, int=2>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float) | |
| API calls: 99.47% 6.15817s 10 615.82ms 24.250us 6.15283s cudaMalloc | |
| 0.22% 13.492ms 185 72.930us 215ns 4.7025ms cuDeviceGetAttribute | |
| 0.16% 9.9405ms 2 4.9703ms 4.8841ms 5.0564ms cudaGetDeviceProperties | |
| 0.03% 2.0786ms 37 56.179us 2.4000us 655.49us cudaStreamSynchronize | |
| 0.03% 2.0602ms 113 18.231us 8.6980us 141.58us cudaLaunch | |
| 0.03% 1.7155ms 37 46.364us 7.5740us 1.1114ms cudaMemcpyAsync | |
| 0.02% 1.2330ms 2 616.50us 539.44us 693.55us cuDeviceTotalMem | |
| 0.01% 799.20us 2 399.60us 257.80us 541.41us cuDeviceGetName | |
| 0.01% 726.74us 1370 530ns 425ns 13.177us cudaGetDevice | |
| 0.00% 267.61us 461 580ns 463ns 9.3500us cudaSetDevice | |
| 0.00% 131.14us 562 233ns 174ns 1.1050us cudaSetupArgument | |
| 0.00% 54.167us 10 5.4160us 4.6610us 8.0000us cudaFuncGetAttributes | |
| 0.00% 38.099us 113 337ns 227ns 1.2180us cudaConfigureCall | |
| 0.00% 31.952us 119 268ns 201ns 579ns cudaGetLastError | |
| 0.00% 8.5300us 10 853ns 713ns 1.2310us cudaDeviceGetAttribute | |
| 0.00% 5.5960us 13 430ns 162ns 1.4770us cudaGetDeviceCount | |
| 0.00% 5.0520us 20 252ns 170ns 380ns cudaPeekAtLastError | |
| 0.00% 2.7840us 4 696ns 244ns 1.8040us cuDeviceGetCount | |
| 0.00% 1.8990us 3 633ns 257ns 1.2610us cuDeviceGet | |
| 0.00% 928ns 1 928ns 928ns 928ns cuInit | |
| 0.00% 792ns 1 792ns 792ns 792ns cuDriverGetVersion | |