File size: 7,436 Bytes
29b9c56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
python profile.py -f -j 2
Type Time (%) Time (ms)  Calls       Avg       Min       Max                                               Name
GPU activities:    20.96    3.5466     24  147.77us  68.544us  243.27us  void spatialDepthwiseConvolutionUpdateOutput<f...
GPU activities:    18.18    3.0757    196  15.692us     992ns  2.8695ms                                 [CUDA memcpy DtoH]
GPU activities:    12.21    2.0655     48  43.031us  9.4720us  97.825us  void kernelPointwiseApply2<TensorTakeOp<float,...
GPU activities:    10.92    1.8471     80  23.089us  1.5680us  98.785us  void kernelPointwiseApply3<TensorAddOp<long>, ...
GPU activities:     4.94   0.83662     48  17.429us  7.5840us  28.129us  void indexSelectSmallIndex<float, unsigned int...
GPU activities:     4.56   0.77092     97  7.9470us     864ns  680.74us                                 [CUDA memcpy HtoD]
GPU activities:     4.15   0.70253     36  19.514us  2.3360us  56.832us  void kernelPointwiseApply2<CopyOp<float, float...
GPU activities:     3.28    0.5545     60  9.2410us     928ns  50.721us  void kernelPointwiseApply2<CopyOp<float, float...
GPU activities:     2.95   0.49908     24  20.795us  2.2080us  70.657us  void kernelPointwiseApply3<TensorAddOp<float>,...
GPU activities:     2.81   0.47514     24  19.797us  2.7840us  55.105us  void CatArrayBatchedCopy<float, unsigned int, ...
GPU activities:     2.11   0.35709     12  29.757us  11.104us  48.929us  void kernelPointwiseApply2<TensorDivConstantOp...
GPU activities:      2.1   0.35585     36  9.8840us  2.7200us  27.457us  void kernelPointwiseApply2<CopyOp<float, float...
GPU activities:     1.33   0.22563     12  18.802us  2.7840us  35.009us  void kernelPointwiseApply3<TensorAddOp<float>,...
GPU activities:     1.25   0.21111      6  35.184us  15.904us  54.657us  void indexSelectSmallIndex<float, unsigned int...
GPU activities:     1.24   0.20992     12  17.493us  3.7760us  31.232us  void kernelPointwiseApply3<TensorSubOp<float>,...
GPU activities:      1.2   0.20346     48  4.2380us  4.0640us  4.4160us  void kernelReduceAll<long, unsigned int, long,...
GPU activities:     1.12   0.18906     48  3.9380us  3.8720us  4.0640us  void kernelReduceAll<long, unsigned int, long,...
GPU activities:     0.98   0.16602      3  55.339us  20.928us  89.602us  void kernelReduceAllPass1<float, unsigned int,...
GPU activities:     0.85   0.14416     24  6.0060us  4.9930us  6.4960us  void kernelPointwiseApply2<CopyOp<float, float...
GPU activities:     0.69   0.11655     96  1.2140us     928ns  1.5680us  void kernelPointwiseApply2<TensorMulConstantOp...
GPU activities:     0.51  0.087104     96     907ns     768ns  1.1200us  void kernelPointwiseApply1<TensorFillOp<long>,...
GPU activities:     0.51  0.086562     80  1.0820us     800ns  1.7920us  void thrust::cuda_cub::core::_kernel_agent<thr...
GPU activities:     0.47  0.079553     48  1.6570us  1.6000us  1.8240us  void kernelPointwiseApply2<TensorRemainderOp<l...
GPU activities:     0.33  0.056065      6  9.3440us  2.5600us  16.320us  void kernelPointwiseApply3<TensorSubOp<float>,...
GPU activities:     0.22  0.037281      6  6.2130us  2.1760us  10.337us  void kernelPointwiseApply2<Tensor_neg_Float_Op...
GPU activities:     0.05  0.008096      4  2.0240us     800ns  5.4720us  void kernelPointwiseApply1<TensorFillOp<float>...
GPU activities:     0.04  0.006912      3  2.3040us  2.2080us  2.4960us  void kernelReduceAllPass2<float, ReduceAdd<flo...
GPU activities:     0.02  0.003552      4     888ns     864ns     928ns                                      [CUDA memset]
GPU activities:     0.02  0.003168      3  1.0560us     928ns  1.3120us  void kernelPointwiseApply1<TensorDivConstantOp...
         Total:      100   16.9208   1184                                                                                 
Total (no mem):    77.26   13.0742    891                                                                                 
     API calls:    95.84   4198.84     18  233.27ms  8.2740us  4.19006s                                         cudaMalloc
     API calls:     3.05    133.79      1  133.79ms  133.79ms  133.79ms                              cudaDeviceSynchronize
     API calls:     0.29    12.761    887  14.386us  6.2380us  93.892us                                         cudaLaunch
     API calls:     0.24    10.469    293  35.729us  4.9150us  4.6638ms                                    cudaMemcpyAsync
     API calls:     0.12    5.4644  10489     520ns     255ns  11.760us                                      cudaGetDevice
     API calls:     0.12    5.0971    185  27.551us     130ns  1.1779ms                               cuDeviceGetAttribute
     API calls:     0.11    5.0105      2  2.5052ms  2.4969ms  2.5136ms                            cudaGetDeviceProperties
     API calls:     0.05    2.1212    269  7.8850us  1.6810us  190.90us                              cudaStreamSynchronize
     API calls:     0.05    2.0268   3374     600ns     279ns  3.2670us                                      cudaSetDevice
     API calls:     0.04    1.6823      1  1.6823ms  1.6823ms  1.6823ms                                      cudaHostAlloc
     API calls:     0.02     1.002    936  1.0700us     107ns  761.44us                                   cudaGetLastError
     API calls:     0.02   0.96456   4190     230ns     106ns  12.219us                                  cudaSetupArgument
     API calls:     0.01   0.48824      2  244.12us  241.64us  246.60us                                    cuDeviceGetName
     API calls:     0.01   0.38307     80  4.7880us  2.9260us  11.740us                              cudaFuncGetAttributes
     API calls:     0.01   0.30248      2  151.24us  150.97us  151.51us                                   cuDeviceTotalMem
     API calls:     0.01   0.27449    887     309ns     141ns  1.7810us                                  cudaConfigureCall
     API calls:     0.01   0.22652      4  56.629us  6.5850us  169.10us                                    cudaMemsetAsync
     API calls:        0  0.075646     23  3.2880us  2.0090us  6.8090us                                     cudaEventQuery
     API calls:        0  0.058023     80     725ns     394ns  2.0390us                             cudaDeviceGetAttribute
     API calls:        0   0.04959     24  2.0660us  1.2120us  4.9820us                           cudaEventCreateWithFlags
     API calls:        0  0.042829     24  1.7840us  1.0720us  4.6350us                                    cudaEventRecord
     API calls:        0  0.042213     23  1.8350us  1.0540us  4.3870us                                   cudaEventDestroy
     API calls:        0  0.038338    160     239ns     106ns     755ns                                cudaPeekAtLastError
     API calls:        0  0.019943     14  1.4240us     110ns  15.481us                                 cudaGetDeviceCount
     API calls:        0  0.001852      4     463ns     136ns  1.2240us                                   cuDeviceGetCount
     API calls:        0  0.001286      3     428ns     147ns     913ns                                        cuDeviceGet
     API calls:        0  0.000884      1     884ns     884ns     884ns                                             cuInit
     API calls:        0  0.000541      1     541ns     541ns     541ns                                 cuDriverGetVersion
         Total:      100   4381.23  21977