diff --git "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" --- "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" +++ "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.25s +Cell: nv | 0.24s | Raw @@ -3905,7 +3905,7 @@ Cell: nv | 0.25s
-
Fri Dec 19 19:54:47 2025       
+
Fri Dec 19 22:48:25 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   37C    P0             82W /  350W |       0MiB /  46068MiB |     14%      Default |
+| N/A   28C    P0             90W /  350W |       0MiB /  46068MiB |     10%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.25s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 6.16s
+Cell: benchmark | 6.59s
  | 
 
 Raw
@@ -3992,19 +3992,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     144.993us      3625.73%     144.993us     144.993us             1  
-                               hf_kernels_causal_conv1d         6.46%     138.804us        99.34%       2.133ms       2.133ms       0.000us         0.00%       5.407us       5.407us             1  
-                                         CausalConv1dFn         4.85%     104.092us        92.88%       1.994ms     664.706us       0.000us         0.00%       5.407us       1.802us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.17%      25.081us        85.08%       1.827ms     608.915us       3.999us       100.00%       5.407us       1.802us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.999us       100.00%       3.999us       1.333us             3  
-                                Activity Buffer Request        81.62%       1.752ms        81.62%       1.752ms       1.752ms       1.408us        35.21%       1.408us       1.408us             1  
-                                       aten::empty_like         0.88%      18.799us         2.95%      63.281us      21.094us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.07%      44.482us         2.07%      44.482us      14.827us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.29%      49.201us         2.29%      49.201us      16.400us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.66%      14.170us         0.66%      14.170us      14.170us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     138.175us      3373.41%     138.175us     138.175us             1  
+                               hf_kernels_causal_conv1d         9.76%     215.575us        99.34%       2.195ms       2.195ms       0.000us         0.00%       5.536us       5.536us             1  
+                                         CausalConv1dFn         4.55%     100.523us        89.58%       1.980ms     659.841us       0.000us         0.00%       5.536us       1.845us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.12%      24.839us        82.43%       1.821ms     607.150us       4.096us       100.00%       5.536us       1.845us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.096us       100.00%       4.096us       1.365us             3  
+                                Activity Buffer Request        79.20%       1.750ms        79.20%       1.750ms       1.750ms       1.440us        35.16%       1.440us       1.440us             1  
+                                       aten::empty_like         0.80%      17.780us         2.60%      57.551us      19.184us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.80%      39.771us         1.80%      39.771us      13.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.11%      46.542us         2.11%      46.542us      15.514us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.66%      14.640us         0.66%      14.640us      14.640us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.147ms
-Self CUDA time total: 3.999us
+Self CPU time total: 2.210ms
+Self CUDA time total: 4.096us
 
 
 
@@ -4014,18 +4014,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.991us      3231.60%     120.991us     120.991us             1  
-                               hf_kernels_causal_conv1d         3.92%      79.473us        99.73%       2.024ms       2.024ms       0.000us         0.00%       4.992us       4.992us             1  
-                                         CausalConv1dFn         3.49%      70.790us        95.81%       1.944ms     648.019us       0.000us         0.00%       4.992us       1.664us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.22%      24.702us        90.78%       1.842ms     614.002us       3.744us       100.00%       4.992us       1.664us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.799us      3119.63%     116.799us     116.799us             1  
+                               hf_kernels_causal_conv1d         4.07%      82.232us        99.70%       2.017ms       2.017ms       0.000us         0.00%       4.992us       4.992us             1  
+                                         CausalConv1dFn         3.50%      70.722us        95.64%       1.934ms     644.801us       0.000us         0.00%       4.992us       1.664us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.19%      23.990us        90.71%       1.835ms     611.576us       3.744us       100.00%       4.992us       1.664us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.744us       100.00%       3.744us       1.248us             3  
-                                Activity Buffer Request        88.06%       1.787ms        88.06%       1.787ms       1.787ms       1.248us        33.33%       1.248us       1.248us             1  
-                                       aten::empty_like         0.40%       8.072us         1.54%      31.262us      10.421us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.14%      23.190us         1.14%      23.190us       7.730us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.51%      30.580us         1.51%      30.580us      10.193us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       5.530us         0.27%       5.530us       5.530us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        88.04%       1.781ms        88.04%       1.781ms       1.781ms       1.248us        33.33%       1.248us       1.248us             1  
+                                       aten::empty_like         0.35%       7.172us         1.43%      28.951us       9.650us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.08%      21.779us         1.08%      21.779us       7.260us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.48%      29.931us         1.48%      29.931us       9.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       6.050us         0.30%       6.050us       6.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.029ms
+Self CPU time total: 2.023ms
 Self CUDA time total: 3.744us
 
 
@@ -4036,19 +4036,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.553us      3165.29%     119.553us     119.553us             1  
-                               hf_kernels_causal_conv1d         5.28%     102.383us        99.71%       1.932ms       1.932ms       0.000us         0.00%       5.026us       5.026us             1  
-                                         CausalConv1dFn         3.60%      69.861us        94.43%       1.830ms     610.001us       0.000us         0.00%       5.026us       1.675us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.29%      25.081us        89.25%       1.730ms     576.561us       3.777us       100.00%       5.026us       1.675us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.777us       100.00%       3.777us       1.259us             3  
-                                Activity Buffer Request        86.29%       1.672ms        86.29%       1.672ms       1.672ms       1.249us        33.07%       1.249us       1.249us             1  
-                                       aten::empty_like         0.46%       8.829us         1.57%      30.461us      10.154us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.12%      21.632us         1.12%      21.632us       7.211us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.67%      32.330us         1.67%      32.330us      10.777us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       5.571us         0.29%       5.571us       5.571us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.407us      3135.87%     117.407us     117.407us             1  
+                               hf_kernels_causal_conv1d         7.37%     145.913us        99.70%       1.974ms       1.974ms       0.000us         0.00%       4.992us       4.992us             1  
+                                         CausalConv1dFn         3.62%      71.732us        92.33%       1.828ms     609.363us       0.000us         0.00%       4.992us       1.664us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.15%      22.761us        87.20%       1.727ms     575.503us       3.744us       100.00%       4.992us       1.664us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.744us       100.00%       3.744us       1.248us             3  
+                                Activity Buffer Request        84.49%       1.673ms        84.49%       1.673ms       1.673ms       1.248us        33.33%       1.248us       1.248us             1  
+                                       aten::empty_like         0.40%       7.929us         1.51%      29.850us       9.950us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.11%      21.921us         1.11%      21.921us       7.307us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.55%      30.741us         1.55%      30.741us      10.247us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       6.010us         0.30%       6.010us       6.010us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.938ms
-Self CUDA time total: 3.777us
+Self CPU time total: 1.980ms
+Self CUDA time total: 3.744us
 
 
 
@@ -4058,18 +4058,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.951us      3441.50%     129.951us     129.951us             1  
-                               hf_kernels_causal_conv1d         4.54%     106.623us        99.77%       2.341ms       2.341ms       0.000us         0.00%       5.024us       5.024us             1  
-                                         CausalConv1dFn         3.24%      75.911us        95.23%       2.234ms     744.741us       0.000us         0.00%       5.024us       1.675us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         0.97%      22.670us        90.70%       2.128ms     709.344us       3.776us       100.00%       5.024us       1.675us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.761us      3330.53%     125.761us     125.761us             1  
+                               hf_kernels_causal_conv1d         5.00%     114.852us        99.75%       2.290ms       2.290ms       0.000us         0.00%       5.024us       5.024us             1  
+                                         CausalConv1dFn         3.16%      72.552us        94.75%       2.175ms     724.966us       0.000us         0.00%       5.024us       1.675us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         0.94%      21.579us        90.35%       2.074ms     691.325us       3.776us       100.00%       5.024us       1.675us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
-                                Activity Buffer Request        79.32%       1.861ms        79.32%       1.861ms       1.861ms       1.248us        33.05%       1.248us       1.248us             1  
-                                       aten::empty_like         0.36%       8.382us         1.29%      30.281us      10.094us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.93%      21.899us         0.93%      21.899us       7.300us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        10.41%     244.307us        10.41%     244.307us      81.436us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       5.280us         0.23%       5.280us       5.280us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        78.71%       1.807ms        78.71%       1.807ms       1.807ms       1.248us        33.05%       1.248us       1.248us             1  
+                                       aten::empty_like         0.34%       7.750us         1.24%      28.370us       9.457us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.90%      20.620us         0.90%      20.620us       6.873us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.71%     245.786us        10.71%     245.786us      81.929us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       5.631us         0.25%       5.631us       5.631us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.346ms
+Self CPU time total: 2.295ms
 Self CUDA time total: 3.776us
 
 
@@ -4080,19 +4080,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.920us      2724.83%     129.920us     129.920us             1  
-                               hf_kernels_causal_conv1d         4.79%     104.612us        99.77%       2.179ms       2.179ms       0.000us         0.00%       6.368us       6.368us             1  
-                                         CausalConv1dFn         3.53%      77.212us        94.98%       2.075ms     691.584us       0.000us         0.00%       6.368us       2.123us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.05%      22.870us        90.05%       1.967ms     655.679us       4.768us       100.00%       6.368us       2.123us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.768us       100.00%       4.768us       1.589us             3  
-                                Activity Buffer Request        78.82%       1.722ms        78.82%       1.722ms       1.722ms       1.600us        33.56%       1.600us       1.600us             1  
-                                       aten::empty_like         0.38%       8.362us         1.40%      30.502us      10.167us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.01%      22.140us         1.01%      22.140us       7.380us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        10.18%     222.375us        10.18%     222.375us      74.125us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       5.100us         0.23%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.607us      2512.65%     120.607us     120.607us             1  
+                               hf_kernels_causal_conv1d         4.85%     103.512us        99.74%       2.128ms       2.128ms       0.000us         0.00%       6.432us       6.432us             1  
+                                         CausalConv1dFn         3.24%      69.180us        94.89%       2.024ms     674.755us       0.000us         0.00%       6.432us       2.144us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.08%      23.061us        90.26%       1.925ms     641.807us       4.800us       100.00%       6.432us       2.144us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.800us       100.00%       4.800us       1.600us             3  
+                                Activity Buffer Request        78.33%       1.671ms        78.33%       1.671ms       1.671ms       1.632us        34.00%       1.632us       1.632us             1  
+                                       aten::empty_like         0.38%       8.041us         1.39%      29.662us       9.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.01%      21.621us         1.01%      21.621us       7.207us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.85%     231.515us        10.85%     231.515us      77.172us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       5.440us         0.26%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.184ms
-Self CUDA time total: 4.768us
+Self CPU time total: 2.133ms
+Self CUDA time total: 4.800us
 
 
 
@@ -4102,19 +4102,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.830us      2433.96%     116.830us     116.830us             1  
-                               hf_kernels_causal_conv1d        11.09%      77.122us        99.27%     690.457us     690.457us       0.000us         0.00%       6.432us       6.432us             1  
-                                         CausalConv1dFn        10.04%      69.842us        88.18%     613.335us     204.445us       0.000us         0.00%       6.432us       2.144us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.34%      23.210us        74.09%     515.312us     171.771us       4.800us       100.00%       6.432us       2.144us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.800us       100.00%       4.800us       1.600us             3  
-                                Activity Buffer Request        40.97%     284.977us        40.97%     284.977us     284.977us       1.632us        34.00%       1.632us       1.632us             1  
-                                       aten::empty_like         1.03%       7.161us         4.05%      28.181us       9.394us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.02%      21.020us         3.02%      21.020us       7.007us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        29.78%     207.125us        29.78%     207.125us      69.042us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.73%       5.090us         0.73%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.085us      2372.25%     113.085us     113.085us             1  
+                               hf_kernels_causal_conv1d        10.68%      79.331us        99.24%     737.226us     737.226us       0.000us         0.00%       6.367us       6.367us             1  
+                                         CausalConv1dFn         8.90%      66.092us        88.56%     657.895us     219.298us       0.000us         0.00%       6.367us       2.122us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.16%      23.443us        75.78%     562.993us     187.664us       4.767us       100.00%       6.367us       2.122us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.767us       100.00%       4.767us       1.589us             3  
+                                Activity Buffer Request        41.95%     311.656us        41.95%     311.656us     311.656us       1.600us        33.56%       1.600us       1.600us             1  
+                                       aten::empty_like         0.97%       7.240us         3.88%      28.810us       9.603us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.90%      21.570us         2.90%      21.570us       7.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.68%     227.894us        30.68%     227.894us      75.965us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.76%       5.680us         0.76%       5.680us       5.680us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 695.547us
-Self CUDA time total: 4.800us
+Self CPU time total: 742.906us
+Self CUDA time total: 4.767us
 
 
 
@@ -4124,19 +4124,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.990us      1138.94%     120.990us     120.990us             1  
-                               hf_kernels_causal_conv1d         8.25%      76.440us        99.44%     921.732us     921.732us       0.000us         0.00%      14.206us      14.206us             1  
-                                         CausalConv1dFn         7.43%      68.843us        91.20%     845.292us     281.764us       0.000us         0.00%      14.206us       4.735us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.57%      23.801us        80.53%     746.458us     248.819us      10.623us       100.00%      14.206us       4.735us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.623us       100.00%      10.623us       3.541us             3  
-                                Activity Buffer Request        58.01%     537.643us        58.01%     537.643us     537.643us       3.583us        33.73%       3.583us       3.583us             1  
-                                       aten::empty_like         0.88%       8.201us         3.24%      29.991us       9.997us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.35%      21.790us         2.35%      21.790us       7.263us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        19.96%     185.014us        19.96%     185.014us      61.671us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.56%       5.149us         0.56%       5.149us       5.149us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.783us      1121.44%     118.783us     118.783us             1  
+                               hf_kernels_causal_conv1d         4.59%     101.592us        99.77%       2.207ms       2.207ms       0.000us         0.00%      14.144us      14.144us             1  
+                                         CausalConv1dFn         3.12%      68.981us        95.17%       2.105ms     701.765us       0.000us         0.00%      14.144us       4.715us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.00%      22.142us        90.78%       2.008ms     669.378us      10.592us       100.00%      14.144us       4.715us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.592us       100.00%      10.592us       3.531us             3  
+                                Activity Buffer Request        79.96%       1.769ms        79.96%       1.769ms       1.769ms       3.552us        33.53%       3.552us       3.552us             1  
+                                       aten::empty_like         0.35%       7.660us         1.27%      28.180us       9.393us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.93%      20.520us         0.93%      20.520us       6.840us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.83%     217.354us         9.83%     217.354us      72.451us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.23%       5.140us         0.23%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 926.881us
-Self CUDA time total: 10.623us
+Self CPU time total: 2.212ms
+Self CUDA time total: 10.592us
 
 
 
@@ -4146,19 +4146,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.338us      1124.43%     122.338us     122.338us             1  
-                               hf_kernels_causal_conv1d         8.49%      80.052us        99.43%     937.274us     937.274us       0.000us         0.00%      14.560us      14.560us             1  
-                                         CausalConv1dFn         7.62%      71.802us        90.94%     857.222us     285.741us       0.000us         0.00%      14.560us       4.853us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.57%      24.180us        80.29%     756.859us     252.286us      10.880us       100.00%      14.560us       4.853us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.880us       100.00%      10.880us       3.627us             3  
-                                Activity Buffer Request        59.85%     564.194us        59.85%     564.194us     564.194us       3.680us        33.82%       3.680us       3.680us             1  
-                                       aten::empty_like         0.81%       7.670us         3.03%      28.561us       9.520us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.22%      20.891us         2.22%      20.891us       6.964us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        17.87%     168.485us        17.87%     168.485us      56.162us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.57%       5.361us         0.57%       5.361us       5.361us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.020us      1041.95%     113.020us     113.020us             1  
+                               hf_kernels_causal_conv1d        12.31%      79.151us        99.13%     637.563us     637.563us       0.000us         0.00%      14.495us      14.495us             1  
+                                         CausalConv1dFn        10.43%      67.063us        86.83%     558.412us     186.137us       0.000us         0.00%      14.495us       4.832us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.37%      21.690us        72.11%     463.759us     154.586us      10.847us       100.00%      14.495us       4.832us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.847us       100.00%      10.847us       3.616us             3  
+                                Activity Buffer Request        35.82%     230.365us        35.82%     230.365us     230.365us       3.648us        33.63%       3.648us       3.648us             1  
+                                       aten::empty_like         1.10%       7.080us         4.29%      27.590us       9.197us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.19%      20.510us         3.19%      20.510us       6.837us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.92%     211.704us        32.92%     211.704us      70.568us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.87%       5.571us         0.87%       5.571us       5.571us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 942.635us
-Self CUDA time total: 10.880us
+Self CPU time total: 643.134us
+Self CUDA time total: 10.847us
 
 
 
@@ -4168,19 +4168,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.254us      1095.61%     120.254us     120.254us             1  
-                               hf_kernels_causal_conv1d        12.25%      73.512us        99.17%     595.034us     595.034us       0.000us         0.00%      14.688us      14.688us             1  
-                                         CausalConv1dFn        11.59%      69.552us        86.91%     521.522us     173.841us       0.000us         0.00%      14.688us       4.896us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.21%      25.240us        70.04%     420.260us     140.087us      10.976us       100.00%      14.688us       4.896us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.976us       100.00%      10.976us       3.659us             3  
-                                Activity Buffer Request        38.36%     230.206us        38.36%     230.206us     230.206us       3.712us        33.82%       3.712us       3.712us             1  
-                                       aten::empty_like         1.43%       8.570us         5.28%      31.710us      10.570us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.86%      23.140us         3.86%      23.140us       7.713us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        27.47%     164.814us        27.47%     164.814us      54.938us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.83%       5.010us         0.83%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.798us      1140.44%     124.798us     124.798us             1  
+                               hf_kernels_causal_conv1d         8.17%      74.152us        99.44%     901.990us     901.990us       0.000us         0.00%      14.655us      14.655us             1  
+                                         CausalConv1dFn         7.50%      68.001us        91.26%     827.838us     275.946us       0.000us         0.00%      14.655us       4.885us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.46%      22.272us        79.87%     724.486us     241.495us      10.943us       100.00%      14.655us       4.885us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.943us       100.00%      10.943us       3.648us             3  
+                                Activity Buffer Request        54.33%     492.860us        54.33%     492.860us     492.860us       3.712us        33.92%       3.712us       3.712us             1  
+                                       aten::empty_like         0.85%       7.700us         3.90%      35.351us      11.784us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.05%      27.651us         3.05%      27.651us       9.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        23.08%     209.354us        23.08%     209.354us      69.785us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.56%       5.090us         0.56%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 600.044us
-Self CUDA time total: 10.976us
+Self CPU time total: 907.080us
+Self CUDA time total: 10.943us
 
 
 
@@ -4190,19 +4190,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.621us      1053.10%     118.621us     118.621us             1  
-                               hf_kernels_causal_conv1d        18.68%      98.702us        99.03%     523.193us     523.193us       0.000us         0.00%      15.040us      15.040us             1  
-                                         CausalConv1dFn        13.73%      72.543us        80.34%     424.491us     141.497us       0.000us         0.00%      15.040us       5.013us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.48%      23.691us        60.95%     322.037us     107.346us      11.264us       100.00%      15.040us       5.013us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.264us       100.00%      11.264us       3.755us             3  
-                                Activity Buffer Request        26.02%     137.473us        26.02%     137.473us     137.473us       3.776us        33.52%       3.776us       3.776us             1  
-                                       aten::empty_like         1.47%       7.771us         5.66%      29.911us       9.970us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.19%      22.140us         4.19%      22.140us       7.380us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.45%     160.873us        30.45%     160.873us      53.624us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.97%       5.150us         0.97%       5.150us       5.150us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     112.030us       959.16%     112.030us     112.030us             1  
+                               hf_kernels_causal_conv1d        12.54%      75.713us        99.10%     598.293us     598.293us       0.000us         0.00%      15.584us      15.584us             1  
+                                         CausalConv1dFn        11.49%      69.370us        86.56%     522.580us     174.193us       0.000us         0.00%      15.584us       5.195us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.73%      22.541us        70.59%     426.160us     142.053us      11.680us       100.00%      15.584us       5.195us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.680us       100.00%      11.680us       3.893us             3  
+                                Activity Buffer Request        32.97%     199.044us        32.97%     199.044us     199.044us       3.904us        33.42%       3.904us       3.904us             1  
+                                       aten::empty_like         1.17%       7.060us         4.48%      27.050us       9.017us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.31%      19.990us         3.31%      19.990us       6.663us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.89%     204.575us        33.89%     204.575us      68.192us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.90%       5.430us         0.90%       5.430us       5.430us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 528.343us
-Self CUDA time total: 11.264us
+Self CPU time total: 603.723us
+Self CUDA time total: 11.680us
 
 
 
@@ -4212,19 +4212,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.863us       259.30%     128.863us     128.863us             1  
-                               hf_kernels_causal_conv1d         3.60%      75.972us        99.76%       2.104ms       2.104ms       0.000us         0.00%      82.688us      82.688us             1  
-                                         CausalConv1dFn         3.37%      70.961us        96.16%       2.028ms     675.870us       0.000us         0.00%      82.688us      27.563us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.16%      24.491us        91.36%       1.926ms     642.136us      49.696us       100.00%      82.688us      27.563us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      49.696us       100.00%      49.696us      16.565us             3  
-                                Activity Buffer Request        82.37%       1.737ms        82.37%       1.737ms       1.737ms      32.992us        66.39%      32.992us      32.992us             1  
-                                       aten::empty_like         0.41%       8.720us         1.43%      30.240us      10.080us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.02%      21.520us         1.02%      21.520us       7.173us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.83%     165.005us         7.83%     165.005us      55.002us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       5.070us         0.24%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.399us       245.20%     122.399us     122.399us             1  
+                               hf_kernels_causal_conv1d         8.97%      73.082us        99.30%     808.948us     808.948us       0.000us         0.00%      83.103us      83.103us             1  
+                                         CausalConv1dFn         8.32%      67.782us        90.33%     735.866us     245.289us       0.000us         0.00%      83.103us      27.701us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.86%      23.300us        78.56%     639.924us     213.308us      49.919us       100.00%      83.103us      27.701us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      49.919us       100.00%      49.919us      16.640us             3  
+                                Activity Buffer Request        50.48%     411.239us        50.48%     411.239us     411.239us      33.184us        66.48%      33.184us      33.184us             1  
+                                       aten::empty_like         0.94%       7.630us         3.46%      28.160us       9.387us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.52%      20.530us         2.52%      20.530us       6.843us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        25.21%     205.385us        25.21%     205.385us      68.462us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.70%       5.670us         0.70%       5.670us       5.670us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.109ms
-Self CUDA time total: 49.696us
+Self CPU time total: 814.618us
+Self CUDA time total: 49.919us
 
 
 
@@ -4234,19 +4234,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.493us       263.09%     122.493us     122.493us             1  
-                               hf_kernels_causal_conv1d        15.75%      73.853us        98.92%     463.792us     463.792us       0.000us         0.00%      76.190us      76.190us             1  
-                                         CausalConv1dFn        14.80%      69.370us        83.17%     389.939us     129.980us       0.000us         0.00%      76.190us      25.397us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.26%      24.662us        62.03%     290.847us      96.949us      46.559us       100.00%      76.190us      25.397us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      46.559us       100.00%      46.559us      15.520us             3  
-                                Activity Buffer Request        22.78%     106.792us        22.78%     106.792us     106.792us      29.631us        63.64%      29.631us      29.631us             1  
-                                       aten::empty_like         1.83%       8.590us         6.34%      29.722us       9.907us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.51%      21.132us         4.51%      21.132us       7.044us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.00%     159.393us        34.00%     159.393us      53.131us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.08%       5.060us         1.08%       5.060us       5.060us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.502us       271.23%     125.502us     125.502us             1  
+                               hf_kernels_causal_conv1d        16.33%      98.732us        99.17%     599.704us     599.704us       0.000us         0.00%      75.486us      75.486us             1  
+                                         CausalConv1dFn        10.93%      66.081us        82.84%     500.972us     166.991us       0.000us         0.00%      75.486us      25.162us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.99%      24.120us        67.22%     406.509us     135.503us      46.271us       100.00%      75.486us      25.162us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      46.271us       100.00%      46.271us      15.424us             3  
+                                Activity Buffer Request        29.69%     179.524us        29.69%     179.524us     179.524us      29.215us        63.14%      29.215us      29.215us             1  
+                                       aten::empty_like         1.23%       7.440us         4.69%      28.382us       9.461us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.46%      20.942us         3.46%      20.942us       6.981us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.55%     202.865us        33.55%     202.865us      67.622us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.83%       5.010us         0.83%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 468.852us
-Self CUDA time total: 46.559us
+Self CPU time total: 604.714us
+Self CUDA time total: 46.271us
 
 
 
@@ -4256,19 +4256,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.279us      3132.21%     121.279us     121.279us             1  
-                               hf_kernels_causal_conv1d         4.47%      97.233us        99.75%       2.167ms       2.167ms       0.000us         0.00%       5.088us       5.088us             1  
-                                         CausalConv1dFn         3.35%      72.763us        95.28%       2.070ms     690.087us       0.000us         0.00%       5.088us       1.696us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.09%      23.651us        90.51%       1.967ms     655.553us       3.872us       100.00%       5.088us       1.696us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.872us       100.00%       3.872us       1.291us             3  
-                                Activity Buffer Request        81.94%       1.781ms        81.94%       1.781ms       1.781ms       1.216us        31.40%       1.216us       1.216us             1  
-                                       aten::empty_like         0.44%       9.520us         1.42%      30.840us      10.280us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.98%      21.320us         0.98%      21.320us       7.107us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.48%     162.473us         7.48%     162.473us      54.158us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       5.420us         0.25%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.631us      2933.93%     113.631us     113.631us             1  
+                               hf_kernels_causal_conv1d         9.16%      84.942us        99.40%     921.770us     921.770us       0.000us         0.00%       5.121us       5.121us             1  
+                                         CausalConv1dFn         7.39%      68.541us        90.24%     836.828us     278.943us       0.000us         0.00%       5.121us       1.707us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.33%      21.600us        79.89%     740.886us     246.962us       3.873us       100.00%       5.121us       1.707us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.873us       100.00%       3.873us       1.291us             3  
+                                Activity Buffer Request        58.56%     543.041us        58.56%     543.041us     543.041us       1.248us        32.22%       1.248us       1.248us             1  
+                                       aten::empty_like         0.81%       7.541us         2.95%      27.401us       9.134us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.14%      19.860us         2.14%      19.860us       6.620us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        19.01%     176.245us        19.01%     176.245us      58.748us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.60%       5.591us         0.60%       5.591us       5.591us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.173ms
-Self CUDA time total: 3.872us
+Self CPU time total: 927.361us
+Self CUDA time total: 3.873us
 
 
 
@@ -4278,19 +4278,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.259us      3001.54%     115.259us     115.259us             1  
-                               hf_kernels_causal_conv1d        21.11%     101.882us        98.95%     477.452us     477.452us       0.000us         0.00%       5.056us       5.056us             1  
-                                         CausalConv1dFn        14.32%      69.113us        77.83%     375.570us     125.190us       0.000us         0.00%       5.056us       1.685us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.08%      24.499us        57.45%     277.196us      92.399us       3.840us       100.00%       5.056us       1.685us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.840us       100.00%       3.840us       1.280us             3  
-                                Activity Buffer Request        19.54%      94.283us        19.54%      94.283us      94.283us       1.216us        31.67%       1.216us       1.216us             1  
-                                       aten::empty_like         1.58%       7.611us         6.06%      29.261us       9.754us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.49%      21.650us         4.49%      21.650us       7.217us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.83%     158.414us        32.83%     158.414us      52.805us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.05%       5.080us         1.05%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     110.365us      2851.07%     110.365us     110.365us             1  
+                               hf_kernels_causal_conv1d        16.99%      89.483us        99.03%     521.432us     521.432us       0.000us         0.00%       5.087us       5.087us             1  
+                                         CausalConv1dFn        12.41%      65.321us        82.04%     431.949us     143.983us       0.000us         0.00%       5.087us       1.696us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.32%      22.760us        64.08%     337.387us     112.462us       3.871us       100.00%       5.087us       1.696us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.871us       100.00%       3.871us       1.290us             3  
+                                Activity Buffer Request        29.67%     156.244us        29.67%     156.244us     156.244us       1.216us        31.41%       1.216us       1.216us             1  
+                                       aten::empty_like         1.31%       6.900us         5.55%      29.241us       9.747us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.24%      22.341us         4.24%      22.341us       7.447us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.08%     158.383us        30.08%     158.383us      52.794us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.97%       5.101us         0.97%       5.101us       5.101us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 482.532us
-Self CUDA time total: 3.840us
+Self CPU time total: 526.533us
+Self CUDA time total: 3.871us
 
 
 
@@ -4300,19 +4300,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.752us      2996.14%     122.752us     122.752us             1  
-                               hf_kernels_causal_conv1d         4.60%      95.013us        99.74%       2.062ms       2.062ms       0.000us         0.00%       5.473us       5.473us             1  
-                                         CausalConv1dFn         3.49%      72.262us        95.14%       1.967ms     655.743us       0.000us         0.00%       5.473us       1.824us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.30%      26.862us        90.17%       1.864ms     621.455us       4.097us       100.00%       5.473us       1.824us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.097us       100.00%       4.097us       1.366us             3  
-                                Activity Buffer Request        81.16%       1.678ms        81.16%       1.678ms       1.678ms       1.376us        33.59%       1.376us       1.376us             1  
-                                       aten::empty_like         0.40%       8.350us         1.48%      30.601us      10.200us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.08%      22.251us         1.08%      22.251us       7.417us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.71%     159.333us         7.71%     159.333us      53.111us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       5.450us         0.26%       5.450us       5.450us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.351us      2916.19%     120.351us     120.351us             1  
+                               hf_kernels_causal_conv1d         4.61%      96.291us        99.73%       2.085ms       2.085ms       0.000us         0.00%       5.502us       5.502us             1  
+                                         CausalConv1dFn         3.35%      69.993us        95.12%       1.989ms     662.875us       0.000us         0.00%       5.502us       1.834us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.13%      23.651us        90.41%       1.890ms     630.060us       4.127us       100.00%       5.502us       1.834us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.127us       100.00%       4.127us       1.376us             3  
+                                Activity Buffer Request        81.44%       1.703ms        81.44%       1.703ms       1.703ms       1.375us        33.32%       1.375us       1.375us             1  
+                                       aten::empty_like         0.37%       7.800us         1.36%      28.450us       9.483us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.99%      20.650us         0.99%      20.650us       6.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.84%     163.843us         7.84%     163.843us      54.614us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.691us         0.27%       5.691us       5.691us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.068ms
-Self CUDA time total: 4.097us
+Self CPU time total: 2.091ms
+Self CUDA time total: 4.127us
 
 
 
@@ -4322,19 +4322,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.482us      3037.69%     123.482us     123.482us             1  
-                               hf_kernels_causal_conv1d        22.91%     113.331us        98.90%     489.311us     489.311us       0.000us         0.00%       5.441us       5.441us             1  
-                                         CausalConv1dFn        13.95%      69.033us        75.99%     375.980us     125.327us       0.000us         0.00%       5.441us       1.814us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.14%      25.431us        55.79%     276.017us      92.006us       4.065us       100.00%       5.441us       1.814us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.065us       100.00%       4.065us       1.355us             3  
-                                Activity Buffer Request        18.49%      91.492us        18.49%      91.492us      91.492us       1.376us        33.85%       1.376us       1.376us             1  
-                                       aten::empty_like         1.55%       7.680us         6.25%      30.930us      10.310us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.70%      23.250us         4.70%      23.250us       7.750us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.16%     159.094us        32.16%     159.094us      53.031us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.10%       5.440us         1.10%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     111.930us      2798.95%     111.930us     111.930us             1  
+                               hf_kernels_causal_conv1d        17.68%      85.272us        98.86%     476.700us     476.700us       0.000us         0.00%       5.343us       5.343us             1  
+                                         CausalConv1dFn        13.80%      66.543us        81.17%     391.428us     130.476us       0.000us         0.00%       5.343us       1.781us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.74%      22.878us        61.63%     297.205us      99.068us       3.999us       100.00%       5.343us       1.781us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.999us       100.00%       3.999us       1.333us             3  
+                                Activity Buffer Request        24.06%     116.003us        24.06%     116.003us     116.003us       1.344us        33.61%       1.344us       1.344us             1  
+                                       aten::empty_like         1.53%       7.380us         5.74%      27.680us       9.227us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.21%      20.300us         4.21%      20.300us       6.767us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.83%     158.324us        32.83%     158.324us      52.775us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.14%       5.510us         1.14%       5.510us       5.510us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 494.751us
-Self CUDA time total: 4.065us
+Self CPU time total: 482.210us
+Self CUDA time total: 3.999us
 
 
 
@@ -4344,19 +4344,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.726us      2264.25%     121.726us     121.726us             1  
-                               hf_kernels_causal_conv1d         4.87%     106.551us        99.77%       2.181ms       2.181ms       0.000us         0.00%       7.200us       7.200us             1  
-                                         CausalConv1dFn         3.29%      71.843us        94.90%       2.074ms     691.407us       0.000us         0.00%       7.200us       2.400us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.24%      27.032us        90.28%       1.973ms     657.779us       5.376us       100.00%       7.200us       2.400us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.376us       100.00%       5.376us       1.792us             3  
-                                Activity Buffer Request        81.68%       1.785ms        81.68%       1.785ms       1.785ms       1.824us        33.93%       1.824us       1.824us             1  
-                                       aten::empty_like         0.35%       7.600us         1.33%      29.041us       9.680us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.98%      21.441us         0.98%      21.441us       7.147us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.36%     160.923us         7.36%     160.923us      53.641us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       4.940us         0.23%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.287us      2224.65%     120.287us     120.287us             1  
+                               hf_kernels_causal_conv1d         4.63%      99.053us        99.76%       2.136ms       2.136ms       0.000us         0.00%       7.231us       7.231us             1  
+                                         CausalConv1dFn         3.38%      72.351us        95.13%       2.037ms     679.021us       0.000us         0.00%       7.231us       2.410us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         0.97%      20.681us        90.42%       1.936ms     645.411us       5.407us       100.00%       7.231us       2.410us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.407us       100.00%       5.407us       1.802us             3  
+                                Activity Buffer Request        82.08%       1.758ms        82.08%       1.758ms       1.758ms       1.824us        33.73%       1.824us       1.824us             1  
+                                       aten::empty_like         0.37%       7.890us         1.33%      28.480us       9.493us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.96%      20.590us         0.96%      20.590us       6.863us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.38%     157.953us         7.38%     157.953us      52.651us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.24%       5.231us         0.24%       5.231us       5.231us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.186ms
-Self CUDA time total: 5.376us
+Self CPU time total: 2.141ms
+Self CUDA time total: 5.407us
 
 
 
@@ -4366,19 +4366,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.382us      2165.25%     116.382us     116.382us             1  
-                               hf_kernels_causal_conv1d        21.05%     101.062us        98.84%     474.631us     474.631us       0.000us         0.00%       7.198us       7.198us             1  
-                                         CausalConv1dFn        14.89%      71.522us        77.79%     373.569us     124.523us       0.000us         0.00%       7.198us       2.399us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.88%      23.451us        56.74%     272.486us      90.829us       5.375us       100.00%       7.198us       2.399us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.375us       100.00%       5.375us       1.792us             3  
-                                Activity Buffer Request        18.95%      91.012us        18.95%      91.012us      91.012us       1.823us        33.92%       1.823us       1.823us             1  
-                                       aten::empty_like         1.53%       7.370us         6.16%      29.561us       9.854us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.62%      22.191us         4.62%      22.191us       7.397us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.91%     158.023us        32.91%     158.023us      52.674us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.16%       5.571us         1.16%       5.571us       5.571us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     110.429us      2029.94%     110.429us     110.429us             1  
+                               hf_kernels_causal_conv1d        19.18%      90.602us        98.83%     466.890us     466.890us       0.000us         0.00%       7.296us       7.296us             1  
+                                         CausalConv1dFn        14.59%      68.912us        79.65%     376.288us     125.429us       0.000us         0.00%       7.296us       2.432us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.94%      23.331us        59.24%     279.846us      93.282us       5.440us       100.00%       7.296us       2.432us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.440us       100.00%       5.440us       1.813us             3  
+                                Activity Buffer Request        20.96%      99.012us        20.96%      99.012us      99.012us       1.856us        34.12%       1.856us       1.856us             1  
+                                       aten::empty_like         1.50%       7.079us         5.83%      27.530us       9.177us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.33%      20.451us         4.33%      20.451us       6.817us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.34%     157.503us        33.34%     157.503us      52.501us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.17%       5.520us         1.17%       5.520us       5.520us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 480.202us
-Self CUDA time total: 5.375us
+Self CPU time total: 472.410us
+Self CUDA time total: 5.440us
 
 
 
@@ -4388,19 +4388,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.143us       748.98%     130.143us     130.143us             1  
-                               hf_kernels_causal_conv1d         4.89%     101.833us        99.76%       2.079ms       2.079ms       0.000us         0.00%      23.200us      23.200us             1  
-                                         CausalConv1dFn         3.45%      71.962us        94.87%       1.978ms     659.169us       0.000us         0.00%      23.200us       7.733us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.13%      23.451us        89.92%       1.874ms     624.738us      17.376us       100.00%      23.200us       7.733us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.376us       100.00%      17.376us       5.792us             3  
-                                Activity Buffer Request        80.93%       1.687ms        80.93%       1.687ms       1.687ms       5.824us        33.52%       5.824us       5.824us             1  
-                                       aten::empty_like         0.43%       8.941us         1.50%      31.331us      10.444us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.07%      22.390us         1.07%      22.390us       7.463us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.86%     163.753us         7.86%     163.753us      54.584us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       5.080us         0.24%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.191us       690.48%     120.191us     120.191us             1  
+                               hf_kernels_causal_conv1d         4.71%     101.512us        99.75%       2.151ms       2.151ms       0.000us         0.00%      23.231us      23.231us             1  
+                                         CausalConv1dFn         3.18%      68.482us        95.04%       2.050ms     683.192us       0.000us         0.00%      23.231us       7.744us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.07%      23.020us        90.55%       1.953ms     650.931us      17.407us       100.00%      23.231us       7.744us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.407us       100.00%      17.407us       5.802us             3  
+                                Activity Buffer Request        82.08%       1.770ms        82.08%       1.770ms       1.770ms       5.824us        33.46%       5.824us       5.824us             1  
+                                       aten::empty_like         0.34%       7.400us         1.31%      28.300us       9.433us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.97%      20.900us         0.97%      20.900us       6.967us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.40%     159.664us         7.40%     159.664us      53.221us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       5.480us         0.25%       5.480us       5.480us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.084ms
-Self CUDA time total: 17.376us
+Self CPU time total: 2.157ms
+Self CUDA time total: 17.407us
 
 
 
@@ -4410,19 +4410,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.030us       694.61%     124.030us     124.030us             1  
-                               hf_kernels_causal_conv1d        16.72%      76.202us        98.90%     450.691us     450.691us       0.000us         0.00%      23.872us      23.872us             1  
-                                         CausalConv1dFn        15.17%      69.121us        82.18%     374.489us     124.830us       0.000us         0.00%      23.872us       7.957us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.17%      23.551us        60.29%     274.737us      91.579us      17.856us       100.00%      23.872us       7.957us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.856us       100.00%      17.856us       5.952us             3  
-                                Activity Buffer Request        20.41%      93.033us        20.41%      93.033us      93.033us       6.016us        33.69%       6.016us       6.016us             1  
-                                       aten::empty_like         1.72%       7.841us         6.72%      30.631us      10.210us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         5.00%      22.790us         5.00%      22.790us       7.597us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.70%     158.153us        34.70%     158.153us      52.718us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.10%       5.020us         1.10%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.167us       646.17%     115.167us     115.167us             1  
+                               hf_kernels_causal_conv1d        20.44%      95.313us        98.80%     460.750us     460.750us       0.000us         0.00%      23.806us      23.806us             1  
+                                         CausalConv1dFn        14.34%      66.892us        78.36%     365.437us     121.812us       0.000us         0.00%      23.806us       7.935us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.79%      22.348us        57.90%     270.035us      90.012us      17.823us       100.00%      23.806us       7.935us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.823us       100.00%      17.823us       5.941us             3  
+                                Activity Buffer Request        19.71%      91.942us        19.71%      91.942us      91.942us       5.983us        33.57%       5.983us       5.983us             1  
+                                       aten::empty_like         1.51%       7.049us         6.11%      28.510us       9.503us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.60%      21.461us         4.60%      21.461us       7.154us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.40%     155.745us        33.40%     155.745us      51.915us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.20%       5.611us         1.20%       5.611us       5.611us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 455.711us
-Self CUDA time total: 17.856us
+Self CPU time total: 466.361us
+Self CUDA time total: 17.823us
 
 
 
@@ -4432,19 +4432,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.247us       722.62%     129.247us     129.247us             1  
-                               hf_kernels_causal_conv1d         3.52%      75.733us        99.76%       2.146ms       2.146ms       0.000us         0.00%      23.870us      23.870us             1  
-                                         CausalConv1dFn         3.45%      74.171us        96.24%       2.070ms     690.050us       0.000us         0.00%      23.870us       7.957us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.15%      24.680us        91.39%       1.966ms     655.299us      17.886us       100.00%      23.870us       7.957us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.886us       100.00%      17.886us       5.962us             3  
-                                Activity Buffer Request        82.74%       1.780ms        82.74%       1.780ms       1.780ms       5.984us        33.46%       5.984us       5.984us             1  
-                                       aten::empty_like         0.37%       7.882us         1.40%      30.082us      10.027us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.03%      22.200us         1.03%      22.200us       7.400us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.51%     161.493us         7.51%     161.493us      53.831us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       5.140us         0.24%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.864us       721.68%     128.864us     128.864us             1  
+                               hf_kernels_causal_conv1d         4.82%     101.713us        99.74%       2.105ms       2.105ms       0.000us         0.00%      23.872us      23.872us             1  
+                                         CausalConv1dFn         3.47%      73.151us        94.92%       2.003ms     667.604us       0.000us         0.00%      23.872us       7.957us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.00%      21.171us        90.08%       1.901ms     633.524us      17.856us       100.00%      23.872us       7.957us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.856us       100.00%      17.856us       5.952us             3  
+                                Activity Buffer Request        81.34%       1.716ms        81.34%       1.716ms       1.716ms       6.016us        33.69%       6.016us       6.016us             1  
+                                       aten::empty_like         0.39%       8.269us         1.38%      29.091us       9.697us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.99%      20.822us         0.99%      20.822us       6.941us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.73%     163.083us         7.73%     163.083us      54.361us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       5.420us         0.26%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.151ms
-Self CUDA time total: 17.886us
+Self CPU time total: 2.110ms
+Self CUDA time total: 17.856us
 
 
 
@@ -4454,19 +4454,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.387us       688.73%     127.387us     127.387us             1  
-                               hf_kernels_causal_conv1d        17.30%      79.433us        98.83%     453.742us     453.742us       0.000us         0.00%      24.704us      24.704us             1  
-                                         CausalConv1dFn        15.18%      69.712us        81.53%     374.309us     124.770us       0.000us         0.00%      24.704us       8.235us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.33%      24.481us        59.78%     274.427us      91.476us      18.496us       100.00%      24.704us       8.235us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.496us       100.00%      18.496us       6.165us             3  
-                                Activity Buffer Request        19.95%      91.582us        19.95%      91.582us      91.582us       6.208us        33.56%       6.208us       6.208us             1  
-                                       aten::empty_like         1.61%       7.379us         6.57%      30.170us      10.057us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.96%      22.791us         4.96%      22.791us       7.597us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.50%     158.364us        34.50%     158.364us      52.788us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.17%       5.350us         1.17%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.180us       600.52%     113.180us     113.180us             1  
+                               hf_kernels_causal_conv1d        20.91%      97.193us        98.79%     459.100us     459.100us       0.000us         0.00%      25.055us      25.055us             1  
+                                         CausalConv1dFn        14.11%      65.571us        77.87%     361.907us     120.636us       0.000us         0.00%      25.055us       8.352us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.76%      22.110us        57.70%     268.175us      89.392us      18.847us       100.00%      25.055us       8.352us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.847us       100.00%      18.847us       6.282us             3  
+                                Activity Buffer Request        19.33%      89.852us        19.33%      89.852us      89.852us       6.208us        32.94%       6.208us       6.208us             1  
+                                       aten::empty_like         1.50%       6.961us         6.06%      28.161us       9.387us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.56%      21.200us         4.56%      21.200us       7.067us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.61%     156.213us        33.61%     156.213us      52.071us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.21%       5.640us         1.21%       5.640us       5.640us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 459.092us
-Self CUDA time total: 18.496us
+Self CPU time total: 464.740us
+Self CUDA time total: 18.847us
 
 
 
@@ -4476,19 +4476,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         4.39%      92.153us        99.77%       2.095ms       2.095ms       0.000us         0.00%     161.921us     161.921us             1  
-                                         CausalConv1dFn         3.68%      77.182us        95.38%       2.002ms     667.453us       0.000us         0.00%     161.921us      53.974us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.14%      23.952us        90.24%       1.894ms     631.499us      97.345us       100.00%     161.921us      53.974us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     141.279us       145.13%     141.279us     141.279us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.345us       100.00%      97.345us      32.448us             3  
-                                Activity Buffer Request        81.45%       1.710ms        81.45%       1.710ms       1.710ms      64.576us        66.34%      64.576us      64.576us             1  
-                                       aten::empty_like         0.41%       8.600us         1.46%      30.680us      10.227us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.05%      22.080us         1.05%      22.080us       7.360us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.65%     160.703us         7.65%     160.703us      53.568us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       4.850us         0.23%       4.850us       4.850us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         4.80%     101.712us        99.74%       2.115ms       2.115ms       0.000us         0.00%     161.662us     161.662us             1  
+                                         CausalConv1dFn         3.27%      69.301us        94.94%       2.014ms     671.195us       0.000us         0.00%     161.662us      53.887us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.10%      23.392us        90.33%       1.916ms     638.601us      97.023us       100.00%     161.662us      53.887us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.911us       134.93%     130.911us     130.911us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.023us       100.00%      97.023us      32.341us             3  
+                                Activity Buffer Request        81.56%       1.730ms        81.56%       1.730ms       1.730ms      64.639us        66.62%      64.639us      64.639us             1  
+                                       aten::empty_like         0.39%       8.180us         1.34%      28.480us       9.493us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.96%      20.300us         0.96%      20.300us       6.767us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.66%     162.533us         7.66%     162.533us      54.178us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       5.581us         0.26%       5.581us       5.581us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.099ms
-Self CUDA time total: 97.345us
+Self CPU time total: 2.121ms
+Self CUDA time total: 97.023us
 
 
 
@@ -4498,19 +4498,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        16.74%      76.723us        98.88%     453.112us     453.112us       0.000us         0.00%     162.555us     162.555us             1  
-                                         CausalConv1dFn        14.95%      68.512us        82.13%     376.389us     125.463us       0.000us         0.00%     162.555us      54.185us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.28%      24.211us        60.48%     277.136us      92.379us      98.269us       100.00%     162.555us      54.185us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.235us       132.53%     130.235us     130.235us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.269us       100.00%      98.269us      32.756us             3  
-                                Activity Buffer Request        20.17%      92.422us        20.17%      92.422us      92.422us      64.286us        65.42%      64.286us      64.286us             1  
-                                       aten::empty_like         1.77%       8.119us         6.71%      30.741us      10.247us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.94%      22.622us         4.94%      22.622us       7.541us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.02%     160.503us        35.02%     160.503us      53.501us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.12%       5.150us         1.12%       5.150us       5.150us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        19.73%      90.504us        98.74%     452.900us     452.900us       0.000us         0.00%     165.018us     165.018us             1  
+                                         CausalConv1dFn        14.46%      66.339us        79.01%     362.396us     120.799us       0.000us         0.00%     165.018us      55.006us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.97%      22.791us        58.44%     268.046us      89.349us      99.612us       100.00%     165.018us      55.006us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.188us       124.67%     124.188us     124.188us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      99.612us       100.00%      99.612us      33.204us             3  
+                                Activity Buffer Request        19.01%      87.182us        19.01%      87.182us      87.182us      65.406us        65.66%      65.406us      65.406us             1  
+                                       aten::empty_like         1.58%       7.240us         6.11%      28.011us       9.337us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.53%      20.771us         4.53%      20.771us       6.924us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.46%     158.073us        34.46%     158.073us      52.691us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.26%       5.760us         1.26%       5.760us       5.760us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 458.262us
-Self CUDA time total: 98.269us
+Self CPU time total: 458.660us
+Self CUDA time total: 99.612us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4542,14 +4542,15 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
 
▶ UV Install Logs
-
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. - -Fetching 11 files: 45%|████▌ | 5/11 [00:00<00:00, 48.10it/s] -Fetching 11 files: 91%|█████████ | 10/11 [00:01<00:00, 4.70it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 5.98it/s]
+
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] +Fetching 11 files: 9%|▉ | 1/11 [00:00<00:02, 4.50it/s] +Fetching 11 files: 64%|██████▎ | 7/11 [00:02<00:01, 3.34it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:02<00:00, 5.31it/s]

Artifacts:

causal_conv1d.jsonl