diff --git "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" --- "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" +++ "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" @@ -3905,7 +3905,7 @@ Cell: nv | 0.25s
-
Fri Dec 19 18:56:39 2025       
+
Fri Dec 19 19:54:47 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   33C    P0             78W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   37C    P0             82W /  350W |       0MiB /  46068MiB |     14%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.25s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 10.23s
+Cell: benchmark | 6.16s
  | 
 
 Raw
@@ -3992,19 +3992,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     152.896us      3791.12%     152.896us     152.896us             1  
-                               hf_kernels_causal_conv1d         7.07%     153.743us        99.32%       2.160ms       2.160ms       0.000us         0.00%       5.442us       5.442us             1  
-                                         CausalConv1dFn         4.91%     106.822us        92.25%       2.006ms     668.831us       0.000us         0.00%       5.442us       1.814us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.14%      24.830us        84.38%       1.835ms     611.780us       4.033us       100.00%       5.442us       1.814us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.033us       100.00%       4.033us       1.344us             3  
-                                Activity Buffer Request        80.94%       1.760ms        80.94%       1.760ms       1.760ms       1.409us        34.94%       1.409us       1.409us             1  
-                                       aten::empty_like         0.92%      19.971us         2.96%      64.331us      21.444us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.04%      44.360us         2.04%      44.360us      14.787us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.30%      50.042us         2.30%      50.042us      16.681us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.68%      14.871us         0.68%      14.871us      14.871us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     144.993us      3625.73%     144.993us     144.993us             1  
+                               hf_kernels_causal_conv1d         6.46%     138.804us        99.34%       2.133ms       2.133ms       0.000us         0.00%       5.407us       5.407us             1  
+                                         CausalConv1dFn         4.85%     104.092us        92.88%       1.994ms     664.706us       0.000us         0.00%       5.407us       1.802us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.17%      25.081us        85.08%       1.827ms     608.915us       3.999us       100.00%       5.407us       1.802us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.999us       100.00%       3.999us       1.333us             3  
+                                Activity Buffer Request        81.62%       1.752ms        81.62%       1.752ms       1.752ms       1.408us        35.21%       1.408us       1.408us             1  
+                                       aten::empty_like         0.88%      18.799us         2.95%      63.281us      21.094us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.07%      44.482us         2.07%      44.482us      14.827us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.29%      49.201us         2.29%      49.201us      16.400us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.66%      14.170us         0.66%      14.170us      14.170us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.175ms
-Self CUDA time total: 4.033us
+Self CPU time total: 2.147ms
+Self CUDA time total: 3.999us
 
 
 
@@ -4014,18 +4014,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.502us      3138.41%     117.502us     117.502us             1  
-                               hf_kernels_causal_conv1d         4.12%      80.030us        99.72%       1.937ms       1.937ms       0.000us         0.00%       4.992us       4.992us             1  
-                                         CausalConv1dFn         3.57%      69.262us        95.60%       1.857ms     618.894us       0.000us         0.00%       4.992us       1.664us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.24%      24.108us        90.50%       1.758ms     585.930us       3.744us       100.00%       4.992us       1.664us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.991us      3231.60%     120.991us     120.991us             1  
+                               hf_kernels_causal_conv1d         3.92%      79.473us        99.73%       2.024ms       2.024ms       0.000us         0.00%       4.992us       4.992us             1  
+                                         CausalConv1dFn         3.49%      70.790us        95.81%       1.944ms     648.019us       0.000us         0.00%       4.992us       1.664us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.22%      24.702us        90.78%       1.842ms     614.002us       3.744us       100.00%       4.992us       1.664us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.744us       100.00%       3.744us       1.248us             3  
-                                Activity Buffer Request        87.68%       1.703ms        87.68%       1.703ms       1.703ms       1.248us        33.33%       1.248us       1.248us             1  
-                                       aten::empty_like         0.41%       7.921us         1.53%      29.631us       9.877us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.12%      21.710us         1.12%      21.710us       7.237us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.59%      30.823us         1.59%      30.823us      10.274us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       5.491us         0.28%       5.491us       5.491us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        88.06%       1.787ms        88.06%       1.787ms       1.787ms       1.248us        33.33%       1.248us       1.248us             1  
+                                       aten::empty_like         0.40%       8.072us         1.54%      31.262us      10.421us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.14%      23.190us         1.14%      23.190us       7.730us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.51%      30.580us         1.51%      30.580us      10.193us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.530us         0.27%       5.530us       5.530us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.942ms
+Self CPU time total: 2.029ms
 Self CUDA time total: 3.744us
 
 
@@ -4036,19 +4036,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.807us      3093.14%     115.807us     115.807us             1  
-                               hf_kernels_causal_conv1d         4.02%      76.850us        99.72%       1.908ms       1.908ms       0.000us         0.00%       4.992us       4.992us             1  
-                                         CausalConv1dFn         3.58%      68.550us        95.70%       1.831ms     610.351us       0.000us         0.00%       4.992us       1.664us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.25%      23.951us        90.61%       1.734ms     577.870us       3.744us       100.00%       4.992us       1.664us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.744us       100.00%       3.744us       1.248us             3  
-                                Activity Buffer Request        87.74%       1.679ms        87.74%       1.679ms       1.679ms       1.248us        33.33%       1.248us       1.248us             1  
-                                       aten::empty_like         0.43%       8.141us         1.51%      28.892us       9.631us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.08%      20.751us         1.08%      20.751us       6.917us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.62%      30.931us         1.62%      30.931us      10.310us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       5.380us         0.28%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.553us      3165.29%     119.553us     119.553us             1  
+                               hf_kernels_causal_conv1d         5.28%     102.383us        99.71%       1.932ms       1.932ms       0.000us         0.00%       5.026us       5.026us             1  
+                                         CausalConv1dFn         3.60%      69.861us        94.43%       1.830ms     610.001us       0.000us         0.00%       5.026us       1.675us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.29%      25.081us        89.25%       1.730ms     576.561us       3.777us       100.00%       5.026us       1.675us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.777us       100.00%       3.777us       1.259us             3  
+                                Activity Buffer Request        86.29%       1.672ms        86.29%       1.672ms       1.672ms       1.249us        33.07%       1.249us       1.249us             1  
+                                       aten::empty_like         0.46%       8.829us         1.57%      30.461us      10.154us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.12%      21.632us         1.12%      21.632us       7.211us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.67%      32.330us         1.67%      32.330us      10.777us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.29%       5.571us         0.29%       5.571us       5.571us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.913ms
-Self CUDA time total: 3.744us
+Self CPU time total: 1.938ms
+Self CUDA time total: 3.777us
 
 
 
@@ -4058,19 +4058,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.191us      3366.36%     128.191us     128.191us             1  
-                               hf_kernels_causal_conv1d         3.49%      79.180us        99.78%       2.264ms       2.264ms       0.000us         0.00%       5.088us       5.088us             1  
-                                         CausalConv1dFn         3.32%      75.331us        96.29%       2.185ms     728.346us       0.000us         0.00%       5.088us       1.696us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.06%      23.951us        91.65%       2.080ms     693.279us       3.808us       100.00%       5.088us       1.696us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.808us       100.00%       3.808us       1.269us             3  
-                                Activity Buffer Request        79.24%       1.798ms        79.24%       1.798ms       1.798ms       1.280us        33.61%       1.280us       1.280us             1  
-                                       aten::empty_like         0.38%       8.540us         1.32%      29.871us       9.957us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.94%      21.331us         0.94%      21.331us       7.110us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        11.35%     257.655us        11.35%     257.655us      85.885us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.22%       5.020us         0.22%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.951us      3441.50%     129.951us     129.951us             1  
+                               hf_kernels_causal_conv1d         4.54%     106.623us        99.77%       2.341ms       2.341ms       0.000us         0.00%       5.024us       5.024us             1  
+                                         CausalConv1dFn         3.24%      75.911us        95.23%       2.234ms     744.741us       0.000us         0.00%       5.024us       1.675us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         0.97%      22.670us        90.70%       2.128ms     709.344us       3.776us       100.00%       5.024us       1.675us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
+                                Activity Buffer Request        79.32%       1.861ms        79.32%       1.861ms       1.861ms       1.248us        33.05%       1.248us       1.248us             1  
+                                       aten::empty_like         0.36%       8.382us         1.29%      30.281us      10.094us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.93%      21.899us         0.93%      21.899us       7.300us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.41%     244.307us        10.41%     244.307us      81.436us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.23%       5.280us         0.23%       5.280us       5.280us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.269ms
-Self CUDA time total: 3.808us
+Self CPU time total: 2.346ms
+Self CUDA time total: 3.776us
 
 
 
@@ -4080,19 +4080,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.753us      2573.98%     122.753us     122.753us             1  
-                               hf_kernels_causal_conv1d         3.87%      79.741us        99.76%       2.057ms       2.057ms       0.000us         0.00%       6.369us       6.369us             1  
-                                         CausalConv1dFn         3.55%      73.230us        95.89%       1.978ms     659.221us       0.000us         0.00%       6.369us       2.123us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.13%      23.292us        90.96%       1.876ms     625.288us       4.769us       100.00%       6.369us       2.123us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.769us       100.00%       4.769us       1.590us             3  
-                                Activity Buffer Request        80.59%       1.662ms        80.59%       1.662ms       1.662ms       1.600us        33.55%       1.600us       1.600us             1  
-                                       aten::empty_like         0.38%       7.870us         1.39%      28.571us       9.524us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.00%      20.701us         1.00%      20.701us       6.900us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.24%     190.473us         9.24%     190.473us      63.491us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       4.940us         0.24%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.920us      2724.83%     129.920us     129.920us             1  
+                               hf_kernels_causal_conv1d         4.79%     104.612us        99.77%       2.179ms       2.179ms       0.000us         0.00%       6.368us       6.368us             1  
+                                         CausalConv1dFn         3.53%      77.212us        94.98%       2.075ms     691.584us       0.000us         0.00%       6.368us       2.123us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.05%      22.870us        90.05%       1.967ms     655.679us       4.768us       100.00%       6.368us       2.123us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.768us       100.00%       4.768us       1.589us             3  
+                                Activity Buffer Request        78.82%       1.722ms        78.82%       1.722ms       1.722ms       1.600us        33.56%       1.600us       1.600us             1  
+                                       aten::empty_like         0.38%       8.362us         1.40%      30.502us      10.167us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.01%      22.140us         1.01%      22.140us       7.380us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.18%     222.375us        10.18%     222.375us      74.125us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.23%       5.100us         0.23%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.062ms
-Self CUDA time total: 4.769us
+Self CPU time total: 2.184ms
+Self CUDA time total: 4.768us
 
 
 
@@ -4102,19 +4102,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.129us      2403.33%     116.129us     116.129us             1  
-                               hf_kernels_causal_conv1d        14.08%      75.841us        99.01%     533.389us     533.389us       0.000us         0.00%       6.464us       6.464us             1  
-                                         CausalConv1dFn        12.88%      69.403us        84.93%     457.548us     152.516us       0.000us         0.00%       6.464us       2.155us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.42%      23.809us        66.86%     360.195us     120.065us       4.832us       100.00%       6.464us       2.155us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.832us       100.00%       4.832us       1.611us             3  
-                                Activity Buffer Request        31.92%     171.983us        31.92%     171.983us     171.983us       1.632us        33.77%       1.632us       1.632us             1  
-                                       aten::empty_like         1.47%       7.920us         5.19%      27.950us       9.317us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.72%      20.030us         3.72%      20.030us       6.677us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.52%     164.403us        30.52%     164.403us      54.801us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.99%       5.360us         0.99%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.830us      2433.96%     116.830us     116.830us             1  
+                               hf_kernels_causal_conv1d        11.09%      77.122us        99.27%     690.457us     690.457us       0.000us         0.00%       6.432us       6.432us             1  
+                                         CausalConv1dFn        10.04%      69.842us        88.18%     613.335us     204.445us       0.000us         0.00%       6.432us       2.144us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.34%      23.210us        74.09%     515.312us     171.771us       4.800us       100.00%       6.432us       2.144us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.800us       100.00%       4.800us       1.600us             3  
+                                Activity Buffer Request        40.97%     284.977us        40.97%     284.977us     284.977us       1.632us        34.00%       1.632us       1.632us             1  
+                                       aten::empty_like         1.03%       7.161us         4.05%      28.181us       9.394us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.02%      21.020us         3.02%      21.020us       7.007us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        29.78%     207.125us        29.78%     207.125us      69.042us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.73%       5.090us         0.73%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 538.749us
-Self CUDA time total: 4.832us
+Self CPU time total: 695.547us
+Self CUDA time total: 4.800us
 
 
 
@@ -4124,19 +4124,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.560us      1146.71%     122.560us     122.560us             1  
-                               hf_kernels_causal_conv1d         3.81%      80.881us        99.76%       2.118ms       2.118ms       0.000us         0.00%      14.272us      14.272us             1  
-                                         CausalConv1dFn         3.44%      73.000us        95.95%       2.037ms     679.028us       0.000us         0.00%      14.272us       4.757us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.08%      22.840us        91.17%       1.936ms     645.207us      10.688us       100.00%      14.272us       4.757us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.688us       100.00%      10.688us       3.563us             3  
-                                Activity Buffer Request        82.40%       1.750ms        82.40%       1.750ms       1.750ms       3.584us        33.53%       3.584us       3.584us             1  
-                                       aten::empty_like         0.36%       7.642us         1.34%      28.462us       9.487us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.98%      20.820us         0.98%      20.820us       6.940us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.69%     163.253us         7.69%     163.253us      54.418us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       5.130us         0.24%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.990us      1138.94%     120.990us     120.990us             1  
+                               hf_kernels_causal_conv1d         8.25%      76.440us        99.44%     921.732us     921.732us       0.000us         0.00%      14.206us      14.206us             1  
+                                         CausalConv1dFn         7.43%      68.843us        91.20%     845.292us     281.764us       0.000us         0.00%      14.206us       4.735us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.57%      23.801us        80.53%     746.458us     248.819us      10.623us       100.00%      14.206us       4.735us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.623us       100.00%      10.623us       3.541us             3  
+                                Activity Buffer Request        58.01%     537.643us        58.01%     537.643us     537.643us       3.583us        33.73%       3.583us       3.583us             1  
+                                       aten::empty_like         0.88%       8.201us         3.24%      29.991us       9.997us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.35%      21.790us         2.35%      21.790us       7.263us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        19.96%     185.014us        19.96%     185.014us      61.671us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.56%       5.149us         0.56%       5.149us       5.149us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.123ms
-Self CUDA time total: 10.688us
+Self CPU time total: 926.881us
+Self CUDA time total: 10.623us
 
 
 
@@ -4146,19 +4146,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.638us      1078.28%     116.638us     116.638us             1  
-                               hf_kernels_causal_conv1d        17.39%      80.341us        98.91%     457.098us     457.098us       0.000us         0.00%      14.401us      14.401us             1  
-                                         CausalConv1dFn        15.12%      69.882us        81.53%     376.757us     125.586us       0.000us         0.00%      14.401us       4.800us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.92%      22.749us        60.38%     279.004us      93.001us      10.817us       100.00%      14.401us       4.800us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.817us       100.00%      10.817us       3.606us             3  
-                                Activity Buffer Request        22.08%     102.032us        22.08%     102.032us     102.032us       3.584us        33.13%       3.584us       3.584us             1  
-                                       aten::empty_like         1.64%       7.601us         6.03%      27.871us       9.290us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.39%      20.270us         4.39%      20.270us       6.757us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.37%     154.223us        33.37%     154.223us      51.408us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.09%       5.020us         1.09%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.338us      1124.43%     122.338us     122.338us             1  
+                               hf_kernels_causal_conv1d         8.49%      80.052us        99.43%     937.274us     937.274us       0.000us         0.00%      14.560us      14.560us             1  
+                                         CausalConv1dFn         7.62%      71.802us        90.94%     857.222us     285.741us       0.000us         0.00%      14.560us       4.853us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.57%      24.180us        80.29%     756.859us     252.286us      10.880us       100.00%      14.560us       4.853us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.880us       100.00%      10.880us       3.627us             3  
+                                Activity Buffer Request        59.85%     564.194us        59.85%     564.194us     564.194us       3.680us        33.82%       3.680us       3.680us             1  
+                                       aten::empty_like         0.81%       7.670us         3.03%      28.561us       9.520us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.22%      20.891us         2.22%      20.891us       6.964us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        17.87%     168.485us        17.87%     168.485us      56.162us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.57%       5.361us         0.57%       5.361us       5.361us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 462.118us
-Self CUDA time total: 10.817us
+Self CPU time total: 942.635us
+Self CUDA time total: 10.880us
 
 
 
@@ -4168,18 +4168,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.873us      1165.02%     127.873us     127.873us             1  
-                               hf_kernels_causal_conv1d         3.83%      78.590us        99.73%       2.045ms       2.045ms       0.000us         0.00%      14.656us      14.656us             1  
-                                         CausalConv1dFn         3.56%      73.060us        95.90%       1.966ms     655.431us       0.000us         0.00%      14.656us       4.885us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.04%      21.271us        90.90%       1.864ms     621.251us      10.976us       100.00%      14.656us       4.885us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.254us      1095.61%     120.254us     120.254us             1  
+                               hf_kernels_causal_conv1d        12.25%      73.512us        99.17%     595.034us     595.034us       0.000us         0.00%      14.688us      14.688us             1  
+                                         CausalConv1dFn        11.59%      69.552us        86.91%     521.522us     173.841us       0.000us         0.00%      14.688us       4.896us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.21%      25.240us        70.04%     420.260us     140.087us      10.976us       100.00%      14.688us       4.896us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.976us       100.00%      10.976us       3.659us             3  
-                                Activity Buffer Request        82.22%       1.686ms        82.22%       1.686ms       1.686ms       3.680us        33.53%       3.680us       3.680us             1  
-                                       aten::empty_like         0.42%       8.532us         1.44%      29.482us       9.827us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.02%      20.950us         1.02%      20.950us       6.983us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.64%     156.623us         7.64%     156.623us      52.208us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       5.481us         0.27%       5.481us       5.481us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        38.36%     230.206us        38.36%     230.206us     230.206us       3.712us        33.82%       3.712us       3.712us             1  
+                                       aten::empty_like         1.43%       8.570us         5.28%      31.710us      10.570us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.86%      23.140us         3.86%      23.140us       7.713us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.47%     164.814us        27.47%     164.814us      54.938us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.83%       5.010us         0.83%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.050ms
+Self CPU time total: 600.044us
 Self CUDA time total: 10.976us
 
 
@@ -4190,19 +4190,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.132us      1060.65%     119.132us     119.132us             1  
-                               hf_kernels_causal_conv1d        16.93%      76.370us        98.84%     445.867us     445.867us       0.000us         0.00%      14.976us      14.976us             1  
-                                         CausalConv1dFn        15.59%      70.303us        81.91%     369.497us     123.166us       0.000us         0.00%      14.976us       4.992us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.36%      24.158us        60.06%     270.914us      90.305us      11.232us       100.00%      14.976us       4.992us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.232us       100.00%      11.232us       3.744us             3  
-                                Activity Buffer Request        20.43%      92.161us        20.43%      92.161us      92.161us       3.744us        33.33%       3.744us       3.744us             1  
-                                       aten::empty_like         1.59%       7.169us         6.27%      28.280us       9.427us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.68%      21.111us         4.68%      21.111us       7.037us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.27%     154.595us        34.27%     154.595us      51.532us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.16%       5.220us         1.16%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.621us      1053.10%     118.621us     118.621us             1  
+                               hf_kernels_causal_conv1d        18.68%      98.702us        99.03%     523.193us     523.193us       0.000us         0.00%      15.040us      15.040us             1  
+                                         CausalConv1dFn        13.73%      72.543us        80.34%     424.491us     141.497us       0.000us         0.00%      15.040us       5.013us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.48%      23.691us        60.95%     322.037us     107.346us      11.264us       100.00%      15.040us       5.013us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.264us       100.00%      11.264us       3.755us             3  
+                                Activity Buffer Request        26.02%     137.473us        26.02%     137.473us     137.473us       3.776us        33.52%       3.776us       3.776us             1  
+                                       aten::empty_like         1.47%       7.771us         5.66%      29.911us       9.970us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.19%      22.140us         4.19%      22.140us       7.380us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.45%     160.873us        30.45%     160.873us      53.624us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.97%       5.150us         0.97%       5.150us       5.150us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 451.087us
-Self CUDA time total: 11.232us
+Self CPU time total: 528.343us
+Self CUDA time total: 11.264us
 
 
 
@@ -4212,19 +4212,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.111us       251.50%     126.111us     126.111us             1  
-                               hf_kernels_causal_conv1d         3.58%      73.510us        99.72%       2.049ms       2.049ms       0.000us         0.00%      83.392us      83.392us             1  
-                                         CausalConv1dFn         3.49%      71.722us        96.15%       1.975ms     658.418us       0.000us         0.00%      83.392us      27.797us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.14%      23.351us        91.25%       1.875ms     624.854us      50.144us       100.00%      83.392us      27.797us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.144us       100.00%      50.144us      16.715us             3  
-                                Activity Buffer Request        82.17%       1.688ms        82.17%       1.688ms       1.688ms      33.248us        66.31%      33.248us      33.248us             1  
-                                       aten::empty_like         0.39%       8.051us         1.41%      28.971us       9.657us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.02%      20.920us         1.02%      20.920us       6.973us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.94%     163.112us         7.94%     163.112us      54.371us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       5.660us         0.28%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.863us       259.30%     128.863us     128.863us             1  
+                               hf_kernels_causal_conv1d         3.60%      75.972us        99.76%       2.104ms       2.104ms       0.000us         0.00%      82.688us      82.688us             1  
+                                         CausalConv1dFn         3.37%      70.961us        96.16%       2.028ms     675.870us       0.000us         0.00%      82.688us      27.563us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.16%      24.491us        91.36%       1.926ms     642.136us      49.696us       100.00%      82.688us      27.563us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      49.696us       100.00%      49.696us      16.565us             3  
+                                Activity Buffer Request        82.37%       1.737ms        82.37%       1.737ms       1.737ms      32.992us        66.39%      32.992us      32.992us             1  
+                                       aten::empty_like         0.41%       8.720us         1.43%      30.240us      10.080us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.02%      21.520us         1.02%      21.520us       7.173us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.83%     165.005us         7.83%     165.005us      55.002us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.24%       5.070us         0.24%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.054ms
-Self CUDA time total: 50.144us
+Self CPU time total: 2.109ms
+Self CUDA time total: 49.696us
 
 
 
@@ -4234,19 +4234,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.959us       269.13%     124.959us     124.959us             1  
-                               hf_kernels_causal_conv1d        16.37%      75.480us        98.93%     456.147us     456.147us       0.000us         0.00%      75.742us      75.742us             1  
-                                         CausalConv1dFn        15.52%      71.561us        82.56%     380.667us     126.889us       0.000us         0.00%      75.742us      25.247us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.18%      23.872us        60.72%     279.966us      93.322us      46.431us       100.00%      75.742us      25.247us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      46.431us       100.00%      46.431us      15.477us             3  
-                                Activity Buffer Request        21.66%      99.871us        21.66%      99.871us      99.871us      29.311us        63.13%      29.311us      29.311us             1  
-                                       aten::empty_like         1.56%       7.180us         6.32%      29.140us       9.713us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.76%      21.960us         4.76%      21.960us       7.320us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.88%     156.223us        33.88%     156.223us      52.074us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.07%       4.930us         1.07%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.493us       263.09%     122.493us     122.493us             1  
+                               hf_kernels_causal_conv1d        15.75%      73.853us        98.92%     463.792us     463.792us       0.000us         0.00%      76.190us      76.190us             1  
+                                         CausalConv1dFn        14.80%      69.370us        83.17%     389.939us     129.980us       0.000us         0.00%      76.190us      25.397us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.26%      24.662us        62.03%     290.847us      96.949us      46.559us       100.00%      76.190us      25.397us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      46.559us       100.00%      46.559us      15.520us             3  
+                                Activity Buffer Request        22.78%     106.792us        22.78%     106.792us     106.792us      29.631us        63.64%      29.631us      29.631us             1  
+                                       aten::empty_like         1.83%       8.590us         6.34%      29.722us       9.907us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.51%      21.132us         4.51%      21.132us       7.044us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.00%     159.393us        34.00%     159.393us      53.131us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.08%       5.060us         1.08%       5.060us       5.060us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 461.077us
-Self CUDA time total: 46.431us
+Self CPU time total: 468.852us
+Self CUDA time total: 46.559us
 
 
 
@@ -4256,19 +4256,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.136us      3207.50%     123.136us     123.136us             1  
-                               hf_kernels_causal_conv1d         3.53%      75.250us        99.73%       2.124ms       2.124ms       0.000us         0.00%       5.055us       5.055us             1  
-                                         CausalConv1dFn         3.40%      72.360us        96.19%       2.049ms     682.905us       0.000us         0.00%       5.055us       1.685us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.03%      21.891us        91.44%       1.947ms     649.155us       3.839us       100.00%       5.055us       1.685us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.839us       100.00%       3.839us       1.280us             3  
-                                Activity Buffer Request        83.03%       1.768ms        83.03%       1.768ms       1.768ms       1.216us        31.67%       1.216us       1.216us             1  
-                                       aten::empty_like         0.38%       8.101us         1.36%      28.892us       9.631us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.98%      20.791us         0.98%      20.791us       6.930us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.38%     157.153us         7.38%     157.153us      52.384us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       5.800us         0.27%       5.800us       5.800us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.279us      3132.21%     121.279us     121.279us             1  
+                               hf_kernels_causal_conv1d         4.47%      97.233us        99.75%       2.167ms       2.167ms       0.000us         0.00%       5.088us       5.088us             1  
+                                         CausalConv1dFn         3.35%      72.763us        95.28%       2.070ms     690.087us       0.000us         0.00%       5.088us       1.696us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.09%      23.651us        90.51%       1.967ms     655.553us       3.872us       100.00%       5.088us       1.696us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.872us       100.00%       3.872us       1.291us             3  
+                                Activity Buffer Request        81.94%       1.781ms        81.94%       1.781ms       1.781ms       1.216us        31.40%       1.216us       1.216us             1  
+                                       aten::empty_like         0.44%       9.520us         1.42%      30.840us      10.280us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.98%      21.320us         0.98%      21.320us       7.107us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.48%     162.473us         7.48%     162.473us      54.158us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       5.420us         0.25%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.130ms
-Self CUDA time total: 3.839us
+Self CPU time total: 2.173ms
+Self CUDA time total: 3.872us
 
 
 
@@ -4278,19 +4278,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.287us      2977.90%     116.287us     116.287us             1  
-                               hf_kernels_causal_conv1d        16.77%      75.910us        98.91%     447.847us     447.847us       0.000us         0.00%       5.153us       5.153us             1  
-                                         CausalConv1dFn        15.17%      68.691us        82.15%     371.937us     123.979us       0.000us         0.00%       5.153us       1.718us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.60%      25.371us        60.63%     274.535us      91.512us       3.905us       100.00%       5.153us       1.718us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.905us       100.00%       3.905us       1.302us             3  
-                                Activity Buffer Request        20.81%      94.211us        20.81%      94.211us      94.211us       1.248us        31.96%       1.248us       1.248us             1  
-                                       aten::empty_like         1.64%       7.411us         6.34%      28.711us       9.570us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.70%      21.300us         4.70%      21.300us       7.100us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.22%     154.953us        34.22%     154.953us      51.651us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.09%       4.920us         1.09%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.259us      3001.54%     115.259us     115.259us             1  
+                               hf_kernels_causal_conv1d        21.11%     101.882us        98.95%     477.452us     477.452us       0.000us         0.00%       5.056us       5.056us             1  
+                                         CausalConv1dFn        14.32%      69.113us        77.83%     375.570us     125.190us       0.000us         0.00%       5.056us       1.685us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.08%      24.499us        57.45%     277.196us      92.399us       3.840us       100.00%       5.056us       1.685us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.840us       100.00%       3.840us       1.280us             3  
+                                Activity Buffer Request        19.54%      94.283us        19.54%      94.283us      94.283us       1.216us        31.67%       1.216us       1.216us             1  
+                                       aten::empty_like         1.58%       7.611us         6.06%      29.261us       9.754us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.49%      21.650us         4.49%      21.650us       7.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.83%     158.414us        32.83%     158.414us      52.805us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.05%       5.080us         1.05%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 452.767us
-Self CUDA time total: 3.905us
+Self CPU time total: 482.532us
+Self CUDA time total: 3.840us
 
 
 
@@ -4300,19 +4300,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.009us      2882.27%     119.009us     119.009us             1  
-                               hf_kernels_causal_conv1d         3.71%      75.410us        99.73%       2.029ms       2.029ms       0.000us         0.00%       5.506us       5.506us             1  
-                                         CausalConv1dFn         3.78%      76.960us        96.02%       1.953ms     651.101us       0.000us         0.00%       5.506us       1.835us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.18%      24.051us        90.82%       1.848ms     615.834us       4.129us       100.00%       5.506us       1.835us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.129us       100.00%       4.129us       1.376us             3  
-                                Activity Buffer Request        81.81%       1.664ms        81.81%       1.664ms       1.664ms       1.377us        33.35%       1.377us       1.377us             1  
-                                       aten::empty_like         0.36%       7.250us         1.42%      28.841us       9.614us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.06%      21.591us         1.06%      21.591us       7.197us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.83%     159.204us         7.83%     159.204us      53.068us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       5.590us         0.27%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.752us      2996.14%     122.752us     122.752us             1  
+                               hf_kernels_causal_conv1d         4.60%      95.013us        99.74%       2.062ms       2.062ms       0.000us         0.00%       5.473us       5.473us             1  
+                                         CausalConv1dFn         3.49%      72.262us        95.14%       1.967ms     655.743us       0.000us         0.00%       5.473us       1.824us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.30%      26.862us        90.17%       1.864ms     621.455us       4.097us       100.00%       5.473us       1.824us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.097us       100.00%       4.097us       1.366us             3  
+                                Activity Buffer Request        81.16%       1.678ms        81.16%       1.678ms       1.678ms       1.376us        33.59%       1.376us       1.376us             1  
+                                       aten::empty_like         0.40%       8.350us         1.48%      30.601us      10.200us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.08%      22.251us         1.08%      22.251us       7.417us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.71%     159.333us         7.71%     159.333us      53.111us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       5.450us         0.26%       5.450us       5.450us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.034ms
-Self CUDA time total: 4.129us
+Self CPU time total: 2.068ms
+Self CUDA time total: 4.097us
 
 
 
@@ -4322,19 +4322,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.758us      2821.38%     113.758us     113.758us             1  
-                               hf_kernels_causal_conv1d        16.75%      75.370us        98.94%     445.237us     445.237us       0.000us         0.00%       5.376us       5.376us             1  
-                                         CausalConv1dFn        15.04%      67.702us        82.19%     369.867us     123.289us       0.000us         0.00%       5.376us       1.792us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.18%      23.329us        60.99%     274.454us      91.485us       4.032us       100.00%       5.376us       1.792us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.032us       100.00%       4.032us       1.344us             3  
-                                Activity Buffer Request        21.24%      95.561us        21.24%      95.561us      95.561us       1.344us        33.33%       1.344us       1.344us             1  
-                                       aten::empty_like         1.71%       7.680us         6.16%      27.711us       9.237us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.45%      20.031us         4.45%      20.031us       6.677us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.57%     155.564us        34.57%     155.564us      51.855us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.06%       4.760us         1.06%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.482us      3037.69%     123.482us     123.482us             1  
+                               hf_kernels_causal_conv1d        22.91%     113.331us        98.90%     489.311us     489.311us       0.000us         0.00%       5.441us       5.441us             1  
+                                         CausalConv1dFn        13.95%      69.033us        75.99%     375.980us     125.327us       0.000us         0.00%       5.441us       1.814us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.14%      25.431us        55.79%     276.017us      92.006us       4.065us       100.00%       5.441us       1.814us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.065us       100.00%       4.065us       1.355us             3  
+                                Activity Buffer Request        18.49%      91.492us        18.49%      91.492us      91.492us       1.376us        33.85%       1.376us       1.376us             1  
+                                       aten::empty_like         1.55%       7.680us         6.25%      30.930us      10.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.70%      23.250us         4.70%      23.250us       7.750us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.16%     159.094us        32.16%     159.094us      53.031us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.10%       5.440us         1.10%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 449.997us
-Self CUDA time total: 4.032us
+Self CPU time total: 494.751us
+Self CUDA time total: 4.065us
 
 
 
@@ -4344,19 +4344,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.936us      2257.83%     119.936us     119.936us             1  
-                               hf_kernels_causal_conv1d         3.67%      76.002us        99.74%       2.065ms       2.065ms       0.000us         0.00%       7.104us       7.104us             1  
-                                         CausalConv1dFn         3.38%      70.021us        96.07%       1.989ms     663.104us       0.000us         0.00%       7.104us       2.368us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.09%      22.660us        91.27%       1.890ms     629.940us       5.312us       100.00%       7.104us       2.368us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.312us       100.00%       5.312us       1.771us             3  
-                                Activity Buffer Request        82.69%       1.712ms        82.69%       1.712ms       1.712ms       1.792us        33.73%       1.792us       1.792us             1  
-                                       aten::empty_like         0.39%       8.121us         1.42%      29.471us       9.824us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.03%      21.350us         1.03%      21.350us       7.117us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.48%     154.872us         7.48%     154.872us      51.624us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       5.360us         0.26%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.726us      2264.25%     121.726us     121.726us             1  
+                               hf_kernels_causal_conv1d         4.87%     106.551us        99.77%       2.181ms       2.181ms       0.000us         0.00%       7.200us       7.200us             1  
+                                         CausalConv1dFn         3.29%      71.843us        94.90%       2.074ms     691.407us       0.000us         0.00%       7.200us       2.400us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.24%      27.032us        90.28%       1.973ms     657.779us       5.376us       100.00%       7.200us       2.400us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.376us       100.00%       5.376us       1.792us             3  
+                                Activity Buffer Request        81.68%       1.785ms        81.68%       1.785ms       1.785ms       1.824us        33.93%       1.824us       1.824us             1  
+                                       aten::empty_like         0.35%       7.600us         1.33%      29.041us       9.680us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.98%      21.441us         0.98%      21.441us       7.147us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.36%     160.923us         7.36%     160.923us      53.641us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.23%       4.940us         0.23%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.071ms
-Self CUDA time total: 5.312us
+Self CPU time total: 2.186ms
+Self CUDA time total: 5.376us
 
 
 
@@ -4366,19 +4366,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.719us      2209.14%     118.719us     118.719us             1  
-                               hf_kernels_causal_conv1d        11.31%      74.461us        99.17%     653.171us     653.171us       0.000us         0.00%       7.197us       7.197us             1  
-                                         CausalConv1dFn        10.87%      71.560us        87.87%     578.710us     192.903us       0.000us         0.00%       7.197us       2.399us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.42%      22.540us        72.76%     479.219us     159.740us       5.374us       100.00%       7.197us       2.399us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.374us       100.00%       5.374us       1.791us             3  
-                                Activity Buffer Request        44.55%     293.385us        44.55%     293.385us     293.385us       1.823us        33.92%       1.823us       1.823us             1  
-                                       aten::empty_like         1.11%       7.281us         4.24%      27.931us       9.310us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.14%      20.650us         3.14%      20.650us       6.883us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.79%     163.294us        24.79%     163.294us      54.431us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.83%       5.440us         0.83%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.382us      2165.25%     116.382us     116.382us             1  
+                               hf_kernels_causal_conv1d        21.05%     101.062us        98.84%     474.631us     474.631us       0.000us         0.00%       7.198us       7.198us             1  
+                                         CausalConv1dFn        14.89%      71.522us        77.79%     373.569us     124.523us       0.000us         0.00%       7.198us       2.399us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.88%      23.451us        56.74%     272.486us      90.829us       5.375us       100.00%       7.198us       2.399us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.375us       100.00%       5.375us       1.792us             3  
+                                Activity Buffer Request        18.95%      91.012us        18.95%      91.012us      91.012us       1.823us        33.92%       1.823us       1.823us             1  
+                                       aten::empty_like         1.53%       7.370us         6.16%      29.561us       9.854us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.62%      22.191us         4.62%      22.191us       7.397us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.91%     158.023us        32.91%     158.023us      52.674us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.16%       5.571us         1.16%       5.571us       5.571us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 658.611us
-Self CUDA time total: 5.374us
+Self CPU time total: 480.202us
+Self CUDA time total: 5.375us
 
 
 
@@ -4388,19 +4388,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.560us       679.82%     118.560us     118.560us             1  
-                               hf_kernels_causal_conv1d        11.95%      75.120us        99.18%     623.480us     623.480us       0.000us         0.00%      23.264us      23.264us             1  
-                                         CausalConv1dFn        10.51%      66.091us        87.23%     548.360us     182.787us       0.000us         0.00%      23.264us       7.755us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.07%      25.561us        72.28%     454.367us     151.456us      17.440us       100.00%      23.264us       7.755us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.440us       100.00%      17.440us       5.813us             3  
-                                Activity Buffer Request        43.50%     273.454us        43.50%     273.454us     273.454us       5.824us        33.39%       5.824us       5.824us             1  
-                                       aten::empty_like         1.15%       7.260us         4.44%      27.902us       9.301us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.28%      20.642us         3.28%      20.642us       6.881us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.71%     155.352us        24.71%     155.352us      51.784us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.82%       5.171us         0.82%       5.171us       5.171us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.143us       748.98%     130.143us     130.143us             1  
+                               hf_kernels_causal_conv1d         4.89%     101.833us        99.76%       2.079ms       2.079ms       0.000us         0.00%      23.200us      23.200us             1  
+                                         CausalConv1dFn         3.45%      71.962us        94.87%       1.978ms     659.169us       0.000us         0.00%      23.200us       7.733us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.13%      23.451us        89.92%       1.874ms     624.738us      17.376us       100.00%      23.200us       7.733us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.376us       100.00%      17.376us       5.792us             3  
+                                Activity Buffer Request        80.93%       1.687ms        80.93%       1.687ms       1.687ms       5.824us        33.52%       5.824us       5.824us             1  
+                                       aten::empty_like         0.43%       8.941us         1.50%      31.331us      10.444us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.07%      22.390us         1.07%      22.390us       7.463us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.86%     163.753us         7.86%     163.753us      54.584us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.24%       5.080us         0.24%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 628.651us
-Self CUDA time total: 17.440us
+Self CPU time total: 2.084ms
+Self CUDA time total: 17.376us
 
 
 
@@ -4410,19 +4410,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.548us       667.16%     119.548us     119.548us             1  
-                               hf_kernels_causal_conv1d        15.17%      73.960us        98.98%     482.488us     482.488us       0.000us         0.00%      23.967us      23.967us             1  
-                                         CausalConv1dFn        14.28%      69.601us        83.81%     408.528us     136.176us       0.000us         0.00%      23.967us       7.989us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.13%      25.009us        63.88%     311.375us     103.792us      17.919us       100.00%      23.967us       7.989us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.919us       100.00%      17.919us       5.973us             3  
-                                Activity Buffer Request        27.03%     131.752us        27.03%     131.752us     131.752us       6.048us        33.75%       6.048us       6.048us             1  
-                                       aten::empty_like         1.47%       7.161us         5.65%      27.552us       9.184us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.18%      20.391us         4.18%      20.391us       6.797us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.72%     154.614us        31.72%     154.614us      51.538us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.02%       4.960us         1.02%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.030us       694.61%     124.030us     124.030us             1  
+                               hf_kernels_causal_conv1d        16.72%      76.202us        98.90%     450.691us     450.691us       0.000us         0.00%      23.872us      23.872us             1  
+                                         CausalConv1dFn        15.17%      69.121us        82.18%     374.489us     124.830us       0.000us         0.00%      23.872us       7.957us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.17%      23.551us        60.29%     274.737us      91.579us      17.856us       100.00%      23.872us       7.957us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.856us       100.00%      17.856us       5.952us             3  
+                                Activity Buffer Request        20.41%      93.033us        20.41%      93.033us      93.033us       6.016us        33.69%       6.016us       6.016us             1  
+                                       aten::empty_like         1.72%       7.841us         6.72%      30.631us      10.210us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         5.00%      22.790us         5.00%      22.790us       7.597us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.70%     158.153us        34.70%     158.153us      52.718us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.10%       5.020us         1.10%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 487.448us
-Self CUDA time total: 17.919us
+Self CPU time total: 455.711us
+Self CUDA time total: 17.856us
 
 
 
@@ -4432,19 +4432,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.341us       679.59%     121.341us     121.341us             1  
-                               hf_kernels_causal_conv1d        15.05%      74.270us        98.98%     488.588us     488.588us       0.000us         0.00%      23.871us      23.871us             1  
-                                         CausalConv1dFn        13.97%      68.960us        83.93%     414.318us     138.106us       0.000us         0.00%      23.871us       7.957us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.63%      22.873us        64.07%     316.257us     105.419us      17.855us       100.00%      23.871us       7.957us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.855us       100.00%      17.855us       5.952us             3  
-                                Activity Buffer Request        27.73%     136.862us        27.73%     136.862us     136.862us       6.016us        33.69%       6.016us       6.016us             1  
-                                       aten::empty_like         1.71%       8.441us         5.90%      29.101us       9.700us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.19%      20.660us         4.19%      20.660us       6.887us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.71%     156.522us        31.71%     156.522us      52.174us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.02%       5.050us         1.02%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.247us       722.62%     129.247us     129.247us             1  
+                               hf_kernels_causal_conv1d         3.52%      75.733us        99.76%       2.146ms       2.146ms       0.000us         0.00%      23.870us      23.870us             1  
+                                         CausalConv1dFn         3.45%      74.171us        96.24%       2.070ms     690.050us       0.000us         0.00%      23.870us       7.957us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.15%      24.680us        91.39%       1.966ms     655.299us      17.886us       100.00%      23.870us       7.957us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.886us       100.00%      17.886us       5.962us             3  
+                                Activity Buffer Request        82.74%       1.780ms        82.74%       1.780ms       1.780ms       5.984us        33.46%       5.984us       5.984us             1  
+                                       aten::empty_like         0.37%       7.882us         1.40%      30.082us      10.027us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.03%      22.200us         1.03%      22.200us       7.400us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.51%     161.493us         7.51%     161.493us      53.831us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.24%       5.140us         0.24%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 493.638us
-Self CUDA time total: 17.855us
+Self CPU time total: 2.151ms
+Self CUDA time total: 17.886us
 
 
 
@@ -4454,19 +4454,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.470us       631.86%     117.470us     117.470us             1  
-                               hf_kernels_causal_conv1d        16.24%      72.220us        98.74%     438.977us     438.977us       0.000us         0.00%      24.831us      24.831us             1  
-                                         CausalConv1dFn        14.93%      66.373us        82.49%     366.757us     122.252us       0.000us         0.00%      24.831us       8.277us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.20%      23.120us        61.32%     272.634us      90.878us      18.591us       100.00%      24.831us       8.277us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.591us       100.00%      18.591us       6.197us             3  
-                                Activity Buffer Request        21.26%      94.541us        21.26%      94.541us      94.541us       6.240us        33.56%       6.240us       6.240us             1  
-                                       aten::empty_like         1.56%       6.930us         6.24%      27.750us       9.250us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.68%      20.820us         4.68%      20.820us       6.940us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.86%     154.973us        34.86%     154.973us      51.658us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.26%       5.620us         1.26%       5.620us       5.620us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.387us       688.73%     127.387us     127.387us             1  
+                               hf_kernels_causal_conv1d        17.30%      79.433us        98.83%     453.742us     453.742us       0.000us         0.00%      24.704us      24.704us             1  
+                                         CausalConv1dFn        15.18%      69.712us        81.53%     374.309us     124.770us       0.000us         0.00%      24.704us       8.235us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.33%      24.481us        59.78%     274.427us      91.476us      18.496us       100.00%      24.704us       8.235us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.496us       100.00%      18.496us       6.165us             3  
+                                Activity Buffer Request        19.95%      91.582us        19.95%      91.582us      91.582us       6.208us        33.56%       6.208us       6.208us             1  
+                                       aten::empty_like         1.61%       7.379us         6.57%      30.170us      10.057us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.96%      22.791us         4.96%      22.791us       7.597us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.50%     158.364us        34.50%     158.364us      52.788us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.17%       5.350us         1.17%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 444.597us
-Self CUDA time total: 18.591us
+Self CPU time total: 459.092us
+Self CUDA time total: 18.496us
 
 
 
@@ -4476,19 +4476,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        11.95%      72.560us        99.11%     601.930us     601.930us       0.000us         0.00%     162.301us     162.301us             1  
-                                         CausalConv1dFn        11.62%      70.560us        87.17%     529.370us     176.457us       0.000us         0.00%     162.301us      54.100us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.15%      25.181us        71.05%     431.498us     143.833us      97.726us       100.00%     162.301us      54.100us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.654us       133.69%     130.654us     130.654us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.726us       100.00%      97.726us      32.575us             3  
-                                Activity Buffer Request        38.68%     234.904us        38.68%     234.904us     234.904us      64.575us        66.08%      64.575us      64.575us             1  
-                                       aten::empty_like         1.24%       7.541us         4.50%      27.312us       9.104us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.26%      19.771us         3.26%      19.771us       6.590us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        28.22%     171.413us        28.22%     171.413us      57.138us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.89%       5.380us         0.89%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         4.39%      92.153us        99.77%       2.095ms       2.095ms       0.000us         0.00%     161.921us     161.921us             1  
+                                         CausalConv1dFn         3.68%      77.182us        95.38%       2.002ms     667.453us       0.000us         0.00%     161.921us      53.974us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.14%      23.952us        90.24%       1.894ms     631.499us      97.345us       100.00%     161.921us      53.974us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     141.279us       145.13%     141.279us     141.279us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.345us       100.00%      97.345us      32.448us             3  
+                                Activity Buffer Request        81.45%       1.710ms        81.45%       1.710ms       1.710ms      64.576us        66.34%      64.576us      64.576us             1  
+                                       aten::empty_like         0.41%       8.600us         1.46%      30.680us      10.227us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.05%      22.080us         1.05%      22.080us       7.360us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.65%     160.703us         7.65%     160.703us      53.568us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.23%       4.850us         0.23%       4.850us       4.850us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 607.310us
-Self CUDA time total: 97.726us
+Self CPU time total: 2.099ms
+Self CUDA time total: 97.345us
 
 
 
@@ -4498,19 +4498,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        16.49%      73.870us        98.87%     442.877us     442.877us       0.000us         0.00%     163.140us     163.140us             1  
-                                         CausalConv1dFn        15.37%      68.833us        82.38%     369.007us     123.002us       0.000us         0.00%     163.140us      54.380us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.34%      23.921us        60.99%     273.184us      91.061us      98.402us       100.00%     163.140us      54.380us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.828us       131.94%     129.828us     129.828us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.402us       100.00%      98.402us      32.801us             3  
-                                Activity Buffer Request        20.78%      93.061us        20.78%      93.061us      93.061us      64.738us        65.79%      64.738us      64.738us             1  
-                                       aten::empty_like         1.66%       7.440us         6.03%      26.990us       8.997us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.36%      19.550us         4.36%      19.550us       6.517us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.87%     156.202us        34.87%     156.202us      52.067us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.13%       5.069us         1.13%       5.069us       5.069us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        16.74%      76.723us        98.88%     453.112us     453.112us       0.000us         0.00%     162.555us     162.555us             1  
+                                         CausalConv1dFn        14.95%      68.512us        82.13%     376.389us     125.463us       0.000us         0.00%     162.555us      54.185us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.28%      24.211us        60.48%     277.136us      92.379us      98.269us       100.00%     162.555us      54.185us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.235us       132.53%     130.235us     130.235us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.269us       100.00%      98.269us      32.756us             3  
+                                Activity Buffer Request        20.17%      92.422us        20.17%      92.422us      92.422us      64.286us        65.42%      64.286us      64.286us             1  
+                                       aten::empty_like         1.77%       8.119us         6.71%      30.741us      10.247us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.94%      22.622us         4.94%      22.622us       7.541us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        35.02%     160.503us        35.02%     160.503us      53.501us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.12%       5.150us         1.12%       5.150us       5.150us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 447.946us
-Self CUDA time total: 98.402us
+Self CPU time total: 458.262us
+Self CUDA time total: 98.269us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4542,14 +4542,14 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
 
▶ UV Install Logs
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. -Fetching 11 files: 9%|▉ | 1/11 [00:00<00:01, 8.38it/s] -Fetching 11 files: 64%|██████▎ | 7/11 [00:02<00:01, 3.41it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:02<00:00, 5.50it/s]
+Fetching 11 files: 45%|████▌ | 5/11 [00:00<00:00, 48.10it/s] +Fetching 11 files: 91%|█████████ | 10/11 [00:01<00:00, 4.70it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 5.98it/s]

Artifacts:

causal_conv1d.jsonl