diff --git "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" --- "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" +++ "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" @@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Generated on:
- Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35 + Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.22s +Cell: nv | 0.25s | Raw @@ -3905,16 +3905,16 @@ Cell: nv | 0.22s
-
Mon Nov 10 21:57:49 2025       
+
Fri Dec 19 18:56:39 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+| NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   27C    P0             77W /  350W |       0MiB /  46068MiB |     18%      Default |
+| N/A   33C    P0             78W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.22s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 10.37s
+Cell: benchmark | 10.23s
  | 
 
 Raw
@@ -3992,19 +3992,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     156.321us      3758.62%     156.321us     156.321us             1  
-                               hf_kernels_causal_conv1d         6.87%     159.072us        99.36%       2.300ms       2.300ms       0.000us         0.00%       5.599us       5.599us             1  
-                                         CausalConv1dFn         4.82%     111.622us        92.49%       2.141ms     713.785us       0.000us         0.00%       5.599us       1.866us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.19%      27.462us        84.76%       1.962ms     654.127us       4.159us       100.00%       5.599us       1.866us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.159us       100.00%       4.159us       1.386us             3  
-                                Activity Buffer Request        81.39%       1.884ms        81.39%       1.884ms       1.884ms       1.440us        34.62%       1.440us       1.440us             1  
-                                       aten::empty_like         0.94%      21.650us         2.91%      67.351us      22.450us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.97%      45.701us         1.97%      45.701us      15.234us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.18%      50.500us         2.18%      50.500us      16.833us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.64%      14.811us         0.64%      14.811us      14.811us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     152.896us      3791.12%     152.896us     152.896us             1  
+                               hf_kernels_causal_conv1d         7.07%     153.743us        99.32%       2.160ms       2.160ms       0.000us         0.00%       5.442us       5.442us             1  
+                                         CausalConv1dFn         4.91%     106.822us        92.25%       2.006ms     668.831us       0.000us         0.00%       5.442us       1.814us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.14%      24.830us        84.38%       1.835ms     611.780us       4.033us       100.00%       5.442us       1.814us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.033us       100.00%       4.033us       1.344us             3  
+                                Activity Buffer Request        80.94%       1.760ms        80.94%       1.760ms       1.760ms       1.409us        34.94%       1.409us       1.409us             1  
+                                       aten::empty_like         0.92%      19.971us         2.96%      64.331us      21.444us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.04%      44.360us         2.04%      44.360us      14.787us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.30%      50.042us         2.30%      50.042us      16.681us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.68%      14.871us         0.68%      14.871us      14.871us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.315ms
-Self CUDA time total: 4.159us
+Self CPU time total: 2.175ms
+Self CUDA time total: 4.033us
 
 
 
@@ -4014,18 +4014,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.455us      3297.41%     123.455us     123.455us             1  
-                               hf_kernels_causal_conv1d         4.13%      83.101us        99.73%       2.009ms       2.009ms       0.000us         0.00%       4.992us       4.992us             1  
-                                         CausalConv1dFn         3.66%      73.760us        95.61%       1.926ms     641.917us       0.000us         0.00%       4.992us       1.664us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.15%      23.071us        90.47%       1.822ms     607.420us       3.744us       100.00%       4.992us       1.664us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.502us      3138.41%     117.502us     117.502us             1  
+                               hf_kernels_causal_conv1d         4.12%      80.030us        99.72%       1.937ms       1.937ms       0.000us         0.00%       4.992us       4.992us             1  
+                                         CausalConv1dFn         3.57%      69.262us        95.60%       1.857ms     618.894us       0.000us         0.00%       4.992us       1.664us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.24%      24.108us        90.50%       1.758ms     585.930us       3.744us       100.00%       4.992us       1.664us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.744us       100.00%       3.744us       1.248us             3  
-                                Activity Buffer Request        87.83%       1.769ms        87.83%       1.769ms       1.769ms       1.248us        33.33%       1.248us       1.248us             1  
-                                       aten::empty_like         0.39%       7.860us         1.48%      29.730us       9.910us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.09%      21.870us         1.09%      21.870us       7.290us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.49%      30.082us         1.49%      30.082us      10.027us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       5.421us         0.27%       5.421us       5.421us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        87.68%       1.703ms        87.68%       1.703ms       1.703ms       1.248us        33.33%       1.248us       1.248us             1  
+                                       aten::empty_like         0.41%       7.921us         1.53%      29.631us       9.877us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.12%      21.710us         1.12%      21.710us       7.237us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.59%      30.823us         1.59%      30.823us      10.274us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.491us         0.28%       5.491us       5.491us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.014ms
+Self CPU time total: 1.942ms
 Self CUDA time total: 3.744us
 
 
@@ -4036,18 +4036,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.263us      3185.44%     119.263us     119.263us             1  
-                               hf_kernels_causal_conv1d         3.91%      78.640us        99.72%       2.003ms       2.003ms       0.000us         0.00%       4.992us       4.992us             1  
-                                         CausalConv1dFn         3.57%      71.661us        95.80%       1.925ms     641.537us       0.000us         0.00%       4.992us       1.664us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.13%      22.781us        90.75%       1.823ms     607.693us       3.744us       100.00%       4.992us       1.664us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.807us      3093.14%     115.807us     115.807us             1  
+                               hf_kernels_causal_conv1d         4.02%      76.850us        99.72%       1.908ms       1.908ms       0.000us         0.00%       4.992us       4.992us             1  
+                                         CausalConv1dFn         3.58%      68.550us        95.70%       1.831ms     610.351us       0.000us         0.00%       4.992us       1.664us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.25%      23.951us        90.61%       1.734ms     577.870us       3.744us       100.00%       4.992us       1.664us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.744us       100.00%       3.744us       1.248us             3  
-                                Activity Buffer Request        88.14%       1.771ms        88.14%       1.771ms       1.771ms       1.248us        33.33%       1.248us       1.248us             1  
-                                       aten::empty_like         0.41%       8.160us         1.49%      29.872us       9.957us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.08%      21.712us         1.08%      21.712us       7.237us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.48%      29.670us         1.48%      29.670us       9.890us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       5.669us         0.28%       5.669us       5.669us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        87.74%       1.679ms        87.74%       1.679ms       1.679ms       1.248us        33.33%       1.248us       1.248us             1  
+                                       aten::empty_like         0.43%       8.141us         1.51%      28.892us       9.631us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.08%      20.751us         1.08%      20.751us       6.917us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.62%      30.931us         1.62%      30.931us      10.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.380us         0.28%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.009ms
+Self CPU time total: 1.913ms
 Self CUDA time total: 3.744us
 
 
@@ -4058,19 +4058,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.790us      3253.81%     121.790us     121.790us             1  
-                               hf_kernels_causal_conv1d         3.48%      76.970us        99.77%       2.208ms       2.208ms       0.000us         0.00%       4.991us       4.991us             1  
-                                         CausalConv1dFn         3.33%      73.753us        96.30%       2.131ms     710.368us       0.000us         0.00%       4.991us       1.664us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.03%      22.770us        91.66%       2.029ms     676.184us       3.743us       100.00%       4.991us       1.664us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.743us       100.00%       3.743us       1.248us             3  
-                                Activity Buffer Request        81.47%       1.803ms        81.47%       1.803ms       1.803ms       1.248us        33.34%       1.248us       1.248us             1  
-                                       aten::empty_like         0.36%       7.858us         1.30%      28.800us       9.600us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.95%      20.942us         0.95%      20.942us       6.981us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.17%     202.863us         9.17%     202.863us      67.621us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       4.991us         0.23%       4.991us       4.991us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.191us      3366.36%     128.191us     128.191us             1  
+                               hf_kernels_causal_conv1d         3.49%      79.180us        99.78%       2.264ms       2.264ms       0.000us         0.00%       5.088us       5.088us             1  
+                                         CausalConv1dFn         3.32%      75.331us        96.29%       2.185ms     728.346us       0.000us         0.00%       5.088us       1.696us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.06%      23.951us        91.65%       2.080ms     693.279us       3.808us       100.00%       5.088us       1.696us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.808us       100.00%       3.808us       1.269us             3  
+                                Activity Buffer Request        79.24%       1.798ms        79.24%       1.798ms       1.798ms       1.280us        33.61%       1.280us       1.280us             1  
+                                       aten::empty_like         0.38%       8.540us         1.32%      29.871us       9.957us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.94%      21.331us         0.94%      21.331us       7.110us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        11.35%     257.655us        11.35%     257.655us      85.885us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.22%       5.020us         0.22%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.213ms
-Self CUDA time total: 3.743us
+Self CPU time total: 2.269ms
+Self CUDA time total: 3.808us
 
 
 
@@ -4080,19 +4080,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.073us      2547.57%     123.073us     123.073us             1  
-                               hf_kernels_causal_conv1d         3.82%      79.680us        99.75%       2.083ms       2.083ms       0.000us         0.00%       6.463us       6.463us             1  
-                                         CausalConv1dFn         3.53%      73.692us        95.93%       2.003ms     667.744us       0.000us         0.00%       6.463us       2.154us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.17%      24.371us        90.98%       1.900ms     633.257us       4.831us       100.00%       6.463us       2.154us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.831us       100.00%       4.831us       1.610us             3  
-                                Activity Buffer Request        81.73%       1.707ms        81.73%       1.707ms       1.707ms       1.632us        33.78%       1.632us       1.632us             1  
-                                       aten::empty_like         0.42%       8.791us         1.43%      29.771us       9.924us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.00%      20.980us         1.00%      20.980us       6.993us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.08%     168.682us         8.08%     168.682us      56.227us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       5.250us         0.25%       5.250us       5.250us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.753us      2573.98%     122.753us     122.753us             1  
+                               hf_kernels_causal_conv1d         3.87%      79.741us        99.76%       2.057ms       2.057ms       0.000us         0.00%       6.369us       6.369us             1  
+                                         CausalConv1dFn         3.55%      73.230us        95.89%       1.978ms     659.221us       0.000us         0.00%       6.369us       2.123us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.13%      23.292us        90.96%       1.876ms     625.288us       4.769us       100.00%       6.369us       2.123us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.769us       100.00%       4.769us       1.590us             3  
+                                Activity Buffer Request        80.59%       1.662ms        80.59%       1.662ms       1.662ms       1.600us        33.55%       1.600us       1.600us             1  
+                                       aten::empty_like         0.38%       7.870us         1.39%      28.571us       9.524us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.00%      20.701us         1.00%      20.701us       6.900us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.24%     190.473us         9.24%     190.473us      63.491us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.24%       4.940us         0.24%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.088ms
-Self CUDA time total: 4.831us
+Self CPU time total: 2.062ms
+Self CUDA time total: 4.769us
 
 
 
@@ -4102,19 +4102,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.883us      2373.55%     113.883us     113.883us             1  
-                               hf_kernels_causal_conv1d        15.03%      75.250us        99.01%     495.717us     495.717us       0.000us         0.00%       6.430us       6.430us             1  
-                                         CausalConv1dFn        13.70%      68.601us        83.98%     420.467us     140.156us       0.000us         0.00%       6.430us       2.143us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.03%      25.190us        64.69%     323.874us     107.958us       4.798us       100.00%       6.430us       2.143us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.798us       100.00%       4.798us       1.599us             3  
-                                Activity Buffer Request        28.01%     140.222us        28.01%     140.222us     140.222us       1.632us        34.01%       1.632us       1.632us             1  
-                                       aten::empty_like         1.45%       7.260us         5.59%      27.992us       9.331us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.14%      20.732us         4.14%      20.732us       6.911us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.65%     158.462us        31.65%     158.462us      52.821us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.99%       4.940us         0.99%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.129us      2403.33%     116.129us     116.129us             1  
+                               hf_kernels_causal_conv1d        14.08%      75.841us        99.01%     533.389us     533.389us       0.000us         0.00%       6.464us       6.464us             1  
+                                         CausalConv1dFn        12.88%      69.403us        84.93%     457.548us     152.516us       0.000us         0.00%       6.464us       2.155us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.42%      23.809us        66.86%     360.195us     120.065us       4.832us       100.00%       6.464us       2.155us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.832us       100.00%       4.832us       1.611us             3  
+                                Activity Buffer Request        31.92%     171.983us        31.92%     171.983us     171.983us       1.632us        33.77%       1.632us       1.632us             1  
+                                       aten::empty_like         1.47%       7.920us         5.19%      27.950us       9.317us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.72%      20.030us         3.72%      20.030us       6.677us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.52%     164.403us        30.52%     164.403us      54.801us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.99%       5.360us         0.99%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 500.657us
-Self CUDA time total: 4.798us
+Self CPU time total: 538.749us
+Self CUDA time total: 4.832us
 
 
 
@@ -4124,19 +4124,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.365us      1148.32%     122.365us     122.365us             1  
-                               hf_kernels_causal_conv1d         3.51%      76.530us        99.77%       2.176ms       2.176ms       0.000us         0.00%      14.208us      14.208us             1  
-                                         CausalConv1dFn         3.29%      71.713us        96.26%       2.099ms     699.771us       0.000us         0.00%      14.208us       4.736us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.11%      24.170us        91.65%       1.999ms     666.274us      10.656us       100.00%      14.208us       4.736us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.656us       100.00%      10.656us       3.552us             3  
-                                Activity Buffer Request        82.90%       1.808ms        82.90%       1.808ms       1.808ms       3.552us        33.33%       3.552us       3.552us             1  
-                                       aten::empty_like         0.37%       8.070us         1.32%      28.780us       9.593us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.95%      20.710us         0.95%      20.710us       6.903us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.64%     166.713us         7.64%     166.713us      55.571us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       5.051us         0.23%       5.051us       5.051us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.560us      1146.71%     122.560us     122.560us             1  
+                               hf_kernels_causal_conv1d         3.81%      80.881us        99.76%       2.118ms       2.118ms       0.000us         0.00%      14.272us      14.272us             1  
+                                         CausalConv1dFn         3.44%      73.000us        95.95%       2.037ms     679.028us       0.000us         0.00%      14.272us       4.757us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.08%      22.840us        91.17%       1.936ms     645.207us      10.688us       100.00%      14.272us       4.757us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.688us       100.00%      10.688us       3.563us             3  
+                                Activity Buffer Request        82.40%       1.750ms        82.40%       1.750ms       1.750ms       3.584us        33.53%       3.584us       3.584us             1  
+                                       aten::empty_like         0.36%       7.642us         1.34%      28.462us       9.487us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.98%      20.820us         0.98%      20.820us       6.940us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.69%     163.253us         7.69%     163.253us      54.418us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.24%       5.130us         0.24%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.181ms
-Self CUDA time total: 10.656us
+Self CPU time total: 2.123ms
+Self CUDA time total: 10.688us
 
 
 
@@ -4146,19 +4146,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.676us      1057.08%     115.676us     115.676us             1  
-                               hf_kernels_causal_conv1d        15.90%      75.141us        98.97%     467.777us     467.777us       0.000us         0.00%      14.654us      14.654us             1  
-                                         CausalConv1dFn        14.89%      70.359us        83.07%     392.636us     130.879us       0.000us         0.00%      14.654us       4.885us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.95%      23.391us        62.24%     294.186us      98.062us      10.943us       100.00%      14.654us       4.885us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.943us       100.00%      10.943us       3.648us             3  
-                                Activity Buffer Request        23.54%     111.281us        23.54%     111.281us     111.281us       3.711us        33.91%       3.711us       3.711us             1  
-                                       aten::empty_like         1.66%       7.830us         5.94%      28.091us       9.364us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.29%      20.261us         4.29%      20.261us       6.754us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.75%     159.514us        33.75%     159.514us      53.171us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.03%       4.890us         1.03%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.638us      1078.28%     116.638us     116.638us             1  
+                               hf_kernels_causal_conv1d        17.39%      80.341us        98.91%     457.098us     457.098us       0.000us         0.00%      14.401us      14.401us             1  
+                                         CausalConv1dFn        15.12%      69.882us        81.53%     376.757us     125.586us       0.000us         0.00%      14.401us       4.800us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.92%      22.749us        60.38%     279.004us      93.001us      10.817us       100.00%      14.401us       4.800us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.817us       100.00%      10.817us       3.606us             3  
+                                Activity Buffer Request        22.08%     102.032us        22.08%     102.032us     102.032us       3.584us        33.13%       3.584us       3.584us             1  
+                                       aten::empty_like         1.64%       7.601us         6.03%      27.871us       9.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.39%      20.270us         4.39%      20.270us       6.757us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.37%     154.223us        33.37%     154.223us      51.408us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.09%       5.020us         1.09%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 472.667us
-Self CUDA time total: 10.943us
+Self CPU time total: 462.118us
+Self CUDA time total: 10.817us
 
 
 
@@ -4168,18 +4168,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.422us      1124.47%     123.422us     123.422us             1  
-                               hf_kernels_causal_conv1d         3.69%      77.100us        99.75%       2.084ms       2.084ms       0.000us         0.00%      14.656us      14.656us             1  
-                                         CausalConv1dFn         3.52%      73.471us        96.06%       2.007ms     668.988us       0.000us         0.00%      14.656us       4.885us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.13%      23.660us        90.70%       1.895ms     631.647us      10.976us       100.00%      14.656us       4.885us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.873us      1165.02%     127.873us     127.873us             1  
+                               hf_kernels_causal_conv1d         3.83%      78.590us        99.73%       2.045ms       2.045ms       0.000us         0.00%      14.656us      14.656us             1  
+                                         CausalConv1dFn         3.56%      73.060us        95.90%       1.966ms     655.431us       0.000us         0.00%      14.656us       4.885us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.04%      21.271us        90.90%       1.864ms     621.251us      10.976us       100.00%      14.656us       4.885us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.976us       100.00%      10.976us       3.659us             3  
-                                Activity Buffer Request        81.81%       1.709ms        81.81%       1.709ms       1.709ms       3.680us        33.53%       3.680us       3.680us             1  
-                                       aten::empty_like         0.81%      17.020us         1.85%      38.551us      12.850us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.03%      21.531us         1.03%      21.531us       7.177us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.76%     162.104us         7.76%     162.104us      54.035us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       5.260us         0.25%       5.260us       5.260us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        82.22%       1.686ms        82.22%       1.686ms       1.686ms       3.680us        33.53%       3.680us       3.680us             1  
+                                       aten::empty_like         0.42%       8.532us         1.44%      29.482us       9.827us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.02%      20.950us         1.02%      20.950us       6.983us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.64%     156.623us         7.64%     156.623us      52.208us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.481us         0.27%       5.481us       5.481us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.089ms
+Self CPU time total: 2.050ms
 Self CUDA time total: 10.976us
 
 
@@ -4190,19 +4190,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.952us      1044.29%     117.952us     117.952us             1  
-                               hf_kernels_causal_conv1d        16.01%      73.960us        98.90%     456.837us     456.837us       0.000us         0.00%      15.071us      15.071us             1  
-                                         CausalConv1dFn        15.53%      71.741us        82.89%     382.877us     127.626us       0.000us         0.00%      15.071us       5.024us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.93%      22.791us        61.20%     282.685us      94.228us      11.295us       100.00%      15.071us       5.024us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.295us       100.00%      11.295us       3.765us             3  
-                                Activity Buffer Request        21.70%     100.232us        21.70%     100.232us     100.232us       3.776us        33.43%       3.776us       3.776us             1  
-                                       aten::empty_like         1.73%       7.970us         6.16%      28.451us       9.484us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.43%      20.481us         4.43%      20.481us       6.827us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.57%     159.662us        34.57%     159.662us      53.221us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.10%       5.060us         1.10%       5.060us       5.060us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.132us      1060.65%     119.132us     119.132us             1  
+                               hf_kernels_causal_conv1d        16.93%      76.370us        98.84%     445.867us     445.867us       0.000us         0.00%      14.976us      14.976us             1  
+                                         CausalConv1dFn        15.59%      70.303us        81.91%     369.497us     123.166us       0.000us         0.00%      14.976us       4.992us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.36%      24.158us        60.06%     270.914us      90.305us      11.232us       100.00%      14.976us       4.992us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.232us       100.00%      11.232us       3.744us             3  
+                                Activity Buffer Request        20.43%      92.161us        20.43%      92.161us      92.161us       3.744us        33.33%       3.744us       3.744us             1  
+                                       aten::empty_like         1.59%       7.169us         6.27%      28.280us       9.427us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.68%      21.111us         4.68%      21.111us       7.037us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.27%     154.595us        34.27%     154.595us      51.532us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.16%       5.220us         1.16%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 461.897us
-Self CUDA time total: 11.295us
+Self CPU time total: 451.087us
+Self CUDA time total: 11.232us
 
 
 
@@ -4212,19 +4212,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.158us       256.57%     128.158us     128.158us             1  
-                               hf_kernels_causal_conv1d         3.51%      75.280us        99.75%       2.140ms       2.140ms       0.000us         0.00%      83.102us      83.102us             1  
-                                         CausalConv1dFn         3.36%      72.172us        96.24%       2.065ms     688.218us       0.000us         0.00%      83.102us      27.701us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.14%      24.540us        91.55%       1.964ms     654.657us      49.951us       100.00%      83.102us      27.701us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      49.951us       100.00%      49.951us      16.650us             3  
-                                Activity Buffer Request        82.86%       1.778ms        82.86%       1.778ms       1.778ms      33.151us        66.37%      33.151us      33.151us             1  
-                                       aten::empty_like         0.37%       7.920us         1.33%      28.510us       9.503us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.96%      20.590us         0.96%      20.590us       6.863us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.54%     161.824us         7.54%     161.824us      53.941us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       5.290us         0.25%       5.290us       5.290us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.111us       251.50%     126.111us     126.111us             1  
+                               hf_kernels_causal_conv1d         3.58%      73.510us        99.72%       2.049ms       2.049ms       0.000us         0.00%      83.392us      83.392us             1  
+                                         CausalConv1dFn         3.49%      71.722us        96.15%       1.975ms     658.418us       0.000us         0.00%      83.392us      27.797us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.14%      23.351us        91.25%       1.875ms     624.854us      50.144us       100.00%      83.392us      27.797us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.144us       100.00%      50.144us      16.715us             3  
+                                Activity Buffer Request        82.17%       1.688ms        82.17%       1.688ms       1.688ms      33.248us        66.31%      33.248us      33.248us             1  
+                                       aten::empty_like         0.39%       8.051us         1.41%      28.971us       9.657us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.02%      20.920us         1.02%      20.920us       6.973us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.94%     163.112us         7.94%     163.112us      54.371us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.660us         0.28%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.145ms
-Self CUDA time total: 49.951us
+Self CPU time total: 2.054ms
+Self CUDA time total: 50.144us
 
 
 
@@ -4234,19 +4234,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.310us       261.10%     121.310us     121.310us             1  
-                               hf_kernels_causal_conv1d        16.42%      74.560us        98.88%     448.987us     448.987us       0.000us         0.00%      75.933us      75.933us             1  
-                                         CausalConv1dFn        15.28%      69.392us        82.46%     374.427us     124.809us       0.000us         0.00%      75.933us      25.311us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.01%      22.740us        60.80%     276.074us      92.025us      46.462us       100.00%      75.933us      25.311us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      46.462us       100.00%      46.462us      15.487us             3  
-                                Activity Buffer Request        21.27%      96.581us        21.27%      96.581us      96.581us      29.471us        63.43%      29.471us      29.471us             1  
-                                       aten::empty_like         1.63%       7.411us         6.38%      28.961us       9.654us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.75%      21.550us         4.75%      21.550us       7.183us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.52%     156.753us        34.52%     156.753us      52.251us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.12%       5.090us         1.12%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.959us       269.13%     124.959us     124.959us             1  
+                               hf_kernels_causal_conv1d        16.37%      75.480us        98.93%     456.147us     456.147us       0.000us         0.00%      75.742us      75.742us             1  
+                                         CausalConv1dFn        15.52%      71.561us        82.56%     380.667us     126.889us       0.000us         0.00%      75.742us      25.247us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.18%      23.872us        60.72%     279.966us      93.322us      46.431us       100.00%      75.742us      25.247us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      46.431us       100.00%      46.431us      15.477us             3  
+                                Activity Buffer Request        21.66%      99.871us        21.66%      99.871us      99.871us      29.311us        63.13%      29.311us      29.311us             1  
+                                       aten::empty_like         1.56%       7.180us         6.32%      29.140us       9.713us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.76%      21.960us         4.76%      21.960us       7.320us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.88%     156.223us        33.88%     156.223us      52.074us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.07%       4.930us         1.07%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 454.077us
-Self CUDA time total: 46.462us
+Self CPU time total: 461.077us
+Self CUDA time total: 46.431us
 
 
 
@@ -4256,19 +4256,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.254us      3312.35%     128.254us     128.254us             1  
-                               hf_kernels_causal_conv1d         3.31%      74.540us        99.77%       2.245ms       2.245ms       0.000us         0.00%       5.120us       5.120us             1  
-                                         CausalConv1dFn         3.41%      76.802us        96.46%       2.170ms     723.418us       0.000us         0.00%       5.120us       1.707us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.08%      24.209us        91.78%       2.065ms     688.374us       3.872us       100.00%       5.120us       1.707us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.872us       100.00%       3.872us       1.291us             3  
-                                Activity Buffer Request        83.69%       1.883ms        83.69%       1.883ms       1.883ms       1.248us        32.23%       1.248us       1.248us             1  
-                                       aten::empty_like         0.34%       7.679us         1.26%      28.331us       9.444us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.92%      20.652us         0.92%      20.652us       6.884us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.01%     157.803us         7.01%     157.803us      52.601us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       5.180us         0.23%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.136us      3207.50%     123.136us     123.136us             1  
+                               hf_kernels_causal_conv1d         3.53%      75.250us        99.73%       2.124ms       2.124ms       0.000us         0.00%       5.055us       5.055us             1  
+                                         CausalConv1dFn         3.40%      72.360us        96.19%       2.049ms     682.905us       0.000us         0.00%       5.055us       1.685us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.03%      21.891us        91.44%       1.947ms     649.155us       3.839us       100.00%       5.055us       1.685us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.839us       100.00%       3.839us       1.280us             3  
+                                Activity Buffer Request        83.03%       1.768ms        83.03%       1.768ms       1.768ms       1.216us        31.67%       1.216us       1.216us             1  
+                                       aten::empty_like         0.38%       8.101us         1.36%      28.892us       9.631us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.98%      20.791us         0.98%      20.791us       6.930us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.38%     157.153us         7.38%     157.153us      52.384us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.800us         0.27%       5.800us       5.800us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.250ms
-Self CUDA time total: 3.872us
+Self CPU time total: 2.130ms
+Self CUDA time total: 3.839us
 
 
 
@@ -4278,19 +4278,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.470us      3059.11%     117.470us     117.470us             1  
-                               hf_kernels_causal_conv1d        16.52%      75.490us        98.91%     451.907us     451.907us       0.000us         0.00%       5.056us       5.056us             1  
-                                         CausalConv1dFn        15.55%      71.061us        82.39%     376.417us     125.472us       0.000us         0.00%       5.056us       1.685us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.27%      24.090us        60.40%     275.984us      91.995us       3.840us       100.00%       5.056us       1.685us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.840us       100.00%       3.840us       1.280us             3  
-                                Activity Buffer Request        20.75%      94.821us        20.75%      94.821us      94.821us       1.216us        31.67%       1.216us       1.216us             1  
-                                       aten::empty_like         1.80%       8.242us         6.43%      29.372us       9.791us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.62%      21.130us         4.62%      21.130us       7.043us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.38%     157.073us        34.38%     157.073us      52.358us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.09%       4.990us         1.09%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.287us      2977.90%     116.287us     116.287us             1  
+                               hf_kernels_causal_conv1d        16.77%      75.910us        98.91%     447.847us     447.847us       0.000us         0.00%       5.153us       5.153us             1  
+                                         CausalConv1dFn        15.17%      68.691us        82.15%     371.937us     123.979us       0.000us         0.00%       5.153us       1.718us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.60%      25.371us        60.63%     274.535us      91.512us       3.905us       100.00%       5.153us       1.718us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.905us       100.00%       3.905us       1.302us             3  
+                                Activity Buffer Request        20.81%      94.211us        20.81%      94.211us      94.211us       1.248us        31.96%       1.248us       1.248us             1  
+                                       aten::empty_like         1.64%       7.411us         6.34%      28.711us       9.570us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.70%      21.300us         4.70%      21.300us       7.100us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.22%     154.953us        34.22%     154.953us      51.651us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.09%       4.920us         1.09%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 456.897us
-Self CUDA time total: 3.840us
+Self CPU time total: 452.767us
+Self CUDA time total: 3.905us
 
 
 
@@ -4300,19 +4300,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.191us      2958.18%     120.191us     120.191us             1  
-                               hf_kernels_causal_conv1d         3.64%      78.360us        99.76%       2.149ms       2.149ms       0.000us         0.00%       5.406us       5.406us             1  
-                                         CausalConv1dFn         3.37%      72.531us        96.13%       2.071ms     690.275us       0.000us         0.00%       5.406us       1.802us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.05%      22.591us        91.41%       1.969ms     656.417us       4.063us       100.00%       5.406us       1.802us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
-                                Activity Buffer Request        83.09%       1.790ms        83.09%       1.790ms       1.790ms       1.343us        33.05%       1.343us       1.343us             1  
-                                       aten::empty_like         0.37%       8.020us         1.35%      29.041us       9.680us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.98%      21.021us         0.98%      21.021us       7.007us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.27%     156.703us         7.27%     156.703us      52.234us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       5.100us         0.24%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.009us      2882.27%     119.009us     119.009us             1  
+                               hf_kernels_causal_conv1d         3.71%      75.410us        99.73%       2.029ms       2.029ms       0.000us         0.00%       5.506us       5.506us             1  
+                                         CausalConv1dFn         3.78%      76.960us        96.02%       1.953ms     651.101us       0.000us         0.00%       5.506us       1.835us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.18%      24.051us        90.82%       1.848ms     615.834us       4.129us       100.00%       5.506us       1.835us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.129us       100.00%       4.129us       1.376us             3  
+                                Activity Buffer Request        81.81%       1.664ms        81.81%       1.664ms       1.664ms       1.377us        33.35%       1.377us       1.377us             1  
+                                       aten::empty_like         0.36%       7.250us         1.42%      28.841us       9.614us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.06%      21.591us         1.06%      21.591us       7.197us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.83%     159.204us         7.83%     159.204us      53.068us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.590us         0.27%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.154ms
-Self CUDA time total: 4.063us
+Self CPU time total: 2.034ms
+Self CUDA time total: 4.129us
 
 
 
@@ -4322,18 +4322,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.509us      2988.81%     120.509us     120.509us             1  
-                               hf_kernels_causal_conv1d        16.24%      73.950us        98.87%     450.317us     450.317us       0.000us         0.00%       5.376us       5.376us             1  
-                                         CausalConv1dFn        17.23%      78.473us        82.64%     376.367us     125.456us       0.000us         0.00%       5.376us       1.792us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.08%      23.119us        59.28%     269.974us      89.991us       4.032us       100.00%       5.376us       1.792us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.758us      2821.38%     113.758us     113.758us             1  
+                               hf_kernels_causal_conv1d        16.75%      75.370us        98.94%     445.237us     445.237us       0.000us         0.00%       5.376us       5.376us             1  
+                                         CausalConv1dFn        15.04%      67.702us        82.19%     369.867us     123.289us       0.000us         0.00%       5.376us       1.792us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.18%      23.329us        60.99%     274.454us      91.485us       4.032us       100.00%       5.376us       1.792us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.032us       100.00%       4.032us       1.344us             3  
-                                Activity Buffer Request        19.95%      90.851us        19.95%      90.851us      90.851us       1.344us        33.33%       1.344us       1.344us             1  
-                                       aten::empty_like         1.73%       7.890us         6.13%      27.920us       9.307us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.40%      20.030us         4.40%      20.030us       6.677us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.25%     156.004us        34.25%     156.004us      52.001us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.13%       5.130us         1.13%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        21.24%      95.561us        21.24%      95.561us      95.561us       1.344us        33.33%       1.344us       1.344us             1  
+                                       aten::empty_like         1.71%       7.680us         6.16%      27.711us       9.237us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.45%      20.031us         4.45%      20.031us       6.677us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.57%     155.564us        34.57%     155.564us      51.855us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.06%       4.760us         1.06%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 455.447us
+Self CPU time total: 449.997us
 Self CUDA time total: 4.032us
 
 
@@ -4344,19 +4344,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.767us      2334.71%     124.767us     124.767us             1  
-                               hf_kernels_causal_conv1d         3.64%      76.791us        99.75%       2.102ms       2.102ms       0.000us         0.00%       7.168us       7.168us             1  
-                                         CausalConv1dFn         3.46%      72.920us        96.11%       2.025ms     674.997us       0.000us         0.00%       7.168us       2.389us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.08%      22.730us        91.24%       1.923ms     640.840us       5.344us       100.00%       7.168us       2.389us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.344us       100.00%       5.344us       1.781us             3  
-                                Activity Buffer Request        82.66%       1.742ms        82.66%       1.742ms       1.742ms       1.824us        34.13%       1.824us       1.824us             1  
-                                       aten::empty_like         0.40%       8.480us         1.40%      29.552us       9.851us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.00%      21.072us         1.00%      21.072us       7.024us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.51%     158.242us         7.51%     158.242us      52.747us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       5.220us         0.25%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.936us      2257.83%     119.936us     119.936us             1  
+                               hf_kernels_causal_conv1d         3.67%      76.002us        99.74%       2.065ms       2.065ms       0.000us         0.00%       7.104us       7.104us             1  
+                                         CausalConv1dFn         3.38%      70.021us        96.07%       1.989ms     663.104us       0.000us         0.00%       7.104us       2.368us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.09%      22.660us        91.27%       1.890ms     629.940us       5.312us       100.00%       7.104us       2.368us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.312us       100.00%       5.312us       1.771us             3  
+                                Activity Buffer Request        82.69%       1.712ms        82.69%       1.712ms       1.712ms       1.792us        33.73%       1.792us       1.792us             1  
+                                       aten::empty_like         0.39%       8.121us         1.42%      29.471us       9.824us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.03%      21.350us         1.03%      21.350us       7.117us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.48%     154.872us         7.48%     154.872us      51.624us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       5.360us         0.26%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.107ms
-Self CUDA time total: 5.344us
+Self CPU time total: 2.071ms
+Self CUDA time total: 5.312us
 
 
 
@@ -4366,19 +4366,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     114.399us      2127.96%     114.399us     114.399us             1  
-                               hf_kernels_causal_conv1d        16.62%      75.320us        98.88%     448.097us     448.097us       0.000us         0.00%       7.200us       7.200us             1  
-                                         CausalConv1dFn        15.04%      68.172us        82.26%     372.777us     124.259us       0.000us         0.00%       7.200us       2.400us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.05%      22.881us        60.95%     276.214us      92.071us       5.376us       100.00%       7.200us       2.400us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.376us       100.00%       5.376us       1.792us             3  
-                                Activity Buffer Request        20.71%      93.851us        20.71%      93.851us      93.851us       1.824us        33.93%       1.824us       1.824us             1  
-                                       aten::empty_like         1.68%       7.630us         6.27%      28.391us       9.464us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.58%      20.761us         4.58%      20.761us       6.920us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.19%     159.482us        35.19%     159.482us      53.161us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.12%       5.070us         1.12%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.719us      2209.14%     118.719us     118.719us             1  
+                               hf_kernels_causal_conv1d        11.31%      74.461us        99.17%     653.171us     653.171us       0.000us         0.00%       7.197us       7.197us             1  
+                                         CausalConv1dFn        10.87%      71.560us        87.87%     578.710us     192.903us       0.000us         0.00%       7.197us       2.399us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.42%      22.540us        72.76%     479.219us     159.740us       5.374us       100.00%       7.197us       2.399us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.374us       100.00%       5.374us       1.791us             3  
+                                Activity Buffer Request        44.55%     293.385us        44.55%     293.385us     293.385us       1.823us        33.92%       1.823us       1.823us             1  
+                                       aten::empty_like         1.11%       7.281us         4.24%      27.931us       9.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.14%      20.650us         3.14%      20.650us       6.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.79%     163.294us        24.79%     163.294us      54.431us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.83%       5.440us         0.83%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 453.167us
-Self CUDA time total: 5.376us
+Self CPU time total: 658.611us
+Self CUDA time total: 5.374us
 
 
 
@@ -4388,19 +4388,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.887us       696.30%     121.887us     121.887us             1  
-                               hf_kernels_causal_conv1d         3.44%      74.640us        99.77%       2.162ms       2.162ms       0.000us         0.00%      23.361us      23.361us             1  
-                                         CausalConv1dFn         3.19%      69.031us        96.32%       2.087ms     695.668us       0.000us         0.00%      23.361us       7.787us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.10%      23.730us        91.78%       1.989ms     662.904us      17.505us       100.00%      23.361us       7.787us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.505us       100.00%      17.505us       5.835us             3  
-                                Activity Buffer Request        82.75%       1.793ms        82.75%       1.793ms       1.793ms       5.856us        33.45%       5.856us       5.856us             1  
-                                       aten::empty_like         0.40%       8.582us         1.35%      29.262us       9.754us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.95%      20.680us         0.95%      20.680us       6.893us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.94%     172.113us         7.94%     172.113us      57.371us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       5.069us         0.23%       5.069us       5.069us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.560us       679.82%     118.560us     118.560us             1  
+                               hf_kernels_causal_conv1d        11.95%      75.120us        99.18%     623.480us     623.480us       0.000us         0.00%      23.264us      23.264us             1  
+                                         CausalConv1dFn        10.51%      66.091us        87.23%     548.360us     182.787us       0.000us         0.00%      23.264us       7.755us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.07%      25.561us        72.28%     454.367us     151.456us      17.440us       100.00%      23.264us       7.755us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.440us       100.00%      17.440us       5.813us             3  
+                                Activity Buffer Request        43.50%     273.454us        43.50%     273.454us     273.454us       5.824us        33.39%       5.824us       5.824us             1  
+                                       aten::empty_like         1.15%       7.260us         4.44%      27.902us       9.301us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.28%      20.642us         3.28%      20.642us       6.881us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.71%     155.352us        24.71%     155.352us      51.784us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.82%       5.171us         0.82%       5.171us       5.171us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.167ms
-Self CUDA time total: 17.505us
+Self CPU time total: 628.651us
+Self CUDA time total: 17.440us
 
 
 
@@ -4410,19 +4410,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.997us       664.91%     119.997us     119.997us             1  
-                               hf_kernels_causal_conv1d        16.46%      76.510us        98.91%     459.857us     459.857us       0.000us         0.00%      24.063us      24.063us             1  
-                                         CausalConv1dFn        14.99%      69.691us        82.45%     383.347us     127.782us       0.000us         0.00%      24.063us       8.021us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.12%      23.810us        61.53%     286.094us      95.365us      18.047us       100.00%      24.063us       8.021us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.047us       100.00%      18.047us       6.016us             3  
-                                Activity Buffer Request        22.64%     105.271us        22.64%     105.271us     105.271us       6.016us        33.34%       6.016us       6.016us             1  
-                                       aten::empty_like         1.59%       7.411us         5.93%      27.562us       9.187us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.33%      20.151us         4.33%      20.151us       6.717us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.77%     157.013us        33.77%     157.013us      52.338us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.09%       5.080us         1.09%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.548us       667.16%     119.548us     119.548us             1  
+                               hf_kernels_causal_conv1d        15.17%      73.960us        98.98%     482.488us     482.488us       0.000us         0.00%      23.967us      23.967us             1  
+                                         CausalConv1dFn        14.28%      69.601us        83.81%     408.528us     136.176us       0.000us         0.00%      23.967us       7.989us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.13%      25.009us        63.88%     311.375us     103.792us      17.919us       100.00%      23.967us       7.989us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.919us       100.00%      17.919us       5.973us             3  
+                                Activity Buffer Request        27.03%     131.752us        27.03%     131.752us     131.752us       6.048us        33.75%       6.048us       6.048us             1  
+                                       aten::empty_like         1.47%       7.161us         5.65%      27.552us       9.184us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.18%      20.391us         4.18%      20.391us       6.797us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.72%     154.614us        31.72%     154.614us      51.538us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.02%       4.960us         1.02%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 464.937us
-Self CUDA time total: 18.047us
+Self CPU time total: 487.448us
+Self CUDA time total: 17.919us
 
 
 
@@ -4432,19 +4432,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.983us       701.78%     125.983us     125.983us             1  
-                               hf_kernels_causal_conv1d         3.62%      75.400us        99.76%       2.076ms       2.076ms       0.000us         0.00%      23.968us      23.968us             1  
-                                         CausalConv1dFn         3.51%      72.963us        96.14%       2.001ms     667.008us       0.000us         0.00%      23.968us       7.989us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.17%      24.320us        91.19%       1.898ms     632.703us      17.952us       100.00%      23.968us       7.989us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.952us       100.00%      17.952us       5.984us             3  
-                                Activity Buffer Request        82.20%       1.711ms        82.20%       1.711ms       1.711ms       6.016us        33.51%       6.016us       6.016us             1  
-                                       aten::empty_like         0.41%       8.499us         1.44%      29.950us       9.983us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.03%      21.451us         1.03%      21.451us       7.150us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.83%     162.893us         7.83%     162.893us      54.298us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       4.969us         0.24%       4.969us       4.969us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.341us       679.59%     121.341us     121.341us             1  
+                               hf_kernels_causal_conv1d        15.05%      74.270us        98.98%     488.588us     488.588us       0.000us         0.00%      23.871us      23.871us             1  
+                                         CausalConv1dFn        13.97%      68.960us        83.93%     414.318us     138.106us       0.000us         0.00%      23.871us       7.957us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.63%      22.873us        64.07%     316.257us     105.419us      17.855us       100.00%      23.871us       7.957us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.855us       100.00%      17.855us       5.952us             3  
+                                Activity Buffer Request        27.73%     136.862us        27.73%     136.862us     136.862us       6.016us        33.69%       6.016us       6.016us             1  
+                                       aten::empty_like         1.71%       8.441us         5.90%      29.101us       9.700us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.19%      20.660us         4.19%      20.660us       6.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.71%     156.522us        31.71%     156.522us      52.174us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.02%       5.050us         1.02%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.081ms
-Self CUDA time total: 17.952us
+Self CPU time total: 493.638us
+Self CUDA time total: 17.855us
 
 
 
@@ -4454,19 +4454,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.901us       639.40%     119.901us     119.901us             1  
-                               hf_kernels_causal_conv1d        11.47%      73.600us        99.21%     636.820us     636.820us       0.000us         0.00%      25.088us      25.088us             1  
-                                         CausalConv1dFn        11.28%      72.380us        87.74%     563.220us     187.740us       0.000us         0.00%      25.088us       8.363us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.65%      23.431us        72.11%     462.887us     154.296us      18.752us       100.00%      25.088us       8.363us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.752us       100.00%      18.752us       6.251us             3  
-                                Activity Buffer Request        43.62%     280.014us        43.62%     280.014us     280.014us       6.336us        33.79%       6.336us       6.336us             1  
-                                       aten::empty_like         1.22%       7.832us         4.35%      27.953us       9.318us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.13%      20.121us         3.13%      20.121us       6.707us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.84%     159.442us        24.84%     159.442us      53.147us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.79%       5.080us         0.79%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.470us       631.86%     117.470us     117.470us             1  
+                               hf_kernels_causal_conv1d        16.24%      72.220us        98.74%     438.977us     438.977us       0.000us         0.00%      24.831us      24.831us             1  
+                                         CausalConv1dFn        14.93%      66.373us        82.49%     366.757us     122.252us       0.000us         0.00%      24.831us       8.277us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.20%      23.120us        61.32%     272.634us      90.878us      18.591us       100.00%      24.831us       8.277us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.591us       100.00%      18.591us       6.197us             3  
+                                Activity Buffer Request        21.26%      94.541us        21.26%      94.541us      94.541us       6.240us        33.56%       6.240us       6.240us             1  
+                                       aten::empty_like         1.56%       6.930us         6.24%      27.750us       9.250us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.68%      20.820us         4.68%      20.820us       6.940us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.86%     154.973us        34.86%     154.973us      51.658us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.26%       5.620us         1.26%       5.620us       5.620us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 641.900us
-Self CUDA time total: 18.752us
+Self CPU time total: 444.597us
+Self CUDA time total: 18.591us
 
 
 
@@ -4476,19 +4476,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        11.42%      73.310us        99.16%     636.780us     636.780us       0.000us         0.00%     162.591us     162.591us             1  
-                                         CausalConv1dFn        11.12%      71.382us        87.74%     563.470us     187.823us       0.000us         0.00%     162.591us      54.197us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.58%      22.989us        72.14%     463.287us     154.429us      97.631us       100.00%     162.591us      54.197us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.208us       133.37%     130.208us     130.208us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.631us       100.00%      97.631us      32.544us             3  
-                                Activity Buffer Request        43.38%     278.604us        43.38%     278.604us     278.604us      64.960us        66.54%      64.960us      64.960us             1  
-                                       aten::empty_like         1.24%       7.950us         4.48%      28.801us       9.600us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.25%      20.851us         3.25%      20.851us       6.950us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        25.18%     161.694us        25.18%     161.694us      53.898us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.84%       5.420us         0.84%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        11.95%      72.560us        99.11%     601.930us     601.930us       0.000us         0.00%     162.301us     162.301us             1  
+                                         CausalConv1dFn        11.62%      70.560us        87.17%     529.370us     176.457us       0.000us         0.00%     162.301us      54.100us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.15%      25.181us        71.05%     431.498us     143.833us      97.726us       100.00%     162.301us      54.100us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.654us       133.69%     130.654us     130.654us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.726us       100.00%      97.726us      32.575us             3  
+                                Activity Buffer Request        38.68%     234.904us        38.68%     234.904us     234.904us      64.575us        66.08%      64.575us      64.575us             1  
+                                       aten::empty_like         1.24%       7.541us         4.50%      27.312us       9.104us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.26%      19.771us         3.26%      19.771us       6.590us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        28.22%     171.413us        28.22%     171.413us      57.138us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.89%       5.380us         0.89%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 642.200us
-Self CUDA time total: 97.631us
+Self CPU time total: 607.310us
+Self CUDA time total: 97.726us
 
 
 
@@ -4498,19 +4498,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        13.89%      72.060us        98.98%     513.378us     513.378us       0.000us         0.00%     163.263us     163.263us             1  
-                                         CausalConv1dFn        13.96%      72.421us        85.08%     441.318us     147.106us       0.000us         0.00%     163.263us      54.421us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.45%      23.099us        65.49%     339.676us     113.225us      98.623us       100.00%     163.263us      54.421us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.111us       131.93%     130.111us     130.111us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.623us       100.00%      98.623us      32.874us             3  
-                                Activity Buffer Request        30.19%     156.612us        30.19%     156.612us     156.612us      64.640us        65.54%      64.640us      64.640us             1  
-                                       aten::empty_like         1.62%       8.391us         5.63%      29.221us       9.740us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.02%      20.830us         4.02%      20.830us       6.943us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.84%     159.965us        30.84%     159.965us      53.322us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.02%       5.310us         1.02%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        16.49%      73.870us        98.87%     442.877us     442.877us       0.000us         0.00%     163.140us     163.140us             1  
+                                         CausalConv1dFn        15.37%      68.833us        82.38%     369.007us     123.002us       0.000us         0.00%     163.140us      54.380us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.34%      23.921us        60.99%     273.184us      91.061us      98.402us       100.00%     163.140us      54.380us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.828us       131.94%     129.828us     129.828us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.402us       100.00%      98.402us      32.801us             3  
+                                Activity Buffer Request        20.78%      93.061us        20.78%      93.061us      93.061us      64.738us        65.79%      64.738us      64.738us             1  
+                                       aten::empty_like         1.66%       7.440us         6.03%      26.990us       8.997us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.36%      19.550us         4.36%      19.550us       6.517us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.87%     156.202us        34.87%     156.202us      52.067us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.13%       5.069us         1.13%       5.069us       5.069us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 518.688us
-Self CUDA time total: 98.623us
+Self CPU time total: 447.946us
+Self CUDA time total: 98.402us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4520,7 +4520,7 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.04  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4     0.05  True
@@ -4542,14 +4542,14 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
 
▶ UV Install Logs
-
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:01, 3.51it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 5.51it/s]
+
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. + +Fetching 11 files: 9%|▉ | 1/11 [00:00<00:01, 8.38it/s] +Fetching 11 files: 64%|██████▎ | 7/11 [00:02<00:01, 3.41it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:02<00:00, 5.50it/s]

Artifacts:

causal_conv1d.jsonl