diff --git "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" --- "a/causal_conv1d/impls/hf_kernels_causal_conv1d.html" +++ "b/causal_conv1d/impls/hf_kernels_causal_conv1d.html" @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.24s +Cell: nv | 0.21s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.24s
-
Wed Oct 29 14:27:09 2025       
+
Wed Oct 29 15:50:16 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.24s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   33C    P0            109W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   29C    P0             78W /  350W |       0MiB /  46068MiB |     18%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3928,7 @@ Cell: nv | 0.24s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 5.79s
+Cell: benchmark | 9.51s
  | 
 
 Raw
@@ -3973,19 +3981,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     151.393us      3724.31%     151.393us     151.393us             1  
-                               hf_kernels_causal_conv1d         8.95%     166.324us        99.62%       1.852ms       1.852ms       0.000us         0.00%       5.505us       5.505us             1  
-                                         CausalConv1dFn         6.05%     112.563us        90.67%       1.686ms     561.934us       0.000us         0.00%       5.505us       1.835us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.41%      26.172us        80.97%       1.505ms     501.826us       4.065us       100.00%       5.505us       1.835us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.065us       100.00%       4.065us       1.355us             3  
-                                Activity Buffer Request        77.14%       1.434ms        77.14%       1.434ms       1.434ms       1.440us        35.42%       1.440us       1.440us             1  
-                                       aten::empty_like         1.03%      19.059us         3.64%      67.761us      22.587us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.62%      48.702us         2.62%      48.702us      16.234us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.42%      45.061us         2.42%      45.061us      15.020us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.38%       7.150us         0.38%       7.150us       7.150us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     146.174us      3568.70%     146.174us     146.174us             1  
+                               hf_kernels_causal_conv1d         8.17%     151.282us        99.60%       1.845ms       1.845ms       0.000us         0.00%       5.536us       5.536us             1  
+                                         CausalConv1dFn         5.96%     110.474us        91.44%       1.694ms     564.683us       0.000us         0.00%       5.536us       1.845us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.25%      23.111us        81.80%       1.516ms     505.182us       4.096us       100.00%       5.536us       1.845us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.096us       100.00%       4.096us       1.365us             3  
+                                Activity Buffer Request        78.05%       1.446ms        78.05%       1.446ms       1.446ms       1.440us        35.16%       1.440us       1.440us             1  
+                                       aten::empty_like         1.06%      19.700us         3.67%      68.031us      22.677us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.61%      48.331us         2.61%      48.331us      16.110us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.50%      46.381us         2.50%      46.381us      15.460us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.40%       7.370us         0.40%       7.370us       7.370us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.859ms
-Self CUDA time total: 4.065us
+Self CPU time total: 1.853ms
+Self CUDA time total: 4.096us
 
 
 
@@ -3995,19 +4003,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.439us      3456.32%     129.439us     129.439us             1  
-                               hf_kernels_causal_conv1d         5.79%      99.043us        99.68%       1.706ms       1.706ms       0.000us         0.00%       4.994us       4.994us             1  
-                                         CausalConv1dFn         4.71%      80.562us        93.90%       1.607ms     535.793us       0.000us         0.00%       4.994us       1.665us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.47%      25.130us        87.50%       1.498ms     499.285us       3.745us       100.00%       4.994us       1.665us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.745us       100.00%       3.745us       1.248us             3  
-                                Activity Buffer Request        84.17%       1.441ms        84.17%       1.441ms       1.441ms       1.249us        33.35%       1.249us       1.249us             1  
-                                       aten::empty_like         0.47%       7.980us         1.69%      28.961us       9.654us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.23%      20.981us         1.23%      20.981us       6.994us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.86%      31.821us         1.86%      31.821us      10.607us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.32%       5.430us         0.32%       5.430us       5.430us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     144.319us      3789.89%     144.319us     144.319us             1  
+                               hf_kernels_causal_conv1d         4.94%      83.592us        99.69%       1.687ms       1.687ms       0.000us         0.00%       5.088us       5.088us             1  
+                                         CausalConv1dFn         5.57%      94.202us        94.76%       1.604ms     534.586us       0.000us         0.00%       5.088us       1.696us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.53%      25.920us        87.50%       1.481ms     493.624us       3.808us       100.00%       5.088us       1.696us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.808us       100.00%       3.808us       1.269us             3  
+                                Activity Buffer Request        84.14%       1.424ms        84.14%       1.424ms       1.424ms       1.280us        33.61%       1.280us       1.280us             1  
+                                       aten::empty_like         0.45%       7.561us         1.69%      28.682us       9.561us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.25%      21.121us         1.25%      21.121us       7.040us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.83%      30.901us         1.83%      30.901us      10.300us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       5.170us         0.31%       5.170us       5.170us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.712ms
-Self CUDA time total: 3.745us
+Self CPU time total: 1.693ms
+Self CUDA time total: 3.808us
 
 
 
@@ -4017,19 +4025,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.098us      3285.62%     124.098us     124.098us             1  
-                               hf_kernels_causal_conv1d         5.52%      95.683us        99.69%       1.728ms       1.728ms       0.000us         0.00%       5.057us       5.057us             1  
-                                         CausalConv1dFn         4.48%      77.582us        94.17%       1.632ms     544.020us       0.000us         0.00%       5.057us       1.686us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.43%      24.830us        87.99%       1.525ms     508.322us       3.777us       100.00%       5.057us       1.686us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.777us       100.00%       3.777us       1.259us             3  
-                                Activity Buffer Request        84.76%       1.469ms        84.76%       1.469ms       1.469ms       1.280us        33.89%       1.280us       1.280us             1  
-                                       aten::empty_like         0.46%       7.920us         1.70%      29.511us       9.837us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.25%      21.591us         1.25%      21.591us       7.197us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.80%      31.261us         1.80%      31.261us      10.420us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.301us         0.31%       5.301us       5.301us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.942us      3149.95%     118.942us     118.942us             1  
+                               hf_kernels_causal_conv1d         4.70%      79.942us        99.69%       1.694ms       1.694ms       0.000us         0.00%       5.024us       5.024us             1  
+                                         CausalConv1dFn         4.32%      73.340us        94.98%       1.614ms     538.022us       0.000us         0.00%       5.024us       1.675us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.40%      23.852us        89.01%       1.513ms     504.182us       3.776us       100.00%       5.024us       1.675us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
+                                Activity Buffer Request        85.86%       1.459ms        85.86%       1.459ms       1.459ms       1.248us        33.05%       1.248us       1.248us             1  
+                                       aten::empty_like         0.44%       7.502us         1.66%      28.182us       9.394us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.22%      20.680us         1.22%      20.680us       6.893us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.75%      29.690us         1.75%      29.690us       9.897us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       5.340us         0.31%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.733ms
-Self CUDA time total: 3.777us
+Self CPU time total: 1.699ms
+Self CUDA time total: 3.776us
 
 
 
@@ -4039,19 +4047,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.729us      3378.36%     129.729us     129.729us             1  
-                               hf_kernels_causal_conv1d         5.03%      97.232us        99.72%       1.927ms       1.927ms       0.000us         0.00%       5.120us       5.120us             1  
-                                         CausalConv1dFn         4.11%      79.452us        94.69%       1.830ms     610.049us       0.000us         0.00%       5.120us       1.707us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.27%      24.481us        89.03%       1.721ms     573.588us       3.840us       100.00%       5.120us       1.707us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.840us       100.00%       3.840us       1.280us             3  
-                                Activity Buffer Request        76.40%       1.477ms        76.40%       1.477ms       1.477ms       1.280us        33.33%       1.280us       1.280us             1  
-                                       aten::empty_like         0.41%       7.951us         1.55%      29.931us       9.977us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.14%      21.980us         1.14%      21.980us       7.327us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        11.36%     219.575us        11.36%     219.575us      73.192us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       5.490us         0.28%       5.490us       5.490us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.814us      3251.63%     122.814us     122.814us             1  
+                               hf_kernels_causal_conv1d         4.64%      85.642us        99.73%       1.840ms       1.840ms       0.000us         0.00%       5.025us       5.025us             1  
+                                         CausalConv1dFn         3.90%      72.023us        95.09%       1.754ms     584.757us       0.000us         0.00%       5.025us       1.675us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.39%      25.651us        89.62%       1.653ms     551.112us       3.777us       100.00%       5.025us       1.675us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.777us       100.00%       3.777us       1.259us             3  
+                                Activity Buffer Request        78.74%       1.453ms        78.74%       1.453ms       1.453ms       1.248us        33.04%       1.248us       1.248us             1  
+                                       aten::empty_like         0.42%       7.802us         1.57%      28.911us       9.637us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.14%      21.109us         1.14%      21.109us       7.036us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.48%     174.913us         9.48%     174.913us      58.304us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.000us         0.27%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.933ms
-Self CUDA time total: 3.840us
+Self CPU time total: 1.845ms
+Self CUDA time total: 3.777us
 
 
 
@@ -4061,19 +4069,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.080us      2644.30%     126.080us     126.080us             1  
-                               hf_kernels_causal_conv1d         5.18%     102.863us        99.75%       1.979ms       1.979ms       0.000us         0.00%       6.368us       6.368us             1  
-                                         CausalConv1dFn         3.95%      78.303us        94.57%       1.876ms     625.402us       0.000us         0.00%       6.368us       2.123us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.22%      24.140us        89.14%       1.768ms     589.491us       4.768us       100.00%       6.368us       2.123us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.768us       100.00%       4.768us       1.589us             3  
-                                Activity Buffer Request        79.49%       1.577ms        79.49%       1.577ms       1.577ms       1.600us        33.56%       1.600us       1.600us             1  
-                                       aten::empty_like         0.40%       7.900us         1.48%      29.430us       9.810us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.09%      21.530us         1.09%      21.530us       7.177us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.43%     167.184us         8.43%     167.184us      55.728us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       4.910us         0.25%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.008us      2597.30%     123.008us     123.008us             1  
+                               hf_kernels_causal_conv1d         4.59%      83.953us        99.73%       1.825ms       1.825ms       0.000us         0.00%       6.337us       6.337us             1  
+                                         CausalConv1dFn         3.99%      73.081us        95.14%       1.741ms     580.330us       0.000us         0.00%       6.337us       2.112us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.48%      27.090us        89.51%       1.638ms     546.026us       4.736us       100.00%       6.337us       2.112us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.736us       100.00%       4.736us       1.579us             3  
+                                Activity Buffer Request        78.87%       1.443ms        78.87%       1.443ms       1.443ms       1.601us        33.80%       1.601us       1.601us             1  
+                                       aten::empty_like         0.45%       8.280us         1.63%      29.831us       9.944us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.18%      21.551us         1.18%      21.551us       7.184us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.16%     167.714us         9.16%     167.714us      55.905us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.030us         0.27%       5.030us       5.030us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.984ms
-Self CUDA time total: 4.768us
+Self CPU time total: 1.830ms
+Self CUDA time total: 4.736us
 
 
 
@@ -4083,19 +4091,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.055us      2488.80%     121.055us     121.055us             1  
-                               hf_kernels_causal_conv1d        13.09%      78.123us        99.20%     592.205us     592.205us       0.000us         0.00%       6.528us       6.528us             1  
-                                         CausalConv1dFn        13.01%      77.643us        86.11%     514.082us     171.361us       0.000us         0.00%       6.528us       2.176us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.18%      24.929us        68.36%     408.089us     136.030us       4.864us       100.00%       6.528us       2.176us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.864us       100.00%       4.864us       1.621us             3  
-                                Activity Buffer Request        36.63%     218.665us        36.63%     218.665us     218.665us       1.664us        34.21%       1.664us       1.664us             1  
-                                       aten::empty_like         1.31%       7.839us         4.75%      28.350us       9.450us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.44%      20.511us         3.44%      20.511us       6.837us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        27.55%     164.495us        27.55%     164.495us      54.832us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.80%       4.790us         0.80%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.566us      2381.84%     113.566us     113.566us             1  
+                               hf_kernels_causal_conv1d        13.06%      81.391us        99.15%     617.944us     617.944us       0.000us         0.00%       6.400us       6.400us             1  
+                                         CausalConv1dFn        11.13%      69.381us        86.09%     536.553us     178.851us       0.000us         0.00%       6.400us       2.133us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.09%      25.520us        70.57%     439.840us     146.613us       4.768us       100.00%       6.400us       2.133us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.768us       100.00%       4.768us       1.589us             3  
+                                Activity Buffer Request        39.92%     248.796us        39.92%     248.796us     248.796us       1.632us        34.23%       1.632us       1.632us             1  
+                                       aten::empty_like         1.16%       7.221us         4.39%      27.332us       9.111us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.23%      20.111us         3.23%      20.111us       6.704us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        26.56%     165.524us        26.56%     165.524us      55.175us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.85%       5.280us         0.85%       5.280us       5.280us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 596.995us
-Self CUDA time total: 4.864us
+Self CPU time total: 623.224us
+Self CUDA time total: 4.768us
 
 
 
@@ -4105,19 +4113,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.031us      1201.49%     128.031us     128.031us             1  
-                               hf_kernels_causal_conv1d         5.58%     105.873us        99.72%       1.893ms       1.893ms       0.000us         0.00%      14.208us      14.208us             1  
-                                         CausalConv1dFn         4.13%      78.341us        94.14%       1.787ms     595.748us       0.000us         0.00%      14.208us       4.736us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.45%      27.570us        88.49%       1.680ms     559.957us      10.656us       100.00%      14.208us       4.736us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.656us       100.00%      10.656us       3.552us             3  
-                                Activity Buffer Request        77.94%       1.480ms        77.94%       1.480ms       1.480ms       3.552us        33.33%       3.552us       3.552us             1  
-                                       aten::empty_like         0.41%       7.812us         1.53%      29.032us       9.677us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.12%      21.220us         1.12%      21.220us       7.073us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.09%     172.624us         9.09%     172.624us      57.541us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       5.330us         0.28%       5.330us       5.330us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.383us      1119.53%     120.383us     120.383us             1  
+                               hf_kernels_causal_conv1d         4.38%      80.811us        99.69%       1.838ms       1.838ms       0.000us         0.00%      14.338us      14.338us             1  
+                                         CausalConv1dFn         3.88%      71.502us        95.31%       1.758ms     585.854us       0.000us         0.00%      14.338us       4.779us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.42%      26.240us        89.89%       1.658ms     552.523us      10.753us       100.00%      14.338us       4.779us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.753us       100.00%      10.753us       3.584us             3  
+                                Activity Buffer Request        79.47%       1.465ms        79.47%       1.465ms       1.465ms       3.585us        33.34%       3.585us       3.585us             1  
+                                       aten::empty_like         0.42%       7.711us         1.54%      28.491us       9.497us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.13%      20.780us         1.13%      20.780us       6.927us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.00%     165.884us         9.00%     165.884us      55.295us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       5.720us         0.31%       5.720us       5.720us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.898ms
-Self CUDA time total: 10.656us
+Self CPU time total: 1.844ms
+Self CUDA time total: 10.753us
 
 
 
@@ -4127,19 +4135,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.524us      1119.66%     122.524us     122.524us             1  
-                               hf_kernels_causal_conv1d        19.00%     100.263us        99.02%     522.563us     522.563us       0.000us         0.00%      14.623us      14.623us             1  
-                                         CausalConv1dFn        14.56%      76.813us        80.02%     422.300us     140.767us       0.000us         0.00%      14.623us       4.874us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.04%      26.621us        60.06%     316.927us     105.642us      10.943us       100.00%      14.623us       4.874us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.943us       100.00%      10.943us       3.648us             3  
-                                Activity Buffer Request        24.63%     129.993us        24.63%     129.993us     129.993us       3.680us        33.63%       3.680us       3.680us             1  
-                                       aten::empty_like         1.53%       8.070us         5.41%      28.560us       9.520us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.88%      20.490us         3.88%      20.490us       6.830us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.38%     160.313us        30.38%     160.313us      53.438us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.98%       5.160us         0.98%       5.160us       5.160us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.901us      1062.24%     115.901us     115.901us             1  
+                               hf_kernels_causal_conv1d        13.49%      81.452us        99.17%     598.664us     598.664us       0.000us         0.00%      14.591us      14.591us             1  
+                                         CausalConv1dFn        11.49%      69.393us        85.68%     517.212us     172.404us       0.000us         0.00%      14.591us       4.864us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.25%      25.660us        69.54%     419.779us     139.926us      10.911us       100.00%      14.591us       4.864us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.911us       100.00%      10.911us       3.637us             3  
+                                Activity Buffer Request        38.07%     229.795us        38.07%     229.795us     229.795us       3.680us        33.73%       3.680us       3.680us             1  
+                                       aten::empty_like         1.23%       7.430us         4.64%      28.040us       9.347us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.41%      20.610us         3.41%      20.610us       6.870us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.22%     164.324us        27.22%     164.324us      54.775us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.83%       5.020us         0.83%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 527.723us
-Self CUDA time total: 10.943us
+Self CPU time total: 603.684us
+Self CUDA time total: 10.911us
 
 
 
@@ -4149,19 +4157,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.879us      1185.50%     130.879us     130.879us             1  
-                               hf_kernels_causal_conv1d         6.10%     112.423us        99.71%       1.839ms       1.839ms       0.000us         0.00%      14.752us      14.752us             1  
-                                         CausalConv1dFn         4.42%      81.553us        93.62%       1.726ms     575.457us       0.000us         0.00%      14.752us       4.917us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.34%      24.629us        87.45%       1.613ms     537.533us      11.040us       100.00%      14.752us       4.917us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.040us       100.00%      11.040us       3.680us             3  
-                                Activity Buffer Request        77.44%       1.428ms        77.44%       1.428ms       1.428ms       3.712us        33.62%       3.712us       3.712us             1  
-                                       aten::empty_like         0.46%       8.560us         1.75%      32.220us      10.740us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.28%      23.660us         1.28%      23.660us       7.887us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.67%     159.915us         8.67%     159.915us      53.305us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       5.260us         0.29%       5.260us       5.260us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.031us      1126.74%     124.031us     124.031us             1  
+                               hf_kernels_causal_conv1d         4.38%      80.211us        99.73%       1.825ms       1.825ms       0.000us         0.00%      14.688us      14.688us             1  
+                                         CausalConv1dFn         3.92%      71.693us        95.35%       1.744ms     581.490us       0.000us         0.00%      14.688us       4.896us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.35%      24.770us        89.82%       1.643ms     547.796us      11.008us       100.00%      14.688us       4.896us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.008us       100.00%      11.008us       3.669us             3  
+                                Activity Buffer Request        79.44%       1.453ms        79.44%       1.453ms       1.453ms       3.680us        33.43%       3.680us       3.680us             1  
+                                       aten::empty_like         0.44%       8.110us         1.61%      29.390us       9.797us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.16%      21.280us         1.16%      21.280us       7.093us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.03%     165.165us         9.03%     165.165us      55.055us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.921us         0.27%       4.921us       4.921us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.844ms
-Self CUDA time total: 11.040us
+Self CPU time total: 1.830ms
+Self CUDA time total: 11.008us
 
 
 
@@ -4171,19 +4179,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.988us      1097.16%     124.988us     124.988us             1  
-                               hf_kernels_causal_conv1d        14.68%      75.042us        98.95%     505.802us     505.802us       0.000us         0.00%      15.232us      15.232us             1  
-                                         CausalConv1dFn        15.20%      77.712us        84.27%     430.760us     143.587us       0.000us         0.00%      15.232us       5.077us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.71%      24.091us        63.54%     324.777us     108.259us      11.392us       100.00%      15.232us       5.077us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.392us       100.00%      11.392us       3.797us             3  
-                                Activity Buffer Request        26.66%     136.263us        26.66%     136.263us     136.263us       3.840us        33.71%       3.840us       3.840us             1  
-                                       aten::empty_like         1.46%       7.441us         5.53%      28.271us       9.424us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.08%      20.830us         4.08%      20.830us       6.943us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.17%     164.423us        32.17%     164.423us      54.808us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.05%       5.351us         1.05%       5.351us       5.351us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.078us      1080.72%     122.078us     122.078us             1  
+                               hf_kernels_causal_conv1d        13.22%      78.432us        99.12%     587.944us     587.944us       0.000us         0.00%      15.072us      15.072us             1  
+                                         CausalConv1dFn        12.12%      71.922us        85.89%     509.512us     169.837us       0.000us         0.00%      15.072us       5.024us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.25%      25.220us        69.07%     409.719us     136.573us      11.296us       100.00%      15.072us       5.024us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.296us       100.00%      11.296us       3.765us             3  
+                                Activity Buffer Request        37.46%     222.215us        37.46%     222.215us     222.215us       3.776us        33.43%       3.776us       3.776us             1  
+                                       aten::empty_like         1.25%       7.430us         4.70%      27.871us       9.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.45%      20.441us         3.45%      20.441us       6.814us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.36%     162.284us        27.36%     162.284us      54.095us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.88%       5.240us         0.88%       5.240us       5.240us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 511.153us
-Self CUDA time total: 11.392us
+Self CPU time total: 593.184us
+Self CUDA time total: 11.296us
 
 
 
@@ -4193,19 +4201,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     131.775us       262.12%     131.775us     131.775us             1  
-                               hf_kernels_causal_conv1d         8.81%      77.263us        99.39%     871.362us     871.362us       0.000us         0.00%      83.680us      83.680us             1  
-                                         CausalConv1dFn         8.68%      76.121us        90.57%     794.099us     264.700us       0.000us         0.00%      83.680us      27.893us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.02%      26.501us        78.58%     688.947us     229.649us      50.272us       100.00%      83.680us      27.893us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.272us       100.00%      50.272us      16.757us             3  
-                                Activity Buffer Request        55.77%     488.972us        55.77%     488.972us     488.972us      33.408us        66.45%      33.408us      33.408us             1  
-                                       aten::empty_like         0.92%       8.040us         3.31%      29.031us       9.677us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.39%      20.991us         2.39%      20.991us       6.997us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        19.79%     173.474us        19.79%     173.474us      57.825us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.61%       5.370us         0.61%       5.370us       5.370us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.648us       252.30%     127.648us     127.648us             1  
+                               hf_kernels_causal_conv1d         4.31%      79.103us        99.73%       1.830ms       1.830ms       0.000us         0.00%      84.257us      84.257us             1  
+                                         CausalConv1dFn         3.94%      72.391us        95.42%       1.751ms     583.740us       0.000us         0.00%      84.257us      28.086us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.47%      26.941us        89.93%       1.650ms     550.139us      50.593us       100.00%      84.257us      28.086us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.593us       100.00%      50.593us      16.864us             3  
+                                Activity Buffer Request        79.45%       1.458ms        79.45%       1.458ms       1.458ms      33.664us        66.54%      33.664us      33.664us             1  
+                                       aten::empty_like         0.41%       7.590us         1.55%      28.411us       9.470us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.13%      20.821us         1.13%      20.821us       6.940us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.01%     165.403us         9.01%     165.403us      55.134us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.880us         0.27%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 876.732us
-Self CUDA time total: 50.272us
+Self CPU time total: 1.835ms
+Self CUDA time total: 50.593us
 
 
 
@@ -4215,19 +4223,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.295us       247.23%     127.295us     127.295us             1  
-                               hf_kernels_causal_conv1d        15.09%      77.332us        99.04%     507.562us     507.562us       0.000us         0.00%      86.016us      86.016us             1  
-                                         CausalConv1dFn        14.68%      75.241us        83.95%     430.230us     143.410us       0.000us         0.00%      86.016us      28.672us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.05%      25.861us        63.40%     324.927us     108.309us      51.488us       100.00%      86.016us      28.672us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.488us       100.00%      51.488us      17.163us             3  
-                                Activity Buffer Request        25.26%     129.463us        25.26%     129.463us     129.463us      34.528us        67.06%      34.528us      34.528us             1  
-                                       aten::empty_like         1.67%       8.561us         5.87%      30.062us      10.021us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.20%      21.501us         4.20%      21.501us       7.167us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.09%     169.603us        33.09%     169.603us      56.534us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.96%       4.929us         0.96%       4.929us       4.929us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.347us       241.52%     124.347us     124.347us             1  
+                               hf_kernels_causal_conv1d        13.88%      78.022us        99.09%     557.033us     557.033us       0.000us         0.00%      85.980us      85.980us             1  
+                                         CausalConv1dFn        12.54%      70.483us        85.21%     479.011us     159.670us       0.000us         0.00%      85.980us      28.660us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.34%      24.401us        67.64%     380.208us     126.736us      51.486us       100.00%      85.980us      28.660us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.486us       100.00%      51.486us      17.162us             3  
+                                Activity Buffer Request        34.82%     195.764us        34.82%     195.764us     195.764us      34.494us        67.00%      34.494us      34.494us             1  
+                                       aten::empty_like         1.33%       7.470us         5.04%      28.320us       9.440us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.71%      20.850us         3.71%      20.850us       6.950us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        28.47%     160.043us        28.47%     160.043us      53.348us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.91%       5.110us         0.91%       5.110us       5.110us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 512.491us
-Self CUDA time total: 51.488us
+Self CPU time total: 562.143us
+Self CUDA time total: 51.486us
 
 
 
@@ -4237,19 +4245,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.214us      3104.87%     121.214us     121.214us             1  
-                               hf_kernels_causal_conv1d         8.71%      75.123us        99.37%     856.672us     856.672us       0.000us         0.00%       5.184us       5.184us             1  
-                                         CausalConv1dFn         8.55%      73.741us        90.66%     781.549us     260.516us       0.000us         0.00%       5.184us       1.728us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.92%      25.150us        78.63%     677.857us     225.952us       3.904us       100.00%       5.184us       1.728us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.904us       100.00%       3.904us       1.301us             3  
-                                Activity Buffer Request        56.24%     484.832us        56.24%     484.832us     484.832us       1.280us        32.79%       1.280us       1.280us             1  
-                                       aten::empty_like         1.08%       9.311us         3.47%      29.951us       9.984us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.39%      20.640us         2.39%      20.640us       6.880us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        19.47%     167.875us        19.47%     167.875us      55.958us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.63%       5.440us         0.63%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.728us      3142.18%     121.728us     121.728us             1  
+                               hf_kernels_causal_conv1d         4.20%      76.603us        99.72%       1.818ms       1.818ms       0.000us         0.00%       5.123us       5.123us             1  
+                                         CausalConv1dFn         3.96%      72.231us        95.52%       1.742ms     580.506us       0.000us         0.00%       5.123us       1.708us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.49%      27.119us        89.93%       1.640ms     546.545us       3.874us       100.00%       5.123us       1.708us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.874us       100.00%       3.874us       1.291us             3  
+                                Activity Buffer Request        79.71%       1.453ms        79.71%       1.453ms       1.453ms       1.249us        32.24%       1.249us       1.249us             1  
+                                       aten::empty_like         0.42%       7.681us         1.63%      29.652us       9.884us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.21%      21.971us         1.21%      21.971us       7.324us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.74%     159.334us         8.74%     159.334us      53.111us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.020us         0.28%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 862.112us
-Self CUDA time total: 3.904us
+Self CPU time total: 1.823ms
+Self CUDA time total: 3.874us
 
 
 
@@ -4259,19 +4267,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.438us      3086.10%     121.438us     121.438us             1  
-                               hf_kernels_causal_conv1d        15.37%      74.422us        98.89%     478.921us     478.921us       0.000us         0.00%       5.183us       5.183us             1  
-                                         CausalConv1dFn        15.69%      75.972us        83.52%     404.499us     134.833us       0.000us         0.00%       5.183us       1.728us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.44%      26.330us        61.72%     298.936us      99.645us       3.935us       100.00%       5.183us       1.728us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.935us       100.00%       3.935us       1.312us             3  
-                                Activity Buffer Request        23.74%     114.963us        23.74%     114.963us     114.963us       1.248us        31.72%       1.248us       1.248us             1  
-                                       aten::empty_like         1.57%       7.609us         6.11%      29.591us       9.864us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.54%      21.982us         4.54%      21.982us       7.327us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.55%     157.643us        32.55%     157.643us      52.548us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.11%       5.391us         1.11%       5.391us       5.391us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     112.862us      2867.43%     112.862us     112.862us             1  
+                               hf_kernels_causal_conv1d        13.92%      73.542us        98.94%     522.552us     522.552us       0.000us         0.00%       5.216us       5.216us             1  
+                                         CausalConv1dFn        13.17%      69.571us        85.02%     449.010us     149.670us       0.000us         0.00%       5.216us       1.739us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.04%      26.641us        66.59%     351.668us     117.223us       3.936us       100.00%       5.216us       1.739us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.936us       100.00%       3.936us       1.312us             3  
+                                Activity Buffer Request        31.20%     164.773us        31.20%     164.773us     164.773us       1.280us        32.52%       1.280us       1.280us             1  
+                                       aten::empty_like         1.39%       7.351us         5.26%      27.771us       9.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.87%      20.420us         3.87%      20.420us       6.807us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.34%     160.254us        30.34%     160.254us      53.418us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.06%       5.590us         1.06%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 484.312us
-Self CUDA time total: 3.935us
+Self CPU time total: 528.142us
+Self CUDA time total: 3.936us
 
 
 
@@ -4281,19 +4289,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     152.157us      3744.94%     152.157us     152.157us             1  
-                               hf_kernels_causal_conv1d        10.88%      77.931us        99.21%     710.327us     710.327us       0.000us         0.00%       5.407us       5.407us             1  
-                                         CausalConv1dFn        11.39%      81.522us        88.32%     632.396us     210.799us       0.000us         0.00%       5.407us       1.802us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.86%      27.639us        72.73%     520.742us     173.581us       4.063us       100.00%       5.407us       1.802us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
-                                Activity Buffer Request        44.05%     315.408us        44.05%     315.408us     315.408us       1.344us        33.08%       1.344us       1.344us             1  
-                                       aten::empty_like         1.15%       8.200us         4.21%      30.132us      10.044us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.06%      21.932us         3.06%      21.932us       7.311us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.82%     177.695us        24.82%     177.695us      59.232us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.79%       5.681us         0.79%       5.681us       5.681us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.630us      2893.73%     117.630us     117.630us             1  
+                               hf_kernels_causal_conv1d         4.22%      76.492us        99.73%       1.809ms       1.809ms       0.000us         0.00%       5.441us       5.441us             1  
+                                         CausalConv1dFn         3.89%      70.602us        95.52%       1.732ms     577.480us       0.000us         0.00%       5.441us       1.814us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.32%      23.990us        90.04%       1.633ms     544.346us       4.065us       100.00%       5.441us       1.814us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.065us       100.00%       4.065us       1.355us             3  
+                                Activity Buffer Request        79.17%       1.436ms        79.17%       1.436ms       1.436ms       1.376us        33.85%       1.376us       1.376us             1  
+                                       aten::empty_like         0.43%       7.870us         1.59%      28.801us       9.600us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.15%      20.931us         1.15%      20.931us       6.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.54%     173.024us         9.54%     173.024us      57.675us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.840us         0.27%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 716.008us
-Self CUDA time total: 4.063us
+Self CPU time total: 1.814ms
+Self CUDA time total: 4.065us
 
 
 
@@ -4303,19 +4311,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.936us      2951.18%     119.936us     119.936us             1  
-                               hf_kernels_causal_conv1d        15.86%      75.552us        99.00%     471.672us     471.672us       0.000us         0.00%       5.440us       5.440us             1  
-                                         CausalConv1dFn        16.03%      76.383us        83.14%     396.120us     132.040us       0.000us         0.00%       5.440us       1.813us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.35%      25.480us        61.26%     291.866us      97.289us       4.064us       100.00%       5.440us       1.813us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
-                                Activity Buffer Request        23.14%     110.243us        23.14%     110.243us     110.243us       1.376us        33.86%       1.376us       1.376us             1  
-                                       aten::empty_like         1.53%       7.269us         5.85%      27.871us       9.290us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.32%      20.602us         4.32%      20.602us       6.867us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.77%     156.143us        32.77%     156.143us      52.048us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.00%       4.760us         1.00%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     112.957us      2780.14%     112.957us     112.957us             1  
+                               hf_kernels_causal_conv1d        13.63%      77.442us        99.02%     562.553us     562.553us       0.000us         0.00%       5.439us       5.439us             1  
+                                         CausalConv1dFn        12.09%      68.663us        85.39%     485.111us     161.704us       0.000us         0.00%       5.439us       1.813us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.90%      27.850us        68.41%     388.648us     129.549us       4.063us       100.00%       5.439us       1.813us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
+                                Activity Buffer Request        32.06%     182.124us        32.06%     182.124us     182.124us       1.376us        33.87%       1.376us       1.376us             1  
+                                       aten::empty_like         1.28%       7.270us         4.89%      27.800us       9.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.61%      20.530us         3.61%      20.530us       6.843us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.45%     178.674us        31.45%     178.674us      59.558us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.98%       5.590us         0.98%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 476.432us
-Self CUDA time total: 4.064us
+Self CPU time total: 568.143us
+Self CUDA time total: 4.063us
 
 
 
@@ -4325,19 +4333,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.888us      2401.78%     129.888us     129.888us             1  
-                               hf_kernels_causal_conv1d        13.50%     106.873us        99.32%     785.980us     785.980us       0.000us         0.00%       7.264us       7.264us             1  
-                                         CausalConv1dFn        10.04%      79.422us        85.81%     679.107us     226.369us       0.000us         0.00%       7.264us       2.421us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.32%      26.310us        72.10%     570.564us     190.188us       5.408us       100.00%       7.264us       2.421us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.408us       100.00%       5.408us       1.803us             3  
-                                Activity Buffer Request        48.81%     386.260us        48.81%     386.260us     386.260us       1.856us        34.32%       1.856us       1.856us             1  
-                                       aten::empty_like         1.01%       7.981us         3.68%      29.121us       9.707us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.67%      21.140us         2.67%      21.140us       7.047us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        19.96%     157.994us        19.96%     157.994us      52.665us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.68%       5.410us         0.68%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.623us      2193.88%     118.623us     118.623us             1  
+                               hf_kernels_causal_conv1d         4.12%      74.582us        99.72%       1.807ms       1.807ms       0.000us         0.00%       7.231us       7.231us             1  
+                                         CausalConv1dFn         3.94%      71.361us        95.60%       1.732ms     577.313us       0.000us         0.00%       7.231us       2.410us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.39%      25.271us        90.02%       1.631ms     543.639us       5.407us       100.00%       7.231us       2.410us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.407us       100.00%       5.407us       1.802us             3  
+                                Activity Buffer Request        79.36%       1.438ms        79.36%       1.438ms       1.438ms       1.824us        33.73%       1.824us       1.824us             1  
+                                       aten::empty_like         0.43%       7.860us         1.64%      29.661us       9.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.20%      21.801us         1.20%      21.801us       7.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.27%     167.954us         9.27%     167.954us      55.985us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.140us         0.28%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 791.390us
-Self CUDA time total: 5.408us
+Self CPU time total: 1.812ms
+Self CUDA time total: 5.407us
 
 
 
@@ -4347,19 +4355,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.463us      2151.92%     118.463us     118.463us             1  
-                               hf_kernels_causal_conv1d        19.47%      96.181us        98.96%     488.812us     488.812us       0.000us         0.00%       7.393us       7.393us             1  
-                                         CausalConv1dFn        15.19%      75.044us        79.49%     392.631us     130.877us       0.000us         0.00%       7.393us       2.464us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.31%      26.241us        58.39%     288.397us      96.132us       5.505us       100.00%       7.393us       2.464us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.505us       100.00%       5.505us       1.835us             3  
-                                Activity Buffer Request        21.50%     106.222us        21.50%     106.222us     106.222us       1.888us        34.30%       1.888us       1.888us             1  
-                                       aten::empty_like         1.50%       7.390us         5.91%      29.190us       9.730us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.41%      21.800us         4.41%      21.800us       7.267us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.57%     155.934us        31.57%     155.934us      51.978us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.04%       5.140us         1.04%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     112.607us      2057.50%     112.607us     112.607us             1  
+                               hf_kernels_causal_conv1d        13.70%      73.872us        99.01%     534.033us     534.033us       0.000us         0.00%       7.361us       7.361us             1  
+                                         CausalConv1dFn        13.12%      70.792us        85.31%     460.161us     153.387us       0.000us         0.00%       7.361us       2.454us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.78%      25.770us        67.08%     361.838us     120.613us       5.473us       100.00%       7.361us       2.454us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.473us       100.00%       5.473us       1.824us             3  
+                                Activity Buffer Request        31.74%     171.214us        31.74%     171.214us     171.214us       1.888us        34.50%       1.888us       1.888us             1  
+                                       aten::empty_like         1.37%       7.381us         5.10%      27.531us       9.177us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.74%      20.150us         3.74%      20.150us       6.717us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.56%     164.854us        30.56%     164.854us      54.951us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.99%       5.340us         0.99%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 493.952us
-Self CUDA time total: 5.505us
+Self CPU time total: 539.373us
+Self CUDA time total: 5.473us
 
 
 
@@ -4369,19 +4377,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.279us       741.28%     129.279us     129.279us             1  
-                               hf_kernels_causal_conv1d         5.08%      91.861us        99.73%       1.805ms       1.805ms       0.000us         0.00%      23.296us      23.296us             1  
-                                         CausalConv1dFn         4.24%      76.815us        94.65%       1.713ms     571.078us       0.000us         0.00%      23.296us       7.765us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.42%      25.791us        88.76%       1.607ms     535.516us      17.440us       100.00%      23.296us       7.765us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.440us       100.00%      17.440us       5.813us             3  
-                                Activity Buffer Request        78.65%       1.424ms        78.65%       1.424ms       1.424ms       5.856us        33.58%       5.856us       5.856us             1  
-                                       aten::empty_like         0.47%       8.500us         1.65%      29.870us       9.957us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.18%      21.370us         1.18%      21.370us       7.123us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.68%     157.163us         8.68%     157.163us      52.388us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       4.911us         0.27%       4.911us       4.911us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.390us       706.22%     123.390us     123.390us             1  
+                               hf_kernels_causal_conv1d         4.18%      75.923us        99.74%       1.812ms       1.812ms       0.000us         0.00%      23.328us      23.328us             1  
+                                         CausalConv1dFn         3.97%      72.132us        95.56%       1.736ms     578.683us       0.000us         0.00%      23.328us       7.776us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.36%      24.789us        89.99%       1.635ms     544.959us      17.472us       100.00%      23.328us       7.776us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.472us       100.00%      17.472us       5.824us             3  
+                                Activity Buffer Request        79.65%       1.447ms        79.65%       1.447ms       1.447ms       5.856us        33.52%       5.856us       5.856us             1  
+                                       aten::empty_like         0.45%       8.169us         1.60%      29.040us       9.680us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.15%      20.871us         1.15%      20.871us       6.957us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.97%     163.034us         8.97%     163.034us      54.345us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.790us         0.26%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.810ms
-Self CUDA time total: 17.440us
+Self CPU time total: 1.817ms
+Self CUDA time total: 17.472us
 
 
 
@@ -4391,19 +4399,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     139.324us       772.01%     139.324us     139.324us             1  
-                               hf_kernels_causal_conv1d        18.68%      93.362us        99.02%     494.883us     494.883us       0.000us         0.00%      24.095us      24.095us             1  
-                                         CausalConv1dFn        17.38%      86.843us        80.34%     401.521us     133.840us       0.000us         0.00%      24.095us       8.032us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.36%      26.789us        57.15%     285.628us      95.209us      18.047us       100.00%      24.095us       8.032us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.047us       100.00%      18.047us       6.016us             3  
-                                Activity Buffer Request        20.49%     102.403us        20.49%     102.403us     102.403us       6.048us        33.51%       6.048us       6.048us             1  
-                                       aten::empty_like         1.48%       7.399us         5.81%      29.050us       9.683us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.33%      21.651us         4.33%      21.651us       7.217us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.30%     156.436us        31.30%     156.436us      52.145us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.98%       4.890us         0.98%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.788us       680.84%     121.788us     121.788us             1  
+                               hf_kernels_causal_conv1d        14.15%      75.583us        99.15%     529.782us     529.782us       0.000us         0.00%      23.904us      23.904us             1  
+                                         CausalConv1dFn        14.61%      78.041us        85.01%     454.199us     151.400us       0.000us         0.00%      23.904us       7.968us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.06%      27.012us        64.90%     346.788us     115.596us      17.888us       100.00%      23.904us       7.968us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.888us       100.00%      17.888us       5.963us             3  
+                                Activity Buffer Request        29.57%     158.003us        29.57%     158.003us     158.003us       6.016us        33.63%       6.016us       6.016us             1  
+                                       aten::empty_like         1.39%       7.440us         5.50%      29.370us       9.790us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.10%      21.930us         4.10%      21.930us       7.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.28%     161.773us        30.28%     161.773us      53.924us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.85%       4.521us         0.85%       4.521us       4.521us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 499.773us
-Self CUDA time total: 18.047us
+Self CPU time total: 534.303us
+Self CUDA time total: 17.888us
 
 
 
@@ -4413,19 +4421,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     135.103us       748.58%     135.103us     135.103us             1  
-                               hf_kernels_causal_conv1d         5.37%      98.434us        99.69%       1.829ms       1.829ms       0.000us         0.00%      24.097us      24.097us             1  
-                                         CausalConv1dFn         4.35%      79.821us        94.33%       1.730ms     576.697us       0.000us         0.00%      24.097us       8.032us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.36%      24.912us        88.33%       1.620ms     540.010us      18.048us       100.00%      24.097us       8.032us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.048us       100.00%      18.048us       6.016us             3  
-                                Activity Buffer Request        77.78%       1.427ms        77.78%       1.427ms       1.427ms       6.049us        33.52%       6.049us       6.049us             1  
-                                       aten::empty_like         0.47%       8.550us         1.65%      30.240us      10.080us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.18%      21.690us         1.18%      21.690us       7.230us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.19%     168.514us         9.19%     168.514us      56.171us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.620us         0.31%       5.620us       5.620us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.087us       696.82%     125.087us     125.087us             1  
+                               hf_kernels_causal_conv1d         4.34%      78.522us        99.74%       1.806ms       1.806ms       0.000us         0.00%      23.998us      23.998us             1  
+                                         CausalConv1dFn         4.03%      72.933us        95.40%       1.728ms     575.883us       0.000us         0.00%      23.998us       7.999us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.38%      25.019us        89.74%       1.625ms     541.689us      17.951us       100.00%      23.998us       7.999us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.951us       100.00%      17.951us       5.984us             3  
+                                Activity Buffer Request        79.32%       1.436ms        79.32%       1.436ms       1.436ms       6.047us        33.69%       6.047us       6.047us             1  
+                                       aten::empty_like         0.46%       8.289us         1.64%      29.650us       9.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.18%      21.361us         1.18%      21.361us       7.120us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.04%     163.685us         9.04%     163.685us      54.562us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.781us         0.26%       4.781us       4.781us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.834ms
-Self CUDA time total: 18.048us
+Self CPU time total: 1.811ms
+Self CUDA time total: 17.951us
 
 
 
@@ -4435,19 +4443,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.684us       694.54%     130.684us     130.684us             1  
-                               hf_kernels_causal_conv1d        18.98%      97.223us        99.02%     507.183us     507.183us       0.000us         0.00%      25.120us      25.120us             1  
-                                         CausalConv1dFn        14.58%      74.692us        80.04%     409.960us     136.653us       0.000us         0.00%      25.120us       8.373us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         6.51%      33.321us        59.71%     305.838us     101.946us      18.816us       100.00%      25.120us       8.373us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.816us       100.00%      18.816us       6.272us             3  
-                                Activity Buffer Request        22.33%     114.353us        22.33%     114.353us     114.353us       6.304us        33.50%       6.304us       6.304us             1  
-                                       aten::empty_like         1.71%       8.769us         5.75%      29.430us       9.810us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.03%      20.661us         4.03%      20.661us       6.887us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.88%     158.164us        30.88%     158.164us      52.721us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.98%       5.010us         0.98%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.887us       630.82%     117.887us     117.887us             1  
+                               hf_kernels_causal_conv1d        11.57%      72.803us        99.15%     623.975us     623.975us       0.000us         0.00%      24.960us      24.960us             1  
+                                         CausalConv1dFn        11.13%      70.072us        87.58%     551.172us     183.724us       0.000us         0.00%      24.960us       8.320us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.22%      26.540us        71.97%     452.920us     150.973us      18.688us       100.00%      24.960us       8.320us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.688us       100.00%      18.688us       6.229us             3  
+                                Activity Buffer Request        41.60%     261.806us        41.60%     261.806us     261.806us       6.272us        33.56%       6.272us       6.272us             1  
+                                       aten::empty_like         1.19%       7.500us         4.48%      28.180us       9.393us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.29%      20.680us         3.29%      20.680us       6.893us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        26.15%     164.574us        26.15%     164.574us      54.858us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.85%       5.340us         0.85%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 512.193us
-Self CUDA time total: 18.816us
+Self CPU time total: 629.315us
+Self CUDA time total: 18.688us
 
 
 
@@ -4457,19 +4465,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         6.14%     112.394us        99.70%       1.825ms       1.825ms       0.000us         0.00%     162.754us     162.754us             1  
-                                         CausalConv1dFn         4.41%      80.651us        93.56%       1.713ms     570.927us       0.000us         0.00%     162.754us      54.251us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.37%      25.010us        87.54%       1.603ms     534.193us      97.985us       100.00%     162.754us      54.251us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     144.737us       147.71%     144.737us     144.737us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.985us       100.00%      97.985us      32.662us             3  
-                                Activity Buffer Request        77.36%       1.416ms        77.36%       1.416ms       1.416ms      64.769us        66.10%      64.769us      64.769us             1  
-                                       aten::empty_like         0.49%       8.901us         1.61%      29.551us       9.850us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.13%      20.650us         1.13%      20.650us       6.883us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.82%     161.445us         8.82%     161.445us      53.815us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.30%       5.480us         0.30%       5.480us       5.480us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        11.55%      73.362us        99.20%     630.015us     630.015us       0.000us         0.00%     162.555us     162.555us             1  
+                                         CausalConv1dFn        11.07%      70.302us        87.65%     556.653us     185.551us       0.000us         0.00%     162.555us      54.185us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.16%      26.411us        72.21%     458.550us     152.850us      97.949us       100.00%     162.555us      54.185us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.645us       130.32%     127.645us     127.645us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.949us       100.00%      97.949us      32.650us             3  
+                                Activity Buffer Request        41.87%     265.926us        41.87%     265.926us     265.926us      64.606us        65.96%      64.606us      64.606us             1  
+                                       aten::empty_like         1.16%       7.350us         4.38%      27.801us       9.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.22%      20.451us         3.22%      20.451us       6.817us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        26.17%     166.213us        26.17%     166.213us      55.404us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.80%       5.050us         0.80%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.831ms
-Self CUDA time total: 97.985us
+Self CPU time total: 635.065us
+Self CUDA time total: 97.949us
 
 
 
@@ -4479,19 +4487,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        19.17%      96.654us        98.90%     498.573us     498.573us       0.000us         0.00%     163.900us     163.900us             1  
-                                         CausalConv1dFn        15.33%      77.291us        79.73%     401.919us     133.973us       0.000us         0.00%     163.900us      54.633us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.17%      26.053us        58.73%     296.088us      98.696us      98.813us       100.00%     163.900us      54.633us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     133.981us       135.59%     133.981us     133.981us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.813us       100.00%      98.813us      32.938us             3  
-                                Activity Buffer Request        22.39%     112.882us        22.39%     112.882us     112.882us      65.087us        65.87%      65.087us      65.087us             1  
-                                       aten::empty_like         1.55%       7.820us         5.66%      28.540us       9.513us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.11%      20.720us         4.11%      20.720us       6.907us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.17%     157.153us        31.17%     157.153us      52.384us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.10%       5.550us         1.10%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        11.83%      75.513us        99.19%     633.215us     633.215us       0.000us         0.00%     164.638us     164.638us             1  
+                                         CausalConv1dFn        11.21%      71.532us        87.37%     557.702us     185.901us       0.000us         0.00%     164.638us      54.879us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.76%      23.990us        71.75%     458.009us     152.670us      99.103us       100.00%     164.638us      54.879us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     132.254us       133.45%     132.254us     132.254us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      99.103us       100.00%      99.103us      33.034us             3  
+                                Activity Buffer Request        40.13%     256.155us        40.13%     256.155us     256.155us      65.535us        66.13%      65.535us      65.535us             1  
+                                       aten::empty_like         1.16%       7.400us         4.41%      28.161us       9.387us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.25%      20.761us         3.25%      20.761us       6.920us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.86%     177.864us        27.86%     177.864us      59.288us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.81%       5.140us         0.81%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 504.123us
-Self CUDA time total: 98.813us
+Self CPU time total: 638.355us
+Self CUDA time total: 99.103us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4501,12 +4509,12 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.06  True
-hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.06  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.04  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W2     0.06  True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W4     0.06  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2     0.05  True
@@ -4517,18 +4525,19 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S128_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.06  True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
 
▶ UV Install Logs
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:01, 3.95it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 6.21it/s]
+Fetching 11 files: 55%|█████▍ | 6/11 [00:00<00:00, 22.15it/s] +Fetching 11 files: 82%|████████▏ | 9/11 [00:01<00:00, 4.06it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 5.94it/s]

Artifacts:

causal_conv1d.jsonl