Spaces:

ror
/

performative_dashboard

Sleeping

App Files Files Community

ror HF Staff commited on Sep 30

Commit

e1f4b73

1 Parent(s): 55c8a69

Better plot

Browse files

Files changed (4) hide show

.gitignore +2 -0
bar_plot.py +64 -35
data.json +0 -1418
data.py +20 -1

.gitignore CHANGED Viewed

@@ -1,2 +1,4 @@
 __pycache__
 __ignore*

 __pycache__
 __ignore*
+*.json

bar_plot.py CHANGED Viewed

@@ -10,9 +10,16 @@ def hex_to_rgb(hex_color):
     r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
     return r, g, b
 def increase_brightness(r, g, b, factor):
     return tuple(map(lambda x: int(x + (255 - x) * factor), (r, g, b)))
 def increase_saturation(r, g, b, factor) -> tuple[int, int, int]:
     gray = 0.299 * r + 0.587 * g + 0.114 * b
     return tuple(map(lambda x: int(gray + (x - gray) * factor), (r, g, b)))
@@ -22,35 +29,54 @@ def rgb_to_hex(r, g, b):
     return f"#{r:02x}{g:02x}{b:02x}"
 # Color assignment function
-def get_color_for_config(config):
     # Determine the main hue for the attention implementation
     attn_implementation, sdpa_backend = config["attn_implementation"], config["sdpa_backend"]
     if attn_implementation == "eager":
-        main_hue = "#FF6B6B"
     elif attn_implementation == "sdpa":
         main_hue = {
-            None: "#4A90E2",
-            "math": "#408DDBFF",
-            "flash_attention": "#28767EFF",
-            "efficient_attention": "#605895FF",
-            "cudnn_attention": "#774AE2FF",
-        }[sdpa_backend]
     elif attn_implementation == "flash_attention_2":
-        main_hue = "#FFD700"
     else:
         raise ValueError(f"Unknown attention implementation: {attn_implementation}")
     # Apply color modifications for compilation and kernelization
     r, g, b = hex_to_rgb(main_hue)
     if config["compilation"]:
-        r, g, b = increase_brightness(r, g, b, 0.3)
     if config["kernelize"]:
-        r, g, b = increase_saturation(r, g, b, 0.8)
     # Return the color as a hex string
     return rgb_to_hex(r, g, b)
 def make_bar_kwargs(per_scenario_data: dict, key: str) -> tuple[dict, list]:
     bar_kwargs = {"x": [], "height": [], "color": [], "label": []}
@@ -65,6 +91,7 @@ def make_bar_kwargs(per_scenario_data: dict, key: str) -> tuple[dict, list]:
 def draw_bar_plot(ax: plt.Axes, bar_kwargs: dict, errors: list, title: str, ylabel: str):
     ax.set_facecolor('#000000')
     # Draw bars
     _ = ax.bar(**bar_kwargs, width=1.0, edgecolor='white', linewidth=1)
     # Add error bars
@@ -73,46 +100,49 @@ def draw_bar_plot(ax: plt.Axes, bar_kwargs: dict, errors: list, title: str, ylab
         fmt='none', ecolor='white', alpha=0.8, elinewidth=1.5, capthick=1.5, capsize=4,
     )
     # Set labels and title
-    ax.set_ylabel(ylabel, color='white', fontsize=14)
-    ax.set_title(title, color='white', fontsize=16, pad=20)
     # Set ticks and grid
     ax.set_xticks([])
-    ax.tick_params(colors='white')
-    ax.grid(True, alpha=0.3, color='white')
     # Truncate axis to better fit the bars
-    # new_ymin, new_ymax = 1e9, -1e9
-    # for h, e in zip(bar_kwargs["height"], errors):
-    #     new_ymin = min(new_ymin, 0.98 * (h - e))
-    #     new_ymax = max(new_ymax, 1.02 * (h + e))
-    # ymin, ymax = ax.get_ylim()
-    # ax.set_ylim(max(ymin, new_ymin), min(ymax, new_ymax))
 def create_matplotlib_bar_plot(per_scenario_data: dict):
     """Create side-by-side matplotlib bar charts for TTFT and TPOT data."""
-    # Create figure with dark theme - larger for more screen space
     plt.style.use('dark_background')
-    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 12))
     fig.patch.set_facecolor('#000000')
     # TTFT Plot (left)
     ttft_bars, ttft_errors = make_bar_kwargs(per_scenario_data, "ttft")
-    draw_bar_plot(ax1, ttft_bars, ttft_errors, "Time to first token (lower is better)", "TTFT (seconds)")
-    # TPOT Plot (right)
     itl_bars, itl_errors = make_bar_kwargs(per_scenario_data, "itl")
-    draw_bar_plot(ax2, itl_bars, itl_errors, "Time per output token (lower is better)", "ITL (seconds)")
     # Add common legend with full text
     legend_labels = ttft_bars["label"]  # Use full labels without truncation
     legend_handles = [plt.Rectangle((0,0),1,1, color=color) for color in ttft_bars["color"]]
-    fig.legend(legend_handles, legend_labels, loc='lower center', ncol=1,
                bbox_to_anchor=(0.5, -0.05), facecolor='black', edgecolor='white',
-               labelcolor='white', fontsize=12)
-    # Tight layout with spacing between subplots and extra bottom space for legend
-    # plt.subplots_adjust(wspace=0.3, bottom=0.075)
     # Save plot to bytes with high DPI for crisp text
     buffer = io.BytesIO()
@@ -124,11 +154,10 @@ def create_matplotlib_bar_plot(per_scenario_data: dict):
     img_data = base64.b64encode(buffer.getvalue()).decode()
     plt.close(fig)
-    # Return HTML with embedded image - full height
     html = f"""
-    <div style="width: 100%; height: 100vh; background: #000; display: flex; justify-content: center; align-items: center;">
-        <img src="data:image/png;base64,{img_data}" style="width: 100%; height: 100%; object-fit: contain;" />
     </div>
     """
     return html

     r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
     return r, g, b
+def blend_colors(rgb, hex_color, blend_strength):
+    other_rgb = hex_to_rgb(hex_color)
+    return tuple(map(lambda i: int(rgb[i] * blend_strength + other_rgb[i] * (1 - blend_strength)), range(3)))
 def increase_brightness(r, g, b, factor):
     return tuple(map(lambda x: int(x + (255 - x) * factor), (r, g, b)))
+def decrease_brightness(r, g, b, factor):
+    return tuple(map(lambda x: int(x * factor), (r, g, b)))
 def increase_saturation(r, g, b, factor) -> tuple[int, int, int]:
     gray = 0.299 * r + 0.587 * g + 0.114 * b
     return tuple(map(lambda x: int(gray + (x - gray) * factor), (r, g, b)))
     return f"#{r:02x}{g:02x}{b:02x}"
 # Color assignment function
+def get_color_for_config(config, filtered_on_compile_mode: bool = False):
     # Determine the main hue for the attention implementation
     attn_implementation, sdpa_backend = config["attn_implementation"], config["sdpa_backend"]
+    compilation = config["compilation"]
     if attn_implementation == "eager":
+        main_hue = "#FF4B4BFF" if compilation else "#FF4141FF"
     elif attn_implementation == "sdpa":
         main_hue = {
+            None:                  "#4A90E2" if compilation else "#2E82E1FF",
+            "math":                "#408DDB" if compilation else "#227BD3FF",
+            "flash_attention":     "#35A34D" if compilation else "#219F3CFF",
+            "efficient_attention": "#605895" if compilation else "#423691FF",
+            "cudnn_attention":     "#774AE2" if compilation else "#5D27DCFF",
+        }[sdpa_backend]  # fmt: off
     elif attn_implementation == "flash_attention_2":
+        main_hue = "#FFD700" if compilation else "#FFBF00FF"
     else:
         raise ValueError(f"Unknown attention implementation: {attn_implementation}")
     # Apply color modifications for compilation and kernelization
     r, g, b = hex_to_rgb(main_hue)
     if config["compilation"]:
+        delta = 0.2
+        delta += 0.2 * (len(config["compile_mode"]) - 7) / 8 if filtered_on_compile_mode else 0
+        r, g, b = increase_brightness(r, g, b, delta)
     if config["kernelize"]:
+        pass
+        # r, g, b = blend_colors((r, g, b), "#FF00F2FF", 0.7)
+        r, g, b = decrease_brightness(r, g, b, 0.8)
+        # r, g, b = increase_saturation(r, g, b, 0.9)
     # Return the color as a hex string
     return rgb_to_hex(r, g, b)
+def reorder_data(per_scenario_data: dict) -> dict:
+    keys = list(per_scenario_data.keys())
+    def sorting_fn(key: str) -> float:
+        cfg = per_scenario_data[key]["config"]
+        attn_implementation = cfg["attn_implementation"]
+        attn_implementation_prio = {"flash_attention_2": 0, "sdpa": 1, "eager": 2}[attn_implementation]
+        return attn_implementation_prio, cfg["sdpa_backend"], cfg["kernelize"], cfg["compilation"]
+    keys.sort(key=sorting_fn)
+    per_scenario_data = {k: per_scenario_data[k] for k in keys}
+    return per_scenario_data
 def make_bar_kwargs(per_scenario_data: dict, key: str) -> tuple[dict, list]:
     bar_kwargs = {"x": [], "height": [], "color": [], "label": []}
 def draw_bar_plot(ax: plt.Axes, bar_kwargs: dict, errors: list, title: str, ylabel: str):
     ax.set_facecolor('#000000')
+    # ax.grid(True, alpha=0.3, color='white')
     # Draw bars
     _ = ax.bar(**bar_kwargs, width=1.0, edgecolor='white', linewidth=1)
     # Add error bars
         fmt='none', ecolor='white', alpha=0.8, elinewidth=1.5, capthick=1.5, capsize=4,
     )
     # Set labels and title
+    ax.set_ylabel(ylabel, color='white', fontsize=16)
+    ax.set_title(title, color='white', fontsize=18, pad=20)
     # Set ticks and grid
     ax.set_xticks([])
+    ax.tick_params(colors='white', labelsize=13)
     # Truncate axis to better fit the bars
+    new_ymin, new_ymax = 1e9, -1e9
+    for h, e in zip(bar_kwargs["height"], errors):
+        new_ymin = min(new_ymin, 0.98 * (h - e))
+        new_ymax = max(new_ymax, 1.02 * (h + e))
+    ymin, ymax = ax.get_ylim()
+    ax.set_ylim(max(ymin, new_ymin), min(ymax, new_ymax))
 def create_matplotlib_bar_plot(per_scenario_data: dict):
     """Create side-by-side matplotlib bar charts for TTFT and TPOT data."""
+    # Create figure with dark theme - maximum size for full screen
     plt.style.use('dark_background')
+    fig, axs = plt.subplots(1, 3, figsize=(30, 16))
     fig.patch.set_facecolor('#000000')
+    # Reorganize data
+    per_scenario_data = reorder_data(per_scenario_data)
     # TTFT Plot (left)
     ttft_bars, ttft_errors = make_bar_kwargs(per_scenario_data, "ttft")
+    draw_bar_plot(axs[0], ttft_bars, ttft_errors, "Time to first token (lower is better)", "TTFT (seconds)")
+    # ITL Plot (right)
     itl_bars, itl_errors = make_bar_kwargs(per_scenario_data, "itl")
+    draw_bar_plot(axs[1], itl_bars, itl_errors, "Inter token latency (lower is better)", "ITL (seconds)")
+    # E2E Plot (right)
+    e2e_bars, e2e_errors = make_bar_kwargs(per_scenario_data, "e2e")
+    draw_bar_plot(axs[2], e2e_bars, e2e_errors, "End-to-end latency (lower is better)", "E2E (seconds)")
     # Add common legend with full text
     legend_labels = ttft_bars["label"]  # Use full labels without truncation
     legend_handles = [plt.Rectangle((0,0),1,1, color=color) for color in ttft_bars["color"]]
+    fig.legend(legend_handles, legend_labels, loc='lower center', ncol=4,
                bbox_to_anchor=(0.5, -0.05), facecolor='black', edgecolor='white',
+               labelcolor='white', fontsize=14)
     # Save plot to bytes with high DPI for crisp text
     buffer = io.BytesIO()
     img_data = base64.b64encode(buffer.getvalue()).decode()
     plt.close(fig)
+    # Return HTML with embedded image - full page coverage
     html = f"""
+    <div style="width: 90vw; height: 90vh; background: #000; display: flex; justify-content: center; align-items: center; margin: 0; padding: 0; top: 0; left: 0;">
+        <img src="data:image/png;base64,{img_data}" style="width: 100%; height: 100%; object-fit: contain; max-width: none; max-height: none;" />
     </div>
     """
     return html

data.json DELETED Viewed

@@ -1,1418 +0,0 @@
-{
-    "eager_None_uncompiled_vanilla_with_cache": {
-        "metadata": {
-            "timestamp": "2025-09-26T12:00:15.841272",
-            "commit_id": null,
-            "hardware_info": {
-                "gpu_name": "AMD Instinct Mi325X VF",
-                "gpu_memory_total_mb": 255.6875,
-                "python_version": "3.12.10",
-                "torch_version": "2.7.1+rocm6.4.1.git2a215e4a"
-            },
-            "config": {
-                "name": "eager_None_uncompiled_vanilla_with_cache",
-                "warmup_iterations": 5,
-                "measurement_iterations": 20,
-                "gpu_monitoring": false,
-                "batch_size": 16,
-                "sequence_length": 128,
-                "num_tokens_to_generate": 128,
-                "attn_implementation": "eager",
-                "use_cache": true,
-                "sdpa_backend": null,
-                "compilation": false,
-                "compile_mode": null,
-                "compile_options": {},
-                "kernelize": false,
-                "device": "cuda",
-                "dtype": "torch.bfloat16"
-            }
-        },
-        "ttft": [
-            {
-                "wall_time": 0.07295582396909595,
-                "cuda_time": 0.07292783355712891,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07316389994230121,
-                "cuda_time": 0.0731438217163086,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07270528899971396,
-                "cuda_time": 0.0726801986694336,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07304733199998736,
-                "cuda_time": 0.07302810668945313,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07359540998004377,
-                "cuda_time": 0.07356825256347656,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07285187696106732,
-                "cuda_time": 0.07282662963867187,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07380507595371455,
-                "cuda_time": 0.07378445434570312,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07294069498311728,
-                "cuda_time": 0.07292146301269531,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.0724824140779674,
-                "cuda_time": 0.07246123504638671,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07335149694699794,
-                "cuda_time": 0.07333294677734375,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07265867001842707,
-                "cuda_time": 0.07263875579833984,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07336580497212708,
-                "cuda_time": 0.07334489440917968,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07250520400702953,
-                "cuda_time": 0.07229901123046875,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07315384992398322,
-                "cuda_time": 0.07313486480712891,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07289102603681386,
-                "cuda_time": 0.07287146759033203,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07284493604674935,
-                "cuda_time": 0.07282552337646485,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07329084700904787,
-                "cuda_time": 0.07327109527587891,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07241688505746424,
-                "cuda_time": 0.07239772033691406,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07338041497860104,
-                "cuda_time": 0.07336121368408204,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07338539499323815,
-                "cuda_time": 0.07336421966552735,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            }
-        ],
-        "tpot": [
-            {
-                "wall_time": 2.031788578024134,
-                "cuda_time": 2.0317476806640626,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0359795569675043,
-                "cuda_time": 2.0359508056640623,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0377820000285283,
-                "cuda_time": 2.0377520751953124,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.03635143104475,
-                "cuda_time": 2.0363153076171874,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0420283479616046,
-                "cuda_time": 2.0419971923828126,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.046406274079345,
-                "cuda_time": 2.046373046875,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.848314146976918,
-                "cuda_time": 1.848263916015625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.8559249829268083,
-                "cuda_time": 1.855894287109375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.967564046033658,
-                "cuda_time": 1.96753173828125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0448259089607745,
-                "cuda_time": 2.0447965087890627,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.8805117839947343,
-                "cuda_time": 1.8804791259765625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.054183567990549,
-                "cuda_time": 2.054150634765625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0287541430443525,
-                "cuda_time": 2.02872314453125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.031609061989002,
-                "cuda_time": 2.0315772705078126,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.030884437961504,
-                "cuda_time": 2.0308531494140625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.039256637915969,
-                "cuda_time": 2.0392255859375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.039441852015443,
-                "cuda_time": 2.0394091796875,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.028935077949427,
-                "cuda_time": 2.0289049072265626,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0309303459944203,
-                "cuda_time": 2.0308995361328126,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.052516763098538,
-                "cuda_time": 2.052485107421875,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            }
-        ]
-    },
-    "eager_None_compiled_vanilla_with_cache": {
-        "metadata": {
-            "timestamp": "2025-09-26T12:01:13.033342",
-            "commit_id": null,
-            "hardware_info": {
-                "gpu_name": "AMD Instinct Mi325X VF",
-                "gpu_memory_total_mb": 255.6875,
-                "python_version": "3.12.10",
-                "torch_version": "2.7.1+rocm6.4.1.git2a215e4a"
-            },
-            "config": {
-                "name": "eager_None_compiled_vanilla_with_cache",
-                "warmup_iterations": 5,
-                "measurement_iterations": 20,
-                "gpu_monitoring": false,
-                "batch_size": 16,
-                "sequence_length": 128,
-                "num_tokens_to_generate": 128,
-                "attn_implementation": "eager",
-                "use_cache": true,
-                "sdpa_backend": null,
-                "compilation": true,
-                "compile_mode": "max-autotune",
-                "compile_options": {},
-                "kernelize": false,
-                "device": "cuda",
-                "dtype": "torch.bfloat16"
-            }
-        },
-        "ttft": [
-            {
-                "wall_time": 0.07427955605089664,
-                "cuda_time": 0.07425287628173828,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07495377992745489,
-                "cuda_time": 0.07493439483642578,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07390031509567052,
-                "cuda_time": 0.07387985229492187,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07471515703946352,
-                "cuda_time": 0.07469596099853516,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07410621899180114,
-                "cuda_time": 0.07408612823486328,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07513214694336057,
-                "cuda_time": 0.07511282348632813,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07445675204508007,
-                "cuda_time": 0.07443669128417969,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07454574899747968,
-                "cuda_time": 0.07452691650390625,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07436268392484635,
-                "cuda_time": 0.07433811950683594,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07478532497771084,
-                "cuda_time": 0.07476634979248047,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07541929103899747,
-                "cuda_time": 0.075388427734375,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07438250305131078,
-                "cuda_time": 0.07436360168457032,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07475267606787384,
-                "cuda_time": 0.07473255157470703,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07406081003136933,
-                "cuda_time": 0.07403617095947265,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07498570007737726,
-                "cuda_time": 0.07496607208251953,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07443700195290148,
-                "cuda_time": 0.07441815948486329,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07433398498687893,
-                "cuda_time": 0.07422721099853516,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07485785405151546,
-                "cuda_time": 0.07483879089355469,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07518403592985123,
-                "cuda_time": 0.07516366577148438,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.0745964579982683,
-                "cuda_time": 0.07457791900634765,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            }
-        ],
-        "tpot": [
-            {
-                "wall_time": 2.0495622069574893,
-                "cuda_time": 2.0495224609375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0417750619817525,
-                "cuda_time": 2.0417412109375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0438177799806,
-                "cuda_time": 2.043787109375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0555215378990397,
-                "cuda_time": 2.0554853515625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.052577171009034,
-                "cuda_time": 2.05254443359375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.057972935028374,
-                "cuda_time": 2.05794091796875,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.048640146967955,
-                "cuda_time": 2.048610107421875,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0477576749399304,
-                "cuda_time": 2.047728759765625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.043742422014475,
-                "cuda_time": 2.0437120361328125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.066631429013796,
-                "cuda_time": 2.066598388671875,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.058080272981897,
-                "cuda_time": 2.058049072265625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0463295759400353,
-                "cuda_time": 2.0462979736328126,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.871039719088003,
-                "cuda_time": 1.871006103515625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.9884425880154595,
-                "cuda_time": 1.9884063720703125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.060368453967385,
-                "cuda_time": 2.060333984375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.070050645968877,
-                "cuda_time": 2.070018798828125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0624818300129846,
-                "cuda_time": 2.062452880859375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.072273188037798,
-                "cuda_time": 2.0722421875,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.0667856170330197,
-                "cuda_time": 2.06675244140625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 2.039454172947444,
-                "cuda_time": 2.0394228515625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            }
-        ]
-    },
-    "sdpa_None_uncompiled_vanilla_with_cache": {
-        "metadata": {
-            "timestamp": "2025-09-26T12:01:57.531459",
-            "commit_id": null,
-            "hardware_info": {
-                "gpu_name": "AMD Instinct Mi325X VF",
-                "gpu_memory_total_mb": 255.6875,
-                "python_version": "3.12.10",
-                "torch_version": "2.7.1+rocm6.4.1.git2a215e4a"
-            },
-            "config": {
-                "name": "sdpa_None_uncompiled_vanilla_with_cache",
-                "warmup_iterations": 5,
-                "measurement_iterations": 20,
-                "gpu_monitoring": false,
-                "batch_size": 16,
-                "sequence_length": 128,
-                "num_tokens_to_generate": 128,
-                "attn_implementation": "sdpa",
-                "use_cache": true,
-                "sdpa_backend": null,
-                "compilation": false,
-                "compile_mode": null,
-                "compile_options": {},
-                "kernelize": false,
-                "device": "cuda",
-                "dtype": "torch.bfloat16"
-            }
-        },
-        "ttft": [
-            {
-                "wall_time": 0.06973504310008138,
-                "cuda_time": 0.06970318603515625,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06962730409577489,
-                "cuda_time": 0.069607666015625,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06969393300823867,
-                "cuda_time": 0.069668701171875,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07013454497791827,
-                "cuda_time": 0.07011565399169922,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06957653688732535,
-                "cuda_time": 0.06955630493164063,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06952459702733904,
-                "cuda_time": 0.06950542449951172,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.0701530339429155,
-                "cuda_time": 0.07013269805908204,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06968465296085924,
-                "cuda_time": 0.06966370391845703,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06970080395694822,
-                "cuda_time": 0.06968074035644531,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.0705471959663555,
-                "cuda_time": 0.07052652740478516,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06941466010175645,
-                "cuda_time": 0.06938239288330078,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06946662906557322,
-                "cuda_time": 0.06944795227050782,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07021664292551577,
-                "cuda_time": 0.0701964569091797,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.0697313129203394,
-                "cuda_time": 0.0697121810913086,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07019988296087831,
-                "cuda_time": 0.07017969512939454,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06973098393063992,
-                "cuda_time": 0.06971258544921875,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06996345904190093,
-                "cuda_time": 0.06994325256347657,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.0691890650196001,
-                "cuda_time": 0.06917007446289063,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07019514299463481,
-                "cuda_time": 0.07002085876464843,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06993868900462985,
-                "cuda_time": 0.06991465759277343,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            }
-        ],
-        "tpot": [
-            {
-                "wall_time": 1.568886692984961,
-                "cuda_time": 1.568811279296875,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5646536540007219,
-                "cuda_time": 1.564624755859375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.569601257913746,
-                "cuda_time": 1.5695728759765626,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.56203376094345,
-                "cuda_time": 1.5620035400390626,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5592194920172915,
-                "cuda_time": 1.5591905517578124,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5600104450713843,
-                "cuda_time": 1.5599837646484376,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.556260335026309,
-                "cuda_time": 1.5562335205078126,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5623370950343087,
-                "cuda_time": 1.5623018798828125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.56083326600492,
-                "cuda_time": 1.5608055419921876,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.561068672919646,
-                "cuda_time": 1.561042724609375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5576978439930826,
-                "cuda_time": 1.557672119140625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5631060280138627,
-                "cuda_time": 1.56307666015625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.564229413983412,
-                "cuda_time": 1.5642041015625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5635379790328443,
-                "cuda_time": 1.563511474609375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5589708760380745,
-                "cuda_time": 1.55894482421875,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5711359959095716,
-                "cuda_time": 1.5711026611328125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5941198619548231,
-                "cuda_time": 1.5940926513671876,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5766646160045639,
-                "cuda_time": 1.5766334228515626,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.579302111058496,
-                "cuda_time": 1.57926708984375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5677615479798988,
-                "cuda_time": 1.567731689453125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            }
-        ]
-    },
-    "sdpa_None_compiled_vanilla_with_cache": {
-        "metadata": {
-            "timestamp": "2025-09-26T12:02:42.414361",
-            "commit_id": null,
-            "hardware_info": {
-                "gpu_name": "AMD Instinct Mi325X VF",
-                "gpu_memory_total_mb": 255.6875,
-                "python_version": "3.12.10",
-                "torch_version": "2.7.1+rocm6.4.1.git2a215e4a"
-            },
-            "config": {
-                "name": "sdpa_None_compiled_vanilla_with_cache",
-                "warmup_iterations": 5,
-                "measurement_iterations": 20,
-                "gpu_monitoring": false,
-                "batch_size": 16,
-                "sequence_length": 128,
-                "num_tokens_to_generate": 128,
-                "attn_implementation": "sdpa",
-                "use_cache": true,
-                "sdpa_backend": null,
-                "compilation": true,
-                "compile_mode": "max-autotune",
-                "compile_options": {},
-                "kernelize": false,
-                "device": "cuda",
-                "dtype": "torch.bfloat16"
-            }
-        },
-        "ttft": [
-            {
-                "wall_time": 0.06976361200213432,
-                "cuda_time": 0.06957430267333985,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06963270506821573,
-                "cuda_time": 0.06961402130126954,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06991932890377939,
-                "cuda_time": 0.06972110748291016,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06952144694514573,
-                "cuda_time": 0.06950206756591797,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06935929099563509,
-                "cuda_time": 0.06933879089355469,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06907697801943868,
-                "cuda_time": 0.06905767822265625,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06930454296525568,
-                "cuda_time": 0.06927911376953125,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06966740405187011,
-                "cuda_time": 0.06963846588134766,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06912526697851717,
-                "cuda_time": 0.06909512329101562,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06950624799355865,
-                "cuda_time": 0.06948722839355469,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06985958106815815,
-                "cuda_time": 0.06983929443359375,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06962528603617102,
-                "cuda_time": 0.06960670471191406,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06962068600114435,
-                "cuda_time": 0.069595703125,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06950531899929047,
-                "cuda_time": 0.06948622131347656,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06971742305904627,
-                "cuda_time": 0.06969742584228515,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07027899206150323,
-                "cuda_time": 0.07026025390625,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06971817300654948,
-                "cuda_time": 0.06969794464111329,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.06987840996589512,
-                "cuda_time": 0.06985985565185547,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07004970591515303,
-                "cuda_time": 0.07003005981445312,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 0.07010770495980978,
-                "cuda_time": 0.07008773803710938,
-                "batch_size": 16,
-                "new_tokens": 1,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            }
-        ],
-        "tpot": [
-            {
-                "wall_time": 1.5897287370171398,
-                "cuda_time": 1.589665771484375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.586211972986348,
-                "cuda_time": 1.5861837158203125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5882492290111259,
-                "cuda_time": 1.5882193603515624,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5917098950594664,
-                "cuda_time": 1.5916837158203125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.6004555160179734,
-                "cuda_time": 1.600423828125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5869074059883133,
-                "cuda_time": 1.586880859375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5914721400476992,
-                "cuda_time": 1.5914462890625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5851828149752691,
-                "cuda_time": 1.58515673828125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5776644350262359,
-                "cuda_time": 1.5776337890625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5777849049773067,
-                "cuda_time": 1.577759521484375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5800251649925485,
-                "cuda_time": 1.5799962158203125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5810497630154714,
-                "cuda_time": 1.5809893798828125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5747365789720789,
-                "cuda_time": 1.57470703125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5829719919711351,
-                "cuda_time": 1.5828975830078125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5985179379349574,
-                "cuda_time": 1.5984254150390624,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5925404160516337,
-                "cuda_time": 1.5925126953125,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5975769789656624,
-                "cuda_time": 1.5975234375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5931882929289714,
-                "cuda_time": 1.593160400390625,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5850070579908788,
-                "cuda_time": 1.5849803466796875,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            },
-            {
-                "wall_time": 1.5830075700068846,
-                "cuda_time": 1.5829818115234375,
-                "batch_size": 16,
-                "new_tokens": 128,
-                "use_cuda_time": false,
-                "gpu_metrics": null
-            }
-        ]
-    }
-}

data.py CHANGED Viewed

@@ -2,6 +2,9 @@ import json
 import numpy as np
 from typing import Optional
 class ModelBenchmarkData:
@@ -22,12 +25,28 @@ class ModelBenchmarkData:
         num_tokens = len(measures["t_tokens"]) - 1
         return delta_t / num_tokens
-    def get_bar_plot_data(self) -> dict:
         per_scenario_data = {}
         for i, (cfg_name, data) in enumerate(self.data.items()):
             per_scenario_data[cfg_name] = {
                 "ttft": [self.compute_ttft(d) for d in data["measures"]],
                 "itl": [self.compute_itl(d) for d in data["measures"]],
                 "config": data["metadata"]["config"],
             }
         return per_scenario_data

 import numpy as np
 from typing import Optional
+def make_id(config: dict, keys_to_ignore: list[str]) -> str:
+    keys = sorted(set(config.keys()))
+    return "_".join(str(config[k]) for k in keys if k not in keys_to_ignore)
 class ModelBenchmarkData:
         num_tokens = len(measures["t_tokens"]) - 1
         return delta_t / num_tokens
+    def get_bar_plot_data(self, collapse_on_cache: bool = True, collapse_on_compile_mode: bool = True) -> dict:
+        # Gather data for each scenario
         per_scenario_data = {}
         for i, (cfg_name, data) in enumerate(self.data.items()):
             per_scenario_data[cfg_name] = {
                 "ttft": [self.compute_ttft(d) for d in data["measures"]],
                 "itl": [self.compute_itl(d) for d in data["measures"]],
+                "e2e": [self.compute_e2e_latency(d) for d in data["measures"]],
                 "config": data["metadata"]["config"],
             }
+        # Eventually collapse on cache
+        if collapse_on_cache:
+            collapsed_keys = {}
+            for cfg_name, data in per_scenario_data.items():
+                keys_to_ignore = ["name"]
+                keys_to_ignore += (["use_cache"] if collapse_on_cache else [])
+                keys_to_ignore += (["compile_mode"] if collapse_on_compile_mode else [])
+                cfg_id = make_id(data["config"], keys_to_ignore)
+                cfg_e2e = np.mean(data["e2e"])
+                other_name, other_e2e = collapsed_keys.get(cfg_id, (None, 1e16))
+                if cfg_e2e < other_e2e:
+                    collapsed_keys[cfg_id] = (cfg_name, cfg_e2e)
+            per_scenario_data = {k: per_scenario_data[k] for k, _ in collapsed_keys.values()}
         return per_scenario_data