Spaces:

KokosDev
/

nsys-llm-explainer

Sleeping

App Files Files Community

KokosDev commited on 25 days ago

Commit

490d677

verified ·

1 Parent(s): 5083c57

Deploy nsys-llm-explainer Gradio Space

Browse files

Files changed (5) hide show

README.md +62 -6
app.py +263 -0
requirements.txt +3 -0
sample_report.json +761 -0
space_utils.py +376 -0

README.md CHANGED Viewed

@@ -1,12 +1,68 @@
 ---
-title: Nsys Llm Explainer
-emoji: 📊
-colorFrom: red
-colorTo: purple
 sdk: gradio
-sdk_version: 6.9.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: nsys-llm-explainer — Instant Nsight Trace Analyzer for Cloud LLM Inference
+emoji: "📈"
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 ---
+# nsys-llm-explainer — Instant Nsight Trace Analyzer for Cloud LLM Inference
+This folder is a production-ready Hugging Face Space payload for the `nsys-llm-explainer` project.
+It turns an uploaded `trace.sqlite`, `.db`, or `report.json` into:
+- Prioritized findings with evidence and recommendations
+- Kernel, NCCL, barrier, and launch-latency summaries
+- NVLink-over-NCCL correlation when GPU metrics are available
+- Markdown preview of the full report
+- Downloadable `report.md`, `report.json`, CSV tables, and a zip bundle
+## Files
+- `app.py`: Gradio app entrypoint
+- `space_utils.py`: analysis and artifact helpers
+- `requirements.txt`: Space dependencies
+## Deploy on Hugging Face Spaces
+1. Create a new Space using the `Gradio` SDK.
+2. Copy the contents of this folder into the Space repository root.
+3. Keep `requirements.txt` in place so the Space installs the analyzer package and Gradio runtime.
+4. Push the repo. Hugging Face will build the Space automatically.
+5. Open the app and upload a `trace.sqlite` or `report.json`.
+## Duplicate and pin
+If you want a reproducible Space, keep the Git dependency pinned to a release tag in `requirements.txt`.
+If you want the Space to follow the latest `main` branch instead, change:
+```txt
+git+https://github.com/KOKOSde/nsys-llm-explainer.git@v0.3.0
+```
+to:
+```txt
+git+https://github.com/KOKOSde/nsys-llm-explainer.git@main
+```
+## Operational notes
+- The app works with uploaded SQLite exports directly, so there is no need to pre-generate artifacts.
+- If a trace is missing NCCL or GPU metrics tables, the UI still loads and explains which analyses are unavailable.
+- For private traces, use a private Space.
+## Local run
+From this repository root:
+```bash
+PYTHONPATH=src python3 spaces/hf_space/app.py
+```
+If you are running the folder standalone, first install the dependencies from `requirements.txt`.

app.py ADDED Viewed

	@@ -0,0 +1,263 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Any, Optional, Sequence, Tuple
+import pandas as pd
+import gradio as gr
+from space_utils import SpaceBundle, analyze_path, coerce_upload_path, find_local_sample
+APP_TITLE = "nsys-llm-explainer — Instant Nsight Trace Analyzer for Cloud LLM Inference"
+CSS = """
+.gradio-container {
+  background:
+    radial-gradient(circle at top left, rgba(42, 93, 142, 0.35), transparent 30%),
+    radial-gradient(circle at top right, rgba(20, 104, 117, 0.22), transparent 26%),
+    linear-gradient(180deg, #081018 0%, #0b111a 42%, #090e15 100%);
+  color: #e6eef7;
+  font-family: "Aptos", "Segoe UI", sans-serif;
+}
+.hero-card {
+  border: 1px solid rgba(115, 145, 180, 0.28);
+  border-radius: 22px;
+  background: linear-gradient(135deg, rgba(14, 22, 34, 0.95), rgba(10, 14, 20, 0.92));
+  box-shadow: 0 24px 70px rgba(0, 0, 0, 0.28);
+  padding: 22px 24px;
+  margin-bottom: 16px;
+}
+.hero-kicker {
+  text-transform: uppercase;
+  letter-spacing: 0.18em;
+  color: #8fb4d9;
+  font-size: 11px;
+  font-weight: 700;
+}
+.hero-title {
+  margin: 10px 0 10px;
+  font-size: 34px;
+  line-height: 1.05;
+  font-weight: 800;
+  color: #f3f8ff;
+}
+.hero-subtitle {
+  color: #b2c5d9;
+  font-size: 15px;
+  line-height: 1.6;
+  max-width: 980px;
+}
+.badge-row {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 8px;
+  margin-top: 16px;
+}
+.badge {
+  display: inline-flex;
+  align-items: center;
+  padding: 6px 12px;
+  border-radius: 999px;
+  border: 1px solid rgba(137, 171, 207, 0.28);
+  background: rgba(13, 21, 31, 0.82);
+  color: #d8e6f5;
+  font-size: 12px;
+}
+.upload-card {
+  border: 1px solid rgba(88, 113, 143, 0.26);
+  border-radius: 18px;
+  background: rgba(10, 16, 24, 0.86);
+  padding: 14px;
+  margin-bottom: 14px;
+}
+.section-title {
+  color: #f4f8fd;
+  font-size: 16px;
+  font-weight: 700;
+  margin: 0 0 10px 0;
+}
+.gr-markdown, .prose {
+  color: #e8eff7;
+}
+.wrap-long {
+  white-space: pre-wrap;
+  word-break: break-word;
+}
+"""
+HEADER = """
+<div class="hero-card">
+  <div class="hero-kicker">Cloud ML trace intelligence</div>
+  <div class="hero-title">nsys-llm-explainer — Instant Nsight Trace Analyzer for Cloud LLM Inference</div>
+  <div class="hero-subtitle">
+    Upload a `trace.sqlite` or `report.json` and get prioritized findings, NCCL/NVLink correlation, launch storm diagnosis,
+    per-process breakdowns, and downloadable analysis artifacts. The same code path powers the CLI, dashboard, and this Space.
+  </div>
+  <div class="badge-row">
+    <span class="badge">SQLite + report.json input</span>
+    <span class="badge">Evidence-backed findings</span>
+    <span class="badge">CSV + JSON downloads</span>
+    <span class="badge">Built for cloud LLM traces</span>
+  </div>
+</div>
+"""
+def _empty_outputs(message: str) -> Tuple[Any, str, pd.DataFrame, str, str, list[str], pd.DataFrame]:
+    empty_df = pd.DataFrame(columns=["section", "metric", "value"])
+    empty_manifest = pd.DataFrame(columns=["artifact", "purpose", "path"])
+    return (
+        message,
+        message,
+        empty_df,
+        message,
+        message,
+        [],
+        empty_manifest,
+    )
+def _bundle_to_outputs(bundle: SpaceBundle) -> Tuple[Any, str, pd.DataFrame, str, str, list[str], pd.DataFrame]:
+    summary_df = pd.DataFrame(bundle.summary_rows)
+    manifest_df = pd.DataFrame(bundle.manifest_rows)
+    bottleneck = next((row["value"] for row in bundle.summary_rows if row.get("metric") == "Top bottleneck"), "No bottleneck summary available")
+    summary_markdown = [
+        "### Quick read",
+        "",
+        "- Source: `{}` (`{}`)".format(bundle.source_path.name, bundle.source_kind),
+        "- {}".format(bundle.report.get("generated_at") or "Generated time unavailable"),
+        "- {}".format(bottleneck),
+        "- Warnings: `{}`".format(len(bundle.report.get("warnings") or [])),
+    ]
+    files = [str(path) for path in bundle.artifact_paths]
+    return (
+        bundle.status_markdown,
+        "\n".join(summary_markdown),
+        summary_df,
+        bundle.findings_markdown,
+        bundle.markdown,
+        files,
+        manifest_df,
+    )
+def _resolve_path(uploaded: Any, sample_path: str) -> Optional[Path]:
+    uploaded_path = coerce_upload_path(uploaded)
+    if uploaded_path:
+        return uploaded_path
+    if sample_path:
+        candidate = Path(sample_path)
+        if candidate.exists():
+            return candidate
+    return None
+def _run_analysis(uploaded: Any, sample_path: str) -> Tuple[Any, str, pd.DataFrame, str, str, list[str], pd.DataFrame]:
+    path = _resolve_path(uploaded, sample_path)
+    if not path:
+        return _empty_outputs(
+            "Upload a `trace.sqlite`/`.db` file or a `report.json` to generate the report. "
+            "If you are using this Space as a demo, click `Load sample trace` first."
+        )
+    try:
+        bundle = analyze_path(path)
+        return _bundle_to_outputs(bundle)
+    except Exception as exc:
+        message = "Failed to analyze `{}`: `{}`".format(path.name, exc)
+        return _empty_outputs(message)
+def _build_demo(sample_path: Optional[Path]) -> gr.Blocks:
+    with gr.Blocks(title=APP_TITLE, css=CSS, theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate")) as demo:
+        gr.HTML(HEADER)
+        with gr.Row(elem_classes=["upload-card"]):
+            with gr.Column(scale=6):
+                upload = gr.File(
+                    label="Upload trace or report",
+                    file_count="single",
+                    file_types=[".sqlite", ".db", ".json"],
+                    type="filepath",
+                )
+            with gr.Column(scale=2, min_width=180):
+                analyze_btn = gr.Button("Analyze trace", variant="primary")
+            with gr.Column(scale=2, min_width=180):
+                sample_btn = gr.Button(
+                    "Load sample trace",
+                    variant="secondary",
+                    visible=bool(sample_path),
+                )
+        status = gr.Markdown("Upload a trace or report to begin.")
+        sample_state = gr.State(str(sample_path) if sample_path else "")
+        with gr.Tabs():
+            with gr.Tab("Summary"):
+                gr.Markdown("### Summary")
+                summary = gr.Markdown(elem_classes=["wrap-long"])
+                summary_table = gr.Dataframe(
+                    headers=["section", "metric", "value"],
+                    datatype=["str", "str", "str"],
+                    interactive=False,
+                    wrap=True,
+                    label="Key metrics",
+                )
+            with gr.Tab("Findings"):
+                findings = gr.Markdown(elem_classes=["wrap-long"])
+            with gr.Tab("Markdown"):
+                report_markdown = gr.Markdown(elem_classes=["wrap-long"])
+            with gr.Tab("Downloads"):
+                gr.Markdown(
+                    "### Generated artifacts\n"
+                    "The analysis writes `report.md`, `report.json`, CSV tables, and a zip bundle."
+                )
+                manifest = gr.Dataframe(
+                    headers=["artifact", "purpose", "path"],
+                    datatype=["str", "str", "str"],
+                    interactive=False,
+                    wrap=True,
+                    label="Artifact manifest",
+                )
+                downloads = gr.File(
+                    label="Download files",
+                    file_count="multiple",
+                    type="filepath",
+                )
+        analyze_btn.click(
+            fn=_run_analysis,
+            inputs=[upload, sample_state],
+            outputs=[status, summary, summary_table, findings, report_markdown, downloads, manifest],
+        )
+        if sample_path:
+            sample_btn.click(
+                fn=lambda sp: _run_analysis(None, sp),
+                inputs=[sample_state],
+                outputs=[status, summary, summary_table, findings, report_markdown, downloads, manifest],
+            )
+            demo.load(
+                fn=lambda sp: _run_analysis(None, sp),
+                inputs=[sample_state],
+                outputs=[status, summary, summary_table, findings, report_markdown, downloads, manifest],
+            )
+    return demo
+def main() -> None:
+    demo = _build_demo(find_local_sample())
+    demo.queue()
+    demo.launch()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio>=4.44.0
+pandas>=1.5.0
+git+https://github.com/KOKOSde/nsys-llm-explainer.git@v0.3.0

sample_report.json ADDED Viewed

	@@ -0,0 +1,761 @@

+{
+  "findings": [
+    {
+      "evidence": [
+        "Top kernel `computeKernel` is 42.6% of total kernel time."
+      ],
+      "recommendation": [
+        "Focus optimization effort on this kernel first."
+      ],
+      "severity": "medium",
+      "title": "Single kernel is a large share of GPU time"
+    },
+    {
+      "evidence": [
+        "Top sync-like call `cudaStreamSynchronize` total 0.80 ms across 1 calls.",
+        "All sync-like calls total 1.50 ms."
+      ],
+      "recommendation": [
+        "Look for `cudaDeviceSynchronize` / stream waits in your serving loop and remove unnecessary barriers.",
+        "Prefer async launches and overlap CPU work with GPU execution; avoid per-token synchronization."
+      ],
+      "severity": "medium",
+      "title": "CPU\u2194GPU synchronization detected (runtime API)"
+    }
+  ],
+  "generated_at": "2026-03-11T03:52:08.704882+00:00",
+  "metrics": {
+    "barriers": {
+      "barriers": [
+        {
+          "api_name": "cudaStreamSynchronize",
+          "avg_duration_us": 800.0,
+          "barrier_kind": "sync_api",
+          "count": 1,
+          "max_duration_us": 800.0,
+          "total_time_ms": 0.8
+        },
+        {
+          "api_name": "cudaDeviceSynchronize",
+          "avg_duration_us": 700.0,
+          "barrier_kind": "sync_api",
+          "count": 1,
+          "max_duration_us": 700.0,
+          "total_time_ms": 0.7
+        },
+        {
+          "api_name": "cudaMemcpy",
+          "avg_duration_us": 600.0,
+          "barrier_kind": "blocking_memcpy",
+          "count": 1,
+          "max_duration_us": 600.0,
+          "total_time_ms": 0.6
+        },
+        {
+          "api_name": "cpu_launcher_gap",
+          "avg_duration_us": 200.0,
+          "barrier_kind": "cpu_launcher_gap",
+          "count": 1,
+          "max_duration_us": 200.0,
+          "total_time_ms": 0.2
+        }
+      ],
+      "barriers_by_pid": [
+        {
+          "api_name": "cudaStreamSynchronize",
+          "avg_duration_us": 800.0,
+          "barrier_kind": "sync_api",
+          "count": 1,
+          "max_duration_us": 800.0,
+          "pid": 111,
+          "total_time_ms": 0.8
+        },
+        {
+          "api_name": "cudaMemcpy",
+          "avg_duration_us": 600.0,
+          "barrier_kind": "blocking_memcpy",
+          "count": 1,
+          "max_duration_us": 600.0,
+          "pid": 111,
+          "total_time_ms": 0.6
+        },
+        {
+          "api_name": "cpu_launcher_gap",
+          "avg_duration_us": 200.0,
+          "barrier_kind": "cpu_launcher_gap",
+          "count": 1,
+          "max_duration_us": 200.0,
+          "pid": 111,
+          "total_time_ms": 0.2
+        },
+        {
+          "api_name": "cudaDeviceSynchronize",
+          "avg_duration_us": 700.0,
+          "barrier_kind": "sync_api",
+          "count": 1,
+          "max_duration_us": 700.0,
+          "pid": 222,
+          "total_time_ms": 0.7
+        }
+      ],
+      "launcher_gap_threshold_us": 50.0,
+      "notes": [],
+      "pids": [
+        {
+          "barrier_event_count": 3,
+          "pid": 111,
+          "top_barrier": "cudaStreamSynchronize",
+          "top_barrier_kind": "sync_api",
+          "total_barrier_time_ms": 1.6
+        },
+        {
+          "barrier_event_count": 1,
+          "pid": 222,
+          "top_barrier": "cudaDeviceSynchronize",
+          "top_barrier_kind": "sync_api",
+          "total_barrier_time_ms": 0.7
+        }
+      ],
+      "present": true,
+      "sql": {
+        "runtime_barriers": "SELECT (CAST(r.globalTid / 16777216 AS INT) % 16777216) AS pid, s.value AS api_name, r.start AS start_ns, r.end AS end_ns FROM CUPTI_ACTIVITY_KIND_RUNTIME r  JOIN StringIds s ON s.id = r.nameId  WHERE r.end IS NOT NULL AND r.end > r.start AND (LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ?) ORDER BY pid, start_ns"
+      }
+    },
+    "by_pid": {
+      "kernels": {
+        "kernel_table": "CUPTI_ACTIVITY_KIND_KERNEL",
+        "kernels": [
+          {
+            "avg_duration_us": 1000.0,
+            "call_count": 2,
+            "device_id": 0,
+            "kernel_name": "computeKernel",
+            "pct_of_pid_kernel_time": 50.0,
+            "pct_of_total_kernel_time": 32.78688524590164,
+            "pid": 111,
+            "pid_pct_of_total_kernel_time": 65.57377049180327,
+            "pid_total_kernel_time_ms": 4.0,
+            "total_time_ms": 2.0
+          },
+          {
+            "avg_duration_us": 2000.0,
+            "call_count": 1,
+            "device_id": 0,
+            "kernel_name": "ncclAllReduceRingKernel",
+            "pct_of_pid_kernel_time": 50.0,
+            "pct_of_total_kernel_time": 32.78688524590164,
+            "pid": 111,
+            "pid_pct_of_total_kernel_time": 65.57377049180327,
+            "pid_total_kernel_time_ms": 4.0,
+            "total_time_ms": 2.0
+          },
+          {
+            "avg_duration_us": 1500.0,
+            "call_count": 1,
+            "device_id": 0,
+            "kernel_name": "ncclBroadcastRingKernel",
+            "pct_of_pid_kernel_time": 71.42857142857143,
+            "pct_of_total_kernel_time": 24.59016393442623,
+            "pid": 222,
+            "pid_pct_of_total_kernel_time": 34.42622950819672,
+            "pid_total_kernel_time_ms": 2.1,
+            "total_time_ms": 1.5
+          },
+          {
+            "avg_duration_us": 600.0,
+            "call_count": 1,
+            "device_id": 0,
+            "kernel_name": "computeKernel",
+            "pct_of_pid_kernel_time": 28.57142857142857,
+            "pct_of_total_kernel_time": 9.836065573770492,
+            "pid": 222,
+            "pid_pct_of_total_kernel_time": 34.42622950819672,
+            "pid_total_kernel_time_ms": 2.1,
+            "total_time_ms": 0.6
+          }
+        ],
+        "notes": [],
+        "pid_quality": {
+          "pid0_fraction": 0.0,
+          "pid0_rows": 0,
+          "pid_ge_10m_fraction": 0.0,
+          "pid_ge_10m_rows": 0,
+          "present": true,
+          "rows_with_pid": 5
+        },
+        "pid_source": "globalPid",
+        "pids": [
+          {
+            "kernel_count": 3,
+            "pct_of_total_kernel_time": 65.57377049180327,
+            "pid": 111,
+            "total_kernel_time_ms": 4.0,
+            "total_kernel_time_ns": 4000000
+          },
+          {
+            "kernel_count": 2,
+            "pct_of_total_kernel_time": 34.42622950819672,
+            "pid": 222,
+            "total_kernel_time_ms": 2.1,
+            "total_kernel_time_ns": 2100000
+          }
+        ],
+        "present": true,
+        "sql": {
+          "kernels": "SELECT (CAST(k.globalPid / 16777216 AS INT) % 16777216) AS pid, s.value AS kernel_name, k.deviceId AS device_id, COUNT(*) AS call_count, SUM(k.end-k.start) AS total_ns, AVG(k.end-k.start) AS avg_ns FROM CUPTI_ACTIVITY_KIND_KERNEL k  JOIN StringIds s ON s.id = k.demangledName  WHERE ((CAST(k.globalPid / 16777216 AS INT) % 16777216)) IN (?,?) GROUP BY pid, kernel_name, device_id ORDER BY pid, total_ns DESC",
+          "pid_quality": "SELECT COUNT(*) AS rows_with_pid, SUM(CASE WHEN (CAST(k.globalPid / 16777216 AS INT) % 16777216) = 0 THEN 1 ELSE 0 END) AS pid0_rows, SUM(CASE WHEN (CAST(k.globalPid / 16777216 AS INT) % 16777216) >= 10000000 THEN 1 ELSE 0 END) AS pid_ge_10m_rows FROM CUPTI_ACTIVITY_KIND_KERNEL k WHERE k.globalPid IS NOT NULL",
+          "top_pids": "SELECT (CAST(k.globalPid / 16777216 AS INT) % 16777216) AS pid, SUM(k.end-k.start) AS total_ns, COUNT(*) AS kernel_count FROM CUPTI_ACTIVITY_KIND_KERNEL k WHERE k.globalPid IS NOT NULL GROUP BY pid ORDER BY total_ns DESC LIMIT ?"
+        }
+      },
+      "nvtx": {
+        "notes": [
+          "No NVTX ranges found."
+        ],
+        "present": false,
+        "sql": {}
+      },
+      "nvtx_kernel_phases": null,
+      "sync": {
+        "notes": [],
+        "pid_source": "globalTid",
+        "pids": [
+          {
+            "pid": 111,
+            "sync_total_time_ms": 0.8
+          },
+          {
+            "pid": 222,
+            "sync_total_time_ms": 0.7
+          }
+        ],
+        "present": true,
+        "runtime_table": "CUPTI_ACTIVITY_KIND_RUNTIME",
+        "sql": {
+          "sync_by_pid": "SELECT (CAST(r.globalTid / 16777216 AS INT) % 16777216) AS pid, s.value AS api_name, COUNT(*) AS call_count, SUM(r.end-r.start) AS total_ns, AVG(r.end-r.start) AS avg_ns FROM CUPTI_ACTIVITY_KIND_RUNTIME r  JOIN StringIds s ON s.id = r.nameId  WHERE ((s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?)) AND (r.globalTid IS NOT NULL) GROUP BY pid, api_name ORDER BY total_ns DESC LIMIT ?"
+        },
+        "sync_calls": [
+          {
+            "api_name": "cudaStreamSynchronize",
+            "avg_duration_us": 800.0,
+            "call_count": 1,
+            "pid": 111,
+            "total_time_ms": 0.8
+          },
+          {
+            "api_name": "cudaDeviceSynchronize",
+            "avg_duration_us": 700.0,
+            "call_count": 1,
+            "pid": 222,
+            "total_time_ms": 0.7
+          }
+        ]
+      }
+    },
+    "gpu_idle": {
+      "devices": [
+        {
+          "busy_ms": 4.5,
+          "device_id": 0,
+          "idle_ms": 1.0,
+          "idle_pct_of_window": 18.181818181818183,
+          "window_ms": 5.5
+        }
+      ],
+      "gaps": [
+        {
+          "device_id": 0,
+          "gap_end_ns": 4000000,
+          "gap_ms": 1.0,
+          "gap_start_ns": 3000000
+        }
+      ],
+      "notes": [],
+      "sql": {
+        "events": "SELECT start, end, deviceId AS device_id FROM CUPTI_ACTIVITY_KIND_KERNEL ORDER BY device_id, start"
+      },
+      "table": "CUPTI_ACTIVITY_KIND_KERNEL"
+    },
+    "launch_storm": {
+      "is_launch_storm": false,
+      "launches_per_s": 909.0909090909091,
+      "median_kernel_us": 1000.0,
+      "notes": [],
+      "p50_kernel_us": 1000.0,
+      "p90_kernel_us": 1800.0,
+      "p99_kernel_us": 1980.0,
+      "pct_under_10us": 0.0,
+      "pct_under_20us": 0.0,
+      "pct_under_5us": 0.0,
+      "sql": {
+        "tiny_kernels": "SELECT s.value AS kernel_name, COUNT(*) AS call_count, AVG(k.end-k.start) AS avg_dur_ns FROM CUPTI_ACTIVITY_KIND_KERNEL k  JOIN StringIds s ON s.id = k.demangledName  WHERE (k.end-k.start) <= ? GROUP BY kernel_name ORDER BY call_count DESC LIMIT ?"
+      },
+      "storm_thresholds": {
+        "launches_per_s_threshold_1": 50000.0,
+        "launches_per_s_threshold_2": 100000.0,
+        "p50_kernel_us_threshold_1": 10.0,
+        "p50_kernel_us_threshold_2": 20.0
+      },
+      "tiny_kernel_us": 5.0,
+      "tiny_kernels": [],
+      "total_launches": 5,
+      "window_s": 0.0055
+    },
+    "nccl": {
+      "event_count": 2,
+      "notes": [
+        "Using NCCL kernel names as NCCL windows; collective names may be inferred only from kernel names."
+      ],
+      "ops": [
+        {
+          "avg_duration_us": 2000.0,
+          "compute_overlap_ms": 1.0,
+          "compute_overlap_pct": 50.0,
+          "count": 1,
+          "max_duration_ms": 2.0,
+          "op_name": "allreduce",
+          "raw_name_example": "ncclAllReduceRingKernel",
+          "source": "kernel",
+          "straggler": "pid:111",
+          "straggler_max_ms": 2.0,
+          "straggler_total_ms": 2.0,
+          "total_time_ms": 2.0
+        },
+        {
+          "avg_duration_us": 1500.0,
+          "compute_overlap_ms": 0.6,
+          "compute_overlap_pct": 40.0,
+          "count": 1,
+          "max_duration_ms": 1.5,
+          "op_name": "broadcast",
+          "raw_name_example": "ncclBroadcastRingKernel",
+          "source": "kernel",
+          "straggler": "pid:222",
+          "straggler_max_ms": 1.5,
+          "straggler_total_ms": 1.5,
+          "total_time_ms": 1.5
+        }
+      ],
+      "pids": [
+        {
+          "max_duration_ms": 2.0,
+          "nccl_event_count": 1,
+          "pid": 111,
+          "top_nccl_op": "allreduce",
+          "total_nccl_time_ms": 2.0
+        },
+        {
+          "max_duration_ms": 1.5,
+          "nccl_event_count": 1,
+          "pid": 222,
+          "top_nccl_op": "broadcast",
+          "total_nccl_time_ms": 1.5
+        }
+      ],
+      "present": true,
+      "source": "kernel",
+      "sql": {
+        "compute_overlap": "SELECT k.start AS start_ns, k.end AS end_ns, s.value AS kernel_name FROM CUPTI_ACTIVITY_KIND_KERNEL k  JOIN StringIds s ON s.id = k.demangledName  WHERE k.end IS NOT NULL AND k.end > k.start ORDER BY k.start",
+        "nccl_kernels": "SELECT (CAST(k.globalPid / 16777216 AS INT) % 16777216) AS pid, k.deviceId AS device_id, s.value AS kernel_name, k.start AS start_ns, k.end AS end_ns FROM CUPTI_ACTIVITY_KIND_KERNEL k  JOIN StringIds s ON s.id = k.demangledName  WHERE k.end IS NOT NULL AND k.end > k.start AND (LOWER(s.value) LIKE ?) ORDER BY k.start",
+        "nccl_runtime": "SELECT (CAST(r.globalTid / 16777216 AS INT) % 16777216) AS pid, s.value AS api_name, r.start AS start_ns, r.end AS end_ns FROM CUPTI_ACTIVITY_KIND_RUNTIME r  JOIN StringIds s ON s.id = r.nameId  WHERE r.end IS NOT NULL AND r.end > r.start AND (LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ? OR LOWER(s.value) LIKE ?) ORDER BY r.start"
+      },
+      "windows": [
+        {
+          "end_ns": 3000000,
+          "start_ns": 1000000
+        },
+        {
+          "end_ns": 5500000,
+          "start_ns": 4000000
+        }
+      ]
+    },
+    "nvlink_during_nccl": {
+      "capture_instructions": [
+        "NVLink counters not found in the SQLite export.",
+        "List supported metric sets first: `nsys profile --gpu-metrics-devices=all --gpu-metrics-set=help`.",
+        "Then re-capture with GPU Metrics enabled, for example: `sudo nsys profile --trace=nccl,cuda,nvtx,osrt --cuda-trace-scope=process-tree --gpu-metrics-devices=all --gpu-metrics-set=<supported-set> --gpu-metrics-frequency=10000 --cuda-graph-trace=node -o trace <app>`.",
+        "Export again with SQLite output: `nsys export --type sqlite --output trace.sqlite --force-overwrite=true --lazy=false trace.nsys-rep`."
+      ],
+      "missing_counters": true,
+      "notes": [
+        "GPU metric tables were not found in this export."
+      ],
+      "present": false,
+      "rows": [],
+      "sql": {}
+    },
+    "nvtx": {
+      "instances": [],
+      "notes": [
+        "No NVTX table found."
+      ],
+      "ranges": [],
+      "sql": {},
+      "table": null
+    },
+    "nvtx_coverage_warn_threshold": 0.7,
+    "nvtx_kernel_phases": null,
+    "nvtx_kernel_time": {
+      "notes": [
+        "Need kernel + runtime + NVTX tables for NVTX\u2192kernel attribution."
+      ],
+      "present": false,
+      "ranges": [],
+      "sql": {}
+    },
+    "nvtx_phases": null,
+    "per_pid": {
+      "notes": [],
+      "pid_source": "globalPid",
+      "pids": [
+        {
+          "launch_storm": {
+            "is_launch_storm": false,
+            "launches_per_s": 1000.0,
+            "median_kernel_us": 1000.0,
+            "p50_kernel_us": 1000.0,
+            "p90_kernel_us": 2000.0,
+            "p99_kernel_us": 2000.0,
+            "pct_under_10us": 0.0,
+            "pct_under_20us": 0.0,
+            "pct_under_5us": 0.0,
+            "storm_thresholds": {
+              "launches_per_s_threshold_1": 50000.0,
+              "launches_per_s_threshold_2": 100000.0,
+              "p50_kernel_us_threshold_1": 10.0,
+              "p50_kernel_us_threshold_2": 20.0
+            },
+            "total_launches": 3,
+            "window_s": 0.003
+          },
+          "nvtx": {
+            "notes": [
+              "No NVTX table."
+            ],
+            "present": false,
+            "ranges": [],
+            "table": null
+          },
+          "pid": 111,
+          "sync": {
+            "notes": [],
+            "present": true,
+            "sql": "SELECT s.value AS api_name, COUNT(*) AS call_count, SUM(r.end-r.start) AS total_time_ns, AVG(r.end-r.start) AS avg_time_ns FROM CUPTI_ACTIVITY_KIND_RUNTIME r  JOIN StringIds s ON s.id = r.nameId  WHERE ((r.globalTid >> 24) & 16777215) = ? AND ((s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?)) GROUP BY api_name ORDER BY total_time_ns DESC LIMIT 50",
+            "sync_calls": [
+              {
+                "api_name": "cudaStreamSynchronize",
+                "avg_duration_us": 800.0,
+                "call_count": 1,
+                "total_time_ms": 0.8
+              }
+            ],
+            "table": "CUPTI_ACTIVITY_KIND_RUNTIME"
+          },
+          "top_kernels": {
+            "kernels": [
+              {
+                "avg_duration_us": 1000.0,
+                "call_count": 2,
+                "device_id": 0,
+                "kernel_name": "computeKernel",
+                "total_time_ms": 2.0
+              },
+              {
+                "avg_duration_us": 2000.0,
+                "call_count": 1,
+                "device_id": 0,
+                "kernel_name": "ncclAllReduceRingKernel",
+                "total_time_ms": 2.0
+              }
+            ],
+            "table": "CUPTI_ACTIVITY_KIND_KERNEL",
+            "tiny_kernels": []
+          }
+        },
+        {
+          "launch_storm": {
+            "is_launch_storm": false,
+            "launches_per_s": 1333.3333333333333,
+            "median_kernel_us": 600.0,
+            "p50_kernel_us": 600.0,
+            "p90_kernel_us": 1500.0,
+            "p99_kernel_us": 1500.0,
+            "pct_under_10us": 0.0,
+            "pct_under_20us": 0.0,
+            "pct_under_5us": 0.0,
+            "storm_thresholds": {
+              "launches_per_s_threshold_1": 50000.0,
+              "launches_per_s_threshold_2": 100000.0,
+              "p50_kernel_us_threshold_1": 10.0,
+              "p50_kernel_us_threshold_2": 20.0
+            },
+            "total_launches": 2,
+            "window_s": 0.0015
+          },
+          "nvtx": {
+            "notes": [
+              "No NVTX table."
+            ],
+            "present": false,
+            "ranges": [],
+            "table": null
+          },
+          "pid": 222,
+          "sync": {
+            "notes": [],
+            "present": true,
+            "sql": "SELECT s.value AS api_name, COUNT(*) AS call_count, SUM(r.end-r.start) AS total_time_ns, AVG(r.end-r.start) AS avg_time_ns FROM CUPTI_ACTIVITY_KIND_RUNTIME r  JOIN StringIds s ON s.id = r.nameId  WHERE ((r.globalTid >> 24) & 16777215) = ? AND ((s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?)) GROUP BY api_name ORDER BY total_time_ns DESC LIMIT 50",
+            "sync_calls": [
+              {
+                "api_name": "cudaDeviceSynchronize",
+                "avg_duration_us": 700.0,
+                "call_count": 1,
+                "total_time_ms": 0.7
+              }
+            ],
+            "table": "CUPTI_ACTIVITY_KIND_RUNTIME"
+          },
+          "top_kernels": {
+            "kernels": [
+              {
+                "avg_duration_us": 1500.0,
+                "call_count": 1,
+                "device_id": 0,
+                "kernel_name": "ncclBroadcastRingKernel",
+                "total_time_ms": 1.5
+              },
+              {
+                "avg_duration_us": 600.0,
+                "call_count": 1,
+                "device_id": 0,
+                "kernel_name": "computeKernel",
+                "total_time_ms": 0.6
+              }
+            ],
+            "table": "CUPTI_ACTIVITY_KIND_KERNEL",
+            "tiny_kernels": []
+          }
+        }
+      ],
+      "present": true,
+      "sql": {
+        "top_pids": "SELECT ((k.globalPid >> 24) & 16777215) AS pid, SUM(k.end-k.start) AS total_ns, COUNT(*) AS launches FROM CUPTI_ACTIVITY_KIND_KERNEL k WHERE k.end > k.start AND ((k.globalPid >> 24) & 16777215) IS NOT NULL GROUP BY pid ORDER BY total_ns DESC LIMIT ?"
+      },
+      "top_pids": [
+        {
+          "kernel_launches": 3,
+          "pct_of_total_kernel_time": 65.57377049180327,
+          "pid": 111,
+          "total_kernel_time_ms": 4.0,
+          "total_kernel_time_ns": 4000000
+        },
+        {
+          "kernel_launches": 2,
+          "pct_of_total_kernel_time": 34.42622950819672,
+          "pid": 222,
+          "total_kernel_time_ms": 2.1,
+          "total_kernel_time_ns": 2100000
+        }
+      ]
+    },
+    "pid_attribution": {
+      "kernel_pid_count": 2,
+      "kernel_pid_source": "globalPid",
+      "kernel_pids_sample": [
+        111,
+        222
+      ],
+      "nvtx_pid_count": 0,
+      "nvtx_pid_source": null,
+      "nvtx_pids_sample": [],
+      "runtime_pid_count": 2,
+      "runtime_pid_source": "globalTid",
+      "runtime_pids_sample": [
+        111,
+        222
+      ]
+    },
+    "sync": {
+      "notes": [],
+      "sql": {
+        "sync_calls": "SELECT s.value AS api_name, COUNT(*) AS call_count, SUM(r.end - r.start) AS total_time_ns, AVG(r.end-r.start) AS avg_time_ns FROM CUPTI_ACTIVITY_KIND_RUNTIME r  JOIN StringIds s ON s.id = r.nameId  WHERE (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) OR (s.value LIKE ?) GROUP BY api_name ORDER BY total_time_ns DESC LIMIT ?"
+      },
+      "sync_calls": [
+        {
+          "api_name": "cudaStreamSynchronize",
+          "avg_duration_us": 800.0,
+          "call_count": 1,
+          "total_time_ms": 0.8
+        },
+        {
+          "api_name": "cudaDeviceSynchronize",
+          "avg_duration_us": 700.0,
+          "call_count": 1,
+          "total_time_ms": 0.7
+        }
+      ],
+      "table": "CUPTI_ACTIVITY_KIND_RUNTIME"
+    },
+    "top_kernels": {
+      "kernels": [
+        {
+          "avg_duration_us": 866.6666666666666,
+          "call_count": 3,
+          "device_id": 0,
+          "kernel_name": "computeKernel",
+          "max_duration_us": 1000.0,
+          "min_duration_us": 600.0,
+          "p50_duration_us": 1000.0,
+          "p90_duration_us": 1000.0,
+          "pct_total_kernel_time": 42.62295081967213,
+          "total_time_ms": 2.6,
+          "total_time_ns": 2600000
+        },
+        {
+          "avg_duration_us": 2000.0,
+          "call_count": 1,
+          "device_id": 0,
+          "kernel_name": "ncclAllReduceRingKernel",
+          "max_duration_us": 2000.0,
+          "min_duration_us": 2000.0,
+          "p50_duration_us": 2000.0,
+          "p90_duration_us": 2000.0,
+          "pct_total_kernel_time": 32.78688524590164,
+          "total_time_ms": 2.0,
+          "total_time_ns": 2000000
+        },
+        {
+          "avg_duration_us": 1500.0,
+          "call_count": 1,
+          "device_id": 0,
+          "kernel_name": "ncclBroadcastRingKernel",
+          "max_duration_us": 1500.0,
+          "min_duration_us": 1500.0,
+          "p50_duration_us": 1500.0,
+          "p90_duration_us": 1500.0,
+          "pct_total_kernel_time": 24.59016393442623,
+          "total_time_ms": 1.5,
+          "total_time_ns": 1500000
+        }
+      ],
+      "notes": [],
+      "sql": {
+        "agg": "SELECT s.value AS kernel_name, k.deviceId AS device_id, COUNT(*) AS call_count, SUM(k.end - k.start) AS total_time_ns, AVG(k.end - k.start) AS avg_time_ns, MIN(k.end - k.start) AS min_time_ns, MAX(k.end - k.start) AS max_time_ns FROM CUPTI_ACTIVITY_KIND_KERNEL k  JOIN StringIds s ON s.id = k.demangledName  GROUP BY kernel_name, device_id ORDER BY total_time_ns DESC LIMIT ?",
+        "durations": "SELECT (end-start) FROM CUPTI_ACTIVITY_KIND_KERNEL ... ORDER BY",
+        "total": "SELECT SUM(end - start) FROM CUPTI_ACTIVITY_KIND_KERNEL"
+      },
+      "table": "CUPTI_ACTIVITY_KIND_KERNEL",
+      "total_kernel_time_ns": 6100000
+    }
+  },
+  "schema": {
+    "capabilities": {
+      "cuda_graph_table": {
+        "present": false,
+        "table": null
+      },
+      "gpu_metrics_table": {
+        "present": false,
+        "table": null,
+        "target_info_table": null
+      },
+      "has_string_table": true,
+      "kernel_table": {
+        "has_correlationId": false,
+        "has_deviceId": true,
+        "has_globalPid": true,
+        "has_pid": false,
+        "has_processId": false,
+        "present": true
+      },
+      "nvtx_table": {
+        "has_end": false,
+        "has_globalTid": false,
+        "has_text": false,
+        "has_textId": false,
+        "present": false
+      },
+      "runtime_table": {
+        "has_correlationId": false,
+        "has_globalTid": true,
+        "has_name": false,
+        "has_nameId": true,
+        "has_pid": false,
+        "has_processId": false,
+        "present": true
+      }
+    },
+    "cuda_graph_table": null,
+    "gpu_metrics_table": null,
+    "kernel_pid_source": "globalPid",
+    "kernel_table": "CUPTI_ACTIVITY_KIND_KERNEL",
+    "nvtx_pid_source": null,
+    "nvtx_table": null,
+    "path": "synthetic fixture (raw trace.sqlite not committed)",
+    "runtime_pid_source": "globalTid",
+    "runtime_table": "CUPTI_ACTIVITY_KIND_RUNTIME",
+    "sqlite_version": "3.26.0",
+    "string_table": "StringIds",
+    "sync_table": "CUPTI_ACTIVITY_KIND_RUNTIME",
+    "tables": {
+      "CUPTI_ACTIVITY_KIND_KERNEL": {
+        "columns": [
+          "start",
+          "end",
+          "deviceId",
+          "contextId",
+          "streamId",
+          "globalPid",
+          "demangledName"
+        ],
+        "types": {
+          "contextId": "INT",
+          "demangledName": "INT",
+          "deviceId": "INT",
+          "end": "INT",
+          "globalPid": "INT",
+          "start": "INT",
+          "streamId": "INT"
+        }
+      },
+      "CUPTI_ACTIVITY_KIND_RUNTIME": {
+        "columns": [
+          "start",
+          "end",
+          "nameId",
+          "globalTid"
+        ],
+        "types": {
+          "end": "INT",
+          "globalTid": "INT",
+          "nameId": "INT",
+          "start": "INT"
+        }
+      },
+      "StringIds": {
+        "columns": [
+          "id",
+          "value"
+        ],
+        "types": {
+          "id": "INTEGER",
+          "value": "TEXT"
+        }
+      }
+    },
+    "target_info_gpu_metrics_table": null,
+    "timestamp_unit_assumed": "ns",
+    "timestamp_unit_guess": "ns_likely",
+    "timestamp_unit_guess_basis": "kernel_window_ns_ge_1ms"
+  },
+  "tool": {
+    "name": "nsys-llm-explain",
+    "version": "0.1.0"
+  },
+  "trace": {
+    "path": "synthetic fixture (raw trace.sqlite not committed)"
+  },
+  "warnings": [
+    "NVLink counters not found. The report cannot correlate NCCL windows with NVLink metrics for this export."
+  ]
+}

space_utils.py ADDED Viewed

	@@ -0,0 +1,376 @@

+from __future__ import annotations
+import json
+import sys
+import tempfile
+import zipfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple
+def _bootstrap_src_path() -> None:
+    here = Path(__file__).resolve()
+    for candidate in (here.parents[2] / "src", here.parents[1] / "src"):
+        if candidate.exists() and str(candidate) not in sys.path:
+            sys.path.insert(0, str(candidate))
+            return
+_bootstrap_src_path()
+from nsys_llm_explainer.queries import TraceDB  # type: ignore
+from nsys_llm_explainer.report import AnalysisOutputs, analyze, render_markdown, write_artifacts  # type: ignore
+@dataclass(frozen=True)
+class SpaceBundle:
+    source_path: Path
+    source_kind: str
+    report: Dict[str, Any]
+    markdown: str
+    artifacts_dir: Path
+    artifact_paths: List[Path]
+    summary_rows: List[Dict[str, str]]
+    manifest_rows: List[Dict[str, str]]
+    findings_markdown: str
+    status_markdown: str
+def _coerce_float(value: Any, default: float = 0.0) -> float:
+    try:
+        return float(value)
+    except Exception:
+        return float(default)
+def _safe_text(value: Any, default: str = "-") -> str:
+    text = str(value).strip() if value is not None else ""
+    return text if text else default
+def _safe_trace_name(report: Mapping[str, Any]) -> str:
+    trace_path = ((report.get("trace") or {}).get("path") or report.get("_source_name") or "")
+    return Path(str(trace_path)).name if trace_path else "unknown"
+def _top_kernel_row(report: Mapping[str, Any]) -> Optional[Mapping[str, Any]]:
+    rows = ((report.get("metrics") or {}).get("top_kernels") or {}).get("kernels") or []
+    return rows[0] if rows else None
+def _top_nccl_row(report: Mapping[str, Any]) -> Optional[Mapping[str, Any]]:
+    rows = ((report.get("metrics") or {}).get("nccl") or {}).get("ops") or []
+    return rows[0] if rows else None
+def _format_ms(value: Any) -> str:
+    return "{:.3f} ms".format(_coerce_float(value))
+def _format_us(value: Any) -> str:
+    return "{:.2f} us".format(_coerce_float(value))
+def _format_pct(value: Any) -> str:
+    return "{:.1f}%".format(_coerce_float(value))
+def _bottleneck_sentence(report: Mapping[str, Any]) -> str:
+    metrics = report.get("metrics") or {}
+    total_gpu_ms = _coerce_float((metrics.get("top_kernels") or {}).get("total_kernel_time_ns")) / 1_000_000.0
+    top_kernel = _top_kernel_row(report)
+    top_nccl = _top_nccl_row(report)
+    if total_gpu_ms > 0.0 and top_nccl:
+        nccl_pct = (_coerce_float(top_nccl.get("total_time_ms")) / total_gpu_ms) * 100.0
+        kernel_pct = _coerce_float(top_kernel.get("pct_total_kernel_time") if top_kernel else 0.0)
+        if nccl_pct >= kernel_pct:
+            return "{} dominates {:.1f}% of GPU time".format(str(top_nccl.get("op_name") or "NCCL"), nccl_pct)
+    if top_kernel:
+        return "{} dominates {:.1f}% of GPU time".format(
+            str(top_kernel.get("kernel_name") or "Top kernel"),
+            _coerce_float(top_kernel.get("pct_total_kernel_time")),
+        )
+    return "No dominant GPU bottleneck detected from available metrics"
+def _summary_rows(report: Mapping[str, Any]) -> List[Dict[str, str]]:
+    metrics = report.get("metrics") or {}
+    timeline = metrics.get("timeline") or {}
+    gpu_total_ms = _coerce_float(timeline.get("total_gpu_time_ms"))
+    if gpu_total_ms <= 0:
+        gpu_total_ms = _coerce_float((metrics.get("top_kernels") or {}).get("total_kernel_time_ns")) / 1_000_000.0
+    cpu_total_ms = _coerce_float(timeline.get("total_cpu_time_ms"))
+    if cpu_total_ms <= 0:
+        sync_rows = (metrics.get("sync") or {}).get("sync_calls") or []
+        cpu_total_ms = sum(_coerce_float(row.get("total_time_ms")) for row in sync_rows)
+    warnings = report.get("warnings") or []
+    report_version = _safe_text((report.get("tool") or {}).get("version"), default="unknown")
+    top_kernel = _top_kernel_row(report)
+    top_nccl = _top_nccl_row(report)
+    nvlink = (metrics.get("nvlink_during_nccl") or {}).get("rows") or []
+    nvlink_row = nvlink[0] if nvlink else None
+    capability_checks = {
+        "Kernel table": bool((metrics.get("top_kernels") or {}).get("present")),
+        "Runtime table": bool((metrics.get("sync") or {}).get("present")),
+        "NVTX ranges": bool((metrics.get("nvtx") or {}).get("present")),
+        "GPU metrics": bool((metrics.get("nvlink_during_nccl") or {}).get("present")),
+        "Per-process breakdown": bool((metrics.get("per_pid") or {}).get("present")),
+    }
+    rows: List[Dict[str, str]] = [
+        {"section": "Overview", "metric": "Trace", "value": _safe_trace_name(report)},
+        {"section": "Overview", "metric": "Tool version", "value": report_version},
+        {"section": "Overview", "metric": "Generated at (UTC)", "value": _safe_text(report.get("generated_at"))},
+        {"section": "Overview", "metric": "Total GPU time", "value": _format_ms(gpu_total_ms)},
+        {"section": "Overview", "metric": "Total CPU time", "value": _format_ms(cpu_total_ms)},
+        {"section": "Overview", "metric": "Top bottleneck", "value": _bottleneck_sentence(report)},
+        {"section": "Overview", "metric": "Warnings", "value": str(len(warnings))},
+    ]
+    if top_kernel:
+        rows.extend(
+            [
+                {"section": "Evidence", "metric": "Top kernel", "value": _safe_text(top_kernel.get("kernel_name"))},
+                {"section": "Evidence", "metric": "Top kernel time", "value": _format_ms(top_kernel.get("total_time_ms"))},
+                {"section": "Evidence", "metric": "Top kernel share", "value": _format_pct(top_kernel.get("pct_total_kernel_time"))},
+            ]
+        )
+    if top_nccl:
+        rows.extend(
+            [
+                {"section": "Evidence", "metric": "Top NCCL op", "value": _safe_text(top_nccl.get("op_name"))},
+                {"section": "Evidence", "metric": "Top NCCL time", "value": _format_ms(top_nccl.get("total_time_ms"))},
+                {"section": "Evidence", "metric": "Top NCCL overlap", "value": _format_pct(top_nccl.get("compute_overlap_pct"))},
+            ]
+        )
+    if nvlink_row:
+        rows.extend(
+            [
+                {"section": "Evidence", "metric": "NVLink metric(s)", "value": _safe_text(nvlink_row.get("metric_names"))},
+                {
+                    "section": "Evidence",
+                    "metric": "NVLink during NCCL",
+                    "value": "{:.2f} export units".format(_coerce_float(nvlink_row.get("avg_metric_during_nccl"), 0.0)),
+                },
+                {
+                    "section": "Evidence",
+                    "metric": "NVLink outside NCCL",
+                    "value": "{:.2f} export units".format(_coerce_float(nvlink_row.get("avg_metric_outside_nccl"), 0.0)),
+                },
+                {
+                    "section": "Evidence",
+                    "metric": "NVLink correlation",
+                    "value": "{:.3f}".format(_coerce_float(nvlink_row.get("nccl_activity_correlation"), 0.0)),
+                },
+            ]
+        )
+    for label, present in capability_checks.items():
+        rows.append({"section": "Capabilities", "metric": label, "value": "present" if present else "missing"})
+    return rows
+def _findings_markdown(report: Mapping[str, Any]) -> str:
+    findings = report.get("findings") or []
+    warnings = report.get("warnings") or []
+    lines: List[str] = ["## What to do next", ""]
+    if not findings:
+        lines.append("No findings were generated for this trace.")
+    else:
+        for finding in findings:
+            severity = _safe_text(finding.get("severity"), default="unknown").upper()
+            title = _safe_text(finding.get("title"), default="Untitled finding")
+            lines.append("### [{}] {}".format(severity, title))
+            evidence = finding.get("evidence") or []
+            recommendations = finding.get("recommendation") or finding.get("recommendations") or []
+            if evidence:
+                lines.append("Evidence:")
+                for item in evidence:
+                    lines.append("- {}".format(item))
+            if recommendations:
+                lines.append("Recommendation:")
+                if isinstance(recommendations, (list, tuple)):
+                    for item in recommendations:
+                        lines.append("- {}".format(item))
+                else:
+                    lines.append("- {}".format(recommendations))
+            lines.append("")
+    if warnings:
+        lines.append("## Warnings")
+        lines.append("")
+        for warning in warnings:
+            lines.append("- {}".format(warning))
+    return "\n".join(lines).strip()
+def _artifact_manifest(out_dir: Path) -> List[Dict[str, str]]:
+    purpose_map = {
+        "report.md": "Human-readable report",
+        "report.json": "Machine-readable report",
+        "kernels.csv": "Top kernels",
+        "barriers.csv": "CPU/GPU barriers",
+        "nccl_ops.csv": "Top NCCL ops",
+        "nccl_rank_skew.csv": "Per-rank NCCL skew",
+        "nccl_by_pid.csv": "NCCL per PID",
+        "nvlink_during_nccl.csv": "NVLink correlation rows",
+        "nvlink_timeseries.csv": "NVLink correlation timeseries",
+        "timeline_events.csv": "Timeline events",
+        "copy_engine_events.csv": "Copy engine events",
+        "launch_latency_rows.csv": "Launch latency rows",
+        "launch_latency_histogram.csv": "Launch latency histogram",
+        "stream_overlap.csv": "Stream overlap summary",
+        "phase_split.csv": "Phase split",
+        "roofline.csv": "Roofline rows",
+        "gpu_idle_gaps.csv": "GPU idle gaps",
+        "kernels_by_pid.csv": "Per-PID kernels",
+        "sync_by_pid.csv": "Per-PID sync calls",
+        "nvtx_by_pid.csv": "Per-PID NVTX ranges",
+        "nvtx_ranges.csv": "NVTX ranges",
+        "bundle.zip": "Download all artifacts as a zip",
+    }
+    rows: List[Dict[str, str]] = []
+    for name, purpose in purpose_map.items():
+        path = out_dir / name
+        if not path.exists():
+            path = out_dir / "tables" / name
+        if path.exists():
+            rows.append({"artifact": name, "purpose": purpose, "path": str(path)})
+    return rows
+def _zip_artifacts(out_dir: Path) -> Path:
+    zip_path = out_dir / "bundle.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
+        for path in sorted(out_dir.rglob("*")):
+            if path.is_file() and path != zip_path:
+                zf.write(path, arcname=path.relative_to(out_dir).as_posix())
+    return zip_path
+def _normalize_report_for_artifacts(report: Mapping[str, Any]) -> Dict[str, Any]:
+    normalized: Dict[str, Any] = dict(report)
+    metrics: Dict[str, Any] = dict(normalized.get("metrics") or {})
+    metrics.setdefault("top_kernels", {"present": False, "kernels": []})
+    metrics.setdefault("barriers", {"present": False, "barriers": []})
+    metrics.setdefault("nccl", {"present": False, "ops": [], "rank_rows": [], "pids": []})
+    metrics.setdefault("nvlink_during_nccl", {"present": False, "rows": [], "timeseries": []})
+    metrics.setdefault("timeline", {"present": False, "events": []})
+    metrics.setdefault("copy_engine", {"present": False, "events": []})
+    metrics.setdefault("launch_latency", {"present": False, "rows": [], "histogram": []})
+    metrics.setdefault("stream_overlap", {"present": False, "summary": []})
+    metrics.setdefault("phase_split", {"present": False, "rows": []})
+    metrics.setdefault("roofline", {"present": False, "rows": []})
+    metrics.setdefault("gpu_idle", {"present": False, "gaps": []})
+    metrics.setdefault("nvtx", {"present": False, "ranges": []})
+    by_pid = dict(metrics.get("by_pid") or {})
+    by_pid.setdefault("kernels", {"kernels": []})
+    by_pid.setdefault("sync", {"sync_calls": []})
+    by_pid.setdefault("nvtx", {"present": False, "ranges": []})
+    metrics["by_pid"] = by_pid
+    normalized["metrics"] = metrics
+    return normalized
+def _load_report(path: Path) -> Tuple[str, Dict[str, Any], str]:
+    lower = path.suffix.lower()
+    if lower in (".sqlite", ".db"):
+        db = TraceDB.open(path)
+        try:
+            outputs = analyze(
+                db,
+                phase_map_path=None,
+                kernel_limit=50,
+                compute_kernel_percentiles=True,
+                compute_nvtx_kernel_map=True,
+            )
+            return "sqlite", dict(outputs.report), str(outputs.markdown)
+        finally:
+            db.close()
+    if lower == ".json":
+        report = json.loads(path.read_text(encoding="utf-8"))
+        if not isinstance(report, dict):
+            raise ValueError("Input JSON root must be an object.")
+        try:
+            markdown = render_markdown(report)
+        except Exception:
+            markdown = "# Nsight Systems LLM Hotspot Report\n\nJSON loaded, but markdown rendering failed for this input."
+        return "json", report, markdown
+    header = path.read_bytes()[:32]
+    if header.startswith(b"SQLite format 3"):
+        db = TraceDB.open(path)
+        try:
+            outputs = analyze(
+                db,
+                phase_map_path=None,
+                kernel_limit=50,
+                compute_kernel_percentiles=True,
+                compute_nvtx_kernel_map=True,
+            )
+            return "sqlite", dict(outputs.report), str(outputs.markdown)
+        finally:
+            db.close()
+    report = json.loads(path.read_text(encoding="utf-8"))
+    if not isinstance(report, dict):
+        raise ValueError("Input JSON root must be an object.")
+    try:
+        markdown = render_markdown(report)
+    except Exception:
+        markdown = "# Nsight Systems LLM Hotspot Report\n\nJSON loaded, but markdown rendering failed for this input."
+    return "json", report, markdown
+def analyze_path(path: Path) -> SpaceBundle:
+    source_kind, report, markdown = _load_report(path)
+    report = _normalize_report_for_artifacts(report)
+    outputs = AnalysisOutputs(report=report, markdown=markdown)
+    artifacts_dir = Path(tempfile.mkdtemp(prefix="nsys-llm-explainer-space-")) / path.stem
+    write_artifacts(outputs, artifacts_dir)
+    _zip_artifacts(artifacts_dir)
+    artifact_paths = sorted(
+        [p for p in artifacts_dir.rglob("*") if p.is_file()],
+        key=lambda item: item.relative_to(artifacts_dir).as_posix(),
+    )
+    return SpaceBundle(
+        source_path=path,
+        source_kind=source_kind,
+        report=report,
+        markdown=markdown,
+        artifacts_dir=artifacts_dir,
+        artifact_paths=artifact_paths,
+        summary_rows=_summary_rows(report),
+        manifest_rows=_artifact_manifest(artifacts_dir),
+        findings_markdown=_findings_markdown(report),
+        status_markdown="Loaded `{}` as `{}` and wrote artifacts to `{}`.".format(path.name, source_kind, artifacts_dir),
+    )
+def find_local_sample() -> Optional[Path]:
+    here = Path(__file__).resolve()
+    candidates = [
+        here.parent / "sample_report.json",
+        here.parents[2] / "examples" / "synthetic" / "report.json",
+        here.parents[2] / "examples" / "a100_vllm" / "report.json",
+        here.parents[1] / "examples" / "synthetic" / "report.json",
+    ]
+    for candidate in candidates:
+        if candidate.exists():
+            return candidate
+    return None
+def coerce_upload_path(uploaded: Any) -> Optional[Path]:
+    if uploaded is None:
+        return None
+    if isinstance(uploaded, (str, Path)):
+        path = Path(uploaded)
+        return path if path.exists() else None
+    if isinstance(uploaded, Sequence) and uploaded:
+        first = uploaded[0]
+        if isinstance(first, (str, Path)):
+            path = Path(first)
+            return path if path.exists() else None
+    return None