from __future__ import annotations import os from pathlib import Path from typing import Any, Optional, Sequence, Tuple import pandas as pd import gradio as gr from space_utils import SpaceBundle, analyze_path, coerce_upload_path, find_local_sample APP_TITLE = "nsys-llm-explainer — Instant Nsight Trace Analyzer for Cloud LLM Inference" CSS = """ .gradio-container { background: radial-gradient(circle at top left, rgba(42, 93, 142, 0.35), transparent 30%), radial-gradient(circle at top right, rgba(20, 104, 117, 0.22), transparent 26%), linear-gradient(180deg, #081018 0%, #0b111a 42%, #090e15 100%); color: #e6eef7; font-family: "Aptos", "Segoe UI", sans-serif; } .hero-card { border: 1px solid rgba(115, 145, 180, 0.28); border-radius: 22px; background: linear-gradient(135deg, rgba(14, 22, 34, 0.95), rgba(10, 14, 20, 0.92)); box-shadow: 0 24px 70px rgba(0, 0, 0, 0.28); padding: 22px 24px; margin-bottom: 16px; } .hero-kicker { text-transform: uppercase; letter-spacing: 0.18em; color: #8fb4d9; font-size: 11px; font-weight: 700; } .hero-title { margin: 10px 0 10px; font-size: 34px; line-height: 1.05; font-weight: 800; color: #f3f8ff; } .hero-subtitle { color: #b2c5d9; font-size: 15px; line-height: 1.6; max-width: 980px; } .badge-row { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 16px; } .badge { display: inline-flex; align-items: center; padding: 6px 12px; border-radius: 999px; border: 1px solid rgba(137, 171, 207, 0.28); background: rgba(13, 21, 31, 0.82); color: #d8e6f5; font-size: 12px; } .upload-card { border: 1px solid rgba(88, 113, 143, 0.26); border-radius: 18px; background: rgba(10, 16, 24, 0.86); padding: 14px; margin-bottom: 14px; } .section-title { color: #f4f8fd; font-size: 16px; font-weight: 700; margin: 0 0 10px 0; } .gr-markdown, .prose { color: #e8eff7; } .wrap-long { white-space: pre-wrap; word-break: break-word; } """ HEADER = """
Cloud ML trace intelligence
nsys-llm-explainer — Instant Nsight Trace Analyzer for Cloud LLM Inference
Upload a `trace.sqlite` or `report.json` and get prioritized findings, NCCL/NVLink correlation, launch storm diagnosis, per-process breakdowns, and downloadable analysis artifacts. The same code path powers the CLI, dashboard, and this Space.
SQLite + report.json input Evidence-backed findings CSV + JSON downloads Built for cloud LLM traces
""" def _empty_outputs(message: str) -> Tuple[Any, str, pd.DataFrame, str, str, list[str], pd.DataFrame]: empty_df = pd.DataFrame(columns=["section", "metric", "value"]) empty_manifest = pd.DataFrame(columns=["artifact", "purpose", "path"]) return ( message, message, empty_df, message, message, [], empty_manifest, ) def _bundle_to_outputs(bundle: SpaceBundle) -> Tuple[Any, str, pd.DataFrame, str, str, list[str], pd.DataFrame]: summary_df = pd.DataFrame(bundle.summary_rows) manifest_df = pd.DataFrame(bundle.manifest_rows) bottleneck = next((row["value"] for row in bundle.summary_rows if row.get("metric") == "Top bottleneck"), "No bottleneck summary available") summary_markdown = [ "### Quick read", "", "- Source: `{}` (`{}`)".format(bundle.source_path.name, bundle.source_kind), "- {}".format(bundle.report.get("generated_at") or "Generated time unavailable"), "- {}".format(bottleneck), "- Warnings: `{}`".format(len(bundle.report.get("warnings") or [])), ] files = [str(path) for path in bundle.artifact_paths] return ( bundle.status_markdown, "\n".join(summary_markdown), summary_df, bundle.findings_markdown, bundle.markdown, files, manifest_df, ) def _resolve_path(uploaded: Any, sample_path: str) -> Optional[Path]: uploaded_path = coerce_upload_path(uploaded) if uploaded_path: return uploaded_path if sample_path: candidate = Path(sample_path) if candidate.exists(): return candidate return None def _run_analysis(uploaded, sample_path): path = _resolve_path(uploaded, sample_path) if not path: return _empty_outputs( "Upload a `trace.sqlite`/`.db` file or a `report.json` to generate the report. " "If you are using this Space as a demo, click `Load sample trace` first." ) try: bundle = analyze_path(path) return _bundle_to_outputs(bundle) except Exception as exc: message = "Failed to analyze `{}`: `{}`".format(path.name, exc) return _empty_outputs(message) def _build_demo(sample_path: Optional[Path]) -> gr.Blocks: with gr.Blocks(title=APP_TITLE, css=CSS, theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate")) as demo: gr.HTML(HEADER) with gr.Row(elem_classes=["upload-card"]): with gr.Column(scale=6): upload = gr.File( label="Upload trace or report", file_count="single", file_types=[".sqlite", ".db", ".json"], type="filepath", ) with gr.Column(scale=2, min_width=180): analyze_btn = gr.Button("Analyze trace", variant="primary") with gr.Column(scale=2, min_width=180): sample_btn = gr.Button( "Load sample trace", variant="secondary", visible=bool(sample_path), ) status = gr.Markdown("Upload a trace or report to begin.") sample_state = gr.State(str(sample_path) if sample_path else "") with gr.Tabs(): with gr.Tab("Summary"): gr.Markdown("### Summary") summary = gr.Markdown(elem_classes=["wrap-long"]) summary_table = gr.Dataframe( headers=["section", "metric", "value"], datatype=["str", "str", "str"], interactive=False, wrap=True, label="Key metrics", ) with gr.Tab("Findings"): findings = gr.Markdown(elem_classes=["wrap-long"]) with gr.Tab("Markdown"): report_markdown = gr.Markdown(elem_classes=["wrap-long"]) with gr.Tab("Downloads"): gr.Markdown( "### Generated artifacts\n" "The analysis writes `report.md`, `report.json`, CSV tables, and a zip bundle." ) manifest = gr.Dataframe( headers=["artifact", "purpose", "path"], datatype=["str", "str", "str"], interactive=False, wrap=True, label="Artifact manifest", ) downloads = gr.File( label="Download files", file_count="multiple", type="filepath", ) analyze_btn.click( fn=_run_analysis, inputs=[upload, sample_state], outputs=[status, summary, summary_table, findings, report_markdown, downloads, manifest], ) if sample_path: sample_btn.click( fn=lambda sp: _run_analysis(None, sp), inputs=[sample_state], outputs=[status, summary, summary_table, findings, report_markdown, downloads, manifest], ) demo.load( fn=lambda sp: _run_analysis(None, sp), inputs=[sample_state], outputs=[status, summary, summary_table, findings, report_markdown, downloads, manifest], ) return demo def main() -> None: demo = _build_demo(find_local_sample()) demo.queue() demo.launch( server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), share=True, ) if __name__ == "__main__": main()