Spaces:

KokosDev
/

nsys-llm-explainer

Sleeping

File size: 8,390 Bytes

from __future__ import annotations

import os
from pathlib import Path
from typing import Any, Optional, Sequence, Tuple

import pandas as pd
import gradio as gr

from space_utils import SpaceBundle, analyze_path, coerce_upload_path, find_local_sample


APP_TITLE = "nsys-llm-explainer — Instant Nsight Trace Analyzer for Cloud LLM Inference"

CSS = """
.gradio-container {
  background:
    radial-gradient(circle at top left, rgba(42, 93, 142, 0.35), transparent 30%),
    radial-gradient(circle at top right, rgba(20, 104, 117, 0.22), transparent 26%),
    linear-gradient(180deg, #081018 0%, #0b111a 42%, #090e15 100%);
  color: #e6eef7;
  font-family: "Aptos", "Segoe UI", sans-serif;
}

.hero-card {
  border: 1px solid rgba(115, 145, 180, 0.28);
  border-radius: 22px;
  background: linear-gradient(135deg, rgba(14, 22, 34, 0.95), rgba(10, 14, 20, 0.92));
  box-shadow: 0 24px 70px rgba(0, 0, 0, 0.28);
  padding: 22px 24px;
  margin-bottom: 16px;
}

.hero-kicker {
  text-transform: uppercase;
  letter-spacing: 0.18em;
  color: #8fb4d9;
  font-size: 11px;
  font-weight: 700;
}

.hero-title {
  margin: 10px 0 10px;
  font-size: 34px;
  line-height: 1.05;
  font-weight: 800;
  color: #f3f8ff;
}

.hero-subtitle {
  color: #b2c5d9;
  font-size: 15px;
  line-height: 1.6;
  max-width: 980px;
}

.badge-row {
  display: flex;
  flex-wrap: wrap;
  gap: 8px;
  margin-top: 16px;
}

.badge {
  display: inline-flex;
  align-items: center;
  padding: 6px 12px;
  border-radius: 999px;
  border: 1px solid rgba(137, 171, 207, 0.28);
  background: rgba(13, 21, 31, 0.82);
  color: #d8e6f5;
  font-size: 12px;
}

.upload-card {
  border: 1px solid rgba(88, 113, 143, 0.26);
  border-radius: 18px;
  background: rgba(10, 16, 24, 0.86);
  padding: 14px;
  margin-bottom: 14px;
}

.section-title {
  color: #f4f8fd;
  font-size: 16px;
  font-weight: 700;
  margin: 0 0 10px 0;
}

.gr-markdown, .prose {
  color: #e8eff7;
}

.wrap-long {
  white-space: pre-wrap;
  word-break: break-word;
}
"""

HEADER = """
<div class="hero-card">
  <div class="hero-kicker">Cloud ML trace intelligence</div>
  <div class="hero-title">nsys-llm-explainer — Instant Nsight Trace Analyzer for Cloud LLM Inference</div>
  <div class="hero-subtitle">
    Upload a `trace.sqlite` or `report.json` and get prioritized findings, NCCL/NVLink correlation, launch storm diagnosis,
    per-process breakdowns, and downloadable analysis artifacts. The same code path powers the CLI, dashboard, and this Space.
  </div>
  <div class="badge-row">
    <span class="badge">SQLite + report.json input</span>
    <span class="badge">Evidence-backed findings</span>
    <span class="badge">CSV + JSON downloads</span>
    <span class="badge">Built for cloud LLM traces</span>
  </div>
</div>
"""


def _empty_outputs(message: str) -> Tuple[Any, str, pd.DataFrame, str, str, list[str], pd.DataFrame]:
    empty_df = pd.DataFrame(columns=["section", "metric", "value"])
    empty_manifest = pd.DataFrame(columns=["artifact", "purpose", "path"])
    return (
        message,
        message,
        empty_df,
        message,
        message,
        [],
        empty_manifest,
    )


def _bundle_to_outputs(bundle: SpaceBundle) -> Tuple[Any, str, pd.DataFrame, str, str, list[str], pd.DataFrame]:
    summary_df = pd.DataFrame(bundle.summary_rows)
    manifest_df = pd.DataFrame(bundle.manifest_rows)
    bottleneck = next((row["value"] for row in bundle.summary_rows if row.get("metric") == "Top bottleneck"), "No bottleneck summary available")
    summary_markdown = [
        "### Quick read",
        "",
        "- Source: `{}` (`{}`)".format(bundle.source_path.name, bundle.source_kind),
        "- {}".format(bundle.report.get("generated_at") or "Generated time unavailable"),
        "- {}".format(bottleneck),
        "- Warnings: `{}`".format(len(bundle.report.get("warnings") or [])),
    ]
    files = [str(path) for path in bundle.artifact_paths]
    return (
        bundle.status_markdown,
        "\n".join(summary_markdown),
        summary_df,
        bundle.findings_markdown,
        bundle.markdown,
        files,
        manifest_df,
    )


def _resolve_path(uploaded: Any, sample_path: str) -> Optional[Path]:
    uploaded_path = coerce_upload_path(uploaded)
    if uploaded_path:
        return uploaded_path
    if sample_path:
        candidate = Path(sample_path)
        if candidate.exists():
            return candidate
    return None


def _run_analysis(uploaded, sample_path):
    path = _resolve_path(uploaded, sample_path)
    if not path:
        return _empty_outputs(
            "Upload a `trace.sqlite`/`.db` file or a `report.json` to generate the report. "
            "If you are using this Space as a demo, click `Load sample trace` first."
        )
    try:
        bundle = analyze_path(path)
        return _bundle_to_outputs(bundle)
    except Exception as exc:
        message = "Failed to analyze `{}`: `{}`".format(path.name, exc)
        return _empty_outputs(message)


def _build_demo(sample_path: Optional[Path]) -> gr.Blocks:
    with gr.Blocks(title=APP_TITLE, css=CSS, theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate")) as demo:
        gr.HTML(HEADER)
        with gr.Row(elem_classes=["upload-card"]):
            with gr.Column(scale=6):
                upload = gr.File(
                    label="Upload trace or report",
                    file_count="single",
                    file_types=[".sqlite", ".db", ".json"],
                    type="filepath",
                )
            with gr.Column(scale=2, min_width=180):
                analyze_btn = gr.Button("Analyze trace", variant="primary")
            with gr.Column(scale=2, min_width=180):
                sample_btn = gr.Button(
                    "Load sample trace",
                    variant="secondary",
                    visible=bool(sample_path),
                )

        status = gr.Markdown("Upload a trace or report to begin.")
        sample_state = gr.State(str(sample_path) if sample_path else "")

        with gr.Tabs():
            with gr.Tab("Summary"):
                gr.Markdown("### Summary")
                summary = gr.Markdown(elem_classes=["wrap-long"])
                summary_table = gr.Dataframe(
                    headers=["section", "metric", "value"],
                    datatype=["str", "str", "str"],
                    interactive=False,
                    wrap=True,
                    label="Key metrics",
                )
            with gr.Tab("Findings"):
                findings = gr.Markdown(elem_classes=["wrap-long"])
            with gr.Tab("Markdown"):
                report_markdown = gr.Markdown(elem_classes=["wrap-long"])
            with gr.Tab("Downloads"):
                gr.Markdown(
                    "### Generated artifacts\n"
                    "The analysis writes `report.md`, `report.json`, CSV tables, and a zip bundle."
                )
                manifest = gr.Dataframe(
                    headers=["artifact", "purpose", "path"],
                    datatype=["str", "str", "str"],
                    interactive=False,
                    wrap=True,
                    label="Artifact manifest",
                )
                downloads = gr.File(
                    label="Download files",
                    file_count="multiple",
                    type="filepath",
                )

        analyze_btn.click(
            fn=_run_analysis,
            inputs=[upload, sample_state],
            outputs=[status, summary, summary_table, findings, report_markdown, downloads, manifest],
        )
        if sample_path:
            sample_btn.click(
                fn=lambda sp: _run_analysis(None, sp),
                inputs=[sample_state],
                outputs=[status, summary, summary_table, findings, report_markdown, downloads, manifest],
            )
            demo.load(
                fn=lambda sp: _run_analysis(None, sp),
                inputs=[sample_state],
                outputs=[status, summary, summary_table, findings, report_markdown, downloads, manifest],
            )
    return demo


def main() -> None:
    demo = _build_demo(find_local_sample())
    demo.queue()
    demo.launch(
        server_name="0.0.0.0",
        server_port=int(os.getenv("PORT", "7860")),
        share=True,
    )


if __name__ == "__main__":
    main()