Spaces:

NinjainPJs
/

EvalPulse

Sleeping

File size: 36,493 Bytes

"""EvalPulse Demo Dashboard — self-contained HuggingFace Spaces deployment.

Runs entirely on synthetic data. No external dependencies on evalpulse or
dashboard packages.
"""

from __future__ import annotations

import random
from collections import defaultdict
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone

import gradio as gr
import plotly.graph_objects as go

# ── Lightweight EvalRecord (replaces pydantic model) ─────────────────

UTC = timezone.utc


@dataclass
class EvalRecord:
    """Minimal evaluation record for demo purposes."""

    app_name: str = "default"
    timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
    query: str = ""
    context: str | None = None
    response: str = ""
    model_name: str = "unknown"
    latency_ms: int = 0
    tags: list[str] = field(default_factory=list)

    # Hallucination
    hallucination_score: float = 0.0
    hallucination_method: str = "none"
    flagged_claims: list[str] = field(default_factory=list)

    # Drift
    embedding_vector: list[float] = field(default_factory=list)
    drift_score: float | None = None

    # RAG Quality
    faithfulness_score: float | None = None
    context_relevance: float | None = None
    answer_relevancy: float | None = None
    groundedness_score: float | None = None

    # Response Quality
    sentiment_score: float = 0.5
    toxicity_score: float = 0.0
    response_length: int = 0
    language_detected: str = "en"
    is_denial: bool = False

    # Composite
    health_score: int = 0


# ── Demo data generator ─────────────────────────────────────────────


def generate_demo_records(n: int = 200) -> list[EvalRecord]:
    """Generate N synthetic EvalRecords with realistic distributions.

    Simulates an LLM app with:
    - Generally good performance (health 70-95)
    - Occasional hallucination spikes
    - Gradual drift over time
    - Some toxic/denial responses
    """
    random.seed(42)
    records: list[EvalRecord] = []
    now = datetime.now(UTC)

    queries = [
        "What is machine learning?",
        "Explain neural networks",
        "How does RAG work?",
        "What is Python used for?",
        "Describe transformer architecture",
        "What are embeddings?",
        "How do LLMs handle context?",
        "What is fine-tuning?",
        "Explain attention mechanism",
        "What is prompt engineering?",
    ]

    models = ["llama-3.1-70b", "gpt-4o-mini", "gemini-flash"]

    for i in range(n):
        ts = now - timedelta(hours=n - i)
        query = random.choice(queries)
        model = random.choice(models)

        # Simulate drift: later responses drift slightly
        drift_factor = i / n * 0.1

        # Base scores
        halluc = random.gauss(0.12, 0.08) + drift_factor * 0.5
        halluc = max(0.0, min(1.0, halluc))

        drift = random.gauss(0.05, 0.03) + drift_factor
        drift = max(0.0, min(1.0, drift))

        sentiment = random.gauss(0.7, 0.1)
        sentiment = max(0.0, min(1.0, sentiment))

        toxicity = abs(random.gauss(0.02, 0.02))
        toxicity = max(0.0, min(1.0, toxicity))

        is_denial = random.random() < 0.05
        length = random.randint(20, 200)

        # RAG scores (70% of calls are RAG)
        is_rag = random.random() < 0.7
        faith = None
        ctx_rel = None
        ans_rel = None
        ground = None
        context = None

        if is_rag:
            faith = random.gauss(0.75, 0.1)
            faith = max(0.0, min(1.0, faith))
            ctx_rel = random.gauss(0.8, 0.08)
            ctx_rel = max(0.0, min(1.0, ctx_rel))
            ans_rel = random.gauss(0.78, 0.09)
            ans_rel = max(0.0, min(1.0, ans_rel))
            ground = 0.4 * faith + 0.3 * ctx_rel + 0.3 * ans_rel
            context = f"Context for: {query}"

        # Compute health score
        components = [(1 - halluc) * 0.35, (1 - drift) * 0.25]
        if ground is not None:
            components.append(ground * 0.20)
        quality = (1 - toxicity) * 0.5 + sentiment * 0.4 + 0.1
        components.append(quality * 0.15)
        health = int(
            sum(components)
            / sum([0.35, 0.25] + ([0.20] if ground else []) + [0.15])
            * 100
        )
        health = max(0, min(100, health))

        record = EvalRecord(
            app_name="demo-app",
            timestamp=ts,
            query=query,
            context=context,
            response=f"Demo response for: {query}",
            model_name=model,
            latency_ms=random.randint(50, 500),
            tags=["demo"],
            hallucination_score=round(halluc, 4),
            hallucination_method="embedding",
            drift_score=round(drift, 4),
            faithfulness_score=round(faith, 4) if faith else None,
            context_relevance=round(ctx_rel, 4) if ctx_rel else None,
            answer_relevancy=round(ans_rel, 4) if ans_rel else None,
            groundedness_score=round(ground, 4) if ground else None,
            sentiment_score=round(sentiment, 4),
            toxicity_score=round(toxicity, 4),
            response_length=length,
            language_detected="en",
            is_denial=is_denial,
            health_score=health,
        )
        records.append(record)

    return records


# ── Chart helpers (inlined from dashboard/charts.py) ─────────────────

_BG = "#0a0e1a"
_SURFACE = "#111827"
_BORDER = "#1e293b"
_TEXT = "#e2e8f0"
_TEXT_DIM = "#64748b"
_CYAN = "#06d6a0"
_AMBER = "#f59e0b"
_RED = "#ef4444"
_BLUE = "#3b82f6"
_PURPLE = "#a78bfa"
_PINK = "#f472b6"

_LAYOUT_BASE: dict = dict(
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    font=dict(family="JetBrains Mono, monospace", color=_TEXT, size=11),
    margin=dict(l=48, r=24, t=48, b=40),
    xaxis=dict(
        gridcolor="rgba(255,255,255,0.04)",
        zerolinecolor="rgba(255,255,255,0.06)",
        tickfont=dict(size=10, color=_TEXT_DIM),
    ),
    yaxis=dict(
        gridcolor="rgba(255,255,255,0.04)",
        zerolinecolor="rgba(255,255,255,0.06)",
        tickfont=dict(size=10, color=_TEXT_DIM),
    ),
    legend=dict(
        font=dict(size=10, color=_TEXT_DIM),
        bgcolor="rgba(0,0,0,0)",
    ),
)


def _apply_layout(fig: go.Figure, height: int = 320, **kwargs) -> go.Figure:
    layout = {**_LAYOUT_BASE, "height": height}
    layout.update(kwargs)
    fig.update_layout(**layout)
    return fig


def empty_figure(title: str = "", message: str = "No data available") -> go.Figure:
    """Create an empty figure with a message."""
    fig = go.Figure()
    _apply_layout(
        fig,
        height=260,
        xaxis=dict(visible=False),
        yaxis=dict(visible=False),
        annotations=[
            dict(
                text=f"<i>{message}</i>",
                xref="paper",
                yref="paper",
                x=0.5,
                y=0.5,
                showarrow=False,
                font=dict(size=13, color=_TEXT_DIM),
            )
        ],
    )
    return fig


def health_gauge_chart(score: int | None = None) -> go.Figure:
    """Create a health score gauge chart (0-100)."""
    if score is None:
        return empty_figure("", "Awaiting first evaluation")

    if score >= 75:
        bar_color = _CYAN
    elif score >= 40:
        bar_color = _AMBER
    else:
        bar_color = _RED

    fig = go.Figure(
        go.Indicator(
            mode="gauge+number",
            value=score,
            number=dict(
                font=dict(
                    size=48, color=bar_color, family="JetBrains Mono, monospace"
                ),
                suffix="",
            ),
            gauge=dict(
                axis=dict(
                    range=[0, 100],
                    tickcolor=_TEXT_DIM,
                    tickfont=dict(size=9, color=_TEXT_DIM),
                    dtick=25,
                ),
                bgcolor="rgba(255,255,255,0.03)",
                bordercolor="rgba(255,255,255,0.08)",
                bar=dict(color=bar_color, thickness=0.75),
                steps=[
                    dict(range=[0, 40], color="rgba(239,68,68,0.08)"),
                    dict(range=[40, 75], color="rgba(245,158,11,0.06)"),
                    dict(range=[75, 100], color="rgba(6,214,160,0.06)"),
                ],
            ),
        )
    )
    _apply_layout(fig, height=220, margin=dict(l=24, r=24, t=16, b=8))
    return fig


def radar_chart(
    categories: list[str],
    values: list[float],
    title: str = "",
) -> go.Figure:
    """Create a radar/spider chart for multi-dimensional scores."""
    if not categories or not values:
        return empty_figure(title, "No RAG data yet")

    # Close the polygon
    cats = categories + [categories[0]]
    vals = values + [values[0]]

    fig = go.Figure()
    fig.add_trace(
        go.Scatterpolar(
            r=vals,
            theta=cats,
            fill="toself",
            fillcolor=f"rgba({int(_CYAN[1:3], 16)},{int(_CYAN[3:5], 16)},{int(_CYAN[5:7], 16)},0.12)",
            line=dict(color=_CYAN, width=2),
            marker=dict(size=5, color=_CYAN),
        )
    )

    _apply_layout(fig, height=340)
    fig.update_layout(
        polar=dict(
            bgcolor="rgba(0,0,0,0)",
            radialaxis=dict(
                visible=True,
                range=[0, 1],
                gridcolor="rgba(255,255,255,0.06)",
                tickfont=dict(size=8, color=_TEXT_DIM),
            ),
            angularaxis=dict(
                gridcolor="rgba(255,255,255,0.06)",
                tickfont=dict(size=10, color=_TEXT),
            ),
        ),
        title=dict(
            text=title, font=dict(size=12, color=_TEXT_DIM), x=0, xanchor="left"
        ),
    )
    return fig


# ── Plotly dark theme for dashboard figures ──────────────────────────

_DARK_LAYOUT: dict = dict(
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    font=dict(family="JetBrains Mono, monospace", color="#94a3b8", size=11),
    autosize=True,
    margin=dict(l=50, r=20, t=44, b=40),
    xaxis=dict(
        gridcolor="rgba(255,255,255,0.04)",
        tickfont=dict(size=10, color="#475569"),
    ),
    yaxis=dict(
        gridcolor="rgba(255,255,255,0.04)",
        tickfont=dict(size=10, color="#475569"),
    ),
    legend=dict(font=dict(size=10, color="#64748b"), bgcolor="rgba(0,0,0,0)"),
)


def _dark(fig: go.Figure, **kw) -> go.Figure:
    """Apply dark theme to a Plotly figure."""
    layout = {**_DARK_LAYOUT, **kw}
    fig.update_layout(**layout)
    return fig


# ── Data layer (demo-only) ───────────────────────────────────────────

_DEMO_RECORDS: list[EvalRecord] | None = None


def _fetch_records(limit: int = 500) -> list[EvalRecord]:
    """Return cached demo records (generated once on first call)."""
    global _DEMO_RECORDS
    if _DEMO_RECORDS is None:
        _DEMO_RECORDS = generate_demo_records(200)
    return _DEMO_RECORDS[:limit]


def _fetch_alerts(limit: int = 20) -> list:
    """No real alerts in demo mode."""
    return []


# ── KPI card HTML helper ────────────────────────────────────────────


def _kpi_card(label: str, value: str, sub: str, color: str) -> str:
    return f"""<div style="
        background:linear-gradient(145deg,#111827,#0f172a);
        border:1px solid #1e293b;
        border-radius:14px;
        padding:18px 20px;
        border-top:2.5px solid {color};
        min-height:90px;
        min-width:0;
        width:100%;
        box-sizing:border-box;
        overflow:hidden;
    ">
        <div style="
            font-family:'JetBrains Mono',monospace;
            font-size:0.62em;font-weight:600;
            text-transform:uppercase;letter-spacing:1.5px;
            color:#64748b;margin-bottom:8px;
        ">{label}</div>
        <div style="
            font-family:'Outfit',sans-serif;
            font-size:1.8em;font-weight:700;
            color:{color};line-height:1;margin-bottom:5px;
        ">{value}</div>
        <div style="
            font-family:'JetBrains Mono',monospace;
            font-size:0.68em;color:#475569;
        ">{sub}</div>
    </div>"""


# ── Tab 1: Overview ─────────────────────────────────────────────────


def build_overview():
    records = _fetch_records(500)
    alerts = _fetch_alerts(20)

    if not records:
        return (
            _kpi_card("Health Score", "---", "no data", "#06d6a0"),
            _kpi_card("Hallucination", "---", "no data", "#f59e0b"),
            _kpi_card("Drift", "---", "no data", "#3b82f6"),
            _kpi_card("Evaluations", "0", "", "#a78bfa"),
            health_gauge_chart(None),
            empty_figure("", "No evaluations yet"),
            [["No alerts yet", "", "", "", "", ""]],
        )

    avg_health = int(sum(r.health_score for r in records) / len(records))
    avg_halluc = sum(r.hallucination_score for r in records) / len(records)
    drift_vals = [r.drift_score for r in records if r.drift_score is not None]
    avg_drift = sum(drift_vals) / len(drift_vals) if drift_vals else None

    if avg_health >= 90:
        h_sub = "HEALTHY"
    elif avg_health >= 75:
        h_sub = "MONITORING"
    elif avg_health >= 60:
        h_sub = "DEGRADING"
    else:
        h_sub = "CRITICAL"

    d_val = f"{avg_drift:.3f}" if avg_drift is not None else "..."
    d_sub = (
        "STABLE"
        if avg_drift is not None and avg_drift < 0.15
        else "DRIFTING"
        if avg_drift is not None
        else "BUILDING BASELINE"
    )

    sorted_recs = sorted(records, key=lambda r: r.timestamp)
    times = [r.timestamp.strftime("%m-%d %H:%M") for r in sorted_recs]
    scores = [r.health_score for r in sorted_recs]

    trend = go.Figure()
    min_score = max(0, min(scores) - 10)
    trend.add_trace(
        go.Scatter(
            x=times,
            y=scores,
            mode="lines",
            name="Health Score",
            line=dict(color="#06d6a0", width=2, shape="spline"),
            fill="tonexty" if min_score > 30 else "none",
            fillcolor="rgba(6,214,160,0.06)",
        )
    )
    # Only show threshold lines if they're within visible range
    if min_score <= 75:
        trend.add_hline(y=75, line_dash="dot", line_color="#f59e0b", line_width=1,
                        annotation_text="Warning: 75", annotation_font_size=9,
                        annotation_font_color="#f59e0b")
    if min_score <= 40:
        trend.add_hline(y=40, line_dash="dot", line_color="#ef4444", line_width=1,
                        annotation_text="Critical: 40", annotation_font_size=9,
                        annotation_font_color="#ef4444")
    _dark(
        trend,
        title="Health Score Trend",
        yaxis=dict(range=[min_score, 105], **_DARK_LAYOUT["yaxis"]),
        height=350,
    )

    alert_rows = [["---", "", "", "", "", "No alerts triggered"]]
    if alerts:
        alert_rows = []
        for a in alerts[:20]:
            alert_rows.append(
                [
                    a.timestamp.strftime("%Y-%m-%d %H:%M"),
                    a.severity.upper(),
                    a.metric,
                    f"{a.value:.4f}",
                    f"{a.threshold:.4f}",
                    a.message,
                ]
            )

    return (
        _kpi_card("Health Score", str(avg_health), h_sub, "#06d6a0"),
        _kpi_card(
            "Hallucination", f"{avg_halluc:.1%}", f"avg of {len(records)}", "#f59e0b"
        ),
        _kpi_card("Drift", d_val, d_sub, "#3b82f6"),
        _kpi_card("Evaluations", f"{len(records):,}", "total tracked", "#a78bfa"),
        health_gauge_chart(avg_health),
        trend,
        alert_rows,
    )


# ── Tab 2: Hallucination ────────────────────────────────────────────


def build_hallucination():
    records = _fetch_records(500)
    if not records:
        e = empty_figure("", "No data yet")
        return e, e, e, [["No data", "", "", "", ""]]

    sorted_recs = sorted(records, key=lambda r: r.timestamp)
    times = [r.timestamp.strftime("%m-%d %H:%M") for r in sorted_recs]
    h_scores = [r.hallucination_score for r in sorted_recs]

    rate = go.Figure()
    rate.add_trace(
        go.Scatter(
            x=times,
            y=h_scores,
            mode="lines",
            line=dict(color="#ef4444", width=2, shape="spline"),
            fill="tozeroy",
            fillcolor="rgba(239,68,68,0.08)",
        )
    )
    rate.add_hline(
        y=0.3,
        line_dash="dot",
        line_color="#f59e0b",
        annotation_text="Threshold 0.3",
        annotation_font_size=9,
        annotation_font_color="#f59e0b",
    )
    _dark(
        rate,
        title="Hallucination Score Over Time",
        yaxis=dict(range=[0, 1.05], **_DARK_LAYOUT["yaxis"]),
        height=350,
    )

    dist = go.Figure(
        go.Histogram(
            x=h_scores,
            nbinsx=25,
            marker_color="#ef4444",
            opacity=0.7,
            marker_line_width=0,
        )
    )
    dist.add_vline(x=0.3, line_dash="dot", line_color="#f59e0b")
    _dark(dist, title="Score Distribution", height=300, bargap=0.05)

    ms: dict[str, list[float]] = defaultdict(list)
    for r in records:
        ms[r.model_name].append(r.hallucination_score)
    model_names = list(ms.keys())
    avgs = [sum(v) / len(v) for v in ms.values()]
    model_fig = go.Figure(
        go.Bar(
            x=model_names,
            y=avgs,
            marker_color=["#ef4444" if a > 0.3 else "#06d6a0" for a in avgs],
            marker_line_width=0,
        )
    )
    _dark(model_fig, title="Avg Hallucination by Model", height=300)

    top = sorted(records, key=lambda r: r.hallucination_score, reverse=True)[:10]
    rows = [
        [
            r.timestamp.strftime("%H:%M:%S"),
            r.query[:50],
            r.response[:60],
            f"{r.hallucination_score:.3f}",
            ", ".join(r.flagged_claims[:2]) if r.flagged_claims else "",
        ]
        for r in top
    ]

    return rate, dist, model_fig, rows


# ── Tab 3: Drift ────────────────────────────────────────────────────


def build_drift():
    records = _fetch_records(500)
    if not records:
        e = empty_figure("", "No data yet")
        return e, e, "No data"

    sorted_recs = sorted(records, key=lambda r: r.timestamp)
    drift_recs = [r for r in sorted_recs if r.drift_score is not None]

    emb_recs = [
        r for r in sorted_recs if r.embedding_vector and len(r.embedding_vector) > 2
    ]
    if len(emb_recs) >= 3:
        embed = go.Figure(
            go.Scatter(
                x=[r.embedding_vector[0] for r in emb_recs],
                y=[r.embedding_vector[1] for r in emb_recs],
                mode="markers",
                marker=dict(
                    size=8,
                    color=[r.hallucination_score for r in emb_recs],
                    colorscale=[
                        [0, "#06d6a0"],
                        [0.5, "#f59e0b"],
                        [1, "#ef4444"],
                    ],
                    showscale=True,
                    colorbar=dict(
                        title="Halluc",
                        tickfont=dict(size=9, color="#64748b"),
                        titlefont=dict(size=10, color="#64748b"),
                    ),
                    line=dict(width=0),
                ),
                text=[r.query[:30] for r in emb_recs],
                hovertemplate="%{text}<br>Halluc: %{marker.color:.3f}<extra></extra>",
            )
        )
        _dark(embed, title="Response Embedding Space", height=350)
    else:
        embed = empty_figure("", "Need more data for visualization")

    if not drift_recs:
        return (
            empty_figure("", "Building baseline (need 10+ evaluations)"),
            embed,
            "Building baseline...",
        )

    times = [r.timestamp.strftime("%m-%d %H:%M") for r in drift_recs]
    scores = [r.drift_score for r in drift_recs]

    dfig = go.Figure()
    dfig.add_trace(
        go.Scatter(
            x=times,
            y=scores,
            mode="lines",
            line=dict(color="#a78bfa", width=2, shape="spline"),
            fill="tozeroy",
            fillcolor="rgba(167,139,250,0.08)",
        )
    )
    dfig.add_hline(
        y=0.15,
        line_dash="dot",
        line_color="#ef4444",
        annotation_text="Threshold 0.15",
        annotation_font_size=9,
        annotation_font_color="#ef4444",
    )
    y_max = max(max(scores) * 1.2, 0.3)
    _dark(
        dfig,
        title="Drift Score Over Time",
        yaxis=dict(range=[0, y_max], **_DARK_LAYOUT["yaxis"]),
        height=350,
    )

    avg = sum(scores) / len(scores)
    if avg < 0.1:
        st = "Stable"
    elif avg < 0.2:
        st = "Minor drift"
    else:
        st = "Significant drift!"

    return dfig, embed, st


# ── Tab 4: RAG & Quality ────────────────────────────────────────────


def build_rag_quality():
    records = _fetch_records(500)
    if not records:
        e = empty_figure("", "No data yet")
        return e, e, e, e

    sorted_recs = sorted(records, key=lambda r: r.timestamp)
    times = [r.timestamp.strftime("%m-%d %H:%M") for r in sorted_recs]

    qfig = go.Figure()
    qfig.add_trace(
        go.Scatter(
            x=times,
            y=[r.sentiment_score for r in sorted_recs],
            mode="lines",
            name="Sentiment",
            line=dict(color="#3b82f6", width=2, shape="spline"),
        )
    )
    qfig.add_trace(
        go.Scatter(
            x=times,
            y=[r.toxicity_score for r in sorted_recs],
            mode="lines",
            name="Toxicity",
            line=dict(color="#ef4444", width=2, shape="spline"),
        )
    )
    _dark(
        qfig,
        title="Quality Metrics Over Time",
        yaxis=dict(range=[0, 1.05], **_DARK_LAYOUT["yaxis"]),
        height=350,
    )

    rag_recs = [r for r in sorted_recs if r.groundedness_score is not None]
    if rag_recs:
        rt = [r.timestamp.strftime("%m-%d %H:%M") for r in rag_recs]
        rfig = go.Figure()
        rfig.add_trace(
            go.Scatter(
                x=rt,
                y=[r.faithfulness_score or 0 for r in rag_recs],
                mode="lines",
                name="Faithfulness",
                line=dict(color="#06d6a0", width=2, shape="spline"),
            )
        )
        rfig.add_trace(
            go.Scatter(
                x=rt,
                y=[r.context_relevance or 0 for r in rag_recs],
                mode="lines",
                name="Context Relevance",
                line=dict(color="#3b82f6", width=2, shape="spline"),
            )
        )
        rfig.add_trace(
            go.Scatter(
                x=rt,
                y=[r.groundedness_score or 0 for r in rag_recs],
                mode="lines",
                name="Groundedness",
                line=dict(color="#a78bfa", width=2, dash="dash"),
            )
        )
        _dark(
            rfig,
            title="RAG Quality Metrics",
            yaxis=dict(range=[0, 1.05], **_DARK_LAYOUT["yaxis"]),
            height=350,
        )

        af = sum(r.faithfulness_score or 0 for r in rag_recs) / len(rag_recs)
        ac = sum(r.context_relevance or 0 for r in rag_recs) / len(rag_recs)
        aa = sum(r.answer_relevancy or 0 for r in rag_recs) / len(rag_recs)
        ag = sum(r.groundedness_score or 0 for r in rag_recs) / len(rag_recs)
        radar = radar_chart(
            ["Faithfulness", "Context Relevance", "Answer Relevancy", "Groundedness"],
            [af, ac, aa, ag],
            title="RAG Quality Radar",
        )
    else:
        rfig = empty_figure("", "No RAG calls yet")
        radar = empty_figure("", "No RAG data")

    lang: dict[str, int] = defaultdict(int)
    denials = 0
    for r in records:
        lang[r.language_detected] += 1
        if r.is_denial:
            denials += 1
    bfig = go.Figure(
        go.Bar(
            x=list(lang.keys()),
            y=list(lang.values()),
            marker_color="#3b82f6",
            marker_line_width=0,
        )
    )
    _dark(
        bfig,
        title=f"Language Distribution  |  Denials: {denials}/{len(records)}",
        height=300,
    )

    return qfig, rfig, radar, bfig


# ── CSS ─────────────────────────────────────────────────────────────

THEME_CSS = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@300;400;500;600;700&family=Outfit:wght@300;400;500;600;700;800&display=swap');

body, .gradio-container {
    background: #060a14 !important;
    color: #e2e8f0 !important;
    font-family: 'Outfit', sans-serif !important;
}
.gradio-container {
    max-width: 100% !important;
    width: 100% !important;
    margin: 0 !important;
    padding: 0 20px !important;
    box-sizing: border-box !important;
    overflow-x: hidden !important;
}
.main, .wrap, .contain {
    max-width: 100% !important;
    width: 100% !important;
    overflow-x: hidden !important;
}
.app {
    max-width: 100% !important;
    overflow-x: hidden !important;
}
/* Plotly charts should not overflow */
.js-plotly-plot, .plotly, .plot-container, .svg-container {
    max-width: 100% !important;
    width: 100% !important;
    overflow: hidden !important;
}
.js-plotly-plot .main-svg, .js-plotly-plot .svg-container {
    max-width: 100% !important;
    width: 100% !important;
}
.plot-container.plotly {
    width: 100% !important;
}
/* Gradio plot wrapper */
.gr-plot, .plot-padding {
    max-width: 100% !important;
    overflow: hidden !important;
}

::-webkit-scrollbar { width: 6px; }
::-webkit-scrollbar-track { background: #0a0e1a; }
::-webkit-scrollbar-thumb { background: #1e293b; border-radius: 3px; }

.ep-hdr {
    position: relative;
    padding: 24px 32px;
    margin: 0 -20px 20px -20px;
    background: linear-gradient(135deg, #0a0e1a 0%, #111827 50%, #0f172a 100%);
    border-bottom: 1px solid rgba(6,214,160,0.15);
    overflow: hidden;
    box-sizing: border-box;
}
.ep-hdr::before {
    content:'';position:absolute;inset:0;
    background:
        radial-gradient(ellipse 600px 300px at 15% 50%,rgba(6,214,160,0.06),transparent 70%),
        radial-gradient(ellipse 400px 200px at 85% 30%,rgba(59,130,246,0.04),transparent 70%);
    pointer-events:none;
}
.ep-hdr-in { position:relative;display:flex;align-items:center;justify-content:space-between;z-index:1; }
.ep-brand { display:flex;align-items:center;gap:14px; }
.ep-logo {
    width:40px;height:40px;border-radius:10px;
    background:linear-gradient(135deg,#06d6a0,#3b82f6);
    display:flex;align-items:center;justify-content:center;
    font-size:18px;font-weight:700;color:#060a14;
    font-family:'JetBrains Mono',monospace;
    box-shadow:0 0 20px rgba(6,214,160,0.3);
}
.ep-t { font-family:'Outfit';font-size:1.6em;font-weight:700;letter-spacing:-0.5px;color:#f1f5f9!important;margin:0!important; }
.ep-st { font-family:'JetBrains Mono';font-size:0.7em;color:#64748b!important;margin:3px 0 0!important;letter-spacing:0.5px;text-transform:uppercase; }
.ep-live { display:flex;align-items:center;gap:8px;font-family:'JetBrains Mono';font-size:0.72em;color:#06d6a0;letter-spacing:0.3px; }
.ep-dot {
    width:7px;height:7px;border-radius:50%;background:#06d6a0;
    box-shadow:0 0 8px rgba(6,214,160,0.6);
    animation:pdot 2s ease-in-out infinite;
}
@keyframes pdot { 0%,100%{opacity:1} 50%{opacity:0.4} }

.tab-nav { background:transparent!important;border:none!important;gap:4px!important;padding:0 0 14px!important;border-bottom:1px solid #1e293b!important;margin-bottom:18px!important; }
.tab-nav button {
    font-family:'JetBrains Mono',monospace!important;font-size:0.76em!important;font-weight:500!important;
    letter-spacing:0.5px!important;text-transform:uppercase!important;color:#64748b!important;
    background:transparent!important;border:1px solid transparent!important;border-radius:8px!important;
    padding:8px 18px!important;transition:all 0.2s!important;
}
.tab-nav button:hover { color:#e2e8f0!important;background:rgba(255,255,255,0.03)!important; }
.tab-nav button.selected { color:#06d6a0!important;background:rgba(6,214,160,0.08)!important;border-color:rgba(6,214,160,0.2)!important; }
.tabitem { border:none!important;background:transparent!important;padding:0!important; }

table { background:#111827!important;border:1px solid #1e293b!important;border-radius:10px!important;overflow:hidden!important; }
table thead th {
    background:#0f172a!important;color:#64748b!important;
    font-family:'JetBrains Mono',monospace!important;font-size:0.7em!important;
    font-weight:600!important;letter-spacing:0.8px!important;text-transform:uppercase!important;
    padding:10px 14px!important;border-bottom:1px solid #1e293b!important;
}
table tbody td {
    background:#111827!important;color:#cbd5e1!important;
    font-family:'JetBrains Mono',monospace!important;font-size:0.78em!important;
    padding:8px 14px!important;border-bottom:1px solid rgba(30,41,59,0.5)!important;
}
table tbody tr:hover td { background:rgba(6,214,160,0.03)!important; }

button.primary, button.secondary {
    font-family:'JetBrains Mono',monospace!important;font-size:0.74em!important;
    letter-spacing:0.4px!important;border-radius:8px!important;
}
button.primary { background:rgba(6,214,160,0.12)!important;color:#06d6a0!important;border:1px solid rgba(6,214,160,0.25)!important; }
button.primary:hover { background:rgba(6,214,160,0.2)!important; }
button.secondary { background:rgba(59,130,246,0.1)!important;color:#3b82f6!important;border:1px solid rgba(59,130,246,0.2)!important; }
button.secondary:hover { background:rgba(59,130,246,0.18)!important; }

.gr-row {
    gap:14px!important;
    flex-wrap: wrap !important;
    max-width: 100% !important;
    overflow: hidden !important;
}
/* Remove all white backgrounds from Gradio components */
.gr-block, .block:not(.gr-group) { border:none!important;background:transparent!important; }
.gr-padded { padding:0!important; }
.label-wrap { background:#0a0e1a!important;border:1px solid #1e293b!important;border-radius:8px!important;padding:4px 10px!important; }
.label-wrap span { color:#64748b!important;font-family:'JetBrains Mono',monospace!important;font-size:0.72em!important;letter-spacing:0.5px!important; }
/* Plot containers */
.gr-plot, .plot-wrap, .gradio-plot { background:transparent!important;border:none!important; }
div[class*="plot"] { background:transparent!important; }
/* All panel/group/box backgrounds */
.panel, .gr-panel, .gr-box, .gr-form, .gr-input-label, .gr-check-radio { background:#111827!important;border-color:#1e293b!important;color:#e2e8f0!important; }
/* File download component */
.file-preview, .upload-button { background:#111827!important;border-color:#1e293b!important;color:#94a3b8!important; }
/* Inputs and textboxes */
input, textarea, select, .gr-input { background:#111827!important;border-color:#1e293b!important;color:#e2e8f0!important; }
/* Any remaining white wrapper divs */
.contain > div, .wrap > div { background:transparent!important; }
/* Markdown text areas */
.prose, .markdown-text, .md { background:transparent!important;color:#94a3b8!important; }
/* Accordion headers */
.accordion { background:#111827!important;border-color:#1e293b!important; }
/* Prevent dataframes from causing horizontal scroll */
.dataframe, .table-wrap, .svelte-table {
    max-width: 100% !important;
    overflow-x: auto !important;
    overflow-y: hidden !important;
}
/* KPI card row in HTML shouldn't overflow */
div[style*="display:flex"] {
    flex-wrap: wrap !important;
    max-width: 100% !important;
}

.ep-ftr {
    margin-top:28px;padding:14px 0;border-top:1px solid #1e293b;
    text-align:center;font-family:'JetBrains Mono',monospace;
    font-size:0.68em;color:#334155;letter-spacing:0.3px;
}
.ep-ftr a { color:#475569;text-decoration:none; }
.ep-ftr a:hover { color:#06d6a0; }

.markdown-text h4 { color:#94a3b8!important;font-family:'Outfit',sans-serif!important; }
.markdown-text p, .markdown-text { color:#94a3b8!important; }

@media(max-width:768px) { .ep-hdr-in{flex-direction:column;gap:10px;align-items:flex-start;} }
"""


# ── App ─────────────────────────────────────────────────────────────


def create_app() -> gr.Blocks:
    with gr.Blocks(title="EvalPulse Dashboard", css=THEME_CSS) as app:
        gr.HTML("""
        <div class="ep-hdr"><div class="ep-hdr-in">
            <div class="ep-brand">
                <div class="ep-logo">EP</div>
                <div><div class="ep-t">EvalPulse</div>
                <div class="ep-st">LLM Evaluation &amp; Drift Monitor</div></div>
            </div>
            <div class="ep-live"><div class="ep-dot"></div>DEMO MODE</div>
        </div></div>
        """)

        with gr.Tabs():
            with gr.TabItem("Overview"):
                with gr.Row():
                    hc = gr.HTML("Loading...")
                    hac = gr.HTML("Loading...")
                    dc = gr.HTML("Loading...")
                    tc = gr.HTML("Loading...")
                with gr.Row():
                    hg = gr.Plot(label="Health Gauge")
                    ht = gr.Plot(label="Health Trend")
                gr.Markdown("#### Recent Alerts")
                at = gr.Dataframe(
                    headers=[
                        "Time",
                        "Severity",
                        "Metric",
                        "Value",
                        "Threshold",
                        "Message",
                    ],
                    interactive=False,
                )
                gr.Button("Refresh", variant="primary", size="sm").click(
                    fn=build_overview, outputs=[hc, hac, dc, tc, hg, ht, at]
                )

            with gr.TabItem("Hallucination"):
                hr = gr.Plot()
                with gr.Row():
                    hd = gr.Plot()
                    hm = gr.Plot()
                gr.Markdown("#### Highest Hallucination Responses")
                htb = gr.Dataframe(
                    headers=["Time", "Query", "Response", "Score", "Flagged"],
                    interactive=False,
                )
                gr.Button("Refresh", variant="primary", size="sm").click(
                    fn=build_hallucination, outputs=[hr, hd, hm, htb]
                )

            with gr.TabItem("Semantic Drift"):
                ds = gr.Markdown("Loading...")
                dp = gr.Plot()
                de = gr.Plot()
                gr.Button("Refresh", variant="primary", size="sm").click(
                    fn=build_drift, outputs=[dp, de, ds]
                )

            with gr.TabItem("RAG & Quality"):
                qp = gr.Plot()
                with gr.Row():
                    rp = gr.Plot()
                    rr = gr.Plot()
                bp = gr.Plot()
                gr.Button("Refresh", variant="primary", size="sm").click(
                    fn=build_rag_quality, outputs=[qp, rp, rr, bp]
                )

        gr.HTML("""
        <div class="ep-ftr">
            EvalPulse v0.1.0 &middot; Open Source LLM Evaluation &amp; Drift Monitoring
            &middot; <a href="https://github.com/ninjacode911/Project-EvalPulse">GitHub</a>
        </div>
        """)

        app.load(fn=build_overview, outputs=[hc, hac, dc, tc, hg, ht, at])

    return app


if __name__ == "__main__":
    create_app().launch(server_name="0.0.0.0", server_port=7860)