"""EvalPulse Demo Dashboard — self-contained HuggingFace Spaces deployment. Runs entirely on synthetic data. No external dependencies on evalpulse or dashboard packages. """ from __future__ import annotations import random from collections import defaultdict from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone import gradio as gr import plotly.graph_objects as go # ── Lightweight EvalRecord (replaces pydantic model) ───────────────── UTC = timezone.utc @dataclass class EvalRecord: """Minimal evaluation record for demo purposes.""" app_name: str = "default" timestamp: datetime = field(default_factory=lambda: datetime.now(UTC)) query: str = "" context: str | None = None response: str = "" model_name: str = "unknown" latency_ms: int = 0 tags: list[str] = field(default_factory=list) # Hallucination hallucination_score: float = 0.0 hallucination_method: str = "none" flagged_claims: list[str] = field(default_factory=list) # Drift embedding_vector: list[float] = field(default_factory=list) drift_score: float | None = None # RAG Quality faithfulness_score: float | None = None context_relevance: float | None = None answer_relevancy: float | None = None groundedness_score: float | None = None # Response Quality sentiment_score: float = 0.5 toxicity_score: float = 0.0 response_length: int = 0 language_detected: str = "en" is_denial: bool = False # Composite health_score: int = 0 # ── Demo data generator ───────────────────────────────────────────── def generate_demo_records(n: int = 200) -> list[EvalRecord]: """Generate N synthetic EvalRecords with realistic distributions. Simulates an LLM app with: - Generally good performance (health 70-95) - Occasional hallucination spikes - Gradual drift over time - Some toxic/denial responses """ random.seed(42) records: list[EvalRecord] = [] now = datetime.now(UTC) queries = [ "What is machine learning?", "Explain neural networks", "How does RAG work?", "What is Python used for?", "Describe transformer architecture", "What are embeddings?", "How do LLMs handle context?", "What is fine-tuning?", "Explain attention mechanism", "What is prompt engineering?", ] models = ["llama-3.1-70b", "gpt-4o-mini", "gemini-flash"] for i in range(n): ts = now - timedelta(hours=n - i) query = random.choice(queries) model = random.choice(models) # Simulate drift: later responses drift slightly drift_factor = i / n * 0.1 # Base scores halluc = random.gauss(0.12, 0.08) + drift_factor * 0.5 halluc = max(0.0, min(1.0, halluc)) drift = random.gauss(0.05, 0.03) + drift_factor drift = max(0.0, min(1.0, drift)) sentiment = random.gauss(0.7, 0.1) sentiment = max(0.0, min(1.0, sentiment)) toxicity = abs(random.gauss(0.02, 0.02)) toxicity = max(0.0, min(1.0, toxicity)) is_denial = random.random() < 0.05 length = random.randint(20, 200) # RAG scores (70% of calls are RAG) is_rag = random.random() < 0.7 faith = None ctx_rel = None ans_rel = None ground = None context = None if is_rag: faith = random.gauss(0.75, 0.1) faith = max(0.0, min(1.0, faith)) ctx_rel = random.gauss(0.8, 0.08) ctx_rel = max(0.0, min(1.0, ctx_rel)) ans_rel = random.gauss(0.78, 0.09) ans_rel = max(0.0, min(1.0, ans_rel)) ground = 0.4 * faith + 0.3 * ctx_rel + 0.3 * ans_rel context = f"Context for: {query}" # Compute health score components = [(1 - halluc) * 0.35, (1 - drift) * 0.25] if ground is not None: components.append(ground * 0.20) quality = (1 - toxicity) * 0.5 + sentiment * 0.4 + 0.1 components.append(quality * 0.15) health = int( sum(components) / sum([0.35, 0.25] + ([0.20] if ground else []) + [0.15]) * 100 ) health = max(0, min(100, health)) record = EvalRecord( app_name="demo-app", timestamp=ts, query=query, context=context, response=f"Demo response for: {query}", model_name=model, latency_ms=random.randint(50, 500), tags=["demo"], hallucination_score=round(halluc, 4), hallucination_method="embedding", drift_score=round(drift, 4), faithfulness_score=round(faith, 4) if faith else None, context_relevance=round(ctx_rel, 4) if ctx_rel else None, answer_relevancy=round(ans_rel, 4) if ans_rel else None, groundedness_score=round(ground, 4) if ground else None, sentiment_score=round(sentiment, 4), toxicity_score=round(toxicity, 4), response_length=length, language_detected="en", is_denial=is_denial, health_score=health, ) records.append(record) return records # ── Chart helpers (inlined from dashboard/charts.py) ───────────────── _BG = "#0a0e1a" _SURFACE = "#111827" _BORDER = "#1e293b" _TEXT = "#e2e8f0" _TEXT_DIM = "#64748b" _CYAN = "#06d6a0" _AMBER = "#f59e0b" _RED = "#ef4444" _BLUE = "#3b82f6" _PURPLE = "#a78bfa" _PINK = "#f472b6" _LAYOUT_BASE: dict = dict( paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)", font=dict(family="JetBrains Mono, monospace", color=_TEXT, size=11), margin=dict(l=48, r=24, t=48, b=40), xaxis=dict( gridcolor="rgba(255,255,255,0.04)", zerolinecolor="rgba(255,255,255,0.06)", tickfont=dict(size=10, color=_TEXT_DIM), ), yaxis=dict( gridcolor="rgba(255,255,255,0.04)", zerolinecolor="rgba(255,255,255,0.06)", tickfont=dict(size=10, color=_TEXT_DIM), ), legend=dict( font=dict(size=10, color=_TEXT_DIM), bgcolor="rgba(0,0,0,0)", ), ) def _apply_layout(fig: go.Figure, height: int = 320, **kwargs) -> go.Figure: layout = {**_LAYOUT_BASE, "height": height} layout.update(kwargs) fig.update_layout(**layout) return fig def empty_figure(title: str = "", message: str = "No data available") -> go.Figure: """Create an empty figure with a message.""" fig = go.Figure() _apply_layout( fig, height=260, xaxis=dict(visible=False), yaxis=dict(visible=False), annotations=[ dict( text=f"{message}", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False, font=dict(size=13, color=_TEXT_DIM), ) ], ) return fig def health_gauge_chart(score: int | None = None) -> go.Figure: """Create a health score gauge chart (0-100).""" if score is None: return empty_figure("", "Awaiting first evaluation") if score >= 75: bar_color = _CYAN elif score >= 40: bar_color = _AMBER else: bar_color = _RED fig = go.Figure( go.Indicator( mode="gauge+number", value=score, number=dict( font=dict( size=48, color=bar_color, family="JetBrains Mono, monospace" ), suffix="", ), gauge=dict( axis=dict( range=[0, 100], tickcolor=_TEXT_DIM, tickfont=dict(size=9, color=_TEXT_DIM), dtick=25, ), bgcolor="rgba(255,255,255,0.03)", bordercolor="rgba(255,255,255,0.08)", bar=dict(color=bar_color, thickness=0.75), steps=[ dict(range=[0, 40], color="rgba(239,68,68,0.08)"), dict(range=[40, 75], color="rgba(245,158,11,0.06)"), dict(range=[75, 100], color="rgba(6,214,160,0.06)"), ], ), ) ) _apply_layout(fig, height=220, margin=dict(l=24, r=24, t=16, b=8)) return fig def radar_chart( categories: list[str], values: list[float], title: str = "", ) -> go.Figure: """Create a radar/spider chart for multi-dimensional scores.""" if not categories or not values: return empty_figure(title, "No RAG data yet") # Close the polygon cats = categories + [categories[0]] vals = values + [values[0]] fig = go.Figure() fig.add_trace( go.Scatterpolar( r=vals, theta=cats, fill="toself", fillcolor=f"rgba({int(_CYAN[1:3], 16)},{int(_CYAN[3:5], 16)},{int(_CYAN[5:7], 16)},0.12)", line=dict(color=_CYAN, width=2), marker=dict(size=5, color=_CYAN), ) ) _apply_layout(fig, height=340) fig.update_layout( polar=dict( bgcolor="rgba(0,0,0,0)", radialaxis=dict( visible=True, range=[0, 1], gridcolor="rgba(255,255,255,0.06)", tickfont=dict(size=8, color=_TEXT_DIM), ), angularaxis=dict( gridcolor="rgba(255,255,255,0.06)", tickfont=dict(size=10, color=_TEXT), ), ), title=dict( text=title, font=dict(size=12, color=_TEXT_DIM), x=0, xanchor="left" ), ) return fig # ── Plotly dark theme for dashboard figures ────────────────────────── _DARK_LAYOUT: dict = dict( paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)", font=dict(family="JetBrains Mono, monospace", color="#94a3b8", size=11), autosize=True, margin=dict(l=50, r=20, t=44, b=40), xaxis=dict( gridcolor="rgba(255,255,255,0.04)", tickfont=dict(size=10, color="#475569"), ), yaxis=dict( gridcolor="rgba(255,255,255,0.04)", tickfont=dict(size=10, color="#475569"), ), legend=dict(font=dict(size=10, color="#64748b"), bgcolor="rgba(0,0,0,0)"), ) def _dark(fig: go.Figure, **kw) -> go.Figure: """Apply dark theme to a Plotly figure.""" layout = {**_DARK_LAYOUT, **kw} fig.update_layout(**layout) return fig # ── Data layer (demo-only) ─────────────────────────────────────────── _DEMO_RECORDS: list[EvalRecord] | None = None def _fetch_records(limit: int = 500) -> list[EvalRecord]: """Return cached demo records (generated once on first call).""" global _DEMO_RECORDS if _DEMO_RECORDS is None: _DEMO_RECORDS = generate_demo_records(200) return _DEMO_RECORDS[:limit] def _fetch_alerts(limit: int = 20) -> list: """No real alerts in demo mode.""" return [] # ── KPI card HTML helper ──────────────────────────────────────────── def _kpi_card(label: str, value: str, sub: str, color: str) -> str: return f"""
""" # ── Tab 1: Overview ───────────────────────────────────────────────── def build_overview(): records = _fetch_records(500) alerts = _fetch_alerts(20) if not records: return ( _kpi_card("Health Score", "---", "no data", "#06d6a0"), _kpi_card("Hallucination", "---", "no data", "#f59e0b"), _kpi_card("Drift", "---", "no data", "#3b82f6"), _kpi_card("Evaluations", "0", "", "#a78bfa"), health_gauge_chart(None), empty_figure("", "No evaluations yet"), [["No alerts yet", "", "", "", "", ""]], ) avg_health = int(sum(r.health_score for r in records) / len(records)) avg_halluc = sum(r.hallucination_score for r in records) / len(records) drift_vals = [r.drift_score for r in records if r.drift_score is not None] avg_drift = sum(drift_vals) / len(drift_vals) if drift_vals else None if avg_health >= 90: h_sub = "HEALTHY" elif avg_health >= 75: h_sub = "MONITORING" elif avg_health >= 60: h_sub = "DEGRADING" else: h_sub = "CRITICAL" d_val = f"{avg_drift:.3f}" if avg_drift is not None else "..." d_sub = ( "STABLE" if avg_drift is not None and avg_drift < 0.15 else "DRIFTING" if avg_drift is not None else "BUILDING BASELINE" ) sorted_recs = sorted(records, key=lambda r: r.timestamp) times = [r.timestamp.strftime("%m-%d %H:%M") for r in sorted_recs] scores = [r.health_score for r in sorted_recs] trend = go.Figure() min_score = max(0, min(scores) - 10) trend.add_trace( go.Scatter( x=times, y=scores, mode="lines", name="Health Score", line=dict(color="#06d6a0", width=2, shape="spline"), fill="tonexty" if min_score > 30 else "none", fillcolor="rgba(6,214,160,0.06)", ) ) # Only show threshold lines if they're within visible range if min_score <= 75: trend.add_hline(y=75, line_dash="dot", line_color="#f59e0b", line_width=1, annotation_text="Warning: 75", annotation_font_size=9, annotation_font_color="#f59e0b") if min_score <= 40: trend.add_hline(y=40, line_dash="dot", line_color="#ef4444", line_width=1, annotation_text="Critical: 40", annotation_font_size=9, annotation_font_color="#ef4444") _dark( trend, title="Health Score Trend", yaxis=dict(range=[min_score, 105], **_DARK_LAYOUT["yaxis"]), height=350, ) alert_rows = [["---", "", "", "", "", "No alerts triggered"]] if alerts: alert_rows = [] for a in alerts[:20]: alert_rows.append( [ a.timestamp.strftime("%Y-%m-%d %H:%M"), a.severity.upper(), a.metric, f"{a.value:.4f}", f"{a.threshold:.4f}", a.message, ] ) return ( _kpi_card("Health Score", str(avg_health), h_sub, "#06d6a0"), _kpi_card( "Hallucination", f"{avg_halluc:.1%}", f"avg of {len(records)}", "#f59e0b" ), _kpi_card("Drift", d_val, d_sub, "#3b82f6"), _kpi_card("Evaluations", f"{len(records):,}", "total tracked", "#a78bfa"), health_gauge_chart(avg_health), trend, alert_rows, ) # ── Tab 2: Hallucination ──────────────────────────────────────────── def build_hallucination(): records = _fetch_records(500) if not records: e = empty_figure("", "No data yet") return e, e, e, [["No data", "", "", "", ""]] sorted_recs = sorted(records, key=lambda r: r.timestamp) times = [r.timestamp.strftime("%m-%d %H:%M") for r in sorted_recs] h_scores = [r.hallucination_score for r in sorted_recs] rate = go.Figure() rate.add_trace( go.Scatter( x=times, y=h_scores, mode="lines", line=dict(color="#ef4444", width=2, shape="spline"), fill="tozeroy", fillcolor="rgba(239,68,68,0.08)", ) ) rate.add_hline( y=0.3, line_dash="dot", line_color="#f59e0b", annotation_text="Threshold 0.3", annotation_font_size=9, annotation_font_color="#f59e0b", ) _dark( rate, title="Hallucination Score Over Time", yaxis=dict(range=[0, 1.05], **_DARK_LAYOUT["yaxis"]), height=350, ) dist = go.Figure( go.Histogram( x=h_scores, nbinsx=25, marker_color="#ef4444", opacity=0.7, marker_line_width=0, ) ) dist.add_vline(x=0.3, line_dash="dot", line_color="#f59e0b") _dark(dist, title="Score Distribution", height=300, bargap=0.05) ms: dict[str, list[float]] = defaultdict(list) for r in records: ms[r.model_name].append(r.hallucination_score) model_names = list(ms.keys()) avgs = [sum(v) / len(v) for v in ms.values()] model_fig = go.Figure( go.Bar( x=model_names, y=avgs, marker_color=["#ef4444" if a > 0.3 else "#06d6a0" for a in avgs], marker_line_width=0, ) ) _dark(model_fig, title="Avg Hallucination by Model", height=300) top = sorted(records, key=lambda r: r.hallucination_score, reverse=True)[:10] rows = [ [ r.timestamp.strftime("%H:%M:%S"), r.query[:50], r.response[:60], f"{r.hallucination_score:.3f}", ", ".join(r.flagged_claims[:2]) if r.flagged_claims else "", ] for r in top ] return rate, dist, model_fig, rows # ── Tab 3: Drift ──────────────────────────────────────────────────── def build_drift(): records = _fetch_records(500) if not records: e = empty_figure("", "No data yet") return e, e, "No data" sorted_recs = sorted(records, key=lambda r: r.timestamp) drift_recs = [r for r in sorted_recs if r.drift_score is not None] emb_recs = [ r for r in sorted_recs if r.embedding_vector and len(r.embedding_vector) > 2 ] if len(emb_recs) >= 3: embed = go.Figure( go.Scatter( x=[r.embedding_vector[0] for r in emb_recs], y=[r.embedding_vector[1] for r in emb_recs], mode="markers", marker=dict( size=8, color=[r.hallucination_score for r in emb_recs], colorscale=[ [0, "#06d6a0"], [0.5, "#f59e0b"], [1, "#ef4444"], ], showscale=True, colorbar=dict( title="Halluc", tickfont=dict(size=9, color="#64748b"), titlefont=dict(size=10, color="#64748b"), ), line=dict(width=0), ), text=[r.query[:30] for r in emb_recs], hovertemplate="%{text}