"""EvalPulse Demo Dashboard — self-contained HuggingFace Spaces deployment. Runs entirely on synthetic data. No external dependencies on evalpulse or dashboard packages. """ from __future__ import annotations import random from collections import defaultdict from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone import gradio as gr import plotly.graph_objects as go # ── Lightweight EvalRecord (replaces pydantic model) ───────────────── UTC = timezone.utc @dataclass class EvalRecord: """Minimal evaluation record for demo purposes.""" app_name: str = "default" timestamp: datetime = field(default_factory=lambda: datetime.now(UTC)) query: str = "" context: str | None = None response: str = "" model_name: str = "unknown" latency_ms: int = 0 tags: list[str] = field(default_factory=list) # Hallucination hallucination_score: float = 0.0 hallucination_method: str = "none" flagged_claims: list[str] = field(default_factory=list) # Drift embedding_vector: list[float] = field(default_factory=list) drift_score: float | None = None # RAG Quality faithfulness_score: float | None = None context_relevance: float | None = None answer_relevancy: float | None = None groundedness_score: float | None = None # Response Quality sentiment_score: float = 0.5 toxicity_score: float = 0.0 response_length: int = 0 language_detected: str = "en" is_denial: bool = False # Composite health_score: int = 0 # ── Demo data generator ───────────────────────────────────────────── def generate_demo_records(n: int = 200) -> list[EvalRecord]: """Generate N synthetic EvalRecords with realistic distributions. Simulates an LLM app with: - Generally good performance (health 70-95) - Occasional hallucination spikes - Gradual drift over time - Some toxic/denial responses """ random.seed(42) records: list[EvalRecord] = [] now = datetime.now(UTC) queries = [ "What is machine learning?", "Explain neural networks", "How does RAG work?", "What is Python used for?", "Describe transformer architecture", "What are embeddings?", "How do LLMs handle context?", "What is fine-tuning?", "Explain attention mechanism", "What is prompt engineering?", ] models = ["llama-3.1-70b", "gpt-4o-mini", "gemini-flash"] for i in range(n): ts = now - timedelta(hours=n - i) query = random.choice(queries) model = random.choice(models) # Simulate drift: later responses drift slightly drift_factor = i / n * 0.1 # Base scores halluc = random.gauss(0.12, 0.08) + drift_factor * 0.5 halluc = max(0.0, min(1.0, halluc)) drift = random.gauss(0.05, 0.03) + drift_factor drift = max(0.0, min(1.0, drift)) sentiment = random.gauss(0.7, 0.1) sentiment = max(0.0, min(1.0, sentiment)) toxicity = abs(random.gauss(0.02, 0.02)) toxicity = max(0.0, min(1.0, toxicity)) is_denial = random.random() < 0.05 length = random.randint(20, 200) # RAG scores (70% of calls are RAG) is_rag = random.random() < 0.7 faith = None ctx_rel = None ans_rel = None ground = None context = None if is_rag: faith = random.gauss(0.75, 0.1) faith = max(0.0, min(1.0, faith)) ctx_rel = random.gauss(0.8, 0.08) ctx_rel = max(0.0, min(1.0, ctx_rel)) ans_rel = random.gauss(0.78, 0.09) ans_rel = max(0.0, min(1.0, ans_rel)) ground = 0.4 * faith + 0.3 * ctx_rel + 0.3 * ans_rel context = f"Context for: {query}" # Compute health score components = [(1 - halluc) * 0.35, (1 - drift) * 0.25] if ground is not None: components.append(ground * 0.20) quality = (1 - toxicity) * 0.5 + sentiment * 0.4 + 0.1 components.append(quality * 0.15) health = int( sum(components) / sum([0.35, 0.25] + ([0.20] if ground else []) + [0.15]) * 100 ) health = max(0, min(100, health)) record = EvalRecord( app_name="demo-app", timestamp=ts, query=query, context=context, response=f"Demo response for: {query}", model_name=model, latency_ms=random.randint(50, 500), tags=["demo"], hallucination_score=round(halluc, 4), hallucination_method="embedding", drift_score=round(drift, 4), faithfulness_score=round(faith, 4) if faith else None, context_relevance=round(ctx_rel, 4) if ctx_rel else None, answer_relevancy=round(ans_rel, 4) if ans_rel else None, groundedness_score=round(ground, 4) if ground else None, sentiment_score=round(sentiment, 4), toxicity_score=round(toxicity, 4), response_length=length, language_detected="en", is_denial=is_denial, health_score=health, ) records.append(record) return records # ── Chart helpers (inlined from dashboard/charts.py) ───────────────── _BG = "#0a0e1a" _SURFACE = "#111827" _BORDER = "#1e293b" _TEXT = "#e2e8f0" _TEXT_DIM = "#64748b" _CYAN = "#06d6a0" _AMBER = "#f59e0b" _RED = "#ef4444" _BLUE = "#3b82f6" _PURPLE = "#a78bfa" _PINK = "#f472b6" _LAYOUT_BASE: dict = dict( paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)", font=dict(family="JetBrains Mono, monospace", color=_TEXT, size=11), margin=dict(l=48, r=24, t=48, b=40), xaxis=dict( gridcolor="rgba(255,255,255,0.04)", zerolinecolor="rgba(255,255,255,0.06)", tickfont=dict(size=10, color=_TEXT_DIM), ), yaxis=dict( gridcolor="rgba(255,255,255,0.04)", zerolinecolor="rgba(255,255,255,0.06)", tickfont=dict(size=10, color=_TEXT_DIM), ), legend=dict( font=dict(size=10, color=_TEXT_DIM), bgcolor="rgba(0,0,0,0)", ), ) def _apply_layout(fig: go.Figure, height: int = 320, **kwargs) -> go.Figure: layout = {**_LAYOUT_BASE, "height": height} layout.update(kwargs) fig.update_layout(**layout) return fig def empty_figure(title: str = "", message: str = "No data available") -> go.Figure: """Create an empty figure with a message.""" fig = go.Figure() _apply_layout( fig, height=260, xaxis=dict(visible=False), yaxis=dict(visible=False), annotations=[ dict( text=f"{message}", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False, font=dict(size=13, color=_TEXT_DIM), ) ], ) return fig def health_gauge_chart(score: int | None = None) -> go.Figure: """Create a health score gauge chart (0-100).""" if score is None: return empty_figure("", "Awaiting first evaluation") if score >= 75: bar_color = _CYAN elif score >= 40: bar_color = _AMBER else: bar_color = _RED fig = go.Figure( go.Indicator( mode="gauge+number", value=score, number=dict( font=dict( size=48, color=bar_color, family="JetBrains Mono, monospace" ), suffix="", ), gauge=dict( axis=dict( range=[0, 100], tickcolor=_TEXT_DIM, tickfont=dict(size=9, color=_TEXT_DIM), dtick=25, ), bgcolor="rgba(255,255,255,0.03)", bordercolor="rgba(255,255,255,0.08)", bar=dict(color=bar_color, thickness=0.75), steps=[ dict(range=[0, 40], color="rgba(239,68,68,0.08)"), dict(range=[40, 75], color="rgba(245,158,11,0.06)"), dict(range=[75, 100], color="rgba(6,214,160,0.06)"), ], ), ) ) _apply_layout(fig, height=220, margin=dict(l=24, r=24, t=16, b=8)) return fig def radar_chart( categories: list[str], values: list[float], title: str = "", ) -> go.Figure: """Create a radar/spider chart for multi-dimensional scores.""" if not categories or not values: return empty_figure(title, "No RAG data yet") # Close the polygon cats = categories + [categories[0]] vals = values + [values[0]] fig = go.Figure() fig.add_trace( go.Scatterpolar( r=vals, theta=cats, fill="toself", fillcolor=f"rgba({int(_CYAN[1:3], 16)},{int(_CYAN[3:5], 16)},{int(_CYAN[5:7], 16)},0.12)", line=dict(color=_CYAN, width=2), marker=dict(size=5, color=_CYAN), ) ) _apply_layout(fig, height=340) fig.update_layout( polar=dict( bgcolor="rgba(0,0,0,0)", radialaxis=dict( visible=True, range=[0, 1], gridcolor="rgba(255,255,255,0.06)", tickfont=dict(size=8, color=_TEXT_DIM), ), angularaxis=dict( gridcolor="rgba(255,255,255,0.06)", tickfont=dict(size=10, color=_TEXT), ), ), title=dict( text=title, font=dict(size=12, color=_TEXT_DIM), x=0, xanchor="left" ), ) return fig # ── Plotly dark theme for dashboard figures ────────────────────────── _DARK_LAYOUT: dict = dict( paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)", font=dict(family="JetBrains Mono, monospace", color="#94a3b8", size=11), autosize=True, margin=dict(l=50, r=20, t=44, b=40), xaxis=dict( gridcolor="rgba(255,255,255,0.04)", tickfont=dict(size=10, color="#475569"), ), yaxis=dict( gridcolor="rgba(255,255,255,0.04)", tickfont=dict(size=10, color="#475569"), ), legend=dict(font=dict(size=10, color="#64748b"), bgcolor="rgba(0,0,0,0)"), ) def _dark(fig: go.Figure, **kw) -> go.Figure: """Apply dark theme to a Plotly figure.""" layout = {**_DARK_LAYOUT, **kw} fig.update_layout(**layout) return fig # ── Data layer (demo-only) ─────────────────────────────────────────── _DEMO_RECORDS: list[EvalRecord] | None = None def _fetch_records(limit: int = 500) -> list[EvalRecord]: """Return cached demo records (generated once on first call).""" global _DEMO_RECORDS if _DEMO_RECORDS is None: _DEMO_RECORDS = generate_demo_records(200) return _DEMO_RECORDS[:limit] def _fetch_alerts(limit: int = 20) -> list: """No real alerts in demo mode.""" return [] # ── KPI card HTML helper ──────────────────────────────────────────── def _kpi_card(label: str, value: str, sub: str, color: str) -> str: return f"""
{label}
{value}
{sub}
""" # ── Tab 1: Overview ───────────────────────────────────────────────── def build_overview(): records = _fetch_records(500) alerts = _fetch_alerts(20) if not records: return ( _kpi_card("Health Score", "---", "no data", "#06d6a0"), _kpi_card("Hallucination", "---", "no data", "#f59e0b"), _kpi_card("Drift", "---", "no data", "#3b82f6"), _kpi_card("Evaluations", "0", "", "#a78bfa"), health_gauge_chart(None), empty_figure("", "No evaluations yet"), [["No alerts yet", "", "", "", "", ""]], ) avg_health = int(sum(r.health_score for r in records) / len(records)) avg_halluc = sum(r.hallucination_score for r in records) / len(records) drift_vals = [r.drift_score for r in records if r.drift_score is not None] avg_drift = sum(drift_vals) / len(drift_vals) if drift_vals else None if avg_health >= 90: h_sub = "HEALTHY" elif avg_health >= 75: h_sub = "MONITORING" elif avg_health >= 60: h_sub = "DEGRADING" else: h_sub = "CRITICAL" d_val = f"{avg_drift:.3f}" if avg_drift is not None else "..." d_sub = ( "STABLE" if avg_drift is not None and avg_drift < 0.15 else "DRIFTING" if avg_drift is not None else "BUILDING BASELINE" ) sorted_recs = sorted(records, key=lambda r: r.timestamp) times = [r.timestamp.strftime("%m-%d %H:%M") for r in sorted_recs] scores = [r.health_score for r in sorted_recs] trend = go.Figure() min_score = max(0, min(scores) - 10) trend.add_trace( go.Scatter( x=times, y=scores, mode="lines", name="Health Score", line=dict(color="#06d6a0", width=2, shape="spline"), fill="tonexty" if min_score > 30 else "none", fillcolor="rgba(6,214,160,0.06)", ) ) # Only show threshold lines if they're within visible range if min_score <= 75: trend.add_hline(y=75, line_dash="dot", line_color="#f59e0b", line_width=1, annotation_text="Warning: 75", annotation_font_size=9, annotation_font_color="#f59e0b") if min_score <= 40: trend.add_hline(y=40, line_dash="dot", line_color="#ef4444", line_width=1, annotation_text="Critical: 40", annotation_font_size=9, annotation_font_color="#ef4444") _dark( trend, title="Health Score Trend", yaxis=dict(range=[min_score, 105], **_DARK_LAYOUT["yaxis"]), height=350, ) alert_rows = [["---", "", "", "", "", "No alerts triggered"]] if alerts: alert_rows = [] for a in alerts[:20]: alert_rows.append( [ a.timestamp.strftime("%Y-%m-%d %H:%M"), a.severity.upper(), a.metric, f"{a.value:.4f}", f"{a.threshold:.4f}", a.message, ] ) return ( _kpi_card("Health Score", str(avg_health), h_sub, "#06d6a0"), _kpi_card( "Hallucination", f"{avg_halluc:.1%}", f"avg of {len(records)}", "#f59e0b" ), _kpi_card("Drift", d_val, d_sub, "#3b82f6"), _kpi_card("Evaluations", f"{len(records):,}", "total tracked", "#a78bfa"), health_gauge_chart(avg_health), trend, alert_rows, ) # ── Tab 2: Hallucination ──────────────────────────────────────────── def build_hallucination(): records = _fetch_records(500) if not records: e = empty_figure("", "No data yet") return e, e, e, [["No data", "", "", "", ""]] sorted_recs = sorted(records, key=lambda r: r.timestamp) times = [r.timestamp.strftime("%m-%d %H:%M") for r in sorted_recs] h_scores = [r.hallucination_score for r in sorted_recs] rate = go.Figure() rate.add_trace( go.Scatter( x=times, y=h_scores, mode="lines", line=dict(color="#ef4444", width=2, shape="spline"), fill="tozeroy", fillcolor="rgba(239,68,68,0.08)", ) ) rate.add_hline( y=0.3, line_dash="dot", line_color="#f59e0b", annotation_text="Threshold 0.3", annotation_font_size=9, annotation_font_color="#f59e0b", ) _dark( rate, title="Hallucination Score Over Time", yaxis=dict(range=[0, 1.05], **_DARK_LAYOUT["yaxis"]), height=350, ) dist = go.Figure( go.Histogram( x=h_scores, nbinsx=25, marker_color="#ef4444", opacity=0.7, marker_line_width=0, ) ) dist.add_vline(x=0.3, line_dash="dot", line_color="#f59e0b") _dark(dist, title="Score Distribution", height=300, bargap=0.05) ms: dict[str, list[float]] = defaultdict(list) for r in records: ms[r.model_name].append(r.hallucination_score) model_names = list(ms.keys()) avgs = [sum(v) / len(v) for v in ms.values()] model_fig = go.Figure( go.Bar( x=model_names, y=avgs, marker_color=["#ef4444" if a > 0.3 else "#06d6a0" for a in avgs], marker_line_width=0, ) ) _dark(model_fig, title="Avg Hallucination by Model", height=300) top = sorted(records, key=lambda r: r.hallucination_score, reverse=True)[:10] rows = [ [ r.timestamp.strftime("%H:%M:%S"), r.query[:50], r.response[:60], f"{r.hallucination_score:.3f}", ", ".join(r.flagged_claims[:2]) if r.flagged_claims else "", ] for r in top ] return rate, dist, model_fig, rows # ── Tab 3: Drift ──────────────────────────────────────────────────── def build_drift(): records = _fetch_records(500) if not records: e = empty_figure("", "No data yet") return e, e, "No data" sorted_recs = sorted(records, key=lambda r: r.timestamp) drift_recs = [r for r in sorted_recs if r.drift_score is not None] emb_recs = [ r for r in sorted_recs if r.embedding_vector and len(r.embedding_vector) > 2 ] if len(emb_recs) >= 3: embed = go.Figure( go.Scatter( x=[r.embedding_vector[0] for r in emb_recs], y=[r.embedding_vector[1] for r in emb_recs], mode="markers", marker=dict( size=8, color=[r.hallucination_score for r in emb_recs], colorscale=[ [0, "#06d6a0"], [0.5, "#f59e0b"], [1, "#ef4444"], ], showscale=True, colorbar=dict( title="Halluc", tickfont=dict(size=9, color="#64748b"), titlefont=dict(size=10, color="#64748b"), ), line=dict(width=0), ), text=[r.query[:30] for r in emb_recs], hovertemplate="%{text}
Halluc: %{marker.color:.3f}", ) ) _dark(embed, title="Response Embedding Space", height=350) else: embed = empty_figure("", "Need more data for visualization") if not drift_recs: return ( empty_figure("", "Building baseline (need 10+ evaluations)"), embed, "Building baseline...", ) times = [r.timestamp.strftime("%m-%d %H:%M") for r in drift_recs] scores = [r.drift_score for r in drift_recs] dfig = go.Figure() dfig.add_trace( go.Scatter( x=times, y=scores, mode="lines", line=dict(color="#a78bfa", width=2, shape="spline"), fill="tozeroy", fillcolor="rgba(167,139,250,0.08)", ) ) dfig.add_hline( y=0.15, line_dash="dot", line_color="#ef4444", annotation_text="Threshold 0.15", annotation_font_size=9, annotation_font_color="#ef4444", ) y_max = max(max(scores) * 1.2, 0.3) _dark( dfig, title="Drift Score Over Time", yaxis=dict(range=[0, y_max], **_DARK_LAYOUT["yaxis"]), height=350, ) avg = sum(scores) / len(scores) if avg < 0.1: st = "Stable" elif avg < 0.2: st = "Minor drift" else: st = "Significant drift!" return dfig, embed, st # ── Tab 4: RAG & Quality ──────────────────────────────────────────── def build_rag_quality(): records = _fetch_records(500) if not records: e = empty_figure("", "No data yet") return e, e, e, e sorted_recs = sorted(records, key=lambda r: r.timestamp) times = [r.timestamp.strftime("%m-%d %H:%M") for r in sorted_recs] qfig = go.Figure() qfig.add_trace( go.Scatter( x=times, y=[r.sentiment_score for r in sorted_recs], mode="lines", name="Sentiment", line=dict(color="#3b82f6", width=2, shape="spline"), ) ) qfig.add_trace( go.Scatter( x=times, y=[r.toxicity_score for r in sorted_recs], mode="lines", name="Toxicity", line=dict(color="#ef4444", width=2, shape="spline"), ) ) _dark( qfig, title="Quality Metrics Over Time", yaxis=dict(range=[0, 1.05], **_DARK_LAYOUT["yaxis"]), height=350, ) rag_recs = [r for r in sorted_recs if r.groundedness_score is not None] if rag_recs: rt = [r.timestamp.strftime("%m-%d %H:%M") for r in rag_recs] rfig = go.Figure() rfig.add_trace( go.Scatter( x=rt, y=[r.faithfulness_score or 0 for r in rag_recs], mode="lines", name="Faithfulness", line=dict(color="#06d6a0", width=2, shape="spline"), ) ) rfig.add_trace( go.Scatter( x=rt, y=[r.context_relevance or 0 for r in rag_recs], mode="lines", name="Context Relevance", line=dict(color="#3b82f6", width=2, shape="spline"), ) ) rfig.add_trace( go.Scatter( x=rt, y=[r.groundedness_score or 0 for r in rag_recs], mode="lines", name="Groundedness", line=dict(color="#a78bfa", width=2, dash="dash"), ) ) _dark( rfig, title="RAG Quality Metrics", yaxis=dict(range=[0, 1.05], **_DARK_LAYOUT["yaxis"]), height=350, ) af = sum(r.faithfulness_score or 0 for r in rag_recs) / len(rag_recs) ac = sum(r.context_relevance or 0 for r in rag_recs) / len(rag_recs) aa = sum(r.answer_relevancy or 0 for r in rag_recs) / len(rag_recs) ag = sum(r.groundedness_score or 0 for r in rag_recs) / len(rag_recs) radar = radar_chart( ["Faithfulness", "Context Relevance", "Answer Relevancy", "Groundedness"], [af, ac, aa, ag], title="RAG Quality Radar", ) else: rfig = empty_figure("", "No RAG calls yet") radar = empty_figure("", "No RAG data") lang: dict[str, int] = defaultdict(int) denials = 0 for r in records: lang[r.language_detected] += 1 if r.is_denial: denials += 1 bfig = go.Figure( go.Bar( x=list(lang.keys()), y=list(lang.values()), marker_color="#3b82f6", marker_line_width=0, ) ) _dark( bfig, title=f"Language Distribution | Denials: {denials}/{len(records)}", height=300, ) return qfig, rfig, radar, bfig # ── CSS ───────────────────────────────────────────────────────────── THEME_CSS = """ @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@300;400;500;600;700&family=Outfit:wght@300;400;500;600;700;800&display=swap'); body, .gradio-container { background: #060a14 !important; color: #e2e8f0 !important; font-family: 'Outfit', sans-serif !important; } .gradio-container { max-width: 100% !important; width: 100% !important; margin: 0 !important; padding: 0 20px !important; box-sizing: border-box !important; overflow-x: hidden !important; } .main, .wrap, .contain { max-width: 100% !important; width: 100% !important; overflow-x: hidden !important; } .app { max-width: 100% !important; overflow-x: hidden !important; } /* Plotly charts should not overflow */ .js-plotly-plot, .plotly, .plot-container, .svg-container { max-width: 100% !important; width: 100% !important; overflow: hidden !important; } .js-plotly-plot .main-svg, .js-plotly-plot .svg-container { max-width: 100% !important; width: 100% !important; } .plot-container.plotly { width: 100% !important; } /* Gradio plot wrapper */ .gr-plot, .plot-padding { max-width: 100% !important; overflow: hidden !important; } ::-webkit-scrollbar { width: 6px; } ::-webkit-scrollbar-track { background: #0a0e1a; } ::-webkit-scrollbar-thumb { background: #1e293b; border-radius: 3px; } .ep-hdr { position: relative; padding: 24px 32px; margin: 0 -20px 20px -20px; background: linear-gradient(135deg, #0a0e1a 0%, #111827 50%, #0f172a 100%); border-bottom: 1px solid rgba(6,214,160,0.15); overflow: hidden; box-sizing: border-box; } .ep-hdr::before { content:'';position:absolute;inset:0; background: radial-gradient(ellipse 600px 300px at 15% 50%,rgba(6,214,160,0.06),transparent 70%), radial-gradient(ellipse 400px 200px at 85% 30%,rgba(59,130,246,0.04),transparent 70%); pointer-events:none; } .ep-hdr-in { position:relative;display:flex;align-items:center;justify-content:space-between;z-index:1; } .ep-brand { display:flex;align-items:center;gap:14px; } .ep-logo { width:40px;height:40px;border-radius:10px; background:linear-gradient(135deg,#06d6a0,#3b82f6); display:flex;align-items:center;justify-content:center; font-size:18px;font-weight:700;color:#060a14; font-family:'JetBrains Mono',monospace; box-shadow:0 0 20px rgba(6,214,160,0.3); } .ep-t { font-family:'Outfit';font-size:1.6em;font-weight:700;letter-spacing:-0.5px;color:#f1f5f9!important;margin:0!important; } .ep-st { font-family:'JetBrains Mono';font-size:0.7em;color:#64748b!important;margin:3px 0 0!important;letter-spacing:0.5px;text-transform:uppercase; } .ep-live { display:flex;align-items:center;gap:8px;font-family:'JetBrains Mono';font-size:0.72em;color:#06d6a0;letter-spacing:0.3px; } .ep-dot { width:7px;height:7px;border-radius:50%;background:#06d6a0; box-shadow:0 0 8px rgba(6,214,160,0.6); animation:pdot 2s ease-in-out infinite; } @keyframes pdot { 0%,100%{opacity:1} 50%{opacity:0.4} } .tab-nav { background:transparent!important;border:none!important;gap:4px!important;padding:0 0 14px!important;border-bottom:1px solid #1e293b!important;margin-bottom:18px!important; } .tab-nav button { font-family:'JetBrains Mono',monospace!important;font-size:0.76em!important;font-weight:500!important; letter-spacing:0.5px!important;text-transform:uppercase!important;color:#64748b!important; background:transparent!important;border:1px solid transparent!important;border-radius:8px!important; padding:8px 18px!important;transition:all 0.2s!important; } .tab-nav button:hover { color:#e2e8f0!important;background:rgba(255,255,255,0.03)!important; } .tab-nav button.selected { color:#06d6a0!important;background:rgba(6,214,160,0.08)!important;border-color:rgba(6,214,160,0.2)!important; } .tabitem { border:none!important;background:transparent!important;padding:0!important; } table { background:#111827!important;border:1px solid #1e293b!important;border-radius:10px!important;overflow:hidden!important; } table thead th { background:#0f172a!important;color:#64748b!important; font-family:'JetBrains Mono',monospace!important;font-size:0.7em!important; font-weight:600!important;letter-spacing:0.8px!important;text-transform:uppercase!important; padding:10px 14px!important;border-bottom:1px solid #1e293b!important; } table tbody td { background:#111827!important;color:#cbd5e1!important; font-family:'JetBrains Mono',monospace!important;font-size:0.78em!important; padding:8px 14px!important;border-bottom:1px solid rgba(30,41,59,0.5)!important; } table tbody tr:hover td { background:rgba(6,214,160,0.03)!important; } button.primary, button.secondary { font-family:'JetBrains Mono',monospace!important;font-size:0.74em!important; letter-spacing:0.4px!important;border-radius:8px!important; } button.primary { background:rgba(6,214,160,0.12)!important;color:#06d6a0!important;border:1px solid rgba(6,214,160,0.25)!important; } button.primary:hover { background:rgba(6,214,160,0.2)!important; } button.secondary { background:rgba(59,130,246,0.1)!important;color:#3b82f6!important;border:1px solid rgba(59,130,246,0.2)!important; } button.secondary:hover { background:rgba(59,130,246,0.18)!important; } .gr-row { gap:14px!important; flex-wrap: wrap !important; max-width: 100% !important; overflow: hidden !important; } /* Remove all white backgrounds from Gradio components */ .gr-block, .block:not(.gr-group) { border:none!important;background:transparent!important; } .gr-padded { padding:0!important; } .label-wrap { background:#0a0e1a!important;border:1px solid #1e293b!important;border-radius:8px!important;padding:4px 10px!important; } .label-wrap span { color:#64748b!important;font-family:'JetBrains Mono',monospace!important;font-size:0.72em!important;letter-spacing:0.5px!important; } /* Plot containers */ .gr-plot, .plot-wrap, .gradio-plot { background:transparent!important;border:none!important; } div[class*="plot"] { background:transparent!important; } /* All panel/group/box backgrounds */ .panel, .gr-panel, .gr-box, .gr-form, .gr-input-label, .gr-check-radio { background:#111827!important;border-color:#1e293b!important;color:#e2e8f0!important; } /* File download component */ .file-preview, .upload-button { background:#111827!important;border-color:#1e293b!important;color:#94a3b8!important; } /* Inputs and textboxes */ input, textarea, select, .gr-input { background:#111827!important;border-color:#1e293b!important;color:#e2e8f0!important; } /* Any remaining white wrapper divs */ .contain > div, .wrap > div { background:transparent!important; } /* Markdown text areas */ .prose, .markdown-text, .md { background:transparent!important;color:#94a3b8!important; } /* Accordion headers */ .accordion { background:#111827!important;border-color:#1e293b!important; } /* Prevent dataframes from causing horizontal scroll */ .dataframe, .table-wrap, .svelte-table { max-width: 100% !important; overflow-x: auto !important; overflow-y: hidden !important; } /* KPI card row in HTML shouldn't overflow */ div[style*="display:flex"] { flex-wrap: wrap !important; max-width: 100% !important; } .ep-ftr { margin-top:28px;padding:14px 0;border-top:1px solid #1e293b; text-align:center;font-family:'JetBrains Mono',monospace; font-size:0.68em;color:#334155;letter-spacing:0.3px; } .ep-ftr a { color:#475569;text-decoration:none; } .ep-ftr a:hover { color:#06d6a0; } .markdown-text h4 { color:#94a3b8!important;font-family:'Outfit',sans-serif!important; } .markdown-text p, .markdown-text { color:#94a3b8!important; } @media(max-width:768px) { .ep-hdr-in{flex-direction:column;gap:10px;align-items:flex-start;} } """ # ── App ───────────────────────────────────────────────────────────── def create_app() -> gr.Blocks: with gr.Blocks(title="EvalPulse Dashboard", css=THEME_CSS) as app: gr.HTML("""
EvalPulse
LLM Evaluation & Drift Monitor
DEMO MODE
""") with gr.Tabs(): with gr.TabItem("Overview"): with gr.Row(): hc = gr.HTML("Loading...") hac = gr.HTML("Loading...") dc = gr.HTML("Loading...") tc = gr.HTML("Loading...") with gr.Row(): hg = gr.Plot(label="Health Gauge") ht = gr.Plot(label="Health Trend") gr.Markdown("#### Recent Alerts") at = gr.Dataframe( headers=[ "Time", "Severity", "Metric", "Value", "Threshold", "Message", ], interactive=False, ) gr.Button("Refresh", variant="primary", size="sm").click( fn=build_overview, outputs=[hc, hac, dc, tc, hg, ht, at] ) with gr.TabItem("Hallucination"): hr = gr.Plot() with gr.Row(): hd = gr.Plot() hm = gr.Plot() gr.Markdown("#### Highest Hallucination Responses") htb = gr.Dataframe( headers=["Time", "Query", "Response", "Score", "Flagged"], interactive=False, ) gr.Button("Refresh", variant="primary", size="sm").click( fn=build_hallucination, outputs=[hr, hd, hm, htb] ) with gr.TabItem("Semantic Drift"): ds = gr.Markdown("Loading...") dp = gr.Plot() de = gr.Plot() gr.Button("Refresh", variant="primary", size="sm").click( fn=build_drift, outputs=[dp, de, ds] ) with gr.TabItem("RAG & Quality"): qp = gr.Plot() with gr.Row(): rp = gr.Plot() rr = gr.Plot() bp = gr.Plot() gr.Button("Refresh", variant="primary", size="sm").click( fn=build_rag_quality, outputs=[qp, rp, rr, bp] ) gr.HTML("""
EvalPulse v0.1.0 · Open Source LLM Evaluation & Drift Monitoring · GitHub
""") app.load(fn=build_overview, outputs=[hc, hac, dc, tc, hg, ht, at]) return app if __name__ == "__main__": create_app().launch(server_name="0.0.0.0", server_port=7860)