Spaces:

NinjainPJs
/

EvalPulse

Sleeping

App Files Files Community

EvalPulse / app.py

NinjainPJs

Upload folder using huggingface_hub

4248ef1 verified 3 months ago

raw

history blame contribute delete

36.5 kB

	"""EvalPulse Demo Dashboard — self-contained HuggingFace Spaces deployment.

	Runs entirely on synthetic data. No external dependencies on evalpulse or
	dashboard packages.
	"""

	from __future__ import annotations

	import random
	from collections import defaultdict
	from dataclasses import dataclass, field
	from datetime import datetime, timedelta, timezone

	import gradio as gr
	import plotly.graph_objects as go

	# ── Lightweight EvalRecord (replaces pydantic model) ─────────────────

	UTC = timezone.utc


	@dataclass
	class EvalRecord:
	"""Minimal evaluation record for demo purposes."""

	app_name: str = "default"
	timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
	query: str = ""
	context: str \| None = None
	response: str = ""
	model_name: str = "unknown"
	latency_ms: int = 0
	tags: list[str] = field(default_factory=list)

	# Hallucination
	hallucination_score: float = 0.0
	hallucination_method: str = "none"
	flagged_claims: list[str] = field(default_factory=list)

	# Drift
	embedding_vector: list[float] = field(default_factory=list)
	drift_score: float \| None = None

	# RAG Quality
	faithfulness_score: float \| None = None
	context_relevance: float \| None = None
	answer_relevancy: float \| None = None
	groundedness_score: float \| None = None

	# Response Quality
	sentiment_score: float = 0.5
	toxicity_score: float = 0.0
	response_length: int = 0
	language_detected: str = "en"
	is_denial: bool = False

	# Composite
	health_score: int = 0


	# ── Demo data generator ─────────────────────────────────────────────


	def generate_demo_records(n: int = 200) -> list[EvalRecord]:
	"""Generate N synthetic EvalRecords with realistic distributions.

	Simulates an LLM app with:
	- Generally good performance (health 70-95)
	- Occasional hallucination spikes
	- Gradual drift over time
	- Some toxic/denial responses
	"""
	random.seed(42)
	records: list[EvalRecord] = []
	now = datetime.now(UTC)

	queries = [
	"What is machine learning?",
	"Explain neural networks",
	"How does RAG work?",
	"What is Python used for?",
	"Describe transformer architecture",
	"What are embeddings?",
	"How do LLMs handle context?",
	"What is fine-tuning?",
	"Explain attention mechanism",
	"What is prompt engineering?",
	]

	models = ["llama-3.1-70b", "gpt-4o-mini", "gemini-flash"]

	for i in range(n):
	ts = now - timedelta(hours=n - i)
	query = random.choice(queries)
	model = random.choice(models)

	# Simulate drift: later responses drift slightly
	drift_factor = i / n * 0.1

	# Base scores
	halluc = random.gauss(0.12, 0.08) + drift_factor * 0.5
	halluc = max(0.0, min(1.0, halluc))

	drift = random.gauss(0.05, 0.03) + drift_factor
	drift = max(0.0, min(1.0, drift))

	sentiment = random.gauss(0.7, 0.1)
	sentiment = max(0.0, min(1.0, sentiment))

	toxicity = abs(random.gauss(0.02, 0.02))
	toxicity = max(0.0, min(1.0, toxicity))

	is_denial = random.random() < 0.05
	length = random.randint(20, 200)

	# RAG scores (70% of calls are RAG)
	is_rag = random.random() < 0.7
	faith = None
	ctx_rel = None
	ans_rel = None
	ground = None
	context = None

	if is_rag:
	faith = random.gauss(0.75, 0.1)
	faith = max(0.0, min(1.0, faith))
	ctx_rel = random.gauss(0.8, 0.08)
	ctx_rel = max(0.0, min(1.0, ctx_rel))
	ans_rel = random.gauss(0.78, 0.09)
	ans_rel = max(0.0, min(1.0, ans_rel))
	ground = 0.4 * faith + 0.3 * ctx_rel + 0.3 * ans_rel
	context = f"Context for: {query}"

	# Compute health score
	components = [(1 - halluc) * 0.35, (1 - drift) * 0.25]
	if ground is not None:
	components.append(ground * 0.20)
	quality = (1 - toxicity) * 0.5 + sentiment * 0.4 + 0.1
	components.append(quality * 0.15)
	health = int(
	sum(components)
	/ sum([0.35, 0.25] + ([0.20] if ground else []) + [0.15])
	* 100
	)
	health = max(0, min(100, health))

	record = EvalRecord(
	app_name="demo-app",
	timestamp=ts,
	query=query,
	context=context,
	response=f"Demo response for: {query}",
	model_name=model,
	latency_ms=random.randint(50, 500),
	tags=["demo"],
	hallucination_score=round(halluc, 4),
	hallucination_method="embedding",
	drift_score=round(drift, 4),
	faithfulness_score=round(faith, 4) if faith else None,
	context_relevance=round(ctx_rel, 4) if ctx_rel else None,
	answer_relevancy=round(ans_rel, 4) if ans_rel else None,
	groundedness_score=round(ground, 4) if ground else None,
	sentiment_score=round(sentiment, 4),
	toxicity_score=round(toxicity, 4),
	response_length=length,
	language_detected="en",
	is_denial=is_denial,
	health_score=health,
	)
	records.append(record)

	return records


	# ── Chart helpers (inlined from dashboard/charts.py) ─────────────────

	_BG = "#0a0e1a"
	_SURFACE = "#111827"
	_BORDER = "#1e293b"
	_TEXT = "#e2e8f0"
	_TEXT_DIM = "#64748b"
	_CYAN = "#06d6a0"
	_AMBER = "#f59e0b"
	_RED = "#ef4444"
	_BLUE = "#3b82f6"
	_PURPLE = "#a78bfa"
	_PINK = "#f472b6"

	_LAYOUT_BASE: dict = dict(
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(0,0,0,0)",
	font=dict(family="JetBrains Mono, monospace", color=_TEXT, size=11),
	margin=dict(l=48, r=24, t=48, b=40),
	xaxis=dict(
	gridcolor="rgba(255,255,255,0.04)",
	zerolinecolor="rgba(255,255,255,0.06)",
	tickfont=dict(size=10, color=_TEXT_DIM),
	),
	yaxis=dict(
	gridcolor="rgba(255,255,255,0.04)",
	zerolinecolor="rgba(255,255,255,0.06)",
	tickfont=dict(size=10, color=_TEXT_DIM),
	),
	legend=dict(
	font=dict(size=10, color=_TEXT_DIM),
	bgcolor="rgba(0,0,0,0)",
	),
	)


	def _apply_layout(fig: go.Figure, height: int = 320, **kwargs) -> go.Figure:
	layout = {**_LAYOUT_BASE, "height": height}
	layout.update(kwargs)
	fig.update_layout(**layout)
	return fig


	def empty_figure(title: str = "", message: str = "No data available") -> go.Figure:
	"""Create an empty figure with a message."""
	fig = go.Figure()
	_apply_layout(
	fig,
	height=260,
	xaxis=dict(visible=False),
	yaxis=dict(visible=False),
	annotations=[
	dict(
	text=f"<i>{message}</i>",
	xref="paper",
	yref="paper",
	x=0.5,
	y=0.5,
	showarrow=False,
	font=dict(size=13, color=_TEXT_DIM),
	)
	],
	)
	return fig


	def health_gauge_chart(score: int \| None = None) -> go.Figure:
	"""Create a health score gauge chart (0-100)."""
	if score is None:
	return empty_figure("", "Awaiting first evaluation")

	if score >= 75:
	bar_color = _CYAN
	elif score >= 40:
	bar_color = _AMBER
	else:
	bar_color = _RED

	fig = go.Figure(
	go.Indicator(
	mode="gauge+number",
	value=score,
	number=dict(
	font=dict(
	size=48, color=bar_color, family="JetBrains Mono, monospace"
	),
	suffix="",
	),
	gauge=dict(
	axis=dict(
	range=[0, 100],
	tickcolor=_TEXT_DIM,
	tickfont=dict(size=9, color=_TEXT_DIM),
	dtick=25,
	),
	bgcolor="rgba(255,255,255,0.03)",
	bordercolor="rgba(255,255,255,0.08)",
	bar=dict(color=bar_color, thickness=0.75),
	steps=[
	dict(range=[0, 40], color="rgba(239,68,68,0.08)"),
	dict(range=[40, 75], color="rgba(245,158,11,0.06)"),
	dict(range=[75, 100], color="rgba(6,214,160,0.06)"),
	],
	),
	)
	)
	_apply_layout(fig, height=220, margin=dict(l=24, r=24, t=16, b=8))
	return fig


	def radar_chart(
	categories: list[str],
	values: list[float],
	title: str = "",
	) -> go.Figure:
	"""Create a radar/spider chart for multi-dimensional scores."""
	if not categories or not values:
	return empty_figure(title, "No RAG data yet")

	# Close the polygon
	cats = categories + [categories[0]]
	vals = values + [values[0]]

	fig = go.Figure()
	fig.add_trace(
	go.Scatterpolar(
	r=vals,
	theta=cats,
	fill="toself",
	fillcolor=f"rgba({int(_CYAN[1:3], 16)},{int(_CYAN[3:5], 16)},{int(_CYAN[5:7], 16)},0.12)",
	line=dict(color=_CYAN, width=2),
	marker=dict(size=5, color=_CYAN),
	)
	)

	_apply_layout(fig, height=340)
	fig.update_layout(
	polar=dict(
	bgcolor="rgba(0,0,0,0)",
	radialaxis=dict(
	visible=True,
	range=[0, 1],
	gridcolor="rgba(255,255,255,0.06)",
	tickfont=dict(size=8, color=_TEXT_DIM),
	),
	angularaxis=dict(
	gridcolor="rgba(255,255,255,0.06)",
	tickfont=dict(size=10, color=_TEXT),
	),
	),
	title=dict(
	text=title, font=dict(size=12, color=_TEXT_DIM), x=0, xanchor="left"
	),
	)
	return fig


	# ── Plotly dark theme for dashboard figures ──────────────────────────

	_DARK_LAYOUT: dict = dict(
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(0,0,0,0)",
	font=dict(family="JetBrains Mono, monospace", color="#94a3b8", size=11),
	autosize=True,
	margin=dict(l=50, r=20, t=44, b=40),
	xaxis=dict(
	gridcolor="rgba(255,255,255,0.04)",
	tickfont=dict(size=10, color="#475569"),
	),
	yaxis=dict(
	gridcolor="rgba(255,255,255,0.04)",
	tickfont=dict(size=10, color="#475569"),
	),
	legend=dict(font=dict(size=10, color="#64748b"), bgcolor="rgba(0,0,0,0)"),
	)


	def _dark(fig: go.Figure, **kw) -> go.Figure:
	"""Apply dark theme to a Plotly figure."""
	layout = {_DARK_LAYOUT, kw}
	fig.update_layout(**layout)
	return fig


	# ── Data layer (demo-only) ───────────────────────────────────────────

	_DEMO_RECORDS: list[EvalRecord] \| None = None


	def _fetch_records(limit: int = 500) -> list[EvalRecord]:
	"""Return cached demo records (generated once on first call)."""
	global _DEMO_RECORDS
	if _DEMO_RECORDS is None:
	_DEMO_RECORDS = generate_demo_records(200)
	return _DEMO_RECORDS[:limit]


	def _fetch_alerts(limit: int = 20) -> list:
	"""No real alerts in demo mode."""
	return []


	# ── KPI card HTML helper ────────────────────────────────────────────


	def _kpi_card(label: str, value: str, sub: str, color: str) -> str:
	return f"""<div style="
	background:linear-gradient(145deg,#111827,#0f172a);
	border:1px solid #1e293b;
	border-radius:14px;
	padding:18px 20px;
	border-top:2.5px solid {color};
	min-height:90px;
	min-width:0;
	width:100%;
	box-sizing:border-box;
	overflow:hidden;
	">
	<div style="
	font-family:'JetBrains Mono',monospace;
	font-size:0.62em;font-weight:600;
	text-transform:uppercase;letter-spacing:1.5px;
	color:#64748b;margin-bottom:8px;
	">{label}</div>
	<div style="
	font-family:'Outfit',sans-serif;
	font-size:1.8em;font-weight:700;
	color:{color};line-height:1;margin-bottom:5px;
	">{value}</div>
	<div style="
	font-family:'JetBrains Mono',monospace;
	font-size:0.68em;color:#475569;
	">{sub}</div>
	</div>"""


	# ── Tab 1: Overview ─────────────────────────────────────────────────


	def build_overview():
	records = _fetch_records(500)
	alerts = _fetch_alerts(20)

	if not records:
	return (
	_kpi_card("Health Score", "---", "no data", "#06d6a0"),
	_kpi_card("Hallucination", "---", "no data", "#f59e0b"),
	_kpi_card("Drift", "---", "no data", "#3b82f6"),
	_kpi_card("Evaluations", "0", "", "#a78bfa"),
	health_gauge_chart(None),
	empty_figure("", "No evaluations yet"),
	[["No alerts yet", "", "", "", "", ""]],
	)

	avg_health = int(sum(r.health_score for r in records) / len(records))
	avg_halluc = sum(r.hallucination_score for r in records) / len(records)
	drift_vals = [r.drift_score for r in records if r.drift_score is not None]
	avg_drift = sum(drift_vals) / len(drift_vals) if drift_vals else None

	if avg_health >= 90:
	h_sub = "HEALTHY"
	elif avg_health >= 75:
	h_sub = "MONITORING"
	elif avg_health >= 60:
	h_sub = "DEGRADING"
	else:
	h_sub = "CRITICAL"

	d_val = f"{avg_drift:.3f}" if avg_drift is not None else "..."
	d_sub = (
	"STABLE"
	if avg_drift is not None and avg_drift < 0.15
	else "DRIFTING"
	if avg_drift is not None
	else "BUILDING BASELINE"
	)

	sorted_recs = sorted(records, key=lambda r: r.timestamp)
	times = [r.timestamp.strftime("%m-%d %H:%M") for r in sorted_recs]
	scores = [r.health_score for r in sorted_recs]

	trend = go.Figure()
	min_score = max(0, min(scores) - 10)
	trend.add_trace(
	go.Scatter(
	x=times,
	y=scores,
	mode="lines",
	name="Health Score",
	line=dict(color="#06d6a0", width=2, shape="spline"),
	fill="tonexty" if min_score > 30 else "none",
	fillcolor="rgba(6,214,160,0.06)",
	)
	)
	# Only show threshold lines if they're within visible range
	if min_score <= 75:
	trend.add_hline(y=75, line_dash="dot", line_color="#f59e0b", line_width=1,
	annotation_text="Warning: 75", annotation_font_size=9,
	annotation_font_color="#f59e0b")
	if min_score <= 40:
	trend.add_hline(y=40, line_dash="dot", line_color="#ef4444", line_width=1,
	annotation_text="Critical: 40", annotation_font_size=9,
	annotation_font_color="#ef4444")
	_dark(
	trend,
	title="Health Score Trend",
	yaxis=dict(range=[min_score, 105], **_DARK_LAYOUT["yaxis"]),
	height=350,
	)

	alert_rows = [["---", "", "", "", "", "No alerts triggered"]]
	if alerts:
	alert_rows = []
	for a in alerts[:20]:
	alert_rows.append(
	[
	a.timestamp.strftime("%Y-%m-%d %H:%M"),
	a.severity.upper(),
	a.metric,
	f"{a.value:.4f}",
	f"{a.threshold:.4f}",
	a.message,
	]
	)

	return (
	_kpi_card("Health Score", str(avg_health), h_sub, "#06d6a0"),
	_kpi_card(
	"Hallucination", f"{avg_halluc:.1%}", f"avg of {len(records)}", "#f59e0b"
	),
	_kpi_card("Drift", d_val, d_sub, "#3b82f6"),
	_kpi_card("Evaluations", f"{len(records):,}", "total tracked", "#a78bfa"),
	health_gauge_chart(avg_health),
	trend,
	alert_rows,
	)


	# ── Tab 2: Hallucination ────────────────────────────────────────────


	def build_hallucination():
	records = _fetch_records(500)
	if not records:
	e = empty_figure("", "No data yet")
	return e, e, e, [["No data", "", "", "", ""]]

	sorted_recs = sorted(records, key=lambda r: r.timestamp)
	times = [r.timestamp.strftime("%m-%d %H:%M") for r in sorted_recs]
	h_scores = [r.hallucination_score for r in sorted_recs]

	rate = go.Figure()
	rate.add_trace(
	go.Scatter(
	x=times,
	y=h_scores,
	mode="lines",
	line=dict(color="#ef4444", width=2, shape="spline"),
	fill="tozeroy",
	fillcolor="rgba(239,68,68,0.08)",
	)
	)
	rate.add_hline(
	y=0.3,
	line_dash="dot",
	line_color="#f59e0b",
	annotation_text="Threshold 0.3",
	annotation_font_size=9,
	annotation_font_color="#f59e0b",
	)
	_dark(
	rate,
	title="Hallucination Score Over Time",
	yaxis=dict(range=[0, 1.05], **_DARK_LAYOUT["yaxis"]),
	height=350,
	)

	dist = go.Figure(
	go.Histogram(
	x=h_scores,
	nbinsx=25,
	marker_color="#ef4444",
	opacity=0.7,
	marker_line_width=0,
	)
	)
	dist.add_vline(x=0.3, line_dash="dot", line_color="#f59e0b")
	_dark(dist, title="Score Distribution", height=300, bargap=0.05)

	ms: dict[str, list[float]] = defaultdict(list)
	for r in records:
	ms[r.model_name].append(r.hallucination_score)
	model_names = list(ms.keys())
	avgs = [sum(v) / len(v) for v in ms.values()]
	model_fig = go.Figure(
	go.Bar(
	x=model_names,
	y=avgs,
	marker_color=["#ef4444" if a > 0.3 else "#06d6a0" for a in avgs],
	marker_line_width=0,
	)
	)
	_dark(model_fig, title="Avg Hallucination by Model", height=300)

	top = sorted(records, key=lambda r: r.hallucination_score, reverse=True)[:10]
	rows = [
	[
	r.timestamp.strftime("%H:%M:%S"),
	r.query[:50],
	r.response[:60],
	f"{r.hallucination_score:.3f}",
	", ".join(r.flagged_claims[:2]) if r.flagged_claims else "",
	]
	for r in top
	]

	return rate, dist, model_fig, rows


	# ── Tab 3: Drift ────────────────────────────────────────────────────


	def build_drift():
	records = _fetch_records(500)
	if not records:
	e = empty_figure("", "No data yet")
	return e, e, "No data"

	sorted_recs = sorted(records, key=lambda r: r.timestamp)
	drift_recs = [r for r in sorted_recs if r.drift_score is not None]

	emb_recs = [
	r for r in sorted_recs if r.embedding_vector and len(r.embedding_vector) > 2
	]
	if len(emb_recs) >= 3:
	embed = go.Figure(
	go.Scatter(
	x=[r.embedding_vector[0] for r in emb_recs],
	y=[r.embedding_vector[1] for r in emb_recs],
	mode="markers",
	marker=dict(
	size=8,
	color=[r.hallucination_score for r in emb_recs],
	colorscale=[
	[0, "#06d6a0"],
	[0.5, "#f59e0b"],
	[1, "#ef4444"],
	],
	showscale=True,
	colorbar=dict(
	title="Halluc",
	tickfont=dict(size=9, color="#64748b"),
	titlefont=dict(size=10, color="#64748b"),
	),
	line=dict(width=0),
	),
	text=[r.query[:30] for r in emb_recs],
	hovertemplate="%{text}<br>Halluc: %{marker.color:.3f}<extra></extra>",
	)
	)
	_dark(embed, title="Response Embedding Space", height=350)
	else:
	embed = empty_figure("", "Need more data for visualization")

	if not drift_recs:
	return (
	empty_figure("", "Building baseline (need 10+ evaluations)"),
	embed,
	"Building baseline...",
	)

	times = [r.timestamp.strftime("%m-%d %H:%M") for r in drift_recs]
	scores = [r.drift_score for r in drift_recs]

	dfig = go.Figure()
	dfig.add_trace(
	go.Scatter(
	x=times,
	y=scores,
	mode="lines",
	line=dict(color="#a78bfa", width=2, shape="spline"),
	fill="tozeroy",
	fillcolor="rgba(167,139,250,0.08)",
	)
	)
	dfig.add_hline(
	y=0.15,
	line_dash="dot",
	line_color="#ef4444",
	annotation_text="Threshold 0.15",
	annotation_font_size=9,
	annotation_font_color="#ef4444",
	)
	y_max = max(max(scores) * 1.2, 0.3)
	_dark(
	dfig,
	title="Drift Score Over Time",
	yaxis=dict(range=[0, y_max], **_DARK_LAYOUT["yaxis"]),
	height=350,
	)

	avg = sum(scores) / len(scores)
	if avg < 0.1:
	st = "Stable"
	elif avg < 0.2:
	st = "Minor drift"
	else:
	st = "Significant drift!"

	return dfig, embed, st


	# ── Tab 4: RAG & Quality ────────────────────────────────────────────


	def build_rag_quality():
	records = _fetch_records(500)
	if not records:
	e = empty_figure("", "No data yet")
	return e, e, e, e

	sorted_recs = sorted(records, key=lambda r: r.timestamp)
	times = [r.timestamp.strftime("%m-%d %H:%M") for r in sorted_recs]

	qfig = go.Figure()
	qfig.add_trace(
	go.Scatter(
	x=times,
	y=[r.sentiment_score for r in sorted_recs],
	mode="lines",
	name="Sentiment",
	line=dict(color="#3b82f6", width=2, shape="spline"),
	)
	)
	qfig.add_trace(
	go.Scatter(
	x=times,
	y=[r.toxicity_score for r in sorted_recs],
	mode="lines",
	name="Toxicity",
	line=dict(color="#ef4444", width=2, shape="spline"),
	)
	)
	_dark(
	qfig,
	title="Quality Metrics Over Time",
	yaxis=dict(range=[0, 1.05], **_DARK_LAYOUT["yaxis"]),
	height=350,
	)

	rag_recs = [r for r in sorted_recs if r.groundedness_score is not None]
	if rag_recs:
	rt = [r.timestamp.strftime("%m-%d %H:%M") for r in rag_recs]
	rfig = go.Figure()
	rfig.add_trace(
	go.Scatter(
	x=rt,
	y=[r.faithfulness_score or 0 for r in rag_recs],
	mode="lines",
	name="Faithfulness",
	line=dict(color="#06d6a0", width=2, shape="spline"),
	)
	)
	rfig.add_trace(
	go.Scatter(
	x=rt,
	y=[r.context_relevance or 0 for r in rag_recs],
	mode="lines",
	name="Context Relevance",
	line=dict(color="#3b82f6", width=2, shape="spline"),
	)
	)
	rfig.add_trace(
	go.Scatter(
	x=rt,
	y=[r.groundedness_score or 0 for r in rag_recs],
	mode="lines",
	name="Groundedness",
	line=dict(color="#a78bfa", width=2, dash="dash"),
	)
	)
	_dark(
	rfig,
	title="RAG Quality Metrics",
	yaxis=dict(range=[0, 1.05], **_DARK_LAYOUT["yaxis"]),
	height=350,
	)

	af = sum(r.faithfulness_score or 0 for r in rag_recs) / len(rag_recs)
	ac = sum(r.context_relevance or 0 for r in rag_recs) / len(rag_recs)
	aa = sum(r.answer_relevancy or 0 for r in rag_recs) / len(rag_recs)
	ag = sum(r.groundedness_score or 0 for r in rag_recs) / len(rag_recs)
	radar = radar_chart(
	["Faithfulness", "Context Relevance", "Answer Relevancy", "Groundedness"],
	[af, ac, aa, ag],
	title="RAG Quality Radar",
	)
	else:
	rfig = empty_figure("", "No RAG calls yet")
	radar = empty_figure("", "No RAG data")

	lang: dict[str, int] = defaultdict(int)
	denials = 0
	for r in records:
	lang[r.language_detected] += 1
	if r.is_denial:
	denials += 1
	bfig = go.Figure(
	go.Bar(
	x=list(lang.keys()),
	y=list(lang.values()),
	marker_color="#3b82f6",
	marker_line_width=0,
	)
	)
	_dark(
	bfig,
	title=f"Language Distribution \| Denials: {denials}/{len(records)}",
	height=300,
	)

	return qfig, rfig, radar, bfig


	# ── CSS ─────────────────────────────────────────────────────────────

	THEME_CSS = """
	@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@300;400;500;600;700&family=Outfit:wght@300;400;500;600;700;800&display=swap');

	body, .gradio-container {
	background: #060a14 !important;
	color: #e2e8f0 !important;
	font-family: 'Outfit', sans-serif !important;
	}
	.gradio-container {
	max-width: 100% !important;
	width: 100% !important;
	margin: 0 !important;
	padding: 0 20px !important;
	box-sizing: border-box !important;
	overflow-x: hidden !important;
	}
	.main, .wrap, .contain {
	max-width: 100% !important;
	width: 100% !important;
	overflow-x: hidden !important;
	}
	.app {
	max-width: 100% !important;
	overflow-x: hidden !important;
	}
	/* Plotly charts should not overflow */
	.js-plotly-plot, .plotly, .plot-container, .svg-container {
	max-width: 100% !important;
	width: 100% !important;
	overflow: hidden !important;
	}
	.js-plotly-plot .main-svg, .js-plotly-plot .svg-container {
	max-width: 100% !important;
	width: 100% !important;
	}
	.plot-container.plotly {
	width: 100% !important;
	}
	/* Gradio plot wrapper */
	.gr-plot, .plot-padding {
	max-width: 100% !important;
	overflow: hidden !important;
	}

	::-webkit-scrollbar { width: 6px; }
	::-webkit-scrollbar-track { background: #0a0e1a; }
	::-webkit-scrollbar-thumb { background: #1e293b; border-radius: 3px; }

	.ep-hdr {
	position: relative;
	padding: 24px 32px;
	margin: 0 -20px 20px -20px;
	background: linear-gradient(135deg, #0a0e1a 0%, #111827 50%, #0f172a 100%);
	border-bottom: 1px solid rgba(6,214,160,0.15);
	overflow: hidden;
	box-sizing: border-box;
	}
	.ep-hdr::before {
	content:'';position:absolute;inset:0;
	background:
	radial-gradient(ellipse 600px 300px at 15% 50%,rgba(6,214,160,0.06),transparent 70%),
	radial-gradient(ellipse 400px 200px at 85% 30%,rgba(59,130,246,0.04),transparent 70%);
	pointer-events:none;
	}
	.ep-hdr-in { position:relative;display:flex;align-items:center;justify-content:space-between;z-index:1; }
	.ep-brand { display:flex;align-items:center;gap:14px; }
	.ep-logo {
	width:40px;height:40px;border-radius:10px;
	background:linear-gradient(135deg,#06d6a0,#3b82f6);
	display:flex;align-items:center;justify-content:center;
	font-size:18px;font-weight:700;color:#060a14;
	font-family:'JetBrains Mono',monospace;
	box-shadow:0 0 20px rgba(6,214,160,0.3);
	}
	.ep-t { font-family:'Outfit';font-size:1.6em;font-weight:700;letter-spacing:-0.5px;color:#f1f5f9!important;margin:0!important; }
	.ep-st { font-family:'JetBrains Mono';font-size:0.7em;color:#64748b!important;margin:3px 0 0!important;letter-spacing:0.5px;text-transform:uppercase; }
	.ep-live { display:flex;align-items:center;gap:8px;font-family:'JetBrains Mono';font-size:0.72em;color:#06d6a0;letter-spacing:0.3px; }
	.ep-dot {
	width:7px;height:7px;border-radius:50%;background:#06d6a0;
	box-shadow:0 0 8px rgba(6,214,160,0.6);
	animation:pdot 2s ease-in-out infinite;
	}
	@keyframes pdot { 0%,100%{opacity:1} 50%{opacity:0.4} }

	.tab-nav { background:transparent!important;border:none!important;gap:4px!important;padding:0 0 14px!important;border-bottom:1px solid #1e293b!important;margin-bottom:18px!important; }
	.tab-nav button {
	font-family:'JetBrains Mono',monospace!important;font-size:0.76em!important;font-weight:500!important;
	letter-spacing:0.5px!important;text-transform:uppercase!important;color:#64748b!important;
	background:transparent!important;border:1px solid transparent!important;border-radius:8px!important;
	padding:8px 18px!important;transition:all 0.2s!important;
	}
	.tab-nav button:hover { color:#e2e8f0!important;background:rgba(255,255,255,0.03)!important; }
	.tab-nav button.selected { color:#06d6a0!important;background:rgba(6,214,160,0.08)!important;border-color:rgba(6,214,160,0.2)!important; }
	.tabitem { border:none!important;background:transparent!important;padding:0!important; }

	table { background:#111827!important;border:1px solid #1e293b!important;border-radius:10px!important;overflow:hidden!important; }
	table thead th {
	background:#0f172a!important;color:#64748b!important;
	font-family:'JetBrains Mono',monospace!important;font-size:0.7em!important;
	font-weight:600!important;letter-spacing:0.8px!important;text-transform:uppercase!important;
	padding:10px 14px!important;border-bottom:1px solid #1e293b!important;
	}
	table tbody td {
	background:#111827!important;color:#cbd5e1!important;
	font-family:'JetBrains Mono',monospace!important;font-size:0.78em!important;
	padding:8px 14px!important;border-bottom:1px solid rgba(30,41,59,0.5)!important;
	}
	table tbody tr:hover td { background:rgba(6,214,160,0.03)!important; }

	button.primary, button.secondary {
	font-family:'JetBrains Mono',monospace!important;font-size:0.74em!important;
	letter-spacing:0.4px!important;border-radius:8px!important;
	}
	button.primary { background:rgba(6,214,160,0.12)!important;color:#06d6a0!important;border:1px solid rgba(6,214,160,0.25)!important; }
	button.primary:hover { background:rgba(6,214,160,0.2)!important; }
	button.secondary { background:rgba(59,130,246,0.1)!important;color:#3b82f6!important;border:1px solid rgba(59,130,246,0.2)!important; }
	button.secondary:hover { background:rgba(59,130,246,0.18)!important; }

	.gr-row {
	gap:14px!important;
	flex-wrap: wrap !important;
	max-width: 100% !important;
	overflow: hidden !important;
	}
	/* Remove all white backgrounds from Gradio components */
	.gr-block, .block:not(.gr-group) { border:none!important;background:transparent!important; }
	.gr-padded { padding:0!important; }
	.label-wrap { background:#0a0e1a!important;border:1px solid #1e293b!important;border-radius:8px!important;padding:4px 10px!important; }
	.label-wrap span { color:#64748b!important;font-family:'JetBrains Mono',monospace!important;font-size:0.72em!important;letter-spacing:0.5px!important; }
	/* Plot containers */
	.gr-plot, .plot-wrap, .gradio-plot { background:transparent!important;border:none!important; }
	div[class*="plot"] { background:transparent!important; }
	/* All panel/group/box backgrounds */
	.panel, .gr-panel, .gr-box, .gr-form, .gr-input-label, .gr-check-radio { background:#111827!important;border-color:#1e293b!important;color:#e2e8f0!important; }
	/* File download component */
	.file-preview, .upload-button { background:#111827!important;border-color:#1e293b!important;color:#94a3b8!important; }
	/* Inputs and textboxes */
	input, textarea, select, .gr-input { background:#111827!important;border-color:#1e293b!important;color:#e2e8f0!important; }
	/* Any remaining white wrapper divs */
	.contain > div, .wrap > div { background:transparent!important; }
	/* Markdown text areas */
	.prose, .markdown-text, .md { background:transparent!important;color:#94a3b8!important; }
	/* Accordion headers */
	.accordion { background:#111827!important;border-color:#1e293b!important; }
	/* Prevent dataframes from causing horizontal scroll */
	.dataframe, .table-wrap, .svelte-table {
	max-width: 100% !important;
	overflow-x: auto !important;
	overflow-y: hidden !important;
	}
	/* KPI card row in HTML shouldn't overflow */
	div[style*="display:flex"] {
	flex-wrap: wrap !important;
	max-width: 100% !important;
	}

	.ep-ftr {
	margin-top:28px;padding:14px 0;border-top:1px solid #1e293b;
	text-align:center;font-family:'JetBrains Mono',monospace;
	font-size:0.68em;color:#334155;letter-spacing:0.3px;
	}
	.ep-ftr a { color:#475569;text-decoration:none; }
	.ep-ftr a:hover { color:#06d6a0; }

	.markdown-text h4 { color:#94a3b8!important;font-family:'Outfit',sans-serif!important; }
	.markdown-text p, .markdown-text { color:#94a3b8!important; }

	@media(max-width:768px) { .ep-hdr-in{flex-direction:column;gap:10px;align-items:flex-start;} }
	"""


	# ── App ─────────────────────────────────────────────────────────────


	def create_app() -> gr.Blocks:
	with gr.Blocks(title="EvalPulse Dashboard", css=THEME_CSS) as app:
	gr.HTML("""
	<div class="ep-hdr"><div class="ep-hdr-in">
	<div class="ep-brand">
	<div class="ep-logo">EP</div>
	<div><div class="ep-t">EvalPulse</div>
	<div class="ep-st">LLM Evaluation & Drift Monitor</div></div>
	</div>
	<div class="ep-live"><div class="ep-dot"></div>DEMO MODE</div>
	</div></div>
	""")

	with gr.Tabs():
	with gr.TabItem("Overview"):
	with gr.Row():
	hc = gr.HTML("Loading...")
	hac = gr.HTML("Loading...")
	dc = gr.HTML("Loading...")
	tc = gr.HTML("Loading...")
	with gr.Row():
	hg = gr.Plot(label="Health Gauge")
	ht = gr.Plot(label="Health Trend")
	gr.Markdown("#### Recent Alerts")
	at = gr.Dataframe(
	headers=[
	"Time",
	"Severity",
	"Metric",
	"Value",
	"Threshold",
	"Message",
	],
	interactive=False,
	)
	gr.Button("Refresh", variant="primary", size="sm").click(
	fn=build_overview, outputs=[hc, hac, dc, tc, hg, ht, at]
	)

	with gr.TabItem("Hallucination"):
	hr = gr.Plot()
	with gr.Row():
	hd = gr.Plot()
	hm = gr.Plot()
	gr.Markdown("#### Highest Hallucination Responses")
	htb = gr.Dataframe(
	headers=["Time", "Query", "Response", "Score", "Flagged"],
	interactive=False,
	)
	gr.Button("Refresh", variant="primary", size="sm").click(
	fn=build_hallucination, outputs=[hr, hd, hm, htb]
	)

	with gr.TabItem("Semantic Drift"):
	ds = gr.Markdown("Loading...")
	dp = gr.Plot()
	de = gr.Plot()
	gr.Button("Refresh", variant="primary", size="sm").click(
	fn=build_drift, outputs=[dp, de, ds]
	)

	with gr.TabItem("RAG & Quality"):
	qp = gr.Plot()
	with gr.Row():
	rp = gr.Plot()
	rr = gr.Plot()
	bp = gr.Plot()
	gr.Button("Refresh", variant="primary", size="sm").click(
	fn=build_rag_quality, outputs=[qp, rp, rr, bp]
	)

	gr.HTML("""
	<div class="ep-ftr">
	EvalPulse v0.1.0 · Open Source LLM Evaluation & Drift Monitoring
	· <a href="https://github.com/ninjacode911/Project-EvalPulse">GitHub</a>
	</div>
	""")

	app.load(fn=build_overview, outputs=[hc, hac, dc, tc, hg, ht, at])

	return app


	if __name__ == "__main__":
	create_app().launch(server_name="0.0.0.0", server_port=7860)