import { useState } from "react"; import { BarChart, Bar, XAxis, YAxis, CartesianGrid, Tooltip, ResponsiveContainer, Cell, } from "recharts"; import { api, getErrorMessage } from "../api"; import type { EvalSection, SimilarityDistribution, DisambiguationMetric, RetrievalMetric } from "../types"; import StatusMessage from "./StatusMessage"; import MetricCard from "./MetricCard"; // ---- Structured form types ---- interface GtRow { text: string; meaning: string; } interface RetrievalRow { query: string; relevantText: string; } // ---- Example data ---- const EXAMPLE_KEYWORD = "pizza"; const EXAMPLE_MEANINGS = [ "school, education, and academic activities like homework and tests", "food, Italian cuisine, restaurant, cooking, and eating", ]; const EXAMPLE_GT: GtRow[] = [ { text: "I love pizza so much, I go there every day", meaning: "school" }, { text: "pizza gives me homework", meaning: "school" }, { text: "she made the best margherita pizza in the city", meaning: "food" }, { text: "pizza dough recipe used tipo 00 flour", meaning: "food" }, { text: "The pizza test is going to be so hard", meaning: "school" }, { text: "This pizza is amazing, the crust is perfectly crispy", meaning: "food" }, ]; const EXAMPLE_RETRIEVAL: RetrievalRow[] = [ { query: "kids using secret code words for school", relevantText: "secret language" }, { query: "Italian restaurant with wood-fired oven", relevantText: "pizza" }, ]; // ---- Meaning label helpers ---- function getMeaningLabels(meanings: string[]): string[] { return meanings.map((m) => { const first = m.split(",")[0].trim(); return first.length > 20 ? first.slice(0, 20) : first; }); } // ---- Tab config ---- const EVAL_TABS: { id: EvalSection; label: string; desc: string }[] = [ { id: "distribution", label: "Distribution", desc: "Analyze pairwise similarity distribution across your corpus. One-click — no setup needed.", }, { id: "disambiguation", label: "Disambiguation", desc: "Test whether the engine can tell apart different meanings of the same word. Provide example sentences and label each with the intended meaning.", }, { id: "retrieval", label: "Retrieval", desc: "Measure how well the engine finds relevant documents for a given query. Provide search queries and what text they should match.", }, ]; export default function EvaluationDashboard() { const [section, setSection] = useState("distribution"); const [distrib, setDistrib] = useState(null); const [disambig, setDisambig] = useState(null); const [retrieval, setRetrieval] = useState(null); const [loading, setLoading] = useState(""); const [error, setError] = useState(""); // Disambiguation structured form const [keyword, setKeyword] = useState(""); const [meanings, setMeanings] = useState(["", ""]); const [gtRows, setGtRows] = useState([{ text: "", meaning: "" }]); // Retrieval structured form const [retRows, setRetRows] = useState([{ query: "", relevantText: "" }]); // ---- Distribution ---- async function fetchDistribution() { setLoading("distrib"); setError(""); try { setDistrib(await api.getSimilarityDistribution()); } catch (err) { setError(getErrorMessage(err)); } finally { setLoading(""); } } // ---- Disambiguation ---- function loadDisambiguationExample() { setKeyword(EXAMPLE_KEYWORD); setMeanings([...EXAMPLE_MEANINGS]); setGtRows(EXAMPLE_GT.map((r) => ({ ...r }))); } function updateMeaning(i: number, val: string) { const next = [...meanings]; next[i] = val; setMeanings(next); } function addMeaning() { setMeanings([...meanings, ""]); } function removeMeaning(i: number) { if (meanings.length <= 2) return; setMeanings(meanings.filter((_, idx) => idx !== i)); // Update GT rows that referenced removed meaning const labels = getMeaningLabels(meanings); const removed = labels[i]; setGtRows(gtRows.map((r) => (r.meaning === removed ? { ...r, meaning: "" } : r))); } function updateGtRow(i: number, field: keyof GtRow, val: string) { const next = [...gtRows]; next[i] = { ...next[i], [field]: val }; setGtRows(next); } function addGtRow() { setGtRows([...gtRows, { text: "", meaning: "" }]); } function removeGtRow(i: number) { if (gtRows.length <= 1) return; setGtRows(gtRows.filter((_, idx) => idx !== i)); } async function runDisambiguation() { if (!keyword.trim()) { setError("Enter a keyword."); return; } const validMeanings = meanings.filter((m) => m.trim()); if (validMeanings.length < 2) { setError("Add at least 2 meanings."); return; } const validGt = gtRows.filter((r) => r.text.trim() && r.meaning); if (validGt.length < 2) { setError("Add at least 2 labeled examples."); return; } setLoading("disambig"); setError(""); try { const labels = getMeaningLabels(meanings); const ground_truth = validGt.map((r) => ({ keyword: keyword.trim(), text: r.text, true_meaning: r.meaning, })); const candidate_meanings: Record = { [keyword.trim()]: validMeanings, }; // Map GT meaning labels back to full candidate strings for the API // The API compares against candidates, so true_meaning should match a candidate label // We use short labels for the dropdown, but the API uses them as-is for matching const res = await api.evalDisambiguation({ ground_truth, candidate_meanings }); setDisambig(res.metrics); } catch (e) { setError(getErrorMessage(e)); } finally { setLoading(""); } } // ---- Retrieval ---- function loadRetrievalExample() { setRetRows(EXAMPLE_RETRIEVAL.map((r) => ({ ...r }))); } function updateRetRow(i: number, field: keyof RetrievalRow, val: string) { const next = [...retRows]; next[i] = { ...next[i], [field]: val }; setRetRows(next); } function addRetRow() { setRetRows([...retRows, { query: "", relevantText: "" }]); } function removeRetRow(i: number) { if (retRows.length <= 1) return; setRetRows(retRows.filter((_, idx) => idx !== i)); } async function runRetrieval() { const valid = retRows.filter((r) => r.query.trim()); if (valid.length === 0) { setError("Add at least one query."); return; } setLoading("retrieval"); setError(""); try { const queries = valid.map((r) => ({ query: r.query, relevant_texts: r.relevantText.trim() ? [r.relevantText.trim()] : [], })); const res = await api.evalRetrieval({ queries, k_values: [1, 3, 5, 10] }); setRetrieval(res.metrics); } catch (e) { setError(getErrorMessage(e)); } finally { setLoading(""); } } // ---- Meaning labels for dropdown ---- const meaningLabels = getMeaningLabels(meanings); return (

{EVAL_TABS.find((t) => t.id === section)?.desc}

{error && } {/* ---- Similarity Distribution ---- */} {section === "distribution" && (
{distrib && (
{[ { label: "Mean", value: distrib.mean }, { label: "Std Dev", value: distrib.std }, { label: "Min", value: distrib.min }, { label: "Max", value: distrib.max }, ].map((m) => ( ))}

Histogram

v.toFixed(1)} /> [Number(value), "Count"]} labelFormatter={(v: unknown) => `Similarity: ${Number(v).toFixed(2)}`} /> {distrib.histogram.map((entry, i) => ( = 0.5 ? "var(--ok)" : entry.bin_start >= 0 ? "var(--accent)" : "var(--err)"} /> ))}

Percentiles

{Object.keys(distrib.percentiles).map((p) => ( ))} {Object.values(distrib.percentiles).map((v, i) => ( ))}
P{p}
{v.toFixed(4)}
)}
)} {/* ---- Disambiguation Evaluation ---- */} {section === "disambiguation" && (
{/* Keyword */}
setKeyword(e.target.value)} placeholder='e.g. "pizza"' />
{/* Candidate Meanings */}
{meanings.map((m, i) => (
{i + 1}. updateMeaning(i, e.target.value)} placeholder={`Meaning ${i + 1} description...`} style={{ flex: 1 }} /> {meanings.length > 2 && ( )}
))}
{/* Ground Truth Examples */}
{gtRows.map((row, i) => ( ))}
Sentence Correct Meaning
updateGtRow(i, "text", e.target.value)} placeholder="A sentence containing the keyword..." style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }} /> {gtRows.length > 1 && ( )}
{disambig && disambig.map((m) => (

Results: "{m.keyword}" ({m.total_samples} samples)

Per-Meaning Scores

{Object.keys(m.per_meaning_f1).map((meaning) => ( ))}
Meaning Precision Recall F1
{meaning} {m.per_meaning_precision[meaning]?.toFixed(4) ?? "-"} {m.per_meaning_recall[meaning]?.toFixed(4) ?? "-"} {m.per_meaning_f1[meaning]?.toFixed(4) ?? "-"}
{m.confusion_matrix && ( <>

Confusion Matrix

{Object.keys(m.per_meaning_f1).map((meaning) => ( ))} {m.confusion_matrix.map((row, i) => ( {row.map((val, j) => ( ))} ))}
True \ Predicted{meaning}
{Object.keys(m.per_meaning_f1)[i]} 0 ? "var(--err)" : "var(--text-dim)", }} > {val}
)}
))}
)} {/* ---- Retrieval Evaluation ---- */} {section === "retrieval" && (
{retRows.map((row, i) => ( ))}
Query Expected Match (text snippet)
updateRetRow(i, "query", e.target.value)} placeholder="A search query..." style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }} /> updateRetRow(i, "relevantText", e.target.value)} placeholder="Text that should match..." style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }} /> {retRows.length > 1 && ( )}
{retrieval && (
{retrieval.map((m, i) => ( ))}
Query MRR P@1 P@3 P@5 Top Score
{m.query.length > 50 ? m.query.slice(0, 50) + "..." : m.query} {m.mrr.toFixed(3)} {m.precision_at_k["1"]?.toFixed(2) ?? "-"} {m.precision_at_k["3"]?.toFixed(2) ?? "-"} {m.precision_at_k["5"]?.toFixed(2) ?? "-"} {m.top_score.toFixed(3)}
s + m.mrr, 0) / retrieval.length).toFixed(3)} label="Mean MRR" /> s + (m.precision_at_k["5"] ?? 0), 0) / retrieval.length).toFixed(3)} label="Mean P@5" /> s + m.top_score, 0) / retrieval.length).toFixed(3)} label="Mean Top Score" />
)}
)}
); }