import { useState } from 'react' import { BarChart, Bar, XAxis, YAxis, CartesianGrid, Tooltip, ResponsiveContainer, Cell, ReferenceLine, } from 'recharts' import { PAGE_STYLE } from '../App.jsx' // ── Eval results (from python -m ml.eval, seed=42, 79 train / 21 val) ───────── const MODELS = [ { name: 'BoW + LogReg', shortName: 'BoW+LR', accuracy: 52.4, tier: 'classical', lecture: 'Lecture 3', note: 'CountVectorizer loses TF weighting — raw counts hurt precision on short headlines', }, { name: 'BoW + LogReg + Lemma', shortName: 'BoW+LR+L', accuracy: 52.4, tier: 'classical', lecture: 'Lectures 2–3', note: 'No change from non-lemmatized — WordNet is English-biased; Tagalog tokens unchanged', }, { name: 'TF-IDF + LogReg', shortName: 'TFIDF+LR', accuracy: 61.9, tier: 'classical', lecture: 'Lecture 3', note: 'Sublinear TF weighting reduces dominance of high-frequency terms; best classical model', }, { name: 'TF-IDF + NB', shortName: 'TFIDF+NB', accuracy: 42.9, tier: 'classical', lecture: 'Lectures 5–6', note: 'Feature independence assumption breaks on 79 samples; noisy probability estimates', }, { name: 'TF-IDF + NB + Lemma', shortName: 'NB+Lemma', accuracy: 42.9, tier: 'classical', lecture: 'Lectures 2, 5–6', note: 'Lemmatization again neutral — confirms English-biased lemmatizer finding', }, { name: 'LDA + LogReg', shortName: 'LDA+LR', accuracy: 42.9, tier: 'classical', lecture: 'Lecture 7', note: '5 topics over 79 documents is too few for stable topic distributions', }, { name: 'XLM-RoBERTa', shortName: 'XLM-R', accuracy: 90.5, tier: 'transformer', lecture: 'Transfer Learning', note: 'Pretrained on 100+ languages including Filipino; fine-tuned on combined dataset', }, { name: 'Tagalog-RoBERTa', shortName: 'TL-R', accuracy: 95.2, tier: 'transformer', lecture: 'Transfer Learning', note: 'Pretrained on TLUnified Filipino corpus; higher recall on Tagalog/Taglish posts', }, { name: 'Ensemble', shortName: 'Ensemble', accuracy: 100.0, tier: 'ensemble', lecture: 'Ensemble Methods', note: 'Soft-vote average of XLM-R + Tagalog-RoBERTa logits; 100% on 21-sample holdout', }, ] const TIER_COLOR = { classical: '#d97706', // gold transformer: '#06b6d4', // cyan ensemble: '#16a34a', // green } const TIER_LABEL = { classical: 'Classical ML', transformer: 'Transformer', ensemble: 'Ensemble', } const FINDINGS = [ { lecture: 'Lecture 3', title: 'TF-IDF > Bag of Words', body: 'TF-IDF sublinear weighting outperforms raw BoW counts by +9.5%. Down-weighting high-frequency filler terms matters for short Filipino news headlines.', color: '#d97706', }, { lecture: 'Lectures 5–6', title: 'Naive Bayes struggles at small scale', body: 'MultinomialNB reaches only 42.9% — 19pp below LogReg. Feature independence breaks down when training on 79 noisy, cross-lingual samples.', color: '#d97706', }, { lecture: 'Lecture 7', title: 'LDA needs more documents', body: '5 topics over 79 training texts yields unstable distributions. Topic features are weak signal for 3-class classification; LDA would improve with 1000+ samples.', color: '#d97706', }, { lecture: 'Lectures 2a–2c', title: 'Lemmatization: neutral on Tagalog', body: 'Zero accuracy change with WordNet lemmatization. English-biased lemmatizers return Tagalog tokens unchanged — confirms the tool is a no-op on Filipino text.', color: '#06b6d4', }, ] // ── Custom tooltip ───────────────────────────────────────────────────────────── function ChartTooltip({ active, payload }) { if (!active || !payload?.length) return null const d = payload[0].payload return (
{d.name}
{d.accuracy.toFixed(1)}% accuracy
{d.note}
) } // ── Tier legend pill ─────────────────────────────────────────────────────────── function TierPill({ tier }) { return ( {TIER_LABEL[tier]} ) } export default function BenchmarksPage() { const [activeRow, setActiveRow] = useState(null) return (
{/* ── Header ─────────────────────────────────────────────────────────── */}
ML Course — Model Comparison

Model Benchmarks

Comparison of 9 classifier variants on a 21-sample holdout from the handcrafted PhilVerify dataset (79 train / 21 val, seed 42). Classical models trained in-session; transformer checkpoints fine-tuned on the full combined dataset.

{/* ── Key findings ───────────────────────────────────────────────────── */}

Key Findings

{FINDINGS.map((f) => (
{f.lecture}
{f.title}

{f.body}

))}
{/* ── Bar chart ──────────────────────────────────────────────────────── */}

Accuracy by Model

{Object.entries(TIER_LABEL).map(([tier, label]) => (
{label.toUpperCase()}
))}
`${v}%`} tick={{ fontSize: 9, fontFamily: 'var(--font-mono)', fill: 'var(--text-muted)' }} tickLine={false} axisLine={false} /> } cursor={{ fill: 'rgba(245,240,232,0.03)' }} /> {MODELS.map((m) => ( ))}
{/* ── Full results table ─────────────────────────────────────────────── */}

Full Results

{['Model', 'Accuracy', 'Tier', 'Lecture', 'Note'].map(h => ( ))} {MODELS.map((m, i) => ( setActiveRow(m.name)} onMouseLeave={() => setActiveRow(null)} style={{ borderBottom: i < MODELS.length - 1 ? '1px solid var(--border)' : 'none', background: activeRow === m.name ? 'var(--bg-elevated)' : 'transparent', transition: 'background 0.1s', borderLeft: `3px solid ${activeRow === m.name ? TIER_COLOR[m.tier] : 'transparent'}`, }} > ))}
{h}
{m.name} {m.accuracy.toFixed(1)}% {m.lecture} {m.note}
{/* ── Footer note ────────────────────────────────────────────────────── */}

* Val set is 21 samples from a handcrafted 100-sample dataset — ensemble 100% reflects near-zero variance on a small holdout, not production accuracy. Transformer models were trained on the larger combined dataset; classical models trained on the 79-sample split.

) }