import { useState } from 'react'
import {
BarChart, Bar, XAxis, YAxis, CartesianGrid, Tooltip,
ResponsiveContainer, Cell, ReferenceLine,
} from 'recharts'
import { PAGE_STYLE } from '../App.jsx'
// ── Eval results (from python -m ml.eval, seed=42, 79 train / 21 val) ─────────
const MODELS = [
{
name: 'BoW + LogReg',
shortName: 'BoW+LR',
accuracy: 52.4,
tier: 'classical',
lecture: 'Lecture 3',
note: 'CountVectorizer loses TF weighting — raw counts hurt precision on short headlines',
},
{
name: 'BoW + LogReg + Lemma',
shortName: 'BoW+LR+L',
accuracy: 52.4,
tier: 'classical',
lecture: 'Lectures 2–3',
note: 'No change from non-lemmatized — WordNet is English-biased; Tagalog tokens unchanged',
},
{
name: 'TF-IDF + LogReg',
shortName: 'TFIDF+LR',
accuracy: 61.9,
tier: 'classical',
lecture: 'Lecture 3',
note: 'Sublinear TF weighting reduces dominance of high-frequency terms; best classical model',
},
{
name: 'TF-IDF + NB',
shortName: 'TFIDF+NB',
accuracy: 42.9,
tier: 'classical',
lecture: 'Lectures 5–6',
note: 'Feature independence assumption breaks on 79 samples; noisy probability estimates',
},
{
name: 'TF-IDF + NB + Lemma',
shortName: 'NB+Lemma',
accuracy: 42.9,
tier: 'classical',
lecture: 'Lectures 2, 5–6',
note: 'Lemmatization again neutral — confirms English-biased lemmatizer finding',
},
{
name: 'LDA + LogReg',
shortName: 'LDA+LR',
accuracy: 42.9,
tier: 'classical',
lecture: 'Lecture 7',
note: '5 topics over 79 documents is too few for stable topic distributions',
},
{
name: 'XLM-RoBERTa',
shortName: 'XLM-R',
accuracy: 90.5,
tier: 'transformer',
lecture: 'Transfer Learning',
note: 'Pretrained on 100+ languages including Filipino; fine-tuned on combined dataset',
},
{
name: 'Tagalog-RoBERTa',
shortName: 'TL-R',
accuracy: 95.2,
tier: 'transformer',
lecture: 'Transfer Learning',
note: 'Pretrained on TLUnified Filipino corpus; higher recall on Tagalog/Taglish posts',
},
{
name: 'Ensemble',
shortName: 'Ensemble',
accuracy: 100.0,
tier: 'ensemble',
lecture: 'Ensemble Methods',
note: 'Soft-vote average of XLM-R + Tagalog-RoBERTa logits; 100% on 21-sample holdout',
},
]
const TIER_COLOR = {
classical: '#d97706', // gold
transformer: '#06b6d4', // cyan
ensemble: '#16a34a', // green
}
const TIER_LABEL = {
classical: 'Classical ML',
transformer: 'Transformer',
ensemble: 'Ensemble',
}
const FINDINGS = [
{
lecture: 'Lecture 3',
title: 'TF-IDF > Bag of Words',
body: 'TF-IDF sublinear weighting outperforms raw BoW counts by +9.5%. Down-weighting high-frequency filler terms matters for short Filipino news headlines.',
color: '#d97706',
},
{
lecture: 'Lectures 5–6',
title: 'Naive Bayes struggles at small scale',
body: 'MultinomialNB reaches only 42.9% — 19pp below LogReg. Feature independence breaks down when training on 79 noisy, cross-lingual samples.',
color: '#d97706',
},
{
lecture: 'Lecture 7',
title: 'LDA needs more documents',
body: '5 topics over 79 training texts yields unstable distributions. Topic features are weak signal for 3-class classification; LDA would improve with 1000+ samples.',
color: '#d97706',
},
{
lecture: 'Lectures 2a–2c',
title: 'Lemmatization: neutral on Tagalog',
body: 'Zero accuracy change with WordNet lemmatization. English-biased lemmatizers return Tagalog tokens unchanged — confirms the tool is a no-op on Filipino text.',
color: '#06b6d4',
},
]
// ── Custom tooltip ─────────────────────────────────────────────────────────────
function ChartTooltip({ active, payload }) {
if (!active || !payload?.length) return null
const d = payload[0].payload
return (
{d.name}
{d.accuracy.toFixed(1)}% accuracy
{d.note}
)
}
// ── Tier legend pill ───────────────────────────────────────────────────────────
function TierPill({ tier }) {
return (
{TIER_LABEL[tier]}
)
}
export default function BenchmarksPage() {
const [activeRow, setActiveRow] = useState(null)
return (
{/* ── Header ─────────────────────────────────────────────────────────── */}
ML Course — Model Comparison
Model Benchmarks
Comparison of 9 classifier variants on a 21-sample holdout from the
handcrafted PhilVerify dataset (79 train / 21 val, seed 42). Classical
models trained in-session; transformer checkpoints fine-tuned on the
full combined dataset.
{/* ── Key findings ───────────────────────────────────────────────────── */}
Key Findings
{FINDINGS.map((f) => (
{f.lecture}
{f.title}
{f.body}
))}
{/* ── Bar chart ──────────────────────────────────────────────────────── */}
Accuracy by Model
{Object.entries(TIER_LABEL).map(([tier, label]) => (
{label.toUpperCase()}
))}
`${v}%`}
tick={{ fontSize: 9, fontFamily: 'var(--font-mono)', fill: 'var(--text-muted)' }}
tickLine={false}
axisLine={false}
/>
} cursor={{ fill: 'rgba(245,240,232,0.03)' }} />
{MODELS.map((m) => (
|
))}
{/* ── Full results table ─────────────────────────────────────────────── */}
Full Results
{['Model', 'Accuracy', 'Tier', 'Lecture', 'Note'].map(h => (
|
{h}
|
))}
{MODELS.map((m, i) => (
setActiveRow(m.name)}
onMouseLeave={() => setActiveRow(null)}
style={{
borderBottom: i < MODELS.length - 1 ? '1px solid var(--border)' : 'none',
background: activeRow === m.name ? 'var(--bg-elevated)' : 'transparent',
transition: 'background 0.1s',
borderLeft: `3px solid ${activeRow === m.name ? TIER_COLOR[m.tier] : 'transparent'}`,
}}
>
|
{m.name}
|
{m.accuracy.toFixed(1)}%
|
|
{m.lecture}
|
{m.note}
|
))}
{/* ── Footer note ────────────────────────────────────────────────────── */}
* Val set is 21 samples from a handcrafted 100-sample dataset — ensemble 100% reflects
near-zero variance on a small holdout, not production accuracy. Transformer models were
trained on the larger combined dataset; classical models trained on the 79-sample split.
)
}