| <div class="d3-text-metrics"></div> |
|
|
| <style> |
| .d3-text-metrics { |
| font-family: var(--default-font-family); |
| background: transparent; |
| padding: 0; |
| width: 100%; |
| position: relative; |
| } |
| |
| .d3-text-metrics .example-text { |
| font-size: 12px; |
| line-height: 1.8; |
| color: var(--text-color); |
| font-family: monospace; |
| margin: 8px 0; |
| padding: 10px 12px; |
| background: var(--surface-bg); |
| border: 1px solid var(--border-color); |
| border-radius: 6px; |
| } |
| |
| .d3-text-metrics .label { |
| font-size: 10px; |
| font-weight: 700; |
| color: var(--muted-color); |
| margin-right: 8px; |
| } |
| |
| .d3-text-metrics .metrics-grid { |
| display: grid; |
| grid-template-columns: repeat(3, 1fr); |
| gap: 12px; |
| margin: 16px 0; |
| } |
| |
| .d3-text-metrics .metric-box { |
| padding: 12px; |
| background: var(--surface-bg); |
| border: 1px solid var(--border-color); |
| border-radius: 8px; |
| transition: border-color 0.2s; |
| } |
| |
| .d3-text-metrics .metric-name { |
| font-size: 13px; |
| font-weight: 600; |
| color: var(--text-color); |
| margin-bottom: 6px; |
| } |
| |
| .d3-text-metrics .metric-score { |
| font-size: 22px; |
| font-weight: 700; |
| color: var(--primary-color); |
| margin-bottom: 4px; |
| } |
| |
| .d3-text-metrics .metric-detail { |
| font-size: 11px; |
| color: var(--muted-color); |
| line-height: 1.4; |
| } |
| |
| .d3-text-metrics .visualization { |
| margin-top: 8px; |
| padding: 8px; |
| background: oklch(from var(--primary-color) calc(l + 0.45) c h / 0.06); |
| border-radius: 4px; |
| font-size: 10px; |
| } |
| |
| [data-theme="dark"] .d3-text-metrics .visualization { |
| background: oklch(from var(--primary-color) calc(l + 0.20) c h / 0.1); |
| } |
| |
| .d3-text-metrics .token { |
| display: inline-block; |
| padding: 2px 5px; |
| margin: 2px; |
| border-radius: 3px; |
| font-size: 10px; |
| background: var(--surface-bg); |
| border: 1px solid var(--border-color); |
| } |
| |
| .d3-text-metrics .token.match { |
| background: oklch(from var(--primary-color) calc(l + 0.35) c h / 0.35); |
| border-color: var(--primary-color); |
| font-weight: 600; |
| } |
| |
| [data-theme="dark"] .d3-text-metrics .token.match { |
| background: oklch(from var(--primary-color) calc(l + 0.25) c h / 0.4); |
| } |
| |
| .d3-text-metrics .controls { |
| display: flex; |
| justify-content: center; |
| margin-bottom: 16px; |
| } |
| |
| .d3-text-metrics select { |
| font-size: 12px; |
| padding: 6px 24px 6px 10px; |
| border: 1px solid var(--border-color); |
| border-radius: 6px; |
| background: var(--surface-bg); |
| color: var(--text-color); |
| cursor: pointer; |
| appearance: none; |
| background-image: url("data:image/svg+xml;charset=UTF-8,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='none' stroke='currentColor' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3e%3cpolyline points='6 9 12 15 18 9'%3e%3c/polyline%3e%3c/svg%3e"); |
| background-repeat: no-repeat; |
| background-position: right 6px center; |
| background-size: 12px; |
| } |
| |
| @media (max-width: 768px) { |
| .d3-text-metrics .metrics-grid { |
| grid-template-columns: 1fr; |
| } |
| } |
| </style> |
|
|
| <script> |
| (() => { |
| const bootstrap = () => { |
| const scriptEl = document.currentScript; |
| let container = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!(container && container.classList && container.classList.contains('d3-text-metrics'))) { |
| const candidates = Array.from(document.querySelectorAll('.d3-text-metrics')) |
| .filter((el) => !(el.dataset && el.dataset.mounted === 'true')); |
| container = candidates[candidates.length - 1] || null; |
| } |
| |
| if (!container) return; |
| if (container.dataset) { |
| if (container.dataset.mounted === 'true') return; |
| container.dataset.mounted = 'true'; |
| } |
| |
| // Single example: Cat Evaluator |
| const reference = "My cat loves doing model evaluation and testing benchmarks"; |
| const prediction = "My cat enjoys model evaluation and testing models"; |
| |
| const tokenize = (text) => text.toLowerCase().trim().split(/\s+/); |
| |
| const getNgrams = (tokens, n) => { |
| const ngrams = []; |
| for (let i = 0; i <= tokens.length - n; i++) { |
| ngrams.push(tokens.slice(i, i + n)); |
| } |
| return ngrams; |
| }; |
| |
| const computeExactMatch = (pred, ref) => { |
| return pred.toLowerCase().trim() === ref.toLowerCase().trim() ? 1.0 : 0.0; |
| }; |
| |
| const computeBleu = (pred, ref) => { |
| const predTokens = tokenize(pred); |
| const refTokens = tokenize(ref); |
| if (predTokens.length === 0) return { score: 0, details: [] }; |
| |
| const details = []; |
| const precisions = []; |
| |
| for (let n = 1; n <= 3; n++) { |
| const predNgrams = getNgrams(predTokens, n); |
| const refNgrams = getNgrams(refTokens, n); |
| if (predNgrams.length === 0) { |
| precisions.push(0); |
| continue; |
| } |
| |
| const refCounts = {}; |
| refNgrams.forEach(ng => { |
| const key = ng.join(' '); |
| refCounts[key] = (refCounts[key] || 0) + 1; |
| }); |
| |
| let matches = 0; |
| const matchedNgrams = []; |
| const predCounts = {}; |
| |
| predNgrams.forEach(ng => { |
| const key = ng.join(' '); |
| predCounts[key] = (predCounts[key] || 0) + 1; |
| if (refCounts[key] && predCounts[key] <= refCounts[key]) { |
| matches++; |
| if (!matchedNgrams.includes(key)) matchedNgrams.push(key); |
| } |
| }); |
| |
| const precision = matches / predNgrams.length; |
| precisions.push(precision); |
| details.push({ n, matches, total: predNgrams.length, matchedNgrams }); |
| } |
| |
| const validPrecisions = precisions.filter(p => p > 0); |
| const score = validPrecisions.length > 0 |
| ? Math.exp(validPrecisions.reduce((sum, p) => sum + Math.log(p), 0) / validPrecisions.length) |
| : 0; |
| |
| return { score, details }; |
| }; |
| |
| const computeRouge1 = (pred, ref) => { |
| const predTokens = tokenize(pred); |
| const refTokens = tokenize(ref); |
| |
| const predCounts = {}; |
| const refCounts = {}; |
| predTokens.forEach(t => predCounts[t] = (predCounts[t] || 0) + 1); |
| refTokens.forEach(t => refCounts[t] = (refCounts[t] || 0) + 1); |
| |
| let overlap = 0; |
| const matchedTokens = []; |
| Object.keys(refCounts).forEach(token => { |
| if (predCounts[token]) { |
| overlap += Math.min(predCounts[token], refCounts[token]); |
| matchedTokens.push(token); |
| } |
| }); |
| |
| const recall = refTokens.length > 0 ? overlap / refTokens.length : 0; |
| const precision = predTokens.length > 0 ? overlap / predTokens.length : 0; |
| const f1 = (precision + recall) > 0 ? 2 * precision * recall / (precision + recall) : 0; |
| |
| return { score: f1, recall, precision, matchedTokens }; |
| }; |
| |
| const computeRouge2 = (pred, ref) => { |
| const predTokens = tokenize(pred); |
| const refTokens = tokenize(ref); |
| |
| const predBigrams = getNgrams(predTokens, 2); |
| const refBigrams = getNgrams(refTokens, 2); |
| |
| if (refBigrams.length === 0) { |
| return { score: 0, recall: 0, precision: 0, matchedBigrams: [] }; |
| } |
| |
| const predCounts = {}; |
| const refCounts = {}; |
| predBigrams.forEach(bg => { |
| const key = bg.join(' '); |
| predCounts[key] = (predCounts[key] || 0) + 1; |
| }); |
| refBigrams.forEach(bg => { |
| const key = bg.join(' '); |
| refCounts[key] = (refCounts[key] || 0) + 1; |
| }); |
| |
| let overlap = 0; |
| const matchedBigrams = []; |
| Object.keys(refCounts).forEach(bigram => { |
| if (predCounts[bigram]) { |
| overlap += Math.min(predCounts[bigram], refCounts[bigram]); |
| matchedBigrams.push(bigram); |
| } |
| }); |
| |
| const recall = refBigrams.length > 0 ? overlap / refBigrams.length : 0; |
| const precision = predBigrams.length > 0 ? overlap / predBigrams.length : 0; |
| const f1 = (precision + recall) > 0 ? 2 * precision * recall / (precision + recall) : 0; |
| |
| return { score: f1, recall, precision, matchedBigrams }; |
| }; |
| |
| const computeEditDistanceWithOps = (s1, s2) => { |
| const m = s1.length; |
| const n = s2.length; |
| |
| // Create DP table |
| const dp = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0)); |
| |
| // Initialize |
| for (let i = 0; i <= m; i++) dp[i][0] = i; |
| for (let j = 0; j <= n; j++) dp[0][j] = j; |
| |
| // Fill DP table |
| for (let i = 1; i <= m; i++) { |
| for (let j = 1; j <= n; j++) { |
| if (s1[i - 1] === s2[j - 1]) { |
| dp[i][j] = dp[i - 1][j - 1]; |
| } else { |
| dp[i][j] = 1 + Math.min( |
| dp[i - 1][j], // delete |
| dp[i][j - 1], // insert |
| dp[i - 1][j - 1] // substitute |
| ); |
| } |
| } |
| } |
| |
| // Traceback to find operations |
| const operations = []; |
| let i = m, j = n; |
| |
| while (i > 0 || j > 0) { |
| if (i === 0) { |
| operations.unshift({ type: 'insert', value: s2[j - 1], pos: j }); |
| j--; |
| } else if (j === 0) { |
| operations.unshift({ type: 'delete', value: s1[i - 1], pos: i }); |
| i--; |
| } else if (s1[i - 1] === s2[j - 1]) { |
| i--; |
| j--; |
| } else { |
| const deleteCost = dp[i - 1][j]; |
| const insertCost = dp[i][j - 1]; |
| const substituteCost = dp[i - 1][j - 1]; |
| |
| if (substituteCost <= deleteCost && substituteCost <= insertCost) { |
| operations.unshift({ type: 'substitute', from: s1[i - 1], to: s2[j - 1], pos: i }); |
| i--; |
| j--; |
| } else if (deleteCost <= insertCost) { |
| operations.unshift({ type: 'delete', value: s1[i - 1], pos: i }); |
| i--; |
| } else { |
| operations.unshift({ type: 'insert', value: s2[j - 1], pos: j }); |
| j--; |
| } |
| } |
| } |
| |
| return { distance: dp[m][n], operations }; |
| }; |
| |
| const computeTer = (pred, ref) => { |
| const predTokens = tokenize(pred); |
| const refTokens = tokenize(ref); |
| const result = computeEditDistanceWithOps(predTokens, refTokens); |
| const score = refTokens.length > 0 ? result.distance / refTokens.length : 1.0; |
| return { |
| score, |
| edits: result.distance, |
| refLength: refTokens.length, |
| operations: result.operations |
| }; |
| }; |
| |
| const computeBleurtMock = (pred, ref) => { |
| const predTokens = new Set(tokenize(pred)); |
| const refTokens = new Set(tokenize(ref)); |
| const intersection = new Set([...predTokens].filter(t => refTokens.has(t))); |
| const union = new Set([...predTokens, ...refTokens]); |
| const jaccard = union.size > 0 ? intersection.size / union.size : 0; |
| return { score: jaccard * 1.5 - 0.5, jaccard }; |
| }; |
| |
| const render = () => { |
| const exactMatch = computeExactMatch(prediction, reference); |
| const bleu = computeBleu(prediction, reference); |
| const rouge1 = computeRouge1(prediction, reference); |
| const rouge2 = computeRouge2(prediction, reference); |
| const ter = computeTer(prediction, reference); |
| const bleurt = computeBleurtMock(prediction, reference); |
| |
| container.innerHTML = ` |
| <div class="example-text"> |
| <span class="label">REF:</span>${reference} |
| </div> |
| <div class="example-text"> |
| <span class="label">PRED:</span>${prediction} |
| </div> |
| |
| <div class="metrics-grid"> |
| |
| <div class="metric-box"> |
| <div class="metric-name">Exact Match</div> |
| <div class="metric-score">${exactMatch.toFixed(1)}</div> |
| <div class="metric-detail">Binary: 1 or 0</div> |
| <div class="visualization"> |
| <div style="margin: 4px 0; font-size: 14px;"> |
| ${exactMatch === 1 ? '✓ Strings are identical' : '✗ Strings differ'} |
| </div> |
| <div style="margin-top: 6px; font-size: 9px; color: var(--muted-color);"> |
| Most strict metric - no partial credit |
| </div> |
| </div> |
| </div> |
| |
| <div class="metric-box"> |
| <div class="metric-name">Translation Error Rate</div> |
| <div class="metric-score">${ter.score.toFixed(3)}</div> |
| <div class="metric-detail">Edit distance normalized</div> |
| <div class="visualization"> |
| <div style="margin: 4px 0;"> |
| <strong>${ter.edits}</strong> edits / <strong>${ter.refLength}</strong> words = <strong>${ter.score.toFixed(3)}</strong> |
| </div> |
| ${ter.operations.length > 0 ? ` |
| <div style="margin-top: 8px; font-size: 10px;"> |
| <div style="margin-bottom: 4px; color: var(--muted-color);">Edit operations:</div> |
| ${ter.operations.map((op, idx) => { |
| if (op.type === 'substitute') { |
| return `<div style="margin: 2px 0;">• Replace "<strong>${op.from}</strong>" → "<strong>${op.to}</strong>"</div>`; |
| } else if (op.type === 'delete') { |
| return `<div style="margin: 2px 0;">• Delete "<strong>${op.value}</strong>"</div>`; |
| } else if (op.type === 'insert') { |
| return `<div style="margin: 2px 0;">• Insert "<strong>${op.value}</strong>"</div>`; |
| } |
| }).join('')} |
| </div> |
| ` : ''} |
| <div style="margin-top: 6px; font-size: 9px; color: var(--muted-color);"> |
| Lower is better (0 = identical) |
| </div> |
| </div> |
| </div> |
| |
| <div class="metric-box"> |
| <div class="metric-name">BLEURT</div> |
| <div class="metric-score">${bleurt.score.toFixed(3)}</div> |
| <div class="metric-detail">Semantic similarity</div> |
| <div class="visualization"> |
| <div style="margin-top: 6px; font-size: 9px; color: var(--muted-color); font-style: italic;"> |
| BLEURT uses BERT embeddings learned from real text. |
| </div> |
| </div> |
| </div> |
| |
| |
| <div class="metric-box"> |
| <div class="metric-name">BLEU</div> |
| <div class="metric-score">${bleu.score.toFixed(3)}</div> |
| <div class="metric-detail">N-gram precision-based</div> |
| <div class="visualization"> |
| ${bleu.details.map(d => ` |
| <div style="margin: 4px 0;"> |
| <strong>${d.n}-gram:</strong> ${d.matches}/${d.total} (${(d.matches/d.total*100).toFixed(0)}%) |
| </div> |
| <div style="margin: 2px 0;"> |
| ${d.matchedNgrams.slice(0, 3).map(ng => `<span class="token match">${ng}</span>`).join('')} |
| ${d.matchedNgrams.length > 3 ? `<span style="color: var(--muted-color); font-size: 10px;">+${d.matchedNgrams.length - 3} more</span>` : ''} |
| </div> |
| `).join('')} |
| </div> |
| </div> |
| |
| <div class="metric-box"> |
| <div class="metric-name">ROUGE-1</div> |
| <div class="metric-score">${rouge1.score.toFixed(3)}</div> |
| <div class="metric-detail">Unigram-based F1</div> |
| <div class="visualization"> |
| <div style="margin: 4px 0;"> |
| <strong>Recall:</strong> ${(rouge1.recall * 100).toFixed(0)}% | <strong>Precision:</strong> ${(rouge1.precision * 100).toFixed(0)}% |
| </div> |
| <div style="margin-top: 6px; font-size: 9px; color: var(--muted-color);"> |
| Matched unigrams: |
| </div> |
| ${rouge1.matchedTokens.length > 0 ? ` |
| <div style="margin: 2px 0;"> |
| ${rouge1.matchedTokens.slice(0, 5).map(t => `<span class="token match">${t}</span>`).join('')} |
| ${rouge1.matchedTokens.length > 5 ? `<span style="color: var(--muted-color); font-size: 10px;">+${rouge1.matchedTokens.length - 5} more</span>` : ''} |
| </div> |
| ` : ''} |
| </div> |
| </div> |
| |
| <div class="metric-box"> |
| <div class="metric-name">ROUGE-2</div> |
| <div class="metric-score">${rouge2.score.toFixed(3)}</div> |
| <div class="metric-detail">Bigram-based F1</div> |
| <div class="visualization"> |
| <div style="margin: 4px 0;"> |
| <strong>Recall:</strong> ${(rouge2.recall * 100).toFixed(0)}% | <strong>Precision:</strong> ${(rouge2.precision * 100).toFixed(0)}% |
| </div> |
| <div style="margin-top: 6px; font-size: 9px; color: var(--muted-color);"> |
| Matched bigrams: |
| </div> |
| ${rouge2.matchedBigrams.length > 0 ? ` |
| <div style="margin: 2px 0;"> |
| ${rouge2.matchedBigrams.slice(0, 3).map(bg => `<span class="token match">${bg}</span>`).join('')} |
| ${rouge2.matchedBigrams.length > 3 ? `<span style="color: var(--muted-color); font-size: 10px;">+${rouge2.matchedBigrams.length - 3} more</span>` : ''} |
| </div> |
| ` : '<div style="margin: 2px 0; font-size: 10px; color: var(--muted-color);">No bigram matches</div>'} |
| </div> |
| </div> |
| </div> |
| `; |
| }; |
| |
| render(); |
| }; |
| |
| if (document.readyState === 'loading') { |
| document.addEventListener('DOMContentLoaded', bootstrap, { once: true }); |
| } else { |
| bootstrap(); |
| } |
| })(); |
| </script> |
|
|