/* ============================================================ Text Vectorization Lab — frontend logic Talks to the real Flask/sklearn/gensim backend in app.py and animates the response stage by stage along each pipeline tape. ============================================================ */ (() => { "use strict"; const PALETTE = { amber: "242,169,59", teal: "52,214,184", pink: "239,93,168", violet: "155,140,242", }; // ---------------------------------------------------------- utilities const $ = (sel, root = document) => root.querySelector(sel); const $$ = (sel, root = document) => Array.from(root.querySelectorAll(sel)); function escapeHtml(str) { return String(str) .replace(/&/g, "&") .replace(//g, ">") .replace(/"/g, """); } function formatNum(v) { if (typeof v !== "number") return escapeHtml(v); if (Number.isInteger(v)) return String(v); return v.toFixed(3).replace(/0+$/, "").replace(/\.$/, ".0"); } function heat(colorKey, v, max) { if (!v || v <= 0) return "transparent"; const rgb = PALETTE[colorKey] || PALETTE.teal; const alpha = max > 0 ? 0.16 + 0.55 * (v / max) : 0.2; return `rgba(${rgb}, ${alpha.toFixed(2)})`; } function chipList(items, colorClass = "", delayStep = 0.04) { return `

${items .map( (t, i) => `${escapeHtml(t)}` ) .join("")}

`; } function matrixTable(rowLabels, colLabels, matrix, opts = {}) { let max = 0; matrix.forEach((row) => row.forEach((v) => { if (typeof v === "number" && v > max) max = v; })); let html = `

`; colLabels.forEach((c) => (html += ``)); html += ""; matrix.forEach((row, ri) => { html += ``; row.forEach((v, ci) => { const idx = ri * row.length + ci; const bg = opts.heat ? heat(opts.heat, v, max) : "transparent"; html += ``; }); html += ""; }); html += "

${escapeHtml(opts.corner \|\| "")}	${escapeHtml(c)}
${escapeHtml(rowLabels[ri])}	${formatNum(v)}

"; return html; } function kvList(pairs, colorClass) { return `

${pairs .map(([k, v]) => `

${escapeHtml(k)}${typeof v === "number" ? formatNum(v) : escapeHtml(v)}

`) .join("")}

`; } // ---------------------------------------------------------- tape control function resetTape(tape) { $$(".tape-stage", tape).forEach((s) => s.classList.remove("done", "current")); } function setTapeStage(tape, stageIndex) { $$(".tape-stage", tape).forEach((s) => { const idx = Number(s.dataset.stage); s.classList.remove("done", "current"); if (idx < stageIndex) s.classList.add("done"); else if (idx === stageIndex) s.classList.add("current"); }); } // ---------------------------------------------------------- sequential reveal function reveal(containerId, tapeId, items) { const container = document.getElementById(containerId); const tape = document.getElementById(tapeId); container.innerHTML = ""; resetTape(tape); items.forEach((item, i) => { setTimeout(() => { setTapeStage(tape, item.stage); const el = document.createElement("div"); el.className = "step"; el.innerHTML = `

${escapeHtml(item.label)}

${item.html}

`; container.appendChild(el); requestAnimationFrame(() => el.classList.add("show")); }, i * 480); }); if (items.length) { setTimeout(() => setTapeStage(tape, items[items.length - 1].stage), items.length * 480); } } function errorBlock(msg) { return `

Backend error — ${escapeHtml(msg)}

`; } function linesFromTextarea(id) { return $("#" + id).value.split("\n").map((s) => s.trim()).filter(Boolean); } async function postJSON(url, body) { const res = await fetch(url, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(body || {}), }); if (!res.ok) { const detail = await res.json().catch(() => ({})); throw new Error(detail.error || `Request to ${url} failed (${res.status})`); } return res.json(); } function withLoading(btn, fn) { return async (...args) => { if (btn.disabled) return; const originalHtml = btn.innerHTML; btn.disabled = true; btn.innerHTML = ` Running…`; try { await fn(...args); } catch (err) { console.error(err); throw err; } finally { btn.disabled = false; btn.innerHTML = originalHtml; } }; } // ============================================================ // 1. ONE-HOT ENCODING // ============================================================ async function runOnehot() { const corpus = linesFromTextarea("onehot-corpus"); let data; try { data = await postJSON("/api/onehot", { corpus }); } catch (err) { $("#out-onehot").innerHTML = errorBlock(err.message); return; } const vocab = data.vocabulary; const identityRows = vocab.map((w) => vocab.map((c) => (data.vectors[w][vocab.indexOf(c)] ? 1 : 0))); const sentenceHtml = data.sentences .map( (s) => `

“${escapeHtml(s.sentence)}”

${chipList( s.tokens.map((t, i) => `${t} → [${s.vectors[i].join(",")}]`), "teal" )}

` ) .join(""); reveal("out-onehot", "tape-onehot", [ { stage: 0, label: "Raw sentences", html: `

${corpus.map((c) => `"${escapeHtml(c)}"`).join("")}

`, }, { stage: 1, label: "Tokenize each sentence", html: data.sentences .map((s) => `

${chipList(s.tokens, "amber")}

`) .join(""), }, { stage: 2, label: `Build vocabulary — ${vocab.length} unique words`, html: chipList(vocab.map((w, i) => `${i}: ${w}`), "teal") + `

Vector length will equal the vocabulary size: every word vector here is ${vocab.length}-dimensional.

`, }, { stage: 3, label: "One-hot vectors (identity matrix over the vocabulary)", html: matrixTable(vocab, vocab, identityRows, { heat: "amber", corner: "word \\ index" }) + `

${sentenceHtml}

`, }, { stage: 4, label: "Cross-check with sklearn's OneHotEncoder", html: matrixTable( data.sklearnCheck.inputWords.map((w, i) => `${w} (#${i})`), data.sklearnCheck.categories, data.sklearnCheck.matrix, { heat: "violet", corner: "token" } ) + `

Two real limitations: the vector size grows with every new word in the vocabulary, and a one-hot vector can't tell you that "cat" and "dog" are more alike than "cat" and "umbrella" — there's no notion of relationship or context.

`, }, ]); } // ============================================================ // 2. COUNT VECTORIZER // ============================================================ async function runCount() { const corpus = linesFromTextarea("count-corpus"); const stopWords = $("#count-stopwords").checked; const maxFeatures = $("#count-maxfeatures").value || null; const newDoc = $("#count-newdoc").value.trim(); let data; try { data = await postJSON("/api/count-vectorizer", { corpus, stopWords, maxFeatures, newDoc }); } catch (err) { $("#out-count").innerHTML = errorBlock(err.message); return; } const rowLabels = corpus.map((_, i) => `Doc ${i + 1}`); const items = [ { stage: 0, label: "Raw corpus", html: `

${corpus.map((c) => `"${escapeHtml(c)}"`).join("")}

`, }, { stage: 1, label: "Tokenize every document", html: data.tokenizedDocs .map((toks, i) => `

Doc ${i + 1}: ${chipList(toks, "amber")}

`) .join(""), }, { stage: 2, label: `CountVectorizer.fit() → vocabulary (${data.vocabulary.length} terms)`, html: chipList(data.vocabulary, "teal") + (data.settings.stopWords ? `

English stop words removed before fitting.

` : "") + (data.settings.maxFeatures ? `

Limited to the top ${data.settings.maxFeatures} most frequent terms.

` : ""), }, { stage: 3, label: "Document–term count matrix", html: matrixTable(rowLabels, data.vocabulary, data.matrix, { heat: "teal", corner: "" }), }, ]; if (data.newDocResult) { items.push({ stage: 4, label: "Transform a brand-new document with the fitted vocabulary", html: `

“${escapeHtml(data.newDocResult.doc)}”

` + matrixTable(["New doc"], data.vocabulary, [data.newDocResult.vector], { heat: "pink" }) + `

Any word here that wasn't in the original vocabulary is simply ignored — CountVectorizer.transform() never grows the vocabulary after fit().

`, }); } else { items.push({ stage: 4, label: "Transform a new document", html: `

Type a sentence into “New document to transform” above and run again to see .transform() applied to text the vectorizer never saw during .fit().

`, }); } reveal("out-count", "tape-count", items); } // ============================================================ // 3. BAG OF WORDS // ============================================================ async function runBow() { const corpus = linesFromTextarea("bow-corpus"); let data; try { data = await postJSON("/api/bow", { corpus }); } catch (err) { $("#out-bow").innerHTML = errorBlock(err.message); return; } const rowLabels = corpus.map((_, i) => `Doc ${i + 1}`); reveal("out-bow", "tape-bow", [ { stage: 0, label: "Raw corpus", html: `

${corpus.map((c) => `"${escapeHtml(c)}"`).join("")}

`, }, { stage: 1, label: "Tokenize", html: data.tokenizedDocs .map((toks, i) => `

Doc ${i + 1}: ${chipList(toks, "amber")}

`) .join(""), }, { stage: 2, label: `Build the bag — ${data.vocabulary.length} unique words across the corpus`, html: chipList(data.vocabulary, "teal"), }, { stage: 3, label: "Bag-of-Words frequency matrix (hand-rolled counter)", html: matrixTable(rowLabels, data.vocabulary, data.matrix, { heat: "teal" }), }, { stage: 4, label: "Binary BoW — presence (1) vs. absence (0), not raw counts", html: matrixTable(rowLabels, data.binaryVocabulary, data.binaryMatrix, { heat: "amber" }), }, { stage: 5, label: "Cosine similarity between documents, derived from the BoW vectors", html: matrixTable(rowLabels, rowLabels, data.cosineSimilarity, { heat: "pink" }) + `

1.0 means identical word-frequency profiles; 0 means no shared vocabulary at all. Documents that share more frequent words end up closer together.

`, }, ]); } // ============================================================ // 4. N-GRAMS // ============================================================ async function runNgrams() { const sentence = $("#ngrams-sentence").value.trim(); const corpus = linesFromTextarea("ngrams-corpus"); let data; try { data = await postJSON("/api/ngrams", { sentence, corpus }); } catch (err) { $("#out-ngrams").innerHTML = errorBlock(err.message); return; } const rowLabels = data.corpus.map((_, i) => `Doc ${i + 1}`); const m = data.ngramMatrices; reveal("out-ngrams", "tape-ngrams", [ { stage: 0, label: "Sentence", html: `

"${escapeHtml(data.sentence)}"

`, }, { stage: 1, label: "Unigrams (N=1) — identical to plain tokenization", html: chipList(data.manual.unigrams, "teal"), }, { stage: 2, label: "Bigrams (N=2) — consecutive word pairs", html: chipList(data.manual.bigrams, "amber"), }, { stage: 3, label: "Trigrams (N=3) — consecutive word triples", html: chipList(data.manual.trigrams, "pink"), }, { stage: 4, label: "N-gram document–term matrices over the corpus", html: `

${m.unigrams.label} — ${m.unigrams.vocabulary.length} terms

${matrixTable(rowLabels, m.unigrams.vocabulary, m.unigrams.matrix, { heat: "teal" })}

${m.bigrams.label} — ${m.bigrams.vocabulary.length} terms

${matrixTable(rowLabels, m.bigrams.vocabulary, m.bigrams.matrix, { heat: "amber" })}

${m.uni_bi.label} — ${m.uni_bi.vocabulary.length} terms

${matrixTable(rowLabels, m.uni_bi.vocabulary, m.uni_bi.matrix, { heat: "pink" })}

Character-level bigrams+trigrams (analyzer='char_wb') on the same corpus produce ${data.charLevel.vocabularySize} features. Sample: ${data.charLevel.sample.map((s) => `"${escapeHtml(s)}"`).join(", ")} …

`, }, ]); } // ============================================================ // 5. TF-IDF // ============================================================ async function runTfidf() { const corpus = linesFromTextarea("tfidf-corpus"); let data; try { data = await postJSON("/api/tfidf", { corpus }); } catch (err) { $("#out-tfidf").innerHTML = errorBlock(err.message); return; } const rowLabels = corpus.map((_, i) => `Doc ${i + 1}`); const tfHtml = data.manualPerDoc .map( (d, i) => `

Doc ${i + 1}: “${escapeHtml(d.doc)}”

${kvList(Object.entries(d.tf), "teal")}

` ) .join(""); const idfHtml = kvList(Object.entries(data.idf), "amber"); const tfidfHtml = data.manualPerDoc .map( (d, i) => `

Doc ${i + 1}: “${escapeHtml(d.doc)}”

${kvList(Object.entries(d.tfidf), "pink")}

` ) .join(""); const topWordsHtml = data.topWords .map( (d) => `

“${escapeHtml(d.doc)}”

${chipList(d.top.map((t) => `${t.word} · ${t.score}`), "amber")}

` ) .join(""); reveal("out-tfidf", "tape-tfidf", [ { stage: 0, label: "Raw corpus", html: `

${corpus.map((c) => `"${escapeHtml(c)}"`).join("")}

`, }, { stage: 1, label: "Term Frequency — count(t, d) / total words in d", html: `

${tfHtml}

` }, { stage: 2, label: "Inverse Document Frequency — log(N / (1 + df(t))) + 1, across the whole corpus", html: idfHtml }, { stage: 3, label: "TF × IDF, computed by hand per document (non-zero terms only)", html: `

${tfidfHtml}

` }, { stage: 4, label: "scikit-learn's TfidfVectorizer, for comparison", html: matrixTable(rowLabels, data.sklearn.vocabulary, data.sklearn.matrix, { heat: "violet" }) + `

sklearn additionally L2-normalizes each row, so the exact numbers differ slightly from the hand-rolled version above — the ranking of important words matches.

`, }, { stage: 5, label: "Top 3 highest-weighted words per document", html: topWordsHtml }, ]); } // ============================================================ // 6. WORD EMBEDDINGS // ============================================================ function renderScatter(points) { const w = 560, h = 340, pad = 40; if (!points.length) return `

Not enough plottable words.

`; const xs = points.map((p) => p.x), ys = points.map((p) => p.y); const minX = Math.min(...xs), maxX = Math.max(...xs); const minY = Math.min(...ys), maxY = Math.max(...ys); const spanX = maxX - minX || 1, spanY = maxY - minY || 1; const colors = [PALETTE.amber, PALETTE.teal, PALETTE.pink, PALETTE.violet]; const sx = (x) => pad + ((x - minX) / spanX) * (w - 2 * pad); const sy = (y) => h - pad - ((y - minY) / spanY) * (h - 2 * pad); let svg = ``; return `

${svg}

PCA reduces the 50-dimensional Word2Vec space to 2D so it can be drawn — distance and direction on this plot are only an approximation of similarity in the real, high-dimensional space.

`; } async function runEmbeddings() { const sentences = linesFromTextarea("embed-sentences"); let data; try { data = await postJSON("/api/embeddings", { sentences }); } catch (err) { $("#out-embeddings").innerHTML = errorBlock(err.message); return; } const simRows = data.similarities.map((s) => s.error ? `${escapeHtml(s.pair.join(" ↔ "))}${escapeHtml(s.error)}` : `${escapeHtml(s.pair.join(" ↔ "))}${formatNum(s.skipgram)}${formatNum(s.cbow)}` ); const mostSimHtml = Object.entries(data.mostSimilar) .map( ([word, sims]) => `

${escapeHtml(word)} is most similar to: ${chipList(sims.map((s) => `${s.word} · ${s.score}`), "teal")}

` ) .join(""); const items = [ { stage: 0, label: `Training sentences (tokenized) — vocabulary of ${data.vocabSize} words`, html: chipList(data.sentences, "muted", 0.03), }, { stage: 1, label: `Train Word2Vec (Skip-gram & CBOW) — finished in ${data.trainSeconds}s`, html: data.sampleWord ? `

First 10 of 50 dimensions for “${escapeHtml(data.sampleWord)}”:

[ ${data.sampleVector.map(formatNum).join(", ")}, … ]

` : `

No vocabulary produced — add a few more training sentences.

`, }, { stage: 2, label: "Cosine similarity between word pairs, Skip-gram vs CBOW", html: `${simRows.join("")}

pair	skip-gram	cbow

`, }, { stage: 3, label: "Most similar words (Skip-gram model)", html: mostSimHtml || `

None of the probe words were found in this vocabulary — try adding "cat", "king" or "paris" to your sentences.

`, }, { stage: 4, label: "All trained vectors, projected to 2D with PCA", html: renderScatter(data.pcaPoints), }, ]; if (data.fastText && !data.fastText.error) { items.push({ stage: 5, label: "FastText: vectors for out-of-vocabulary words via character n-grams", html: `

“${escapeHtml(data.fastText.inVocabWord)}” — seen during training

[ ${data.fastText.inVocabVector.map(formatNum).join(", ")}, … ]

“${escapeHtml(data.fastText.oovWord)}” — never seen during training

[ ${data.fastText.oovVector.map(formatNum).join(", ")}, … ]

Why this works: ${escapeHtml(data.fastText.note)} Word2Vec, by contrast, would raise a KeyError for an unseen word — it has no concept of subword structure.

`, }); } reveal("out-embeddings", "tape-embeddings", items); } // ============================================================ // Navigation + wiring // ============================================================ const RUNNERS = { onehot: runOnehot, count: runCount, bow: runBow, ngrams: runNgrams, tfidf: runTfidf, embeddings: runEmbeddings, }; const DEFAULT_TEXT = { "onehot-corpus": "I love NLP\nNLP is fun\nI love coding", "count-corpus": "I love NLP and I love Python\nNLP is amazing and fun\nPython is great for NLP", "bow-corpus": "the cat sat on the mat\nthe dog sat on the log\nthe cat and the dog are friends", "ngrams-sentence": "I love studying Natural Language Processing", "ngrams-corpus": "I love NLP and machine learning\nmachine learning is part of AI\nNLP is a branch of AI", "tfidf-corpus": "I love NLP and machine learning\nmachine learning is part of AI\nNLP is a branch of AI\nI love AI and deep learning", "embed-sentences": "the cat sat on the mat\nthe dog ran on the grass\ncats and dogs are pets\ni love my cat\ni love my dog\nking and queen are royalty\nman and woman are humans\nparis is the capital of france\nberlin is the capital of germany", }; const autoRan = new Set(); function activateSection(target) { $$(".section").forEach((s) => s.classList.toggle("active", s.id === `sec-${target}`)); $$(".nav-item").forEach((b) => b.classList.toggle("active", b.dataset.target === target)); window.scrollTo({ top: 0, behavior: "instant" in window ? "instant" : "auto" }); if (RUNNERS[target] && !autoRan.has(target)) { autoRan.add(target); RUNNERS[target](); } } function wireNav() { $$(".nav-item, .tech-card").forEach((btn) => { btn.addEventListener("click", () => activateSection(btn.dataset.target)); }); } function wireRunButtons() { $$("[data-run]").forEach((btn) => { const key = btn.dataset.run; btn.addEventListener("click", withLoading(btn, RUNNERS[key])); }); $$("[data-reset]").forEach((btn) => { btn.addEventListener("click", () => { const key = btn.dataset.reset; Object.keys(DEFAULT_TEXT) .filter((id) => id.startsWith(key + "-")) .forEach((id) => { const el = document.getElementById(id); if (el) el.value = DEFAULT_TEXT[id]; }); if (key === "count") { $("#count-maxfeatures").value = ""; $("#count-newdoc").value = ""; $("#count-stopwords").checked = false; } RUNNERS[key](); }); }); } document.addEventListener("DOMContentLoaded", () => { wireNav(); wireRunButtons(); }); })();