""" Text Vectorization Lab — backend ================================= A small Flask API that actually executes the same scikit-learn / numpy / gensim code used to teach One-Hot Encoding, Count Vectorizer, Bag-of-Words, N-grams, TF-IDF, and Word2Vec/FastText word embeddings. Every endpoint below is a thin, JSON-friendly wrapper around the exact computations from the reference notebook — nothing is hard-coded or faked. The frontend (static/js/main.js) calls these endpoints and animates the intermediate steps so the visitor can watch each technique build up. """ import math import re import time import numpy as np import pandas as pd from flask import Flask, jsonify, render_template, request from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import OneHotEncoder from sklearn.decomposition import PCA app = Flask(__name__) # ---------------------------------------------------------------------------- # Helpers # ---------------------------------------------------------------------------- def tokenize(text: str): """Simple whitespace/punctuation tokenizer used across the manual demos.""" return re.findall(r"[A-Za-z0-9']+", text.lower()) def clean_corpus(raw): """Normalize whatever the frontend sends into a list[str] of non-empty docs.""" if raw is None: return [] if isinstance(raw, str): raw = raw.split("\n") return [s.strip() for s in raw if isinstance(s, str) and s.strip()] DEFAULTS = { "onehot": ["I love NLP", "NLP is fun", "I love coding"], "count": [ "I love NLP and I love Python", "NLP is amazing and fun", "Python is great for NLP", ], "bow": [ "the cat sat on the mat", "the dog sat on the log", "the cat and the dog are friends", ], "ngrams_sentence": "I love studying Natural Language Processing", "ngrams_corpus": [ "I love NLP and machine learning", "machine learning is part of AI", "NLP is a branch of AI", ], "tfidf": [ "I love NLP and machine learning", "machine learning is part of AI", "NLP is a branch of AI", "I love AI and deep learning", ], "embeddings_sentences": [ "the cat sat on the mat", "the dog ran on the grass", "cats and dogs are pets", "i love my cat", "i love my dog", "king and queen are royalty", "man and woman are humans", "paris is the capital of france", "berlin is the capital of germany", ], } # ---------------------------------------------------------------------------- # Page # ---------------------------------------------------------------------------- @app.route("/") def index(): return render_template("index.html") # ---------------------------------------------------------------------------- # 1. One-Hot Encoding # ---------------------------------------------------------------------------- @app.route("/api/onehot", methods=["POST"]) def api_onehot(): payload = request.get_json(force=True) or {} corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["onehot"] tokenized = [tokenize(s) for s in corpus] vocabulary = sorted(set(tok for sent in tokenized for tok in sent)) word_to_idx = {w: i for i, w in enumerate(vocabulary)} # Manual / NumPy one-hot, identical approach to the notebook's # one_hot_encode() helper, run once per unique vocabulary word. vectors = {} for word in vocabulary: vec = np.zeros(len(vocabulary), dtype=int) vec[word_to_idx[word]] = 1 vectors[word] = vec.tolist() # sklearn OneHotEncoder cross-check (Method 2 in the notebook) so the # numbers are guaranteed to match what scikit-learn itself produces. flat_words = np.array([tok for sent in tokenized for tok in sent]).reshape(-1, 1) encoder = OneHotEncoder(sparse_output=False) sk_matrix = encoder.fit_transform(flat_words) if len(flat_words) else np.empty((0, 0)) sentences_out = [] for sent, toks in zip(corpus, tokenized): sentences_out.append({ "sentence": sent, "tokens": toks, "vectors": [vectors[t] for t in toks], }) return jsonify({ "corpus": corpus, "vocabulary": vocabulary, "vectorLength": len(vocabulary), "vectors": vectors, "sentences": sentences_out, "sklearnCheck": { "categories": encoder.categories_[0].tolist() if len(flat_words) else [], "matrix": sk_matrix.astype(int).tolist(), "inputWords": flat_words.flatten().tolist(), }, }) # ---------------------------------------------------------------------------- # 2. Count Vectorizer # ---------------------------------------------------------------------------- @app.route("/api/count-vectorizer", methods=["POST"]) def api_count_vectorizer(): payload = request.get_json(force=True) or {} corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["count"] use_stopwords = bool(payload.get("stopWords")) max_features = payload.get("maxFeatures") new_doc = (payload.get("newDoc") or "").strip() kwargs = {} if use_stopwords: kwargs["stop_words"] = "english" if max_features: try: kwargs["max_features"] = int(max_features) except (TypeError, ValueError): pass cv = CountVectorizer(**kwargs) X = cv.fit_transform(corpus) vocabulary = cv.get_feature_names_out().tolist() matrix = X.toarray().tolist() new_doc_result = None if new_doc: X_new = cv.transform([new_doc]) new_doc_result = { "doc": new_doc, "vector": X_new.toarray()[0].tolist(), } return jsonify({ "corpus": corpus, "tokenizedDocs": [tokenize(s) for s in corpus], "vocabulary": vocabulary, "matrix": matrix, "newDocResult": new_doc_result, "settings": {"stopWords": use_stopwords, "maxFeatures": max_features}, }) # ---------------------------------------------------------------------------- # 3. Bag-of-Words # ---------------------------------------------------------------------------- @app.route("/api/bow", methods=["POST"]) def api_bow(): payload = request.get_json(force=True) or {} corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["bow"] tokenized = [tokenize(s) for s in corpus] vocabulary = sorted(set(tok for sent in tokenized for tok in sent)) def bow_vector(tokens, vocab): counts = {w: 0 for w in vocab} for tok in tokens: if tok in counts: counts[tok] += 1 return list(counts.values()) matrix = [bow_vector(toks, vocabulary) for toks in tokenized] cv_binary = CountVectorizer(binary=True) X_binary = cv_binary.fit_transform(corpus) binary_vocab = cv_binary.get_feature_names_out().tolist() binary_matrix = X_binary.toarray().tolist() cv_sim = CountVectorizer() X_sim = cv_sim.fit_transform(corpus) sim_matrix = cosine_similarity(X_sim).round(4).tolist() return jsonify({ "corpus": corpus, "tokenizedDocs": tokenized, "vocabulary": vocabulary, "matrix": matrix, "binaryVocabulary": binary_vocab, "binaryMatrix": binary_matrix, "cosineSimilarity": sim_matrix, }) # ---------------------------------------------------------------------------- # 4. N-grams # ---------------------------------------------------------------------------- def generate_ngrams(text, n): tokens = tokenize(text) return [" ".join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)] @app.route("/api/ngrams", methods=["POST"]) def api_ngrams(): payload = request.get_json(force=True) or {} sentence = (payload.get("sentence") or "").strip() or DEFAULTS["ngrams_sentence"] corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["ngrams_corpus"] manual = { "unigrams": generate_ngrams(sentence, 1), "bigrams": generate_ngrams(sentence, 2), "trigrams": generate_ngrams(sentence, 3), } ngram_matrices = {} for key, ngram_range, label in [ ("unigrams", (1, 1), "Unigrams"), ("bigrams", (2, 2), "Bigrams"), ("uni_bi", (1, 2), "Unigrams + Bigrams"), ]: cv_ng = CountVectorizer(ngram_range=ngram_range) X_ng = cv_ng.fit_transform(corpus) ngram_matrices[key] = { "label": label, "vocabulary": cv_ng.get_feature_names_out().tolist(), "matrix": X_ng.toarray().tolist(), } cv_char = CountVectorizer(analyzer="char_wb", ngram_range=(2, 3)) X_char = cv_char.fit_transform(corpus) char_vocab = cv_char.get_feature_names_out().tolist() return jsonify({ "sentence": sentence, "manual": manual, "corpus": corpus, "ngramMatrices": ngram_matrices, "charLevel": { "vocabularySize": len(char_vocab), "sample": char_vocab[:15], }, }) # ---------------------------------------------------------------------------- # 5. TF-IDF # ---------------------------------------------------------------------------- def compute_tf(tokens): tf = {} for w in tokens: tf[w] = tf.get(w, 0) + 1 return {w: c / len(tokens) for w, c in tf.items()} if tokens else {} def compute_idf(tokenized_docs): N = len(tokenized_docs) all_words = set(w for doc in tokenized_docs for w in doc) idf = {} for w in all_words: df = sum(1 for doc in tokenized_docs if w in doc) idf[w] = math.log(N / (1 + df)) + 1 # sklearn-style smoothing return idf @app.route("/api/tfidf", methods=["POST"]) def api_tfidf(): payload = request.get_json(force=True) or {} corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["tfidf"] tokenized_docs = [tokenize(s) for s in corpus] idf = compute_idf(tokenized_docs) manual_per_doc = [] for doc, tokens in zip(corpus, tokenized_docs): tf = compute_tf(tokens) scores = {w: round(tf.get(w, 0) * idf[w], 4) for w in tf} manual_per_doc.append({ "doc": doc, "tf": {w: round(v, 4) for w, v in tf.items()}, "tfidf": dict(sorted(scores.items(), key=lambda x: x[1], reverse=True)), }) tfidf_vec = TfidfVectorizer() X_tfidf = tfidf_vec.fit_transform(corpus) feature_names = tfidf_vec.get_feature_names_out().tolist() sk_matrix = X_tfidf.toarray().round(4).tolist() top_words = [] arr = X_tfidf.toarray() for i, doc in enumerate(corpus): row = arr[i] top_idx = row.argsort()[-3:][::-1] words = [ {"word": feature_names[j], "score": round(float(row[j]), 4)} for j in top_idx if row[j] > 0 ] top_words.append({"doc": doc, "top": words}) return jsonify({ "corpus": corpus, "idf": dict(sorted(idf.items(), key=lambda x: x[1], reverse=True)), "manualPerDoc": manual_per_doc, "sklearn": {"vocabulary": feature_names, "matrix": sk_matrix}, "topWords": top_words, }) # ---------------------------------------------------------------------------- # 6. Word Embeddings (Word2Vec + FastText via gensim) # ---------------------------------------------------------------------------- @app.route("/api/embeddings", methods=["POST"]) def api_embeddings(): payload = request.get_json(force=True) or {} raw_sentences = clean_corpus(payload.get("sentences")) or DEFAULTS["embeddings_sentences"] sentences = [tokenize(s) for s in raw_sentences] sentences = [s for s in sentences if s] word_pairs = payload.get("wordPairs") or [ ["cat", "dog"], ["cat", "mat"], ["king", "queen"], ["paris", "berlin"], ] plot_words_req = payload.get("plotWords") or [ "cat", "dog", "king", "queen", "paris", "berlin", "man", "woman", ] try: from gensim.models import Word2Vec, FastText except ImportError: return jsonify({"error": "gensim is not installed on the server. " "Run: pip install gensim"}), 500 t0 = time.time() model_sg = Word2Vec(sentences, vector_size=50, window=3, min_count=1, sg=1, epochs=200, seed=42) model_cbow = Word2Vec(sentences, vector_size=50, window=3, min_count=1, sg=0, epochs=200, seed=42) train_seconds = round(time.time() - t0, 3) vocab = sorted(model_sg.wv.key_to_index.keys()) similarities = [] for w1, w2 in word_pairs: if w1 in model_sg.wv and w2 in model_sg.wv: similarities.append({ "pair": [w1, w2], "skipgram": round(float(model_sg.wv.similarity(w1, w2)), 4), "cbow": round(float(model_cbow.wv.similarity(w1, w2)), 4), }) else: similarities.append({"pair": [w1, w2], "error": "word not in vocabulary"}) most_similar = {} for w in ["cat", "king", "paris"]: if w in model_sg.wv: most_similar[w] = [ {"word": ww, "score": round(float(sc), 4)} for ww, sc in model_sg.wv.most_similar(w, topn=3) ] plot_words = [w for w in plot_words_req if w in model_sg.wv] pca_points = [] if len(plot_words) >= 2: vectors = np.array([model_sg.wv[w] for w in plot_words]) coords = PCA(n_components=2, random_state=42).fit_transform(vectors) pca_points = [ {"word": w, "x": round(float(c[0]), 4), "y": round(float(c[1]), 4)} for w, c in zip(plot_words, coords) ] sample_word = plot_words[0] if plot_words else (vocab[0] if vocab else None) sample_vector = ( model_sg.wv[sample_word][:10].round(4).tolist() if sample_word else [] ) fasttext_demo = None try: ft = FastText(sentences, vector_size=50, window=3, min_count=1, epochs=100, seed=42) in_vocab_word = sample_word or (vocab[0] if vocab else None) oov_word = (in_vocab_word + "like") if in_vocab_word else "catlike" fasttext_demo = { "inVocabWord": in_vocab_word, "inVocabVector": ft.wv[in_vocab_word][:10].round(4).tolist() if in_vocab_word else [], "oovWord": oov_word, "oovVector": ft.wv[oov_word][:10].round(4).tolist(), "note": f"'{oov_word}' never appeared during training, but FastText " f"still produces a vector from its character n-grams.", } except Exception as exc: # pragma: no cover - defensive only fasttext_demo = {"error": str(exc)} return jsonify({ "sentences": raw_sentences, "vocabulary": vocab, "vocabSize": len(vocab), "trainSeconds": train_seconds, "sampleWord": sample_word, "sampleVector": sample_vector, "similarities": similarities, "mostSimilar": most_similar, "pcaPoints": pca_points, "fastText": fasttext_demo, }) if __name__ == "__main__": import os port = int(os.environ.get("PORT", 5000)) debug = os.environ.get("FLASK_DEBUG", "1") == "1" app.run(debug=debug, host="0.0.0.0", port=port)