Spaces:
Running
Running
| """ | |
| Text Vectorization Lab — backend | |
| ================================= | |
| A small Flask API that actually executes the same scikit-learn / numpy / | |
| gensim code used to teach One-Hot Encoding, Count Vectorizer, Bag-of-Words, | |
| N-grams, TF-IDF, and Word2Vec/FastText word embeddings. | |
| Every endpoint below is a thin, JSON-friendly wrapper around the exact | |
| computations from the reference notebook — nothing is hard-coded or faked. | |
| The frontend (static/js/main.js) calls these endpoints and animates the | |
| intermediate steps so the visitor can watch each technique build up. | |
| """ | |
| import math | |
| import re | |
| import time | |
| import numpy as np | |
| import pandas as pd | |
| from flask import Flask, jsonify, render_template, request | |
| from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.preprocessing import OneHotEncoder | |
| from sklearn.decomposition import PCA | |
| app = Flask(__name__) | |
| # ---------------------------------------------------------------------------- | |
| # Helpers | |
| # ---------------------------------------------------------------------------- | |
| def tokenize(text: str): | |
| """Simple whitespace/punctuation tokenizer used across the manual demos.""" | |
| return re.findall(r"[A-Za-z0-9']+", text.lower()) | |
| def clean_corpus(raw): | |
| """Normalize whatever the frontend sends into a list[str] of non-empty docs.""" | |
| if raw is None: | |
| return [] | |
| if isinstance(raw, str): | |
| raw = raw.split("\n") | |
| return [s.strip() for s in raw if isinstance(s, str) and s.strip()] | |
| DEFAULTS = { | |
| "onehot": ["I love NLP", "NLP is fun", "I love coding"], | |
| "count": [ | |
| "I love NLP and I love Python", | |
| "NLP is amazing and fun", | |
| "Python is great for NLP", | |
| ], | |
| "bow": [ | |
| "the cat sat on the mat", | |
| "the dog sat on the log", | |
| "the cat and the dog are friends", | |
| ], | |
| "ngrams_sentence": "I love studying Natural Language Processing", | |
| "ngrams_corpus": [ | |
| "I love NLP and machine learning", | |
| "machine learning is part of AI", | |
| "NLP is a branch of AI", | |
| ], | |
| "tfidf": [ | |
| "I love NLP and machine learning", | |
| "machine learning is part of AI", | |
| "NLP is a branch of AI", | |
| "I love AI and deep learning", | |
| ], | |
| "embeddings_sentences": [ | |
| "the cat sat on the mat", | |
| "the dog ran on the grass", | |
| "cats and dogs are pets", | |
| "i love my cat", | |
| "i love my dog", | |
| "king and queen are royalty", | |
| "man and woman are humans", | |
| "paris is the capital of france", | |
| "berlin is the capital of germany", | |
| ], | |
| } | |
| # ---------------------------------------------------------------------------- | |
| # Page | |
| # ---------------------------------------------------------------------------- | |
| def index(): | |
| return render_template("index.html") | |
| # ---------------------------------------------------------------------------- | |
| # 1. One-Hot Encoding | |
| # ---------------------------------------------------------------------------- | |
| def api_onehot(): | |
| payload = request.get_json(force=True) or {} | |
| corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["onehot"] | |
| tokenized = [tokenize(s) for s in corpus] | |
| vocabulary = sorted(set(tok for sent in tokenized for tok in sent)) | |
| word_to_idx = {w: i for i, w in enumerate(vocabulary)} | |
| # Manual / NumPy one-hot, identical approach to the notebook's | |
| # one_hot_encode() helper, run once per unique vocabulary word. | |
| vectors = {} | |
| for word in vocabulary: | |
| vec = np.zeros(len(vocabulary), dtype=int) | |
| vec[word_to_idx[word]] = 1 | |
| vectors[word] = vec.tolist() | |
| # sklearn OneHotEncoder cross-check (Method 2 in the notebook) so the | |
| # numbers are guaranteed to match what scikit-learn itself produces. | |
| flat_words = np.array([tok for sent in tokenized for tok in sent]).reshape(-1, 1) | |
| encoder = OneHotEncoder(sparse_output=False) | |
| sk_matrix = encoder.fit_transform(flat_words) if len(flat_words) else np.empty((0, 0)) | |
| sentences_out = [] | |
| for sent, toks in zip(corpus, tokenized): | |
| sentences_out.append({ | |
| "sentence": sent, | |
| "tokens": toks, | |
| "vectors": [vectors[t] for t in toks], | |
| }) | |
| return jsonify({ | |
| "corpus": corpus, | |
| "vocabulary": vocabulary, | |
| "vectorLength": len(vocabulary), | |
| "vectors": vectors, | |
| "sentences": sentences_out, | |
| "sklearnCheck": { | |
| "categories": encoder.categories_[0].tolist() if len(flat_words) else [], | |
| "matrix": sk_matrix.astype(int).tolist(), | |
| "inputWords": flat_words.flatten().tolist(), | |
| }, | |
| }) | |
| # ---------------------------------------------------------------------------- | |
| # 2. Count Vectorizer | |
| # ---------------------------------------------------------------------------- | |
| def api_count_vectorizer(): | |
| payload = request.get_json(force=True) or {} | |
| corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["count"] | |
| use_stopwords = bool(payload.get("stopWords")) | |
| max_features = payload.get("maxFeatures") | |
| new_doc = (payload.get("newDoc") or "").strip() | |
| kwargs = {} | |
| if use_stopwords: | |
| kwargs["stop_words"] = "english" | |
| if max_features: | |
| try: | |
| kwargs["max_features"] = int(max_features) | |
| except (TypeError, ValueError): | |
| pass | |
| cv = CountVectorizer(**kwargs) | |
| X = cv.fit_transform(corpus) | |
| vocabulary = cv.get_feature_names_out().tolist() | |
| matrix = X.toarray().tolist() | |
| new_doc_result = None | |
| if new_doc: | |
| X_new = cv.transform([new_doc]) | |
| new_doc_result = { | |
| "doc": new_doc, | |
| "vector": X_new.toarray()[0].tolist(), | |
| } | |
| return jsonify({ | |
| "corpus": corpus, | |
| "tokenizedDocs": [tokenize(s) for s in corpus], | |
| "vocabulary": vocabulary, | |
| "matrix": matrix, | |
| "newDocResult": new_doc_result, | |
| "settings": {"stopWords": use_stopwords, "maxFeatures": max_features}, | |
| }) | |
| # ---------------------------------------------------------------------------- | |
| # 3. Bag-of-Words | |
| # ---------------------------------------------------------------------------- | |
| def api_bow(): | |
| payload = request.get_json(force=True) or {} | |
| corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["bow"] | |
| tokenized = [tokenize(s) for s in corpus] | |
| vocabulary = sorted(set(tok for sent in tokenized for tok in sent)) | |
| def bow_vector(tokens, vocab): | |
| counts = {w: 0 for w in vocab} | |
| for tok in tokens: | |
| if tok in counts: | |
| counts[tok] += 1 | |
| return list(counts.values()) | |
| matrix = [bow_vector(toks, vocabulary) for toks in tokenized] | |
| cv_binary = CountVectorizer(binary=True) | |
| X_binary = cv_binary.fit_transform(corpus) | |
| binary_vocab = cv_binary.get_feature_names_out().tolist() | |
| binary_matrix = X_binary.toarray().tolist() | |
| cv_sim = CountVectorizer() | |
| X_sim = cv_sim.fit_transform(corpus) | |
| sim_matrix = cosine_similarity(X_sim).round(4).tolist() | |
| return jsonify({ | |
| "corpus": corpus, | |
| "tokenizedDocs": tokenized, | |
| "vocabulary": vocabulary, | |
| "matrix": matrix, | |
| "binaryVocabulary": binary_vocab, | |
| "binaryMatrix": binary_matrix, | |
| "cosineSimilarity": sim_matrix, | |
| }) | |
| # ---------------------------------------------------------------------------- | |
| # 4. N-grams | |
| # ---------------------------------------------------------------------------- | |
| def generate_ngrams(text, n): | |
| tokens = tokenize(text) | |
| return [" ".join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)] | |
| def api_ngrams(): | |
| payload = request.get_json(force=True) or {} | |
| sentence = (payload.get("sentence") or "").strip() or DEFAULTS["ngrams_sentence"] | |
| corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["ngrams_corpus"] | |
| manual = { | |
| "unigrams": generate_ngrams(sentence, 1), | |
| "bigrams": generate_ngrams(sentence, 2), | |
| "trigrams": generate_ngrams(sentence, 3), | |
| } | |
| ngram_matrices = {} | |
| for key, ngram_range, label in [ | |
| ("unigrams", (1, 1), "Unigrams"), | |
| ("bigrams", (2, 2), "Bigrams"), | |
| ("uni_bi", (1, 2), "Unigrams + Bigrams"), | |
| ]: | |
| cv_ng = CountVectorizer(ngram_range=ngram_range) | |
| X_ng = cv_ng.fit_transform(corpus) | |
| ngram_matrices[key] = { | |
| "label": label, | |
| "vocabulary": cv_ng.get_feature_names_out().tolist(), | |
| "matrix": X_ng.toarray().tolist(), | |
| } | |
| cv_char = CountVectorizer(analyzer="char_wb", ngram_range=(2, 3)) | |
| X_char = cv_char.fit_transform(corpus) | |
| char_vocab = cv_char.get_feature_names_out().tolist() | |
| return jsonify({ | |
| "sentence": sentence, | |
| "manual": manual, | |
| "corpus": corpus, | |
| "ngramMatrices": ngram_matrices, | |
| "charLevel": { | |
| "vocabularySize": len(char_vocab), | |
| "sample": char_vocab[:15], | |
| }, | |
| }) | |
| # ---------------------------------------------------------------------------- | |
| # 5. TF-IDF | |
| # ---------------------------------------------------------------------------- | |
| def compute_tf(tokens): | |
| tf = {} | |
| for w in tokens: | |
| tf[w] = tf.get(w, 0) + 1 | |
| return {w: c / len(tokens) for w, c in tf.items()} if tokens else {} | |
| def compute_idf(tokenized_docs): | |
| N = len(tokenized_docs) | |
| all_words = set(w for doc in tokenized_docs for w in doc) | |
| idf = {} | |
| for w in all_words: | |
| df = sum(1 for doc in tokenized_docs if w in doc) | |
| idf[w] = math.log(N / (1 + df)) + 1 # sklearn-style smoothing | |
| return idf | |
| def api_tfidf(): | |
| payload = request.get_json(force=True) or {} | |
| corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["tfidf"] | |
| tokenized_docs = [tokenize(s) for s in corpus] | |
| idf = compute_idf(tokenized_docs) | |
| manual_per_doc = [] | |
| for doc, tokens in zip(corpus, tokenized_docs): | |
| tf = compute_tf(tokens) | |
| scores = {w: round(tf.get(w, 0) * idf[w], 4) for w in tf} | |
| manual_per_doc.append({ | |
| "doc": doc, | |
| "tf": {w: round(v, 4) for w, v in tf.items()}, | |
| "tfidf": dict(sorted(scores.items(), key=lambda x: x[1], reverse=True)), | |
| }) | |
| tfidf_vec = TfidfVectorizer() | |
| X_tfidf = tfidf_vec.fit_transform(corpus) | |
| feature_names = tfidf_vec.get_feature_names_out().tolist() | |
| sk_matrix = X_tfidf.toarray().round(4).tolist() | |
| top_words = [] | |
| arr = X_tfidf.toarray() | |
| for i, doc in enumerate(corpus): | |
| row = arr[i] | |
| top_idx = row.argsort()[-3:][::-1] | |
| words = [ | |
| {"word": feature_names[j], "score": round(float(row[j]), 4)} | |
| for j in top_idx if row[j] > 0 | |
| ] | |
| top_words.append({"doc": doc, "top": words}) | |
| return jsonify({ | |
| "corpus": corpus, | |
| "idf": dict(sorted(idf.items(), key=lambda x: x[1], reverse=True)), | |
| "manualPerDoc": manual_per_doc, | |
| "sklearn": {"vocabulary": feature_names, "matrix": sk_matrix}, | |
| "topWords": top_words, | |
| }) | |
| # ---------------------------------------------------------------------------- | |
| # 6. Word Embeddings (Word2Vec + FastText via gensim) | |
| # ---------------------------------------------------------------------------- | |
| def api_embeddings(): | |
| payload = request.get_json(force=True) or {} | |
| raw_sentences = clean_corpus(payload.get("sentences")) or DEFAULTS["embeddings_sentences"] | |
| sentences = [tokenize(s) for s in raw_sentences] | |
| sentences = [s for s in sentences if s] | |
| word_pairs = payload.get("wordPairs") or [ | |
| ["cat", "dog"], ["cat", "mat"], ["king", "queen"], ["paris", "berlin"], | |
| ] | |
| plot_words_req = payload.get("plotWords") or [ | |
| "cat", "dog", "king", "queen", "paris", "berlin", "man", "woman", | |
| ] | |
| try: | |
| from gensim.models import Word2Vec, FastText | |
| except ImportError: | |
| return jsonify({"error": "gensim is not installed on the server. " | |
| "Run: pip install gensim"}), 500 | |
| t0 = time.time() | |
| model_sg = Word2Vec(sentences, vector_size=50, window=3, min_count=1, | |
| sg=1, epochs=200, seed=42) | |
| model_cbow = Word2Vec(sentences, vector_size=50, window=3, min_count=1, | |
| sg=0, epochs=200, seed=42) | |
| train_seconds = round(time.time() - t0, 3) | |
| vocab = sorted(model_sg.wv.key_to_index.keys()) | |
| similarities = [] | |
| for w1, w2 in word_pairs: | |
| if w1 in model_sg.wv and w2 in model_sg.wv: | |
| similarities.append({ | |
| "pair": [w1, w2], | |
| "skipgram": round(float(model_sg.wv.similarity(w1, w2)), 4), | |
| "cbow": round(float(model_cbow.wv.similarity(w1, w2)), 4), | |
| }) | |
| else: | |
| similarities.append({"pair": [w1, w2], "error": "word not in vocabulary"}) | |
| most_similar = {} | |
| for w in ["cat", "king", "paris"]: | |
| if w in model_sg.wv: | |
| most_similar[w] = [ | |
| {"word": ww, "score": round(float(sc), 4)} | |
| for ww, sc in model_sg.wv.most_similar(w, topn=3) | |
| ] | |
| plot_words = [w for w in plot_words_req if w in model_sg.wv] | |
| pca_points = [] | |
| if len(plot_words) >= 2: | |
| vectors = np.array([model_sg.wv[w] for w in plot_words]) | |
| coords = PCA(n_components=2, random_state=42).fit_transform(vectors) | |
| pca_points = [ | |
| {"word": w, "x": round(float(c[0]), 4), "y": round(float(c[1]), 4)} | |
| for w, c in zip(plot_words, coords) | |
| ] | |
| sample_word = plot_words[0] if plot_words else (vocab[0] if vocab else None) | |
| sample_vector = ( | |
| model_sg.wv[sample_word][:10].round(4).tolist() if sample_word else [] | |
| ) | |
| fasttext_demo = None | |
| try: | |
| ft = FastText(sentences, vector_size=50, window=3, min_count=1, | |
| epochs=100, seed=42) | |
| in_vocab_word = sample_word or (vocab[0] if vocab else None) | |
| oov_word = (in_vocab_word + "like") if in_vocab_word else "catlike" | |
| fasttext_demo = { | |
| "inVocabWord": in_vocab_word, | |
| "inVocabVector": ft.wv[in_vocab_word][:10].round(4).tolist() if in_vocab_word else [], | |
| "oovWord": oov_word, | |
| "oovVector": ft.wv[oov_word][:10].round(4).tolist(), | |
| "note": f"'{oov_word}' never appeared during training, but FastText " | |
| f"still produces a vector from its character n-grams.", | |
| } | |
| except Exception as exc: # pragma: no cover - defensive only | |
| fasttext_demo = {"error": str(exc)} | |
| return jsonify({ | |
| "sentences": raw_sentences, | |
| "vocabulary": vocab, | |
| "vocabSize": len(vocab), | |
| "trainSeconds": train_seconds, | |
| "sampleWord": sample_word, | |
| "sampleVector": sample_vector, | |
| "similarities": similarities, | |
| "mostSimilar": most_similar, | |
| "pcaPoints": pca_points, | |
| "fastText": fasttext_demo, | |
| }) | |
| if __name__ == "__main__": | |
| import os | |
| port = int(os.environ.get("PORT", 5000)) | |
| debug = os.environ.get("FLASK_DEBUG", "1") == "1" | |
| app.run(debug=debug, host="0.0.0.0", port=port) | |