Spaces:

shukdev3
/

Text-Vectorizer

Running

App Files Files Community

Text-Vectorizer / app.py

shukdev3

Upload 4 files

fd87dec verified 7 days ago

Raw

History Blame Contribute Delete

15.4 kB

	"""
	Text Vectorization Lab — backend
	=================================
	A small Flask API that actually executes the same scikit-learn / numpy /
	gensim code used to teach One-Hot Encoding, Count Vectorizer, Bag-of-Words,
	N-grams, TF-IDF, and Word2Vec/FastText word embeddings.

	Every endpoint below is a thin, JSON-friendly wrapper around the exact
	computations from the reference notebook — nothing is hard-coded or faked.
	The frontend (static/js/main.js) calls these endpoints and animates the
	intermediate steps so the visitor can watch each technique build up.
	"""
	import math
	import re
	import time

	import numpy as np
	import pandas as pd
	from flask import Flask, jsonify, render_template, request
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.decomposition import PCA

	app = Flask(__name__)

	# ----------------------------------------------------------------------------
	# Helpers
	# ----------------------------------------------------------------------------

	def tokenize(text: str):
	"""Simple whitespace/punctuation tokenizer used across the manual demos."""
	return re.findall(r"[A-Za-z0-9']+", text.lower())


	def clean_corpus(raw):
	"""Normalize whatever the frontend sends into a list[str] of non-empty docs."""
	if raw is None:
	return []
	if isinstance(raw, str):
	raw = raw.split("\n")
	return [s.strip() for s in raw if isinstance(s, str) and s.strip()]


	DEFAULTS = {
	"onehot": ["I love NLP", "NLP is fun", "I love coding"],
	"count": [
	"I love NLP and I love Python",
	"NLP is amazing and fun",
	"Python is great for NLP",
	],
	"bow": [
	"the cat sat on the mat",
	"the dog sat on the log",
	"the cat and the dog are friends",
	],
	"ngrams_sentence": "I love studying Natural Language Processing",
	"ngrams_corpus": [
	"I love NLP and machine learning",
	"machine learning is part of AI",
	"NLP is a branch of AI",
	],
	"tfidf": [
	"I love NLP and machine learning",
	"machine learning is part of AI",
	"NLP is a branch of AI",
	"I love AI and deep learning",
	],
	"embeddings_sentences": [
	"the cat sat on the mat",
	"the dog ran on the grass",
	"cats and dogs are pets",
	"i love my cat",
	"i love my dog",
	"king and queen are royalty",
	"man and woman are humans",
	"paris is the capital of france",
	"berlin is the capital of germany",
	],
	}


	# ----------------------------------------------------------------------------
	# Page
	# ----------------------------------------------------------------------------

	@app.route("/")
	def index():
	return render_template("index.html")


	# ----------------------------------------------------------------------------
	# 1. One-Hot Encoding
	# ----------------------------------------------------------------------------

	@app.route("/api/onehot", methods=["POST"])
	def api_onehot():
	payload = request.get_json(force=True) or {}
	corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["onehot"]

	tokenized = [tokenize(s) for s in corpus]
	vocabulary = sorted(set(tok for sent in tokenized for tok in sent))
	word_to_idx = {w: i for i, w in enumerate(vocabulary)}

	# Manual / NumPy one-hot, identical approach to the notebook's
	# one_hot_encode() helper, run once per unique vocabulary word.
	vectors = {}
	for word in vocabulary:
	vec = np.zeros(len(vocabulary), dtype=int)
	vec[word_to_idx[word]] = 1
	vectors[word] = vec.tolist()

	# sklearn OneHotEncoder cross-check (Method 2 in the notebook) so the
	# numbers are guaranteed to match what scikit-learn itself produces.
	flat_words = np.array([tok for sent in tokenized for tok in sent]).reshape(-1, 1)
	encoder = OneHotEncoder(sparse_output=False)
	sk_matrix = encoder.fit_transform(flat_words) if len(flat_words) else np.empty((0, 0))

	sentences_out = []
	for sent, toks in zip(corpus, tokenized):
	sentences_out.append({
	"sentence": sent,
	"tokens": toks,
	"vectors": [vectors[t] for t in toks],
	})

	return jsonify({
	"corpus": corpus,
	"vocabulary": vocabulary,
	"vectorLength": len(vocabulary),
	"vectors": vectors,
	"sentences": sentences_out,
	"sklearnCheck": {
	"categories": encoder.categories_[0].tolist() if len(flat_words) else [],
	"matrix": sk_matrix.astype(int).tolist(),
	"inputWords": flat_words.flatten().tolist(),
	},
	})


	# ----------------------------------------------------------------------------
	# 2. Count Vectorizer
	# ----------------------------------------------------------------------------

	@app.route("/api/count-vectorizer", methods=["POST"])
	def api_count_vectorizer():
	payload = request.get_json(force=True) or {}
	corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["count"]
	use_stopwords = bool(payload.get("stopWords"))
	max_features = payload.get("maxFeatures")
	new_doc = (payload.get("newDoc") or "").strip()

	kwargs = {}
	if use_stopwords:
	kwargs["stop_words"] = "english"
	if max_features:
	try:
	kwargs["max_features"] = int(max_features)
	except (TypeError, ValueError):
	pass

	cv = CountVectorizer(**kwargs)
	X = cv.fit_transform(corpus)
	vocabulary = cv.get_feature_names_out().tolist()
	matrix = X.toarray().tolist()

	new_doc_result = None
	if new_doc:
	X_new = cv.transform([new_doc])
	new_doc_result = {
	"doc": new_doc,
	"vector": X_new.toarray()[0].tolist(),
	}

	return jsonify({
	"corpus": corpus,
	"tokenizedDocs": [tokenize(s) for s in corpus],
	"vocabulary": vocabulary,
	"matrix": matrix,
	"newDocResult": new_doc_result,
	"settings": {"stopWords": use_stopwords, "maxFeatures": max_features},
	})


	# ----------------------------------------------------------------------------
	# 3. Bag-of-Words
	# ----------------------------------------------------------------------------

	@app.route("/api/bow", methods=["POST"])
	def api_bow():
	payload = request.get_json(force=True) or {}
	corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["bow"]

	tokenized = [tokenize(s) for s in corpus]
	vocabulary = sorted(set(tok for sent in tokenized for tok in sent))

	def bow_vector(tokens, vocab):
	counts = {w: 0 for w in vocab}
	for tok in tokens:
	if tok in counts:
	counts[tok] += 1
	return list(counts.values())

	matrix = [bow_vector(toks, vocabulary) for toks in tokenized]

	cv_binary = CountVectorizer(binary=True)
	X_binary = cv_binary.fit_transform(corpus)
	binary_vocab = cv_binary.get_feature_names_out().tolist()
	binary_matrix = X_binary.toarray().tolist()

	cv_sim = CountVectorizer()
	X_sim = cv_sim.fit_transform(corpus)
	sim_matrix = cosine_similarity(X_sim).round(4).tolist()

	return jsonify({
	"corpus": corpus,
	"tokenizedDocs": tokenized,
	"vocabulary": vocabulary,
	"matrix": matrix,
	"binaryVocabulary": binary_vocab,
	"binaryMatrix": binary_matrix,
	"cosineSimilarity": sim_matrix,
	})


	# ----------------------------------------------------------------------------
	# 4. N-grams
	# ----------------------------------------------------------------------------

	def generate_ngrams(text, n):
	tokens = tokenize(text)
	return [" ".join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]


	@app.route("/api/ngrams", methods=["POST"])
	def api_ngrams():
	payload = request.get_json(force=True) or {}
	sentence = (payload.get("sentence") or "").strip() or DEFAULTS["ngrams_sentence"]
	corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["ngrams_corpus"]

	manual = {
	"unigrams": generate_ngrams(sentence, 1),
	"bigrams": generate_ngrams(sentence, 2),
	"trigrams": generate_ngrams(sentence, 3),
	}

	ngram_matrices = {}
	for key, ngram_range, label in [
	("unigrams", (1, 1), "Unigrams"),
	("bigrams", (2, 2), "Bigrams"),
	("uni_bi", (1, 2), "Unigrams + Bigrams"),
	]:
	cv_ng = CountVectorizer(ngram_range=ngram_range)
	X_ng = cv_ng.fit_transform(corpus)
	ngram_matrices[key] = {
	"label": label,
	"vocabulary": cv_ng.get_feature_names_out().tolist(),
	"matrix": X_ng.toarray().tolist(),
	}

	cv_char = CountVectorizer(analyzer="char_wb", ngram_range=(2, 3))
	X_char = cv_char.fit_transform(corpus)
	char_vocab = cv_char.get_feature_names_out().tolist()

	return jsonify({
	"sentence": sentence,
	"manual": manual,
	"corpus": corpus,
	"ngramMatrices": ngram_matrices,
	"charLevel": {
	"vocabularySize": len(char_vocab),
	"sample": char_vocab[:15],
	},
	})


	# ----------------------------------------------------------------------------
	# 5. TF-IDF
	# ----------------------------------------------------------------------------

	def compute_tf(tokens):
	tf = {}
	for w in tokens:
	tf[w] = tf.get(w, 0) + 1
	return {w: c / len(tokens) for w, c in tf.items()} if tokens else {}


	def compute_idf(tokenized_docs):
	N = len(tokenized_docs)
	all_words = set(w for doc in tokenized_docs for w in doc)
	idf = {}
	for w in all_words:
	df = sum(1 for doc in tokenized_docs if w in doc)
	idf[w] = math.log(N / (1 + df)) + 1 # sklearn-style smoothing
	return idf


	@app.route("/api/tfidf", methods=["POST"])
	def api_tfidf():
	payload = request.get_json(force=True) or {}
	corpus = clean_corpus(payload.get("corpus")) or DEFAULTS["tfidf"]

	tokenized_docs = [tokenize(s) for s in corpus]
	idf = compute_idf(tokenized_docs)

	manual_per_doc = []
	for doc, tokens in zip(corpus, tokenized_docs):
	tf = compute_tf(tokens)
	scores = {w: round(tf.get(w, 0) * idf[w], 4) for w in tf}
	manual_per_doc.append({
	"doc": doc,
	"tf": {w: round(v, 4) for w, v in tf.items()},
	"tfidf": dict(sorted(scores.items(), key=lambda x: x[1], reverse=True)),
	})

	tfidf_vec = TfidfVectorizer()
	X_tfidf = tfidf_vec.fit_transform(corpus)
	feature_names = tfidf_vec.get_feature_names_out().tolist()
	sk_matrix = X_tfidf.toarray().round(4).tolist()

	top_words = []
	arr = X_tfidf.toarray()
	for i, doc in enumerate(corpus):
	row = arr[i]
	top_idx = row.argsort()[-3:][::-1]
	words = [
	{"word": feature_names[j], "score": round(float(row[j]), 4)}
	for j in top_idx if row[j] > 0
	]
	top_words.append({"doc": doc, "top": words})

	return jsonify({
	"corpus": corpus,
	"idf": dict(sorted(idf.items(), key=lambda x: x[1], reverse=True)),
	"manualPerDoc": manual_per_doc,
	"sklearn": {"vocabulary": feature_names, "matrix": sk_matrix},
	"topWords": top_words,
	})


	# ----------------------------------------------------------------------------
	# 6. Word Embeddings (Word2Vec + FastText via gensim)
	# ----------------------------------------------------------------------------

	@app.route("/api/embeddings", methods=["POST"])
	def api_embeddings():
	payload = request.get_json(force=True) or {}
	raw_sentences = clean_corpus(payload.get("sentences")) or DEFAULTS["embeddings_sentences"]
	sentences = [tokenize(s) for s in raw_sentences]
	sentences = [s for s in sentences if s]

	word_pairs = payload.get("wordPairs") or [
	["cat", "dog"], ["cat", "mat"], ["king", "queen"], ["paris", "berlin"],
	]
	plot_words_req = payload.get("plotWords") or [
	"cat", "dog", "king", "queen", "paris", "berlin", "man", "woman",
	]

	try:
	from gensim.models import Word2Vec, FastText
	except ImportError:
	return jsonify({"error": "gensim is not installed on the server. "
	"Run: pip install gensim"}), 500

	t0 = time.time()
	model_sg = Word2Vec(sentences, vector_size=50, window=3, min_count=1,
	sg=1, epochs=200, seed=42)
	model_cbow = Word2Vec(sentences, vector_size=50, window=3, min_count=1,
	sg=0, epochs=200, seed=42)
	train_seconds = round(time.time() - t0, 3)

	vocab = sorted(model_sg.wv.key_to_index.keys())

	similarities = []
	for w1, w2 in word_pairs:
	if w1 in model_sg.wv and w2 in model_sg.wv:
	similarities.append({
	"pair": [w1, w2],
	"skipgram": round(float(model_sg.wv.similarity(w1, w2)), 4),
	"cbow": round(float(model_cbow.wv.similarity(w1, w2)), 4),
	})
	else:
	similarities.append({"pair": [w1, w2], "error": "word not in vocabulary"})

	most_similar = {}
	for w in ["cat", "king", "paris"]:
	if w in model_sg.wv:
	most_similar[w] = [
	{"word": ww, "score": round(float(sc), 4)}
	for ww, sc in model_sg.wv.most_similar(w, topn=3)
	]

	plot_words = [w for w in plot_words_req if w in model_sg.wv]
	pca_points = []
	if len(plot_words) >= 2:
	vectors = np.array([model_sg.wv[w] for w in plot_words])
	coords = PCA(n_components=2, random_state=42).fit_transform(vectors)
	pca_points = [
	{"word": w, "x": round(float(c[0]), 4), "y": round(float(c[1]), 4)}
	for w, c in zip(plot_words, coords)
	]

	sample_word = plot_words[0] if plot_words else (vocab[0] if vocab else None)
	sample_vector = (
	model_sg.wv[sample_word][:10].round(4).tolist() if sample_word else []
	)

	fasttext_demo = None
	try:
	ft = FastText(sentences, vector_size=50, window=3, min_count=1,
	epochs=100, seed=42)
	in_vocab_word = sample_word or (vocab[0] if vocab else None)
	oov_word = (in_vocab_word + "like") if in_vocab_word else "catlike"
	fasttext_demo = {
	"inVocabWord": in_vocab_word,
	"inVocabVector": ft.wv[in_vocab_word][:10].round(4).tolist() if in_vocab_word else [],
	"oovWord": oov_word,
	"oovVector": ft.wv[oov_word][:10].round(4).tolist(),
	"note": f"'{oov_word}' never appeared during training, but FastText "
	f"still produces a vector from its character n-grams.",
	}
	except Exception as exc: # pragma: no cover - defensive only
	fasttext_demo = {"error": str(exc)}

	return jsonify({
	"sentences": raw_sentences,
	"vocabulary": vocab,
	"vocabSize": len(vocab),
	"trainSeconds": train_seconds,
	"sampleWord": sample_word,
	"sampleVector": sample_vector,
	"similarities": similarities,
	"mostSimilar": most_similar,
	"pcaPoints": pca_points,
	"fastText": fasttext_demo,
	})


	if __name__ == "__main__":
	import os
	port = int(os.environ.get("PORT", 5000))
	debug = os.environ.get("FLASK_DEBUG", "1") == "1"
	app.run(debug=debug, host="0.0.0.0", port=port)