import re import string import numpy as np import pandas as pd import gradio as gr import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.decomposition import TruncatedSVD # ---------------------------- # 1) BASIC NLP PREPROCESSING # ---------------------------- BASIC_STOPWORDS = { # small kid-friendly stopword list (no external downloads) "a","an","the","and","or","but","if","then","so","because", "is","am","are","was","were","be","been","being", "i","you","he","she","it","we","they","me","my","your","his","her","our","their", "to","of","in","on","at","for","with","from","as","by","about", "this","that","these","those", "do","does","did","doing", "have","has","had", "not","no","yes", # keep "not" if you want sentiment nuance; we let user choose "very","really","just" } def simple_stem(word: str) -> str: """ A tiny, kid-friendly stemmer (NOT perfect). Real stemming uses libraries; this keeps the app simple for HF. """ for suf in ["ing", "edly", "edly", "edly", "ed", "ly", "s"]: if word.endswith(suf) and len(word) > len(suf) + 2: return word[:-len(suf)] return word def preprocess_text( text: str, do_lower: bool = True, do_remove_punct: bool = True, do_remove_numbers: bool = False, do_stopwords: bool = False, keep_not: bool = True, do_stem: bool = False, ): t = text # 1) lowercase if do_lower: t = t.lower() # 2) remove punctuation if do_remove_punct: t = t.translate(str.maketrans("", "", string.punctuation)) # 3) remove numbers if do_remove_numbers: t = re.sub(r"\d+", "", t) # 4) tokenize (simple word tokens) tokens = re.findall(r"\b\w+\b", t) # 5) stopwords removal if do_stopwords: sw = BASIC_STOPWORDS.copy() if keep_not: sw.discard("not") sw.discard("no") tokens = [w for w in tokens if w not in sw] # 6) stemming (tiny demo) if do_stem: tokens = [simple_stem(w) for w in tokens] cleaned = " ".join(tokens).strip() return cleaned, tokens # ---------------------------- # 2) EMBEDDINGS + SIMILARITY # ---------------------------- DEFAULT_CORPUS = """A cat drinks milk and sleeps on the sofa. A dog likes to play fetch with a ball. Kittens are small cats and they love to nap. Puppies are small dogs and they love to play. The airplane flies in the sky above the clouds. A ship sails on the ocean and carries cargo. Trucks and cars drive on roads and highways. A bird can fly and sing in the morning. Fish swim in water and live in rivers. The teacher explains math in the classroom.""" def parse_corpus(corpus_text: str): lines = [ln.strip() for ln in corpus_text.splitlines()] lines = [ln for ln in lines if ln] # remove empty lines return lines def build_vectorizer(method: str, ngrams: str): if ngrams == "Unigrams (1 word)": ngram_range = (1, 1) else: ngram_range = (1, 2) # uni + bi if method == "TF-IDF (recommended)": return TfidfVectorizer(lowercase=True, ngram_range=ngram_range, stop_words="english") else: return CountVectorizer(lowercase=True, ngram_range=ngram_range, stop_words="english") def similarity_search(corpus_lines, query, method, ngrams, top_k): if len(corpus_lines) == 0: return pd.DataFrame(columns=["rank", "score", "text"]), None, None vec = build_vectorizer(method, ngrams) X = vec.fit_transform(corpus_lines) q = vec.transform([query]) sims = cosine_similarity(q, X)[0] # (num_docs,) order = np.argsort(sims)[::-1][:top_k] rows = [] for r, idx in enumerate(order, start=1): rows.append({"rank": r, "score": float(sims[idx]), "text": corpus_lines[int(idx)]}) df = pd.DataFrame(rows) return df, X, vec # ---------------------------- # 3) VISUALIZATIONS # ---------------------------- def plot_similarity_heatmap(X): S = cosine_similarity(X) fig = plt.figure(figsize=(6, 5)) plt.imshow(S) plt.title("Similarity Heatmap (Corpus vs Corpus)") plt.xlabel("Doc index") plt.ylabel("Doc index") plt.colorbar() plt.tight_layout() return fig def plot_2d_map(X, corpus_lines): # compress to 2D for visualization n_components = 2 svd = TruncatedSVD(n_components=n_components, random_state=42) X2 = svd.fit_transform(X) fig = plt.figure(figsize=(7, 5)) plt.scatter(X2[:, 0], X2[:, 1]) for i, (x, y) in enumerate(X2): plt.text(x + 0.01, y + 0.01, f"D{i}", fontsize=9) plt.title("2D Meaning Map (SVD on Embeddings)") plt.xlabel("Component 1") plt.ylabel("Component 2") plt.tight_layout() return fig # ---------------------------- # GRADIO APP LOGIC # ---------------------------- def run_preprocessing( text, do_lower, do_remove_punct, do_remove_numbers, do_stopwords, keep_not, do_stem ): cleaned, tokens = preprocess_text( text=text, do_lower=do_lower, do_remove_punct=do_remove_punct, do_remove_numbers=do_remove_numbers, do_stopwords=do_stopwords, keep_not=keep_not, do_stem=do_stem, ) # show tokens nicely tokens_str = ", ".join(tokens[:200]) + (" ..." if len(tokens) > 200 else "") return cleaned, tokens_str, len(tokens) def run_similarity( corpus_text, query, method, ngrams, top_k, show_heatmap, show_map ): corpus_lines = parse_corpus(corpus_text) if not query.strip(): return pd.DataFrame(columns=["rank", "score", "text"]), None, None, f"Corpus size: {len(corpus_lines)}" df, X, vec = similarity_search(corpus_lines, query, method, ngrams, int(top_k)) heat_fig = None map_fig = None if X is not None and show_heatmap and X.shape[0] >= 2: heat_fig = plot_similarity_heatmap(X) if X is not None and show_map and X.shape[0] >= 2: map_fig = plot_2d_map(X, corpus_lines) info = f"Corpus size: {len(corpus_lines)} | Embedding dims: {X.shape[1] if X is not None else 0}" return df, heat_fig, map_fig, info # ---------------------------- # UI # ---------------------------- with gr.Blocks(theme=gr.themes.Soft(), title="NLP Preprocessing + Similarity (Kid Friendly)") as demo: gr.Markdown( """ # 🧠 NLP Playground (Preprocessing + Similarity Search) This app teaches two basic NLP superpowers: ### 1) Preprocessing (cleaning text) You can turn cleaning steps on/off and see how the text changes. ### 2) Embeddings + Similarity Search You can paste a mini “library of sentences” and search it by meaning using embeddings. ✅ Works great on **Hugging Face Spaces**. """ ) with gr.Tabs(): # ---------------------------- # TAB 1: PREPROCESSING # ---------------------------- with gr.Tab("🧽 Preprocessing Lab"): gr.Markdown( """ ### What students learn here - **Lowercase** makes words match better (Cat = cat) - **Remove punctuation** removes extra symbols - **Remove numbers** removes digits if you want - **Stopwords** removes super common words (“the”, “is”) - **Stemming** is a simple trick to chop endings (play → play, playing → play) Try toggling things and watching the output change. """ ) inp = gr.Textbox( label="Type any sentence", value="Wow!!! I LOVE cats, cats, and more cats... I won 1000 points!!!", lines=3 ) with gr.Row(): do_lower = gr.Checkbox(True, label="lowercase") do_remove_punct = gr.Checkbox(True, label="remove punctuation") do_remove_numbers = gr.Checkbox(False, label="remove numbers") with gr.Row(): do_stopwords = gr.Checkbox(False, label="remove stopwords") keep_not = gr.Checkbox(True, label="keep 'not' and 'no' (important for meaning)") do_stem = gr.Checkbox(False, label="tiny stemming (demo)") btn = gr.Button("✨ Run Preprocessing", variant="primary") cleaned_out = gr.Textbox(label="Cleaned text (what model sees)", lines=2) tokens_out = gr.Textbox(label="Tokens (split words)", lines=3) token_count = gr.Number(label="Token count", precision=0) btn.click( fn=run_preprocessing, inputs=[inp, do_lower, do_remove_punct, do_remove_numbers, do_stopwords, keep_not, do_stem], outputs=[cleaned_out, tokens_out, token_count] ) # ---------------------------- # TAB 2: SIMILARITY SEARCH # ---------------------------- with gr.Tab("🔎 Similarity Search Lab"): gr.Markdown( """ ### What students learn here - An **embedding** turns each sentence into numbers. - **Cosine similarity** measures how close meanings are. - You can build a tiny “Google-like search” over your own sentences. """ ) corpus = gr.Textbox( label="Corpus (one sentence per line) — students can edit this", value=DEFAULT_CORPUS, lines=10 ) query = gr.Textbox( label="Query (what you want to search)", value="small baby cats love sleeping", lines=2 ) with gr.Row(): method = gr.Radio( choices=["TF-IDF (recommended)", "Bag of Words (counts)"], value="TF-IDF (recommended)", label="Embedding method" ) ngrams = gr.Radio( choices=["Unigrams (1 word)", "Unigrams + Bigrams (1-2 words)"], value="Unigrams + Bigrams (1-2 words)", label="N-grams" ) with gr.Row(): top_k = gr.Slider(1, 10, value=5, step=1, label="Top-K results") show_heatmap = gr.Checkbox(False, label="Show similarity heatmap (slow for big corpus)") show_map = gr.Checkbox(True, label="Show 2D meaning map") run_btn = gr.Button("🔍 Search by Meaning", variant="primary") info = gr.Markdown("") results_table = gr.Dataframe( headers=["rank", "score", "text"], datatype=["number", "number", "str"], label="Top matches (sorted by similarity)" ) with gr.Row(): heat_plot = gr.Plot(label="Similarity Heatmap") map_plot = gr.Plot(label="2D Meaning Map") run_btn.click( fn=run_similarity, inputs=[corpus, query, method, ngrams, top_k, show_heatmap, show_map], outputs=[results_table, heat_plot, map_plot, info] ) gr.Markdown( """ --- ## ✅ Classroom mini-challenges 1) In the **Preprocessing** tab, make the cleaned text remove punctuation and stopwords. What changes? 2) In **Similarity Search**, add your own lines like: - "I love pizza and burgers." - "Math homework is difficult." - "Dogs are playful and friendly." Then search: - “food I like” - “school work” - “animals that play” Watch which sentences become “closest”. """ ) if __name__ == "__main__": demo.launch()