Spaces:
Sleeping
Sleeping
| import re | |
| import string | |
| import numpy as np | |
| import pandas as pd | |
| import gradio as gr | |
| import matplotlib.pyplot as plt | |
| from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.decomposition import TruncatedSVD | |
| # ---------------------------- | |
| # 1) BASIC NLP PREPROCESSING | |
| # ---------------------------- | |
| BASIC_STOPWORDS = { | |
| # small kid-friendly stopword list (no external downloads) | |
| "a","an","the","and","or","but","if","then","so","because", | |
| "is","am","are","was","were","be","been","being", | |
| "i","you","he","she","it","we","they","me","my","your","his","her","our","their", | |
| "to","of","in","on","at","for","with","from","as","by","about", | |
| "this","that","these","those", | |
| "do","does","did","doing", | |
| "have","has","had", | |
| "not","no","yes", # keep "not" if you want sentiment nuance; we let user choose | |
| "very","really","just" | |
| } | |
| def simple_stem(word: str) -> str: | |
| """ | |
| A tiny, kid-friendly stemmer (NOT perfect). | |
| Real stemming uses libraries; this keeps the app simple for HF. | |
| """ | |
| for suf in ["ing", "edly", "edly", "edly", "ed", "ly", "s"]: | |
| if word.endswith(suf) and len(word) > len(suf) + 2: | |
| return word[:-len(suf)] | |
| return word | |
| def preprocess_text( | |
| text: str, | |
| do_lower: bool = True, | |
| do_remove_punct: bool = True, | |
| do_remove_numbers: bool = False, | |
| do_stopwords: bool = False, | |
| keep_not: bool = True, | |
| do_stem: bool = False, | |
| ): | |
| t = text | |
| # 1) lowercase | |
| if do_lower: | |
| t = t.lower() | |
| # 2) remove punctuation | |
| if do_remove_punct: | |
| t = t.translate(str.maketrans("", "", string.punctuation)) | |
| # 3) remove numbers | |
| if do_remove_numbers: | |
| t = re.sub(r"\d+", "", t) | |
| # 4) tokenize (simple word tokens) | |
| tokens = re.findall(r"\b\w+\b", t) | |
| # 5) stopwords removal | |
| if do_stopwords: | |
| sw = BASIC_STOPWORDS.copy() | |
| if keep_not: | |
| sw.discard("not") | |
| sw.discard("no") | |
| tokens = [w for w in tokens if w not in sw] | |
| # 6) stemming (tiny demo) | |
| if do_stem: | |
| tokens = [simple_stem(w) for w in tokens] | |
| cleaned = " ".join(tokens).strip() | |
| return cleaned, tokens | |
| # ---------------------------- | |
| # 2) EMBEDDINGS + SIMILARITY | |
| # ---------------------------- | |
| DEFAULT_CORPUS = """A cat drinks milk and sleeps on the sofa. | |
| A dog likes to play fetch with a ball. | |
| Kittens are small cats and they love to nap. | |
| Puppies are small dogs and they love to play. | |
| The airplane flies in the sky above the clouds. | |
| A ship sails on the ocean and carries cargo. | |
| Trucks and cars drive on roads and highways. | |
| A bird can fly and sing in the morning. | |
| Fish swim in water and live in rivers. | |
| The teacher explains math in the classroom.""" | |
| def parse_corpus(corpus_text: str): | |
| lines = [ln.strip() for ln in corpus_text.splitlines()] | |
| lines = [ln for ln in lines if ln] # remove empty lines | |
| return lines | |
| def build_vectorizer(method: str, ngrams: str): | |
| if ngrams == "Unigrams (1 word)": | |
| ngram_range = (1, 1) | |
| else: | |
| ngram_range = (1, 2) # uni + bi | |
| if method == "TF-IDF (recommended)": | |
| return TfidfVectorizer(lowercase=True, ngram_range=ngram_range, stop_words="english") | |
| else: | |
| return CountVectorizer(lowercase=True, ngram_range=ngram_range, stop_words="english") | |
| def similarity_search(corpus_lines, query, method, ngrams, top_k): | |
| if len(corpus_lines) == 0: | |
| return pd.DataFrame(columns=["rank", "score", "text"]), None, None | |
| vec = build_vectorizer(method, ngrams) | |
| X = vec.fit_transform(corpus_lines) | |
| q = vec.transform([query]) | |
| sims = cosine_similarity(q, X)[0] # (num_docs,) | |
| order = np.argsort(sims)[::-1][:top_k] | |
| rows = [] | |
| for r, idx in enumerate(order, start=1): | |
| rows.append({"rank": r, "score": float(sims[idx]), "text": corpus_lines[int(idx)]}) | |
| df = pd.DataFrame(rows) | |
| return df, X, vec | |
| # ---------------------------- | |
| # 3) VISUALIZATIONS | |
| # ---------------------------- | |
| def plot_similarity_heatmap(X): | |
| S = cosine_similarity(X) | |
| fig = plt.figure(figsize=(6, 5)) | |
| plt.imshow(S) | |
| plt.title("Similarity Heatmap (Corpus vs Corpus)") | |
| plt.xlabel("Doc index") | |
| plt.ylabel("Doc index") | |
| plt.colorbar() | |
| plt.tight_layout() | |
| return fig | |
| def plot_2d_map(X, corpus_lines): | |
| # compress to 2D for visualization | |
| n_components = 2 | |
| svd = TruncatedSVD(n_components=n_components, random_state=42) | |
| X2 = svd.fit_transform(X) | |
| fig = plt.figure(figsize=(7, 5)) | |
| plt.scatter(X2[:, 0], X2[:, 1]) | |
| for i, (x, y) in enumerate(X2): | |
| plt.text(x + 0.01, y + 0.01, f"D{i}", fontsize=9) | |
| plt.title("2D Meaning Map (SVD on Embeddings)") | |
| plt.xlabel("Component 1") | |
| plt.ylabel("Component 2") | |
| plt.tight_layout() | |
| return fig | |
| # ---------------------------- | |
| # GRADIO APP LOGIC | |
| # ---------------------------- | |
| def run_preprocessing( | |
| text, | |
| do_lower, | |
| do_remove_punct, | |
| do_remove_numbers, | |
| do_stopwords, | |
| keep_not, | |
| do_stem | |
| ): | |
| cleaned, tokens = preprocess_text( | |
| text=text, | |
| do_lower=do_lower, | |
| do_remove_punct=do_remove_punct, | |
| do_remove_numbers=do_remove_numbers, | |
| do_stopwords=do_stopwords, | |
| keep_not=keep_not, | |
| do_stem=do_stem, | |
| ) | |
| # show tokens nicely | |
| tokens_str = ", ".join(tokens[:200]) + (" ..." if len(tokens) > 200 else "") | |
| return cleaned, tokens_str, len(tokens) | |
| def run_similarity( | |
| corpus_text, | |
| query, | |
| method, | |
| ngrams, | |
| top_k, | |
| show_heatmap, | |
| show_map | |
| ): | |
| corpus_lines = parse_corpus(corpus_text) | |
| if not query.strip(): | |
| return pd.DataFrame(columns=["rank", "score", "text"]), None, None, f"Corpus size: {len(corpus_lines)}" | |
| df, X, vec = similarity_search(corpus_lines, query, method, ngrams, int(top_k)) | |
| heat_fig = None | |
| map_fig = None | |
| if X is not None and show_heatmap and X.shape[0] >= 2: | |
| heat_fig = plot_similarity_heatmap(X) | |
| if X is not None and show_map and X.shape[0] >= 2: | |
| map_fig = plot_2d_map(X, corpus_lines) | |
| info = f"Corpus size: {len(corpus_lines)} | Embedding dims: {X.shape[1] if X is not None else 0}" | |
| return df, heat_fig, map_fig, info | |
| # ---------------------------- | |
| # UI | |
| # ---------------------------- | |
| with gr.Blocks(theme=gr.themes.Soft(), title="NLP Preprocessing + Similarity (Kid Friendly)") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🧠 NLP Playground (Preprocessing + Similarity Search) | |
| This app teaches two basic NLP superpowers: | |
| ### 1) Preprocessing (cleaning text) | |
| You can turn cleaning steps on/off and see how the text changes. | |
| ### 2) Embeddings + Similarity Search | |
| You can paste a mini “library of sentences” and search it by meaning using embeddings. | |
| ✅ Works great on **Hugging Face Spaces**. | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| # ---------------------------- | |
| # TAB 1: PREPROCESSING | |
| # ---------------------------- | |
| with gr.Tab("🧽 Preprocessing Lab"): | |
| gr.Markdown( | |
| """ | |
| ### What students learn here | |
| - **Lowercase** makes words match better (Cat = cat) | |
| - **Remove punctuation** removes extra symbols | |
| - **Remove numbers** removes digits if you want | |
| - **Stopwords** removes super common words (“the”, “is”) | |
| - **Stemming** is a simple trick to chop endings (play → play, playing → play) | |
| Try toggling things and watching the output change. | |
| """ | |
| ) | |
| inp = gr.Textbox( | |
| label="Type any sentence", | |
| value="Wow!!! I LOVE cats, cats, and more cats... I won 1000 points!!!", | |
| lines=3 | |
| ) | |
| with gr.Row(): | |
| do_lower = gr.Checkbox(True, label="lowercase") | |
| do_remove_punct = gr.Checkbox(True, label="remove punctuation") | |
| do_remove_numbers = gr.Checkbox(False, label="remove numbers") | |
| with gr.Row(): | |
| do_stopwords = gr.Checkbox(False, label="remove stopwords") | |
| keep_not = gr.Checkbox(True, label="keep 'not' and 'no' (important for meaning)") | |
| do_stem = gr.Checkbox(False, label="tiny stemming (demo)") | |
| btn = gr.Button("✨ Run Preprocessing", variant="primary") | |
| cleaned_out = gr.Textbox(label="Cleaned text (what model sees)", lines=2) | |
| tokens_out = gr.Textbox(label="Tokens (split words)", lines=3) | |
| token_count = gr.Number(label="Token count", precision=0) | |
| btn.click( | |
| fn=run_preprocessing, | |
| inputs=[inp, do_lower, do_remove_punct, do_remove_numbers, do_stopwords, keep_not, do_stem], | |
| outputs=[cleaned_out, tokens_out, token_count] | |
| ) | |
| # ---------------------------- | |
| # TAB 2: SIMILARITY SEARCH | |
| # ---------------------------- | |
| with gr.Tab("🔎 Similarity Search Lab"): | |
| gr.Markdown( | |
| """ | |
| ### What students learn here | |
| - An **embedding** turns each sentence into numbers. | |
| - **Cosine similarity** measures how close meanings are. | |
| - You can build a tiny “Google-like search” over your own sentences. | |
| """ | |
| ) | |
| corpus = gr.Textbox( | |
| label="Corpus (one sentence per line) — students can edit this", | |
| value=DEFAULT_CORPUS, | |
| lines=10 | |
| ) | |
| query = gr.Textbox( | |
| label="Query (what you want to search)", | |
| value="small baby cats love sleeping", | |
| lines=2 | |
| ) | |
| with gr.Row(): | |
| method = gr.Radio( | |
| choices=["TF-IDF (recommended)", "Bag of Words (counts)"], | |
| value="TF-IDF (recommended)", | |
| label="Embedding method" | |
| ) | |
| ngrams = gr.Radio( | |
| choices=["Unigrams (1 word)", "Unigrams + Bigrams (1-2 words)"], | |
| value="Unigrams + Bigrams (1-2 words)", | |
| label="N-grams" | |
| ) | |
| with gr.Row(): | |
| top_k = gr.Slider(1, 10, value=5, step=1, label="Top-K results") | |
| show_heatmap = gr.Checkbox(False, label="Show similarity heatmap (slow for big corpus)") | |
| show_map = gr.Checkbox(True, label="Show 2D meaning map") | |
| run_btn = gr.Button("🔍 Search by Meaning", variant="primary") | |
| info = gr.Markdown("") | |
| results_table = gr.Dataframe( | |
| headers=["rank", "score", "text"], | |
| datatype=["number", "number", "str"], | |
| label="Top matches (sorted by similarity)" | |
| ) | |
| with gr.Row(): | |
| heat_plot = gr.Plot(label="Similarity Heatmap") | |
| map_plot = gr.Plot(label="2D Meaning Map") | |
| run_btn.click( | |
| fn=run_similarity, | |
| inputs=[corpus, query, method, ngrams, top_k, show_heatmap, show_map], | |
| outputs=[results_table, heat_plot, map_plot, info] | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| ## ✅ Classroom mini-challenges | |
| 1) In the **Preprocessing** tab, make the cleaned text remove punctuation and stopwords. | |
| What changes? | |
| 2) In **Similarity Search**, add your own lines like: | |
| - "I love pizza and burgers." | |
| - "Math homework is difficult." | |
| - "Dogs are playful and friendly." | |
| Then search: | |
| - “food I like” | |
| - “school work” | |
| - “animals that play” | |
| Watch which sentences become “closest”. | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |