Spaces:

adityaardak
/

NLP

Sleeping

App Files Files Community

adityaardak commited on Jan 2

Commit

5b861c2

verified ·

1 Parent(s): 3a24638

Create app.py

Browse files

Files changed (1) hide show

app.py +364 -0

app.py ADDED Viewed

	@@ -0,0 +1,364 @@

+import re
+import string
+import numpy as np
+import pandas as pd
+import gradio as gr
+import matplotlib.pyplot as plt
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.decomposition import TruncatedSVD
+# ----------------------------
+# 1) BASIC NLP PREPROCESSING
+# ----------------------------
+BASIC_STOPWORDS = {
+    # small kid-friendly stopword list (no external downloads)
+    "a","an","the","and","or","but","if","then","so","because",
+    "is","am","are","was","were","be","been","being",
+    "i","you","he","she","it","we","they","me","my","your","his","her","our","their",
+    "to","of","in","on","at","for","with","from","as","by","about",
+    "this","that","these","those",
+    "do","does","did","doing",
+    "have","has","had",
+    "not","no","yes",  # keep "not" if you want sentiment nuance; we let user choose
+    "very","really","just"
+}
+def simple_stem(word: str) -> str:
+    """
+    A tiny, kid-friendly stemmer (NOT perfect).
+    Real stemming uses libraries; this keeps the app simple for HF.
+    """
+    for suf in ["ing", "edly", "edly", "edly", "ed", "ly", "s"]:
+        if word.endswith(suf) and len(word) > len(suf) + 2:
+            return word[:-len(suf)]
+    return word
+def preprocess_text(
+    text: str,
+    do_lower: bool = True,
+    do_remove_punct: bool = True,
+    do_remove_numbers: bool = False,
+    do_stopwords: bool = False,
+    keep_not: bool = True,
+    do_stem: bool = False,
+):
+    t = text
+    # 1) lowercase
+    if do_lower:
+        t = t.lower()
+    # 2) remove punctuation
+    if do_remove_punct:
+        t = t.translate(str.maketrans("", "", string.punctuation))
+    # 3) remove numbers
+    if do_remove_numbers:
+        t = re.sub(r"\d+", "", t)
+    # 4) tokenize (simple word tokens)
+    tokens = re.findall(r"\b\w+\b", t)
+    # 5) stopwords removal
+    if do_stopwords:
+        sw = BASIC_STOPWORDS.copy()
+        if keep_not:
+            sw.discard("not")
+            sw.discard("no")
+        tokens = [w for w in tokens if w not in sw]
+    # 6) stemming (tiny demo)
+    if do_stem:
+        tokens = [simple_stem(w) for w in tokens]
+    cleaned = " ".join(tokens).strip()
+    return cleaned, tokens
+# ----------------------------
+# 2) EMBEDDINGS + SIMILARITY
+# ----------------------------
+DEFAULT_CORPUS = """A cat drinks milk and sleeps on the sofa.
+A dog likes to play fetch with a ball.
+Kittens are small cats and they love to nap.
+Puppies are small dogs and they love to play.
+The airplane flies in the sky above the clouds.
+A ship sails on the ocean and carries cargo.
+Trucks and cars drive on roads and highways.
+A bird can fly and sing in the morning.
+Fish swim in water and live in rivers.
+The teacher explains math in the classroom."""
+def parse_corpus(corpus_text: str):
+    lines = [ln.strip() for ln in corpus_text.splitlines()]
+    lines = [ln for ln in lines if ln]  # remove empty lines
+    return lines
+def build_vectorizer(method: str, ngrams: str):
+    if ngrams == "Unigrams (1 word)":
+        ngram_range = (1, 1)
+    else:
+        ngram_range = (1, 2)  # uni + bi
+    if method == "TF-IDF (recommended)":
+        return TfidfVectorizer(lowercase=True, ngram_range=ngram_range, stop_words="english")
+    else:
+        return CountVectorizer(lowercase=True, ngram_range=ngram_range, stop_words="english")
+def similarity_search(corpus_lines, query, method, ngrams, top_k):
+    if len(corpus_lines) == 0:
+        return pd.DataFrame(columns=["rank", "score", "text"]), None, None
+    vec = build_vectorizer(method, ngrams)
+    X = vec.fit_transform(corpus_lines)
+    q = vec.transform([query])
+    sims = cosine_similarity(q, X)[0]  # (num_docs,)
+    order = np.argsort(sims)[::-1][:top_k]
+    rows = []
+    for r, idx in enumerate(order, start=1):
+        rows.append({"rank": r, "score": float(sims[idx]), "text": corpus_lines[int(idx)]})
+    df = pd.DataFrame(rows)
+    return df, X, vec
+# ----------------------------
+# 3) VISUALIZATIONS
+# ----------------------------
+def plot_similarity_heatmap(X):
+    S = cosine_similarity(X)
+    fig = plt.figure(figsize=(6, 5))
+    plt.imshow(S)
+    plt.title("Similarity Heatmap (Corpus vs Corpus)")
+    plt.xlabel("Doc index")
+    plt.ylabel("Doc index")
+    plt.colorbar()
+    plt.tight_layout()
+    return fig
+def plot_2d_map(X, corpus_lines):
+    # compress to 2D for visualization
+    n_components = 2
+    svd = TruncatedSVD(n_components=n_components, random_state=42)
+    X2 = svd.fit_transform(X)
+    fig = plt.figure(figsize=(7, 5))
+    plt.scatter(X2[:, 0], X2[:, 1])
+    for i, (x, y) in enumerate(X2):
+        plt.text(x + 0.01, y + 0.01, f"D{i}", fontsize=9)
+    plt.title("2D Meaning Map (SVD on Embeddings)")
+    plt.xlabel("Component 1")
+    plt.ylabel("Component 2")
+    plt.tight_layout()
+    return fig
+# ----------------------------
+# GRADIO APP LOGIC
+# ----------------------------
+def run_preprocessing(
+    text,
+    do_lower,
+    do_remove_punct,
+    do_remove_numbers,
+    do_stopwords,
+    keep_not,
+    do_stem
+):
+    cleaned, tokens = preprocess_text(
+        text=text,
+        do_lower=do_lower,
+        do_remove_punct=do_remove_punct,
+        do_remove_numbers=do_remove_numbers,
+        do_stopwords=do_stopwords,
+        keep_not=keep_not,
+        do_stem=do_stem,
+    )
+    # show tokens nicely
+    tokens_str = ", ".join(tokens[:200]) + (" ..." if len(tokens) > 200 else "")
+    return cleaned, tokens_str, len(tokens)
+def run_similarity(
+    corpus_text,
+    query,
+    method,
+    ngrams,
+    top_k,
+    show_heatmap,
+    show_map
+):
+    corpus_lines = parse_corpus(corpus_text)
+    if not query.strip():
+        return pd.DataFrame(columns=["rank", "score", "text"]), None, None, f"Corpus size: {len(corpus_lines)}"
+    df, X, vec = similarity_search(corpus_lines, query, method, ngrams, int(top_k))
+    heat_fig = None
+    map_fig = None
+    if X is not None and show_heatmap and X.shape[0] >= 2:
+        heat_fig = plot_similarity_heatmap(X)
+    if X is not None and show_map and X.shape[0] >= 2:
+        map_fig = plot_2d_map(X, corpus_lines)
+    info = f"Corpus size: {len(corpus_lines)} | Embedding dims: {X.shape[1] if X is not None else 0}"
+    return df, heat_fig, map_fig, info
+# ----------------------------
+# UI
+# ----------------------------
+with gr.Blocks(theme=gr.themes.Soft(), title="NLP Preprocessing + Similarity (Kid Friendly)") as demo:
+    gr.Markdown(
+        """
+# 🧠 NLP Playground (Preprocessing + Similarity Search)
+This app teaches two basic NLP superpowers:
+### 1) Preprocessing (cleaning text)
+You can turn cleaning steps on/off and see how the text changes.
+### 2) Embeddings + Similarity Search
+You can paste a mini “library of sentences” and search it by meaning using embeddings.
+✅ Works great on **Hugging Face Spaces**.
+        """
+    )
+    with gr.Tabs():
+        # ----------------------------
+        # TAB 1: PREPROCESSING
+        # ----------------------------
+        with gr.Tab("🧽 Preprocessing Lab"):
+            gr.Markdown(
+                """
+### What students learn here
+- **Lowercase** makes words match better (Cat = cat)
+- **Remove punctuation** removes extra symbols
+- **Remove numbers** removes digits if you want
+- **Stopwords** removes super common words (“the”, “is”)
+- **Stemming** is a simple trick to chop endings (play → play, playing → play)
+Try toggling things and watching the output change.
+                """
+            )
+            inp = gr.Textbox(
+                label="Type any sentence",
+                value="Wow!!! I LOVE cats, cats, and more cats... I won 1000 points!!!",
+                lines=3
+            )
+            with gr.Row():
+                do_lower = gr.Checkbox(True, label="lowercase")
+                do_remove_punct = gr.Checkbox(True, label="remove punctuation")
+                do_remove_numbers = gr.Checkbox(False, label="remove numbers")
+            with gr.Row():
+                do_stopwords = gr.Checkbox(False, label="remove stopwords")
+                keep_not = gr.Checkbox(True, label="keep 'not' and 'no' (important for meaning)")
+                do_stem = gr.Checkbox(False, label="tiny stemming (demo)")
+            btn = gr.Button("✨ Run Preprocessing", variant="primary")
+            cleaned_out = gr.Textbox(label="Cleaned text (what model sees)", lines=2)
+            tokens_out = gr.Textbox(label="Tokens (split words)", lines=3)
+            token_count = gr.Number(label="Token count", precision=0)
+            btn.click(
+                fn=run_preprocessing,
+                inputs=[inp, do_lower, do_remove_punct, do_remove_numbers, do_stopwords, keep_not, do_stem],
+                outputs=[cleaned_out, tokens_out, token_count]
+            )
+        # ----------------------------
+        # TAB 2: SIMILARITY SEARCH
+        # ----------------------------
+        with gr.Tab("🔎 Similarity Search Lab"):
+            gr.Markdown(
+                """
+### What students learn here
+- An **embedding** turns each sentence into numbers.
+- **Cosine similarity** measures how close meanings are.
+- You can build a tiny “Google-like search” over your own sentences.
+                """
+            )
+            corpus = gr.Textbox(
+                label="Corpus (one sentence per line) — students can edit this",
+                value=DEFAULT_CORPUS,
+                lines=10
+            )
+            query = gr.Textbox(
+                label="Query (what you want to search)",
+                value="small baby cats love sleeping",
+                lines=2
+            )
+            with gr.Row():
+                method = gr.Radio(
+                    choices=["TF-IDF (recommended)", "Bag of Words (counts)"],
+                    value="TF-IDF (recommended)",
+                    label="Embedding method"
+                )
+                ngrams = gr.Radio(
+                    choices=["Unigrams (1 word)", "Unigrams + Bigrams (1-2 words)"],
+                    value="Unigrams + Bigrams (1-2 words)",
+                    label="N-grams"
+                )
+            with gr.Row():
+                top_k = gr.Slider(1, 10, value=5, step=1, label="Top-K results")
+                show_heatmap = gr.Checkbox(False, label="Show similarity heatmap (slow for big corpus)")
+                show_map = gr.Checkbox(True, label="Show 2D meaning map")
+            run_btn = gr.Button("🔍 Search by Meaning", variant="primary")
+            info = gr.Markdown("")
+            results_table = gr.Dataframe(
+                headers=["rank", "score", "text"],
+                datatype=["number", "number", "str"],
+                label="Top matches (sorted by similarity)"
+            )
+            with gr.Row():
+                heat_plot = gr.Plot(label="Similarity Heatmap")
+                map_plot = gr.Plot(label="2D Meaning Map")
+            run_btn.click(
+                fn=run_similarity,
+                inputs=[corpus, query, method, ngrams, top_k, show_heatmap, show_map],
+                outputs=[results_table, heat_plot, map_plot, info]
+            )
+    gr.Markdown(
+        """
+---
+## ✅ Classroom mini-challenges
+1) In the **Preprocessing** tab, make the cleaned text remove punctuation and stopwords.
+   What changes?
+2) In **Similarity Search**, add your own lines like:
+- "I love pizza and burgers."
+- "Math homework is difficult."
+- "Dogs are playful and friendly."
+Then search:
+- “food I like”
+- “school work”
+- “animals that play”
+Watch which sentences become “closest”.
+        """
+    )
+if __name__ == "__main__":
+    demo.launch()