Spaces:

adityaardak
/

NLP

Sleeping

NLP

File size: 11,622 Bytes

5b861c2

import re
import string
import numpy as np
import pandas as pd
import gradio as gr
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD


# ----------------------------
# 1) BASIC NLP PREPROCESSING
# ----------------------------
BASIC_STOPWORDS = {
    # small kid-friendly stopword list (no external downloads)
    "a","an","the","and","or","but","if","then","so","because",
    "is","am","are","was","were","be","been","being",
    "i","you","he","she","it","we","they","me","my","your","his","her","our","their",
    "to","of","in","on","at","for","with","from","as","by","about",
    "this","that","these","those",
    "do","does","did","doing",
    "have","has","had",
    "not","no","yes",  # keep "not" if you want sentiment nuance; we let user choose
    "very","really","just"
}

def simple_stem(word: str) -> str:
    """
    A tiny, kid-friendly stemmer (NOT perfect).
    Real stemming uses libraries; this keeps the app simple for HF.
    """
    for suf in ["ing", "edly", "edly", "edly", "ed", "ly", "s"]:
        if word.endswith(suf) and len(word) > len(suf) + 2:
            return word[:-len(suf)]
    return word

def preprocess_text(
    text: str,
    do_lower: bool = True,
    do_remove_punct: bool = True,
    do_remove_numbers: bool = False,
    do_stopwords: bool = False,
    keep_not: bool = True,
    do_stem: bool = False,
):
    t = text

    # 1) lowercase
    if do_lower:
        t = t.lower()

    # 2) remove punctuation
    if do_remove_punct:
        t = t.translate(str.maketrans("", "", string.punctuation))

    # 3) remove numbers
    if do_remove_numbers:
        t = re.sub(r"\d+", "", t)

    # 4) tokenize (simple word tokens)
    tokens = re.findall(r"\b\w+\b", t)

    # 5) stopwords removal
    if do_stopwords:
        sw = BASIC_STOPWORDS.copy()
        if keep_not:
            sw.discard("not")
            sw.discard("no")
        tokens = [w for w in tokens if w not in sw]

    # 6) stemming (tiny demo)
    if do_stem:
        tokens = [simple_stem(w) for w in tokens]

    cleaned = " ".join(tokens).strip()
    return cleaned, tokens


# ----------------------------
# 2) EMBEDDINGS + SIMILARITY
# ----------------------------
DEFAULT_CORPUS = """A cat drinks milk and sleeps on the sofa.
A dog likes to play fetch with a ball.
Kittens are small cats and they love to nap.
Puppies are small dogs and they love to play.
The airplane flies in the sky above the clouds.
A ship sails on the ocean and carries cargo.
Trucks and cars drive on roads and highways.
A bird can fly and sing in the morning.
Fish swim in water and live in rivers.
The teacher explains math in the classroom."""

def parse_corpus(corpus_text: str):
    lines = [ln.strip() for ln in corpus_text.splitlines()]
    lines = [ln for ln in lines if ln]  # remove empty lines
    return lines

def build_vectorizer(method: str, ngrams: str):
    if ngrams == "Unigrams (1 word)":
        ngram_range = (1, 1)
    else:
        ngram_range = (1, 2)  # uni + bi

    if method == "TF-IDF (recommended)":
        return TfidfVectorizer(lowercase=True, ngram_range=ngram_range, stop_words="english")
    else:
        return CountVectorizer(lowercase=True, ngram_range=ngram_range, stop_words="english")

def similarity_search(corpus_lines, query, method, ngrams, top_k):
    if len(corpus_lines) == 0:
        return pd.DataFrame(columns=["rank", "score", "text"]), None, None

    vec = build_vectorizer(method, ngrams)
    X = vec.fit_transform(corpus_lines)
    q = vec.transform([query])

    sims = cosine_similarity(q, X)[0]  # (num_docs,)
    order = np.argsort(sims)[::-1][:top_k]

    rows = []
    for r, idx in enumerate(order, start=1):
        rows.append({"rank": r, "score": float(sims[idx]), "text": corpus_lines[int(idx)]})

    df = pd.DataFrame(rows)
    return df, X, vec


# ----------------------------
# 3) VISUALIZATIONS
# ----------------------------
def plot_similarity_heatmap(X):
    S = cosine_similarity(X)
    fig = plt.figure(figsize=(6, 5))
    plt.imshow(S)
    plt.title("Similarity Heatmap (Corpus vs Corpus)")
    plt.xlabel("Doc index")
    plt.ylabel("Doc index")
    plt.colorbar()
    plt.tight_layout()
    return fig

def plot_2d_map(X, corpus_lines):
    # compress to 2D for visualization
    n_components = 2
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    X2 = svd.fit_transform(X)

    fig = plt.figure(figsize=(7, 5))
    plt.scatter(X2[:, 0], X2[:, 1])
    for i, (x, y) in enumerate(X2):
        plt.text(x + 0.01, y + 0.01, f"D{i}", fontsize=9)
    plt.title("2D Meaning Map (SVD on Embeddings)")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.tight_layout()
    return fig


# ----------------------------
# GRADIO APP LOGIC
# ----------------------------
def run_preprocessing(
    text,
    do_lower,
    do_remove_punct,
    do_remove_numbers,
    do_stopwords,
    keep_not,
    do_stem
):
    cleaned, tokens = preprocess_text(
        text=text,
        do_lower=do_lower,
        do_remove_punct=do_remove_punct,
        do_remove_numbers=do_remove_numbers,
        do_stopwords=do_stopwords,
        keep_not=keep_not,
        do_stem=do_stem,
    )
    # show tokens nicely
    tokens_str = ", ".join(tokens[:200]) + (" ..." if len(tokens) > 200 else "")
    return cleaned, tokens_str, len(tokens)

def run_similarity(
    corpus_text,
    query,
    method,
    ngrams,
    top_k,
    show_heatmap,
    show_map
):
    corpus_lines = parse_corpus(corpus_text)
    if not query.strip():
        return pd.DataFrame(columns=["rank", "score", "text"]), None, None, f"Corpus size: {len(corpus_lines)}"

    df, X, vec = similarity_search(corpus_lines, query, method, ngrams, int(top_k))

    heat_fig = None
    map_fig = None

    if X is not None and show_heatmap and X.shape[0] >= 2:
        heat_fig = plot_similarity_heatmap(X)

    if X is not None and show_map and X.shape[0] >= 2:
        map_fig = plot_2d_map(X, corpus_lines)

    info = f"Corpus size: {len(corpus_lines)} | Embedding dims: {X.shape[1] if X is not None else 0}"
    return df, heat_fig, map_fig, info


# ----------------------------
# UI
# ----------------------------
with gr.Blocks(theme=gr.themes.Soft(), title="NLP Preprocessing + Similarity (Kid Friendly)") as demo:
    gr.Markdown(
        """
# 🧠 NLP Playground (Preprocessing + Similarity Search)

This app teaches two basic NLP superpowers:

### 1) Preprocessing (cleaning text)
You can turn cleaning steps on/off and see how the text changes.

### 2) Embeddings + Similarity Search
You can paste a mini “library of sentences” and search it by meaning using embeddings.

✅ Works great on **Hugging Face Spaces**.
        """
    )

    with gr.Tabs():
        # ----------------------------
        # TAB 1: PREPROCESSING
        # ----------------------------
        with gr.Tab("🧽 Preprocessing Lab"):
            gr.Markdown(
                """
### What students learn here
- **Lowercase** makes words match better (Cat = cat)
- **Remove punctuation** removes extra symbols
- **Remove numbers** removes digits if you want
- **Stopwords** removes super common words (“the”, “is”)
- **Stemming** is a simple trick to chop endings (play → play, playing → play)

Try toggling things and watching the output change.
                """
            )

            inp = gr.Textbox(
                label="Type any sentence",
                value="Wow!!! I LOVE cats, cats, and more cats... I won 1000 points!!!",
                lines=3
            )

            with gr.Row():
                do_lower = gr.Checkbox(True, label="lowercase")
                do_remove_punct = gr.Checkbox(True, label="remove punctuation")
                do_remove_numbers = gr.Checkbox(False, label="remove numbers")

            with gr.Row():
                do_stopwords = gr.Checkbox(False, label="remove stopwords")
                keep_not = gr.Checkbox(True, label="keep 'not' and 'no' (important for meaning)")
                do_stem = gr.Checkbox(False, label="tiny stemming (demo)")

            btn = gr.Button("✨ Run Preprocessing", variant="primary")

            cleaned_out = gr.Textbox(label="Cleaned text (what model sees)", lines=2)
            tokens_out = gr.Textbox(label="Tokens (split words)", lines=3)
            token_count = gr.Number(label="Token count", precision=0)

            btn.click(
                fn=run_preprocessing,
                inputs=[inp, do_lower, do_remove_punct, do_remove_numbers, do_stopwords, keep_not, do_stem],
                outputs=[cleaned_out, tokens_out, token_count]
            )

        # ----------------------------
        # TAB 2: SIMILARITY SEARCH
        # ----------------------------
        with gr.Tab("🔎 Similarity Search Lab"):
            gr.Markdown(
                """
### What students learn here
- An **embedding** turns each sentence into numbers.
- **Cosine similarity** measures how close meanings are.
- You can build a tiny “Google-like search” over your own sentences.
                """
            )

            corpus = gr.Textbox(
                label="Corpus (one sentence per line) — students can edit this",
                value=DEFAULT_CORPUS,
                lines=10
            )

            query = gr.Textbox(
                label="Query (what you want to search)",
                value="small baby cats love sleeping",
                lines=2
            )

            with gr.Row():
                method = gr.Radio(
                    choices=["TF-IDF (recommended)", "Bag of Words (counts)"],
                    value="TF-IDF (recommended)",
                    label="Embedding method"
                )
                ngrams = gr.Radio(
                    choices=["Unigrams (1 word)", "Unigrams + Bigrams (1-2 words)"],
                    value="Unigrams + Bigrams (1-2 words)",
                    label="N-grams"
                )

            with gr.Row():
                top_k = gr.Slider(1, 10, value=5, step=1, label="Top-K results")
                show_heatmap = gr.Checkbox(False, label="Show similarity heatmap (slow for big corpus)")
                show_map = gr.Checkbox(True, label="Show 2D meaning map")

            run_btn = gr.Button("🔍 Search by Meaning", variant="primary")

            info = gr.Markdown("")
            results_table = gr.Dataframe(
                headers=["rank", "score", "text"],
                datatype=["number", "number", "str"],
                label="Top matches (sorted by similarity)"
            )

            with gr.Row():
                heat_plot = gr.Plot(label="Similarity Heatmap")
                map_plot = gr.Plot(label="2D Meaning Map")

            run_btn.click(
                fn=run_similarity,
                inputs=[corpus, query, method, ngrams, top_k, show_heatmap, show_map],
                outputs=[results_table, heat_plot, map_plot, info]
            )

    gr.Markdown(
        """
---
## ✅ Classroom mini-challenges

1) In the **Preprocessing** tab, make the cleaned text remove punctuation and stopwords.  
   What changes?

2) In **Similarity Search**, add your own lines like:
- "I love pizza and burgers."
- "Math homework is difficult."
- "Dogs are playful and friendly."

Then search:
- “food I like”
- “school work”
- “animals that play”

Watch which sentences become “closest”.
        """
    )

if __name__ == "__main__":
    demo.launch()