NLP / app.py
adityaardak's picture
Create app.py
5b861c2 verified
import re
import string
import numpy as np
import pandas as pd
import gradio as gr
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
# ----------------------------
# 1) BASIC NLP PREPROCESSING
# ----------------------------
BASIC_STOPWORDS = {
# small kid-friendly stopword list (no external downloads)
"a","an","the","and","or","but","if","then","so","because",
"is","am","are","was","were","be","been","being",
"i","you","he","she","it","we","they","me","my","your","his","her","our","their",
"to","of","in","on","at","for","with","from","as","by","about",
"this","that","these","those",
"do","does","did","doing",
"have","has","had",
"not","no","yes", # keep "not" if you want sentiment nuance; we let user choose
"very","really","just"
}
def simple_stem(word: str) -> str:
"""
A tiny, kid-friendly stemmer (NOT perfect).
Real stemming uses libraries; this keeps the app simple for HF.
"""
for suf in ["ing", "edly", "edly", "edly", "ed", "ly", "s"]:
if word.endswith(suf) and len(word) > len(suf) + 2:
return word[:-len(suf)]
return word
def preprocess_text(
text: str,
do_lower: bool = True,
do_remove_punct: bool = True,
do_remove_numbers: bool = False,
do_stopwords: bool = False,
keep_not: bool = True,
do_stem: bool = False,
):
t = text
# 1) lowercase
if do_lower:
t = t.lower()
# 2) remove punctuation
if do_remove_punct:
t = t.translate(str.maketrans("", "", string.punctuation))
# 3) remove numbers
if do_remove_numbers:
t = re.sub(r"\d+", "", t)
# 4) tokenize (simple word tokens)
tokens = re.findall(r"\b\w+\b", t)
# 5) stopwords removal
if do_stopwords:
sw = BASIC_STOPWORDS.copy()
if keep_not:
sw.discard("not")
sw.discard("no")
tokens = [w for w in tokens if w not in sw]
# 6) stemming (tiny demo)
if do_stem:
tokens = [simple_stem(w) for w in tokens]
cleaned = " ".join(tokens).strip()
return cleaned, tokens
# ----------------------------
# 2) EMBEDDINGS + SIMILARITY
# ----------------------------
DEFAULT_CORPUS = """A cat drinks milk and sleeps on the sofa.
A dog likes to play fetch with a ball.
Kittens are small cats and they love to nap.
Puppies are small dogs and they love to play.
The airplane flies in the sky above the clouds.
A ship sails on the ocean and carries cargo.
Trucks and cars drive on roads and highways.
A bird can fly and sing in the morning.
Fish swim in water and live in rivers.
The teacher explains math in the classroom."""
def parse_corpus(corpus_text: str):
lines = [ln.strip() for ln in corpus_text.splitlines()]
lines = [ln for ln in lines if ln] # remove empty lines
return lines
def build_vectorizer(method: str, ngrams: str):
if ngrams == "Unigrams (1 word)":
ngram_range = (1, 1)
else:
ngram_range = (1, 2) # uni + bi
if method == "TF-IDF (recommended)":
return TfidfVectorizer(lowercase=True, ngram_range=ngram_range, stop_words="english")
else:
return CountVectorizer(lowercase=True, ngram_range=ngram_range, stop_words="english")
def similarity_search(corpus_lines, query, method, ngrams, top_k):
if len(corpus_lines) == 0:
return pd.DataFrame(columns=["rank", "score", "text"]), None, None
vec = build_vectorizer(method, ngrams)
X = vec.fit_transform(corpus_lines)
q = vec.transform([query])
sims = cosine_similarity(q, X)[0] # (num_docs,)
order = np.argsort(sims)[::-1][:top_k]
rows = []
for r, idx in enumerate(order, start=1):
rows.append({"rank": r, "score": float(sims[idx]), "text": corpus_lines[int(idx)]})
df = pd.DataFrame(rows)
return df, X, vec
# ----------------------------
# 3) VISUALIZATIONS
# ----------------------------
def plot_similarity_heatmap(X):
S = cosine_similarity(X)
fig = plt.figure(figsize=(6, 5))
plt.imshow(S)
plt.title("Similarity Heatmap (Corpus vs Corpus)")
plt.xlabel("Doc index")
plt.ylabel("Doc index")
plt.colorbar()
plt.tight_layout()
return fig
def plot_2d_map(X, corpus_lines):
# compress to 2D for visualization
n_components = 2
svd = TruncatedSVD(n_components=n_components, random_state=42)
X2 = svd.fit_transform(X)
fig = plt.figure(figsize=(7, 5))
plt.scatter(X2[:, 0], X2[:, 1])
for i, (x, y) in enumerate(X2):
plt.text(x + 0.01, y + 0.01, f"D{i}", fontsize=9)
plt.title("2D Meaning Map (SVD on Embeddings)")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.tight_layout()
return fig
# ----------------------------
# GRADIO APP LOGIC
# ----------------------------
def run_preprocessing(
text,
do_lower,
do_remove_punct,
do_remove_numbers,
do_stopwords,
keep_not,
do_stem
):
cleaned, tokens = preprocess_text(
text=text,
do_lower=do_lower,
do_remove_punct=do_remove_punct,
do_remove_numbers=do_remove_numbers,
do_stopwords=do_stopwords,
keep_not=keep_not,
do_stem=do_stem,
)
# show tokens nicely
tokens_str = ", ".join(tokens[:200]) + (" ..." if len(tokens) > 200 else "")
return cleaned, tokens_str, len(tokens)
def run_similarity(
corpus_text,
query,
method,
ngrams,
top_k,
show_heatmap,
show_map
):
corpus_lines = parse_corpus(corpus_text)
if not query.strip():
return pd.DataFrame(columns=["rank", "score", "text"]), None, None, f"Corpus size: {len(corpus_lines)}"
df, X, vec = similarity_search(corpus_lines, query, method, ngrams, int(top_k))
heat_fig = None
map_fig = None
if X is not None and show_heatmap and X.shape[0] >= 2:
heat_fig = plot_similarity_heatmap(X)
if X is not None and show_map and X.shape[0] >= 2:
map_fig = plot_2d_map(X, corpus_lines)
info = f"Corpus size: {len(corpus_lines)} | Embedding dims: {X.shape[1] if X is not None else 0}"
return df, heat_fig, map_fig, info
# ----------------------------
# UI
# ----------------------------
with gr.Blocks(theme=gr.themes.Soft(), title="NLP Preprocessing + Similarity (Kid Friendly)") as demo:
gr.Markdown(
"""
# 🧠 NLP Playground (Preprocessing + Similarity Search)
This app teaches two basic NLP superpowers:
### 1) Preprocessing (cleaning text)
You can turn cleaning steps on/off and see how the text changes.
### 2) Embeddings + Similarity Search
You can paste a mini “library of sentences” and search it by meaning using embeddings.
✅ Works great on **Hugging Face Spaces**.
"""
)
with gr.Tabs():
# ----------------------------
# TAB 1: PREPROCESSING
# ----------------------------
with gr.Tab("🧽 Preprocessing Lab"):
gr.Markdown(
"""
### What students learn here
- **Lowercase** makes words match better (Cat = cat)
- **Remove punctuation** removes extra symbols
- **Remove numbers** removes digits if you want
- **Stopwords** removes super common words (“the”, “is”)
- **Stemming** is a simple trick to chop endings (play → play, playing → play)
Try toggling things and watching the output change.
"""
)
inp = gr.Textbox(
label="Type any sentence",
value="Wow!!! I LOVE cats, cats, and more cats... I won 1000 points!!!",
lines=3
)
with gr.Row():
do_lower = gr.Checkbox(True, label="lowercase")
do_remove_punct = gr.Checkbox(True, label="remove punctuation")
do_remove_numbers = gr.Checkbox(False, label="remove numbers")
with gr.Row():
do_stopwords = gr.Checkbox(False, label="remove stopwords")
keep_not = gr.Checkbox(True, label="keep 'not' and 'no' (important for meaning)")
do_stem = gr.Checkbox(False, label="tiny stemming (demo)")
btn = gr.Button("✨ Run Preprocessing", variant="primary")
cleaned_out = gr.Textbox(label="Cleaned text (what model sees)", lines=2)
tokens_out = gr.Textbox(label="Tokens (split words)", lines=3)
token_count = gr.Number(label="Token count", precision=0)
btn.click(
fn=run_preprocessing,
inputs=[inp, do_lower, do_remove_punct, do_remove_numbers, do_stopwords, keep_not, do_stem],
outputs=[cleaned_out, tokens_out, token_count]
)
# ----------------------------
# TAB 2: SIMILARITY SEARCH
# ----------------------------
with gr.Tab("🔎 Similarity Search Lab"):
gr.Markdown(
"""
### What students learn here
- An **embedding** turns each sentence into numbers.
- **Cosine similarity** measures how close meanings are.
- You can build a tiny “Google-like search” over your own sentences.
"""
)
corpus = gr.Textbox(
label="Corpus (one sentence per line) — students can edit this",
value=DEFAULT_CORPUS,
lines=10
)
query = gr.Textbox(
label="Query (what you want to search)",
value="small baby cats love sleeping",
lines=2
)
with gr.Row():
method = gr.Radio(
choices=["TF-IDF (recommended)", "Bag of Words (counts)"],
value="TF-IDF (recommended)",
label="Embedding method"
)
ngrams = gr.Radio(
choices=["Unigrams (1 word)", "Unigrams + Bigrams (1-2 words)"],
value="Unigrams + Bigrams (1-2 words)",
label="N-grams"
)
with gr.Row():
top_k = gr.Slider(1, 10, value=5, step=1, label="Top-K results")
show_heatmap = gr.Checkbox(False, label="Show similarity heatmap (slow for big corpus)")
show_map = gr.Checkbox(True, label="Show 2D meaning map")
run_btn = gr.Button("🔍 Search by Meaning", variant="primary")
info = gr.Markdown("")
results_table = gr.Dataframe(
headers=["rank", "score", "text"],
datatype=["number", "number", "str"],
label="Top matches (sorted by similarity)"
)
with gr.Row():
heat_plot = gr.Plot(label="Similarity Heatmap")
map_plot = gr.Plot(label="2D Meaning Map")
run_btn.click(
fn=run_similarity,
inputs=[corpus, query, method, ngrams, top_k, show_heatmap, show_map],
outputs=[results_table, heat_plot, map_plot, info]
)
gr.Markdown(
"""
---
## ✅ Classroom mini-challenges
1) In the **Preprocessing** tab, make the cleaned text remove punctuation and stopwords.
What changes?
2) In **Similarity Search**, add your own lines like:
- "I love pizza and burgers."
- "Math homework is difficult."
- "Dogs are playful and friendly."
Then search:
- “food I like”
- “school work”
- “animals that play”
Watch which sentences become “closest”.
"""
)
if __name__ == "__main__":
demo.launch()