adityaardak commited on
Commit
5b861c2
·
verified ·
1 Parent(s): 3a24638

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +364 -0
app.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import pandas as pd
5
+ import gradio as gr
6
+ import matplotlib.pyplot as plt
7
+
8
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ from sklearn.decomposition import TruncatedSVD
11
+
12
+
13
+ # ----------------------------
14
+ # 1) BASIC NLP PREPROCESSING
15
+ # ----------------------------
16
+ BASIC_STOPWORDS = {
17
+ # small kid-friendly stopword list (no external downloads)
18
+ "a","an","the","and","or","but","if","then","so","because",
19
+ "is","am","are","was","were","be","been","being",
20
+ "i","you","he","she","it","we","they","me","my","your","his","her","our","their",
21
+ "to","of","in","on","at","for","with","from","as","by","about",
22
+ "this","that","these","those",
23
+ "do","does","did","doing",
24
+ "have","has","had",
25
+ "not","no","yes", # keep "not" if you want sentiment nuance; we let user choose
26
+ "very","really","just"
27
+ }
28
+
29
+ def simple_stem(word: str) -> str:
30
+ """
31
+ A tiny, kid-friendly stemmer (NOT perfect).
32
+ Real stemming uses libraries; this keeps the app simple for HF.
33
+ """
34
+ for suf in ["ing", "edly", "edly", "edly", "ed", "ly", "s"]:
35
+ if word.endswith(suf) and len(word) > len(suf) + 2:
36
+ return word[:-len(suf)]
37
+ return word
38
+
39
+ def preprocess_text(
40
+ text: str,
41
+ do_lower: bool = True,
42
+ do_remove_punct: bool = True,
43
+ do_remove_numbers: bool = False,
44
+ do_stopwords: bool = False,
45
+ keep_not: bool = True,
46
+ do_stem: bool = False,
47
+ ):
48
+ t = text
49
+
50
+ # 1) lowercase
51
+ if do_lower:
52
+ t = t.lower()
53
+
54
+ # 2) remove punctuation
55
+ if do_remove_punct:
56
+ t = t.translate(str.maketrans("", "", string.punctuation))
57
+
58
+ # 3) remove numbers
59
+ if do_remove_numbers:
60
+ t = re.sub(r"\d+", "", t)
61
+
62
+ # 4) tokenize (simple word tokens)
63
+ tokens = re.findall(r"\b\w+\b", t)
64
+
65
+ # 5) stopwords removal
66
+ if do_stopwords:
67
+ sw = BASIC_STOPWORDS.copy()
68
+ if keep_not:
69
+ sw.discard("not")
70
+ sw.discard("no")
71
+ tokens = [w for w in tokens if w not in sw]
72
+
73
+ # 6) stemming (tiny demo)
74
+ if do_stem:
75
+ tokens = [simple_stem(w) for w in tokens]
76
+
77
+ cleaned = " ".join(tokens).strip()
78
+ return cleaned, tokens
79
+
80
+
81
+ # ----------------------------
82
+ # 2) EMBEDDINGS + SIMILARITY
83
+ # ----------------------------
84
+ DEFAULT_CORPUS = """A cat drinks milk and sleeps on the sofa.
85
+ A dog likes to play fetch with a ball.
86
+ Kittens are small cats and they love to nap.
87
+ Puppies are small dogs and they love to play.
88
+ The airplane flies in the sky above the clouds.
89
+ A ship sails on the ocean and carries cargo.
90
+ Trucks and cars drive on roads and highways.
91
+ A bird can fly and sing in the morning.
92
+ Fish swim in water and live in rivers.
93
+ The teacher explains math in the classroom."""
94
+
95
+ def parse_corpus(corpus_text: str):
96
+ lines = [ln.strip() for ln in corpus_text.splitlines()]
97
+ lines = [ln for ln in lines if ln] # remove empty lines
98
+ return lines
99
+
100
+ def build_vectorizer(method: str, ngrams: str):
101
+ if ngrams == "Unigrams (1 word)":
102
+ ngram_range = (1, 1)
103
+ else:
104
+ ngram_range = (1, 2) # uni + bi
105
+
106
+ if method == "TF-IDF (recommended)":
107
+ return TfidfVectorizer(lowercase=True, ngram_range=ngram_range, stop_words="english")
108
+ else:
109
+ return CountVectorizer(lowercase=True, ngram_range=ngram_range, stop_words="english")
110
+
111
+ def similarity_search(corpus_lines, query, method, ngrams, top_k):
112
+ if len(corpus_lines) == 0:
113
+ return pd.DataFrame(columns=["rank", "score", "text"]), None, None
114
+
115
+ vec = build_vectorizer(method, ngrams)
116
+ X = vec.fit_transform(corpus_lines)
117
+ q = vec.transform([query])
118
+
119
+ sims = cosine_similarity(q, X)[0] # (num_docs,)
120
+ order = np.argsort(sims)[::-1][:top_k]
121
+
122
+ rows = []
123
+ for r, idx in enumerate(order, start=1):
124
+ rows.append({"rank": r, "score": float(sims[idx]), "text": corpus_lines[int(idx)]})
125
+
126
+ df = pd.DataFrame(rows)
127
+ return df, X, vec
128
+
129
+
130
+ # ----------------------------
131
+ # 3) VISUALIZATIONS
132
+ # ----------------------------
133
+ def plot_similarity_heatmap(X):
134
+ S = cosine_similarity(X)
135
+ fig = plt.figure(figsize=(6, 5))
136
+ plt.imshow(S)
137
+ plt.title("Similarity Heatmap (Corpus vs Corpus)")
138
+ plt.xlabel("Doc index")
139
+ plt.ylabel("Doc index")
140
+ plt.colorbar()
141
+ plt.tight_layout()
142
+ return fig
143
+
144
+ def plot_2d_map(X, corpus_lines):
145
+ # compress to 2D for visualization
146
+ n_components = 2
147
+ svd = TruncatedSVD(n_components=n_components, random_state=42)
148
+ X2 = svd.fit_transform(X)
149
+
150
+ fig = plt.figure(figsize=(7, 5))
151
+ plt.scatter(X2[:, 0], X2[:, 1])
152
+ for i, (x, y) in enumerate(X2):
153
+ plt.text(x + 0.01, y + 0.01, f"D{i}", fontsize=9)
154
+ plt.title("2D Meaning Map (SVD on Embeddings)")
155
+ plt.xlabel("Component 1")
156
+ plt.ylabel("Component 2")
157
+ plt.tight_layout()
158
+ return fig
159
+
160
+
161
+ # ----------------------------
162
+ # GRADIO APP LOGIC
163
+ # ----------------------------
164
+ def run_preprocessing(
165
+ text,
166
+ do_lower,
167
+ do_remove_punct,
168
+ do_remove_numbers,
169
+ do_stopwords,
170
+ keep_not,
171
+ do_stem
172
+ ):
173
+ cleaned, tokens = preprocess_text(
174
+ text=text,
175
+ do_lower=do_lower,
176
+ do_remove_punct=do_remove_punct,
177
+ do_remove_numbers=do_remove_numbers,
178
+ do_stopwords=do_stopwords,
179
+ keep_not=keep_not,
180
+ do_stem=do_stem,
181
+ )
182
+ # show tokens nicely
183
+ tokens_str = ", ".join(tokens[:200]) + (" ..." if len(tokens) > 200 else "")
184
+ return cleaned, tokens_str, len(tokens)
185
+
186
+ def run_similarity(
187
+ corpus_text,
188
+ query,
189
+ method,
190
+ ngrams,
191
+ top_k,
192
+ show_heatmap,
193
+ show_map
194
+ ):
195
+ corpus_lines = parse_corpus(corpus_text)
196
+ if not query.strip():
197
+ return pd.DataFrame(columns=["rank", "score", "text"]), None, None, f"Corpus size: {len(corpus_lines)}"
198
+
199
+ df, X, vec = similarity_search(corpus_lines, query, method, ngrams, int(top_k))
200
+
201
+ heat_fig = None
202
+ map_fig = None
203
+
204
+ if X is not None and show_heatmap and X.shape[0] >= 2:
205
+ heat_fig = plot_similarity_heatmap(X)
206
+
207
+ if X is not None and show_map and X.shape[0] >= 2:
208
+ map_fig = plot_2d_map(X, corpus_lines)
209
+
210
+ info = f"Corpus size: {len(corpus_lines)} | Embedding dims: {X.shape[1] if X is not None else 0}"
211
+ return df, heat_fig, map_fig, info
212
+
213
+
214
+ # ----------------------------
215
+ # UI
216
+ # ----------------------------
217
+ with gr.Blocks(theme=gr.themes.Soft(), title="NLP Preprocessing + Similarity (Kid Friendly)") as demo:
218
+ gr.Markdown(
219
+ """
220
+ # 🧠 NLP Playground (Preprocessing + Similarity Search)
221
+
222
+ This app teaches two basic NLP superpowers:
223
+
224
+ ### 1) Preprocessing (cleaning text)
225
+ You can turn cleaning steps on/off and see how the text changes.
226
+
227
+ ### 2) Embeddings + Similarity Search
228
+ You can paste a mini “library of sentences” and search it by meaning using embeddings.
229
+
230
+ ✅ Works great on **Hugging Face Spaces**.
231
+ """
232
+ )
233
+
234
+ with gr.Tabs():
235
+ # ----------------------------
236
+ # TAB 1: PREPROCESSING
237
+ # ----------------------------
238
+ with gr.Tab("🧽 Preprocessing Lab"):
239
+ gr.Markdown(
240
+ """
241
+ ### What students learn here
242
+ - **Lowercase** makes words match better (Cat = cat)
243
+ - **Remove punctuation** removes extra symbols
244
+ - **Remove numbers** removes digits if you want
245
+ - **Stopwords** removes super common words (“the”, “is”)
246
+ - **Stemming** is a simple trick to chop endings (play → play, playing → play)
247
+
248
+ Try toggling things and watching the output change.
249
+ """
250
+ )
251
+
252
+ inp = gr.Textbox(
253
+ label="Type any sentence",
254
+ value="Wow!!! I LOVE cats, cats, and more cats... I won 1000 points!!!",
255
+ lines=3
256
+ )
257
+
258
+ with gr.Row():
259
+ do_lower = gr.Checkbox(True, label="lowercase")
260
+ do_remove_punct = gr.Checkbox(True, label="remove punctuation")
261
+ do_remove_numbers = gr.Checkbox(False, label="remove numbers")
262
+
263
+ with gr.Row():
264
+ do_stopwords = gr.Checkbox(False, label="remove stopwords")
265
+ keep_not = gr.Checkbox(True, label="keep 'not' and 'no' (important for meaning)")
266
+ do_stem = gr.Checkbox(False, label="tiny stemming (demo)")
267
+
268
+ btn = gr.Button("✨ Run Preprocessing", variant="primary")
269
+
270
+ cleaned_out = gr.Textbox(label="Cleaned text (what model sees)", lines=2)
271
+ tokens_out = gr.Textbox(label="Tokens (split words)", lines=3)
272
+ token_count = gr.Number(label="Token count", precision=0)
273
+
274
+ btn.click(
275
+ fn=run_preprocessing,
276
+ inputs=[inp, do_lower, do_remove_punct, do_remove_numbers, do_stopwords, keep_not, do_stem],
277
+ outputs=[cleaned_out, tokens_out, token_count]
278
+ )
279
+
280
+ # ----------------------------
281
+ # TAB 2: SIMILARITY SEARCH
282
+ # ----------------------------
283
+ with gr.Tab("🔎 Similarity Search Lab"):
284
+ gr.Markdown(
285
+ """
286
+ ### What students learn here
287
+ - An **embedding** turns each sentence into numbers.
288
+ - **Cosine similarity** measures how close meanings are.
289
+ - You can build a tiny “Google-like search” over your own sentences.
290
+ """
291
+ )
292
+
293
+ corpus = gr.Textbox(
294
+ label="Corpus (one sentence per line) — students can edit this",
295
+ value=DEFAULT_CORPUS,
296
+ lines=10
297
+ )
298
+
299
+ query = gr.Textbox(
300
+ label="Query (what you want to search)",
301
+ value="small baby cats love sleeping",
302
+ lines=2
303
+ )
304
+
305
+ with gr.Row():
306
+ method = gr.Radio(
307
+ choices=["TF-IDF (recommended)", "Bag of Words (counts)"],
308
+ value="TF-IDF (recommended)",
309
+ label="Embedding method"
310
+ )
311
+ ngrams = gr.Radio(
312
+ choices=["Unigrams (1 word)", "Unigrams + Bigrams (1-2 words)"],
313
+ value="Unigrams + Bigrams (1-2 words)",
314
+ label="N-grams"
315
+ )
316
+
317
+ with gr.Row():
318
+ top_k = gr.Slider(1, 10, value=5, step=1, label="Top-K results")
319
+ show_heatmap = gr.Checkbox(False, label="Show similarity heatmap (slow for big corpus)")
320
+ show_map = gr.Checkbox(True, label="Show 2D meaning map")
321
+
322
+ run_btn = gr.Button("🔍 Search by Meaning", variant="primary")
323
+
324
+ info = gr.Markdown("")
325
+ results_table = gr.Dataframe(
326
+ headers=["rank", "score", "text"],
327
+ datatype=["number", "number", "str"],
328
+ label="Top matches (sorted by similarity)"
329
+ )
330
+
331
+ with gr.Row():
332
+ heat_plot = gr.Plot(label="Similarity Heatmap")
333
+ map_plot = gr.Plot(label="2D Meaning Map")
334
+
335
+ run_btn.click(
336
+ fn=run_similarity,
337
+ inputs=[corpus, query, method, ngrams, top_k, show_heatmap, show_map],
338
+ outputs=[results_table, heat_plot, map_plot, info]
339
+ )
340
+
341
+ gr.Markdown(
342
+ """
343
+ ---
344
+ ## ✅ Classroom mini-challenges
345
+
346
+ 1) In the **Preprocessing** tab, make the cleaned text remove punctuation and stopwords.
347
+ What changes?
348
+
349
+ 2) In **Similarity Search**, add your own lines like:
350
+ - "I love pizza and burgers."
351
+ - "Math homework is difficult."
352
+ - "Dogs are playful and friendly."
353
+
354
+ Then search:
355
+ - “food I like”
356
+ - “school work”
357
+ - “animals that play”
358
+
359
+ Watch which sentences become “closest”.
360
+ """
361
+ )
362
+
363
+ if __name__ == "__main__":
364
+ demo.launch()