clowwendy commited on
Commit
348e339
·
verified ·
1 Parent(s): 42b6750

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import gradio as gr
4
+
5
+ import nltk
6
+ from nltk.corpus import stopwords
7
+ from nltk.stem import WordNetLemmatizer
8
+
9
+ # Ensure NLTK resources are available at runtime
10
+ nltk.download("punkt", quiet=True)
11
+ nltk.download("stopwords", quiet=True)
12
+ nltk.download("wordnet", quiet=True)
13
+ nltk.download("omw-1.4", quiet=True)
14
+
15
+ # ---------- Normalization helpers ----------
16
+ _wordnet_lemmatizer = WordNetLemmatizer()
17
+ _stop_words = set(stopwords.words("english"))
18
+ _punct_table = str.maketrans("", "", string.punctuation)
19
+
20
+ def word_tokenize(text: str):
21
+ # Simple word tokenizer that keeps apostrophes inside words
22
+ return nltk.word_tokenize(text)
23
+
24
+ def remove_non_ascii(words):
25
+ return [w.encode("ascii", "ignore").decode("ascii") for w in words]
26
+
27
+ def to_lowercase(words):
28
+ return [w.lower() for w in words]
29
+
30
+ def remove_punctuation(words):
31
+ return [w.translate(_punct_table) for w in words if w.translate(_punct_table) != ""]
32
+
33
+ def remove_stopwords(words):
34
+ return [w for w in words if w not in _stop_words]
35
+
36
+ def lemmatize_list(words):
37
+ # Lemmatize as nouns first, then verbs if noun same as original
38
+ out = []
39
+ for w in words:
40
+ n = _wordnet_lemmatizer.lemmatize(w, pos="n")
41
+ v = _wordnet_lemmatizer.lemmatize(n, pos="v")
42
+ out.append(v)
43
+ return out
44
+
45
+ def normalize_pipeline(text: str):
46
+ """
47
+ Runs the full preprocessing pipeline while returning step-by-step outputs.
48
+ Returns a dict mapping step name to value (list of tokens or final string).
49
+ """
50
+ steps = {}
51
+ steps["original"] = text
52
+
53
+ tokens = word_tokenize(text)
54
+ steps["1) tokenize"] = tokens
55
+
56
+ words = remove_non_ascii(tokens)
57
+ steps["2) remove_non_ascii"] = words
58
+
59
+ words = to_lowercase(words)
60
+ steps["3) to_lowercase"] = words
61
+
62
+ words = remove_punctuation(words)
63
+ steps["4) remove_punctuation"] = words
64
+
65
+ words = remove_stopwords(words)
66
+ steps["5) remove_stopwords"] = words
67
+
68
+ words = lemmatize_list(words)
69
+ steps["6) lemmatize"] = words
70
+
71
+ final_text = " ".join(words)
72
+ steps["7) join"] = final_text
73
+ return steps, final_text
74
+
75
+ # ---------- Gradio UI ----------
76
+ EXAMPLES = [
77
+ "Habitat's 20 by 28 campaign is inspiring—let's build more homes in Jackson!",
78
+ "NLTK makes text preprocessing EASY: Tokenize, lowercase, remove punctuation & stopwords, then lemmatize.",
79
+ "Cats were running, jumped over fences; the dogs' tails were wagging! 🐶🐱",
80
+ "Email me at Example@Domain.com!!! This, perhaps, isn't AS easy as it looks...",
81
+ ]
82
+
83
+ def run_pipeline(text):
84
+ steps, final_text = normalize_pipeline(text)
85
+ # Provide a human-friendly multiline trace
86
+ trace_lines = []
87
+ for k, v in steps.items():
88
+ if isinstance(v, list):
89
+ display = ", ".join(v[:40]) + (" ..." if len(v) > 40 else "")
90
+ else:
91
+ display = v
92
+ trace_lines.append(f"{k}:\n{display}\n")
93
+ trace = "\n".join(trace_lines)
94
+ return steps, final_text, trace
95
+
96
+ with gr.Blocks(title="Text Normalization Demo") as demo:
97
+ gr.Markdown("# Text Normalization (Step-by-Step)")
98
+ gr.Markdown(
99
+ "Enter text on the left or choose an example. The pipeline shows each step: "
100
+ "**tokenize → remove_non_ascii → lowercase → remove_punctuation → remove_stopwords → lemmatize → join**."
101
+ )
102
+
103
+ with gr.Row():
104
+ with gr.Column():
105
+ text_in = gr.Textbox(label="Input text", lines=6, placeholder="Type or pick an example below...")
106
+ examples = gr.Examples(
107
+ examples=[[e] for e in EXAMPLES],
108
+ inputs=[text_in],
109
+ label="Try these examples",
110
+ )
111
+ run_btn = gr.Button("Run normalization")
112
+ with gr.Column():
113
+ final_out = gr.Textbox(label="Final normalized text", lines=2)
114
+ trace_out = gr.Code(label="Step-by-step trace (human readable)", language="markdown")
115
+
116
+ # Collapsible JSON view for each step (for clarity & grading)
117
+ steps_json = gr.JSON(label="Detailed steps (JSON)")
118
+
119
+ run_btn.click(fn=run_pipeline, inputs=text_in, outputs=[steps_json, final_out, trace_out])
120
+
121
+ if __name__ == "__main__":
122
+ demo.launch()