ajayinsac commited on
Commit
d65f58c
·
verified ·
1 Parent(s): 58c6939

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -145
app.py CHANGED
@@ -1,151 +1,27 @@
1
- #!/usr/bin/env python3
2
- """
3
- Gradio app: Text normalization pipeline with step-by-step outputs.
4
- Run locally:
5
- pip install -r requirements.txt
6
- python app.py
7
- """
8
-
9
- import os
10
- import string
11
- import pandas as pd
12
  import gradio as gr
13
- import nltk
14
-
15
- # Detect if running on Hugging Face Spaces
16
- IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
17
-
18
- # Lightweight tokenizer that needs no punkt download
19
- from nltk.tokenize import wordpunct_tokenize
20
-
21
- # Optional NLTK corpora: use if present; otherwise fall back
22
- try:
23
- from nltk.corpus import stopwords
24
- _STOPWORDS = set(stopwords.words("english"))
25
- except Exception:
26
- # Minimal built-in fallback list to avoid startup downloads
27
- _STOPWORDS = {
28
- "a","an","and","are","as","at","be","but","by","for","if","in","into",
29
- "is","it","no","not","of","on","or","such","that","the","their","then",
30
- "there","these","they","this","to","was","will","with","were","from","your"
31
- }
32
-
33
- # Prefer WordNet lemmatizer; if unavailable, fall back to PorterStemmer (no corpora)
34
- try:
35
- from nltk.stem import WordNetLemmatizer
36
- _lemmatizer = WordNetLemmatizer()
37
- _use_porter = False
38
- except Exception:
39
- from nltk.stem import PorterStemmer
40
- _stemmer = PorterStemmer()
41
- _use_porter = True
42
-
43
-
44
- # -------- Pipeline helpers --------
45
- def remove_non_ascii(words):
46
- """Strip non-ASCII chars from each token and drop empties."""
47
- cleaned = []
48
- for w in words:
49
- ascii_w = w.encode("ascii", "ignore").decode("ascii")
50
- if ascii_w:
51
- cleaned.append(ascii_w)
52
- return cleaned
53
-
54
- def to_lowercase(words):
55
- return [w.lower() for w in words]
56
-
57
- def remove_punctuation(words):
58
- """Remove punctuation characters from each token and drop empties."""
59
- table = str.maketrans("", "", string.punctuation)
60
- stripped = [w.translate(table) for w in words]
61
- return [w for w in stripped if w and not w.isspace()]
62
-
63
- def remove_stopwords(words):
64
- return [w for w in words if w not in _STOPWORDS]
65
 
66
- def lemmatize_list(words):
67
- if _use_porter:
68
- # Fallback: stemming when WordNet corpus isn't available
69
- return [_stemmer.stem(w) for w in words]
70
- else:
71
- return [_lemmatizer.lemmatize(w) for w in words]
72
-
73
-
74
- # -------- Core pipeline (from prompt) --------
75
- def normalize(text: str):
76
- """Full preprocessing pipeline"""
77
- words = wordpunct_tokenize(text or "")
78
- words = remove_non_ascii(words)
79
- words = to_lowercase(words)
80
- words = remove_punctuation(words)
81
- words = remove_stopwords(words)
82
- words = lemmatize_list(words)
83
- return " ".join(words)
84
-
85
-
86
- # -------- Step-by-step output for UI --------
87
- def normalize_with_steps(text: str):
88
- if not text or not text.strip():
89
- empty_df = pd.DataFrame([["—", [], 0]], columns=["Step", "Tokens", "Count"])
90
- return empty_df, ""
91
-
92
- steps = []
93
- # 1) Tokenize (no punkt dependency)
94
- tokens = wordpunct_tokenize(text)
95
- steps.append(("1) Tokenize", tokens.copy(), len(tokens)))
96
- # 2) Remove non-ASCII
97
- tokens = remove_non_ascii(tokens)
98
- steps.append(("2) Remove non-ASCII", tokens.copy(), len(tokens)))
99
- # 3) Lowercase
100
- tokens = to_lowercase(tokens)
101
- steps.append(("3) Lowercase", tokens.copy(), len(tokens)))
102
- # 4) Remove punctuation
103
- tokens = remove_punctuation(tokens)
104
- steps.append(("4) Remove punctuation", tokens.copy(), len(tokens)))
105
- # 5) Remove stopwords
106
- tokens = remove_stopwords(tokens)
107
- steps.append(("5) Remove stopwords", tokens.copy(), len(tokens)))
108
- # 6) Lemmatize (or stem if WordNet missing)
109
- tokens = lemmatize_list(tokens)
110
- steps.append(("6) Lemmatize", tokens.copy(), len(tokens)))
111
-
112
- df = pd.DataFrame(steps, columns=["Step", "Tokens", "Count"])
113
- final_text = " ".join(tokens)
114
- return df, final_text
115
-
116
-
117
- # -------- Gradio UI --------
118
- EXAMPLES = [
119
- ["The QUICK brown foxes, jumping over 13 lazy dogs!!!"],
120
- ["Café prices in 2024 were higher—aren't they? 🤔"],
121
- ["NLTK's tokenization isn't perfect; e.g., 'don't' becomes two tokens."],
122
- ["Hello!!! This is a TEST of the FULL preprocessing PIPELINE."],
123
- ["E-mail: ajay@example.com; Visit https://example.org soon..."],
124
  ]
125
 
126
- with gr.Blocks(title="Text Normalization Pipeline") as demo:
127
- gr.Markdown(
128
- "# Text Normalization Pipeline\n"
129
- "Type text below or click an example. Click **Normalize** to see each step and the final result."
130
- )
131
- with gr.Row():
132
- with gr.Column(scale=1):
133
- inp = gr.Textbox(lines=8, label="Input Text", placeholder="Type or paste text here...")
134
- btn = gr.Button("Normalize", variant="primary")
135
- gr.Examples(EXAMPLES, inputs=inp, label="Examples")
136
- with gr.Column(scale=1):
137
- out_df = gr.Dataframe(headers=["Step", "Tokens", "Count"], wrap=True, label="Step-by-step outputs")
138
- out_final = gr.Textbox(label="Final normalized text")
139
- btn.click(normalize_with_steps, inputs=inp, outputs=[out_df, out_final])
140
-
141
 
142
- # -------- Launch (Spaces-friendly & Local public link) --------
143
  if __name__ == "__main__":
144
- demo.queue()
145
- demo.launch(
146
- server_name="0.0.0.0",
147
- server_port=7860,
148
- ssr_mode=False,
149
- share=True
150
- #share=not IN_SPACES, # no warning on Spaces; public link when running locally
151
- )
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from normalize_pipeline import normalize
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ examples = [
5
+ "The quick brown fox jumps over the lazy dog!",
6
+ "NLTK is a leading platform for building Python programs to work with human language data.",
7
+ "Text normalization is important for NLP tasks.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  ]
9
 
10
+ def show_steps(text):
11
+ steps = normalize(text)
12
+ output = ""
13
+ for step, value in steps.items():
14
+ output += f"<b>{step}:</b> {value}<br>"
15
+ return output
16
+
17
+ iface = gr.Interface(
18
+ fn=show_steps,
19
+ inputs=gr.Textbox(lines=3, label="Enter text to normalize"),
20
+ outputs=gr.HTML(label="Step-by-step normalization"),
21
+ examples=[[ex] for ex in examples],
22
+ title="Text Normalization Pipeline",
23
+ description="Enter text or select an example to see each step of the normalization process.",
24
+ )
25
 
 
26
  if __name__ == "__main__":
27
+ iface.launch()