Spaces:
Sleeping
Sleeping
| import re | |
| import string | |
| import gradio as gr | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| # Ensure NLTK resources are available at runtime | |
| nltk.download("punkt", quiet=True) | |
| nltk.download("stopwords", quiet=True) | |
| nltk.download("wordnet", quiet=True) | |
| nltk.download("omw-1.4", quiet=True) | |
| # ---------- Normalization helpers ---------- | |
| _wordnet_lemmatizer = WordNetLemmatizer() | |
| _stop_words = set(stopwords.words("english")) | |
| _punct_table = str.maketrans("", "", string.punctuation) | |
| def word_tokenize(text: str): | |
| # Simple word tokenizer that keeps apostrophes inside words | |
| return nltk.word_tokenize(text) | |
| def remove_non_ascii(words): | |
| return [w.encode("ascii", "ignore").decode("ascii") for w in words] | |
| def to_lowercase(words): | |
| return [w.lower() for w in words] | |
| def remove_punctuation(words): | |
| return [w.translate(_punct_table) for w in words if w.translate(_punct_table) != ""] | |
| def remove_stopwords(words): | |
| return [w for w in words if w not in _stop_words] | |
| def lemmatize_list(words): | |
| # Lemmatize as nouns first, then verbs if noun same as original | |
| out = [] | |
| for w in words: | |
| n = _wordnet_lemmatizer.lemmatize(w, pos="n") | |
| v = _wordnet_lemmatizer.lemmatize(n, pos="v") | |
| out.append(v) | |
| return out | |
| def normalize_pipeline(text: str): | |
| """ | |
| Runs the full preprocessing pipeline while returning step-by-step outputs. | |
| Returns a dict mapping step name to value (list of tokens or final string). | |
| """ | |
| steps = {} | |
| steps["original"] = text | |
| tokens = word_tokenize(text) | |
| steps["1) tokenize"] = tokens | |
| words = remove_non_ascii(tokens) | |
| steps["2) remove_non_ascii"] = words | |
| words = to_lowercase(words) | |
| steps["3) to_lowercase"] = words | |
| words = remove_punctuation(words) | |
| steps["4) remove_punctuation"] = words | |
| words = remove_stopwords(words) | |
| steps["5) remove_stopwords"] = words | |
| words = lemmatize_list(words) | |
| steps["6) lemmatize"] = words | |
| final_text = " ".join(words) | |
| steps["7) join"] = final_text | |
| return steps, final_text | |
| # ---------- Gradio UI ---------- | |
| EXAMPLES = [ | |
| "Habitat's 20 by 28 campaign is inspiring—let's build more homes in Jackson!", | |
| "NLTK makes text preprocessing EASY: Tokenize, lowercase, remove punctuation & stopwords, then lemmatize.", | |
| "Cats were running, jumped over fences; the dogs' tails were wagging! 🐶🐱", | |
| "Email me at Example@Domain.com!!! This, perhaps, isn't AS easy as it looks...", | |
| ] | |
| def run_pipeline(text): | |
| steps, final_text = normalize_pipeline(text) | |
| # Provide a human-friendly multiline trace | |
| trace_lines = [] | |
| for k, v in steps.items(): | |
| if isinstance(v, list): | |
| display = ", ".join(v[:40]) + (" ..." if len(v) > 40 else "") | |
| else: | |
| display = v | |
| trace_lines.append(f"{k}:\n{display}\n") | |
| trace = "\n".join(trace_lines) | |
| return steps, final_text, trace | |
| with gr.Blocks(title="Text Normalization Demo") as demo: | |
| gr.Markdown("# Text Normalization (Step-by-Step)") | |
| gr.Markdown( | |
| "Enter text on the left or choose an example. The pipeline shows each step: " | |
| "**tokenize → remove_non_ascii → lowercase → remove_punctuation → remove_stopwords → lemmatize → join**." | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_in = gr.Textbox(label="Input text", lines=6, placeholder="Type or pick an example below...") | |
| examples = gr.Examples( | |
| examples=[[e] for e in EXAMPLES], | |
| inputs=[text_in], | |
| label="Try these examples", | |
| ) | |
| run_btn = gr.Button("Run normalization") | |
| with gr.Column(): | |
| final_out = gr.Textbox(label="Final normalized text", lines=2) | |
| trace_out = gr.Code(label="Step-by-step trace (human readable)", language="markdown") | |
| # Collapsible JSON view for each step (for clarity & grading) | |
| steps_json = gr.JSON(label="Detailed steps (JSON)") | |
| run_btn.click(fn=run_pipeline, inputs=text_in, outputs=[steps_json, final_out, trace_out]) | |
| if __name__ == "__main__": | |
| demo.launch() | |