import re import string import gradio as gr import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer # Ensure NLTK resources are available at runtime nltk.download("punkt", quiet=True) nltk.download("stopwords", quiet=True) nltk.download("wordnet", quiet=True) nltk.download("omw-1.4", quiet=True) # ---------- Normalization helpers ---------- _wordnet_lemmatizer = WordNetLemmatizer() _stop_words = set(stopwords.words("english")) _punct_table = str.maketrans("", "", string.punctuation) def word_tokenize(text: str): # Simple word tokenizer that keeps apostrophes inside words return nltk.word_tokenize(text) def remove_non_ascii(words): return [w.encode("ascii", "ignore").decode("ascii") for w in words] def to_lowercase(words): return [w.lower() for w in words] def remove_punctuation(words): return [w.translate(_punct_table) for w in words if w.translate(_punct_table) != ""] def remove_stopwords(words): return [w for w in words if w not in _stop_words] def lemmatize_list(words): # Lemmatize as nouns first, then verbs if noun same as original out = [] for w in words: n = _wordnet_lemmatizer.lemmatize(w, pos="n") v = _wordnet_lemmatizer.lemmatize(n, pos="v") out.append(v) return out def normalize_pipeline(text: str): """ Runs the full preprocessing pipeline while returning step-by-step outputs. Returns a dict mapping step name to value (list of tokens or final string). """ steps = {} steps["original"] = text tokens = word_tokenize(text) steps["1) tokenize"] = tokens words = remove_non_ascii(tokens) steps["2) remove_non_ascii"] = words words = to_lowercase(words) steps["3) to_lowercase"] = words words = remove_punctuation(words) steps["4) remove_punctuation"] = words words = remove_stopwords(words) steps["5) remove_stopwords"] = words words = lemmatize_list(words) steps["6) lemmatize"] = words final_text = " ".join(words) steps["7) join"] = final_text return steps, final_text # ---------- Gradio UI ---------- EXAMPLES = [ "Habitat's 20 by 28 campaign is inspiring—let's build more homes in Jackson!", "NLTK makes text preprocessing EASY: Tokenize, lowercase, remove punctuation & stopwords, then lemmatize.", "Cats were running, jumped over fences; the dogs' tails were wagging! 🐶🐱", "Email me at Example@Domain.com!!! This, perhaps, isn't AS easy as it looks...", ] def run_pipeline(text): steps, final_text = normalize_pipeline(text) # Provide a human-friendly multiline trace trace_lines = [] for k, v in steps.items(): if isinstance(v, list): display = ", ".join(v[:40]) + (" ..." if len(v) > 40 else "") else: display = v trace_lines.append(f"{k}:\n{display}\n") trace = "\n".join(trace_lines) return steps, final_text, trace with gr.Blocks(title="Text Normalization Demo") as demo: gr.Markdown("# Text Normalization (Step-by-Step)") gr.Markdown( "Enter text on the left or choose an example. The pipeline shows each step: " "**tokenize → remove_non_ascii → lowercase → remove_punctuation → remove_stopwords → lemmatize → join**." ) with gr.Row(): with gr.Column(): text_in = gr.Textbox(label="Input text", lines=6, placeholder="Type or pick an example below...") examples = gr.Examples( examples=[[e] for e in EXAMPLES], inputs=[text_in], label="Try these examples", ) run_btn = gr.Button("Run normalization") with gr.Column(): final_out = gr.Textbox(label="Final normalized text", lines=2) trace_out = gr.Code(label="Step-by-step trace (human readable)", language="markdown") # Collapsible JSON view for each step (for clarity & grading) steps_json = gr.JSON(label="Detailed steps (JSON)") run_btn.click(fn=run_pipeline, inputs=text_in, outputs=[steps_json, final_out, trace_out]) if __name__ == "__main__": demo.launch()