#!/usr/bin/env python3 """ Gradio app: Text normalization pipeline with step-by-step outputs. Run locally: pip install -r requirements.txt python app.py """ import os import re import string from collections import OrderedDict import gradio as gr # Detect if running on Hugging Face Spaces (avoid share=True there) IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID")) # ---- Optional NLTK pieces (NO downloads at startup) ---- # Use real stopwords if available; otherwise fall back to a small set. try: import nltk # noqa: F401 from nltk.corpus import stopwords as nltk_stopwords _STOPWORDS = set(nltk_stopwords.words("english")) except Exception: _STOPWORDS = { "a","an","and","are","as","at","be","but","by","for","if","in","into", "is","it","no","not","of","on","or","such","that","the","their","then", "there","these","they","this","to","was","will","with","were","from","your" } # Decide lemmatizer vs stemmer based on whether the *corpus* exists _use_porter = True _lemmatizer = None _stemmer = None try: import nltk # noqa: F401 from nltk.stem import WordNetLemmatizer # Only use WordNetLemmatizer if the *wordnet* corpus is present try: nltk.data.find("corpora/wordnet") _lemmatizer = WordNetLemmatizer() _use_porter = False except LookupError: from nltk.stem import PorterStemmer _stemmer = PorterStemmer() _use_porter = True except Exception: # If NLTK isn't fully available, fall back to identity later _lemmatizer = None _stemmer = None _use_porter = None # ---- Pipeline helpers ---- def tokenize(text: str): # Simple, dependency-free tokenizer: words or single non-space symbols return re.findall(r"\w+|[^\w\s]", text or "", flags=re.UNICODE) def remove_non_ascii(tokens): cleaned = [] for w in tokens: ascii_w = w.encode("ascii", "ignore").decode("ascii") if ascii_w: cleaned.append(ascii_w) return cleaned def to_lowercase(tokens): return [w.lower() for w in tokens] def remove_punctuation(tokens): table = str.maketrans("", "", string.punctuation) stripped = [w.translate(table) for w in tokens] return [w for w in stripped if w and not w.isspace()] def remove_stopwords(tokens): return [w for w in tokens if w not in _STOPWORDS] def lemmatize_list(tokens): """Lemmatize if wordnet is present; otherwise stem; otherwise identity. Also guards against runtime LookupError during example caching.""" global _use_porter, _lemmatizer, _stemmer if _use_porter is False and _lemmatizer is not None: try: return [_lemmatizer.lemmatize(w) for w in tokens] except LookupError: try: from nltk.stem import PorterStemmer _stemmer = PorterStemmer() _use_porter = True return [_stemmer.stem(w) for w in tokens] except Exception: return tokens elif _use_porter is True and _stemmer is not None: return [_stemmer.stem(w) for w in tokens] else: return tokens # ---- Core pipeline (returns step-by-step dict) ---- def normalize(text: str) -> OrderedDict: steps = OrderedDict() t1 = tokenize(text) steps["1) Tokenize"] = t1 t2 = remove_non_ascii(t1) steps["2) Remove non-ASCII"] = t2 t3 = to_lowercase(t2) steps["3) Lowercase"] = t3 t4 = remove_punctuation(t3) steps["4) Remove punctuation"] = t4 t5 = remove_stopwords(t4) steps["5) Remove stopwords"] = t5 t6 = lemmatize_list(t5) steps["6) Lemmatize"] = t6 steps["Final normalized text"] = " ".join(t6) return steps # ---- Gradio wiring ---- examples = [ "The quick brown fox jumps over the lazy dog!", "NLTK is a leading platform for building Python programs to work with human language data.", "Text normalization is important for NLP tasks.", "Café prices in 2024 were higher—aren't they? 🤔", ] def show_steps(text): steps = normalize(text) parts = [] for step, value in steps.items(): if isinstance(value, list): pretty = " ".join(value) parts.append(f"{step}: {pretty} ({len(value)} tokens)") else: parts.append(f"{step}: {value}") return "
".join(parts) iface = gr.Interface( fn=show_steps, inputs=gr.Textbox(lines=4, label="Enter text to normalize"), outputs=gr.HTML(label="Step-by-step normalization"), examples=[[ex] for ex in examples], cache_examples=False, # avoid startup caching flagging_mode="never", title="Text Normalization Pipeline", description="Enter text or select an example to see each step of the normalization process." ) if __name__ == "__main__": iface.launch( server_name="0.0.0.0", server_port=7860, ssr_mode=False, # disable SSR for stability share=(not IN_SPACES), # public link only when local; avoids Spaces warning quiet=True, # suppresses the “To create a public link…” tip )