Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Gradio app: Text normalization pipeline with step-by-step outputs. | |
| Run locally: | |
| pip install -r requirements.txt | |
| python app.py | |
| """ | |
| import os | |
| import re | |
| import string | |
| from collections import OrderedDict | |
| import gradio as gr | |
| # Detect if running on Hugging Face Spaces (avoid share=True there) | |
| IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID")) | |
| # ---- Optional NLTK pieces (NO downloads at startup) ---- | |
| # Use real stopwords if available; otherwise fall back to a small set. | |
| try: | |
| import nltk # noqa: F401 | |
| from nltk.corpus import stopwords as nltk_stopwords | |
| _STOPWORDS = set(nltk_stopwords.words("english")) | |
| except Exception: | |
| _STOPWORDS = { | |
| "a","an","and","are","as","at","be","but","by","for","if","in","into", | |
| "is","it","no","not","of","on","or","such","that","the","their","then", | |
| "there","these","they","this","to","was","will","with","were","from","your" | |
| } | |
| # Decide lemmatizer vs stemmer based on whether the *corpus* exists | |
| _use_porter = True | |
| _lemmatizer = None | |
| _stemmer = None | |
| try: | |
| import nltk # noqa: F401 | |
| from nltk.stem import WordNetLemmatizer | |
| # Only use WordNetLemmatizer if the *wordnet* corpus is present | |
| try: | |
| nltk.data.find("corpora/wordnet") | |
| _lemmatizer = WordNetLemmatizer() | |
| _use_porter = False | |
| except LookupError: | |
| from nltk.stem import PorterStemmer | |
| _stemmer = PorterStemmer() | |
| _use_porter = True | |
| except Exception: | |
| # If NLTK isn't fully available, fall back to identity later | |
| _lemmatizer = None | |
| _stemmer = None | |
| _use_porter = None | |
| # ---- Pipeline helpers ---- | |
| def tokenize(text: str): | |
| # Simple, dependency-free tokenizer: words or single non-space symbols | |
| return re.findall(r"\w+|[^\w\s]", text or "", flags=re.UNICODE) | |
| def remove_non_ascii(tokens): | |
| cleaned = [] | |
| for w in tokens: | |
| ascii_w = w.encode("ascii", "ignore").decode("ascii") | |
| if ascii_w: | |
| cleaned.append(ascii_w) | |
| return cleaned | |
| def to_lowercase(tokens): | |
| return [w.lower() for w in tokens] | |
| def remove_punctuation(tokens): | |
| table = str.maketrans("", "", string.punctuation) | |
| stripped = [w.translate(table) for w in tokens] | |
| return [w for w in stripped if w and not w.isspace()] | |
| def remove_stopwords(tokens): | |
| return [w for w in tokens if w not in _STOPWORDS] | |
| def lemmatize_list(tokens): | |
| """Lemmatize if wordnet is present; otherwise stem; otherwise identity. | |
| Also guards against runtime LookupError during example caching.""" | |
| global _use_porter, _lemmatizer, _stemmer | |
| if _use_porter is False and _lemmatizer is not None: | |
| try: | |
| return [_lemmatizer.lemmatize(w) for w in tokens] | |
| except LookupError: | |
| try: | |
| from nltk.stem import PorterStemmer | |
| _stemmer = PorterStemmer() | |
| _use_porter = True | |
| return [_stemmer.stem(w) for w in tokens] | |
| except Exception: | |
| return tokens | |
| elif _use_porter is True and _stemmer is not None: | |
| return [_stemmer.stem(w) for w in tokens] | |
| else: | |
| return tokens | |
| # ---- Core pipeline (returns step-by-step dict) ---- | |
| def normalize(text: str) -> OrderedDict: | |
| steps = OrderedDict() | |
| t1 = tokenize(text) | |
| steps["1) Tokenize"] = t1 | |
| t2 = remove_non_ascii(t1) | |
| steps["2) Remove non-ASCII"] = t2 | |
| t3 = to_lowercase(t2) | |
| steps["3) Lowercase"] = t3 | |
| t4 = remove_punctuation(t3) | |
| steps["4) Remove punctuation"] = t4 | |
| t5 = remove_stopwords(t4) | |
| steps["5) Remove stopwords"] = t5 | |
| t6 = lemmatize_list(t5) | |
| steps["6) Lemmatize"] = t6 | |
| steps["Final normalized text"] = " ".join(t6) | |
| return steps | |
| # ---- Gradio wiring ---- | |
| examples = [ | |
| "The quick brown fox jumps over the lazy dog!", | |
| "NLTK is a leading platform for building Python programs to work with human language data.", | |
| "Text normalization is important for NLP tasks.", | |
| "Café prices in 2024 were higher—aren't they? 🤔", | |
| ] | |
| def show_steps(text): | |
| steps = normalize(text) | |
| parts = [] | |
| for step, value in steps.items(): | |
| if isinstance(value, list): | |
| pretty = " ".join(value) | |
| parts.append(f"<b>{step}</b>: {pretty} <small>({len(value)} tokens)</small>") | |
| else: | |
| parts.append(f"<b>{step}</b>: {value}") | |
| return "<br>".join(parts) | |
| iface = gr.Interface( | |
| fn=show_steps, | |
| inputs=gr.Textbox(lines=4, label="Enter text to normalize"), | |
| outputs=gr.HTML(label="Step-by-step normalization"), | |
| examples=[[ex] for ex in examples], | |
| cache_examples=False, # avoid startup caching | |
| flagging_mode="never", | |
| title="Text Normalization Pipeline", | |
| description="Enter text or select an example to see each step of the normalization process." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| ssr_mode=False, # disable SSR for stability | |
| share=(not IN_SPACES), # public link only when local; avoids Spaces warning | |
| quiet=True, # suppresses the “To create a public link…” tip | |
| ) | |