Nlp_example / app.py
ajayinsac's picture
Update app.py
ccdb725 verified
#!/usr/bin/env python3
"""
Gradio app: Text normalization pipeline with step-by-step outputs.
Run locally:
pip install -r requirements.txt
python app.py
"""
import os
import re
import string
from collections import OrderedDict
import gradio as gr
# Detect if running on Hugging Face Spaces (avoid share=True there)
IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
# ---- Optional NLTK pieces (NO downloads at startup) ----
# Use real stopwords if available; otherwise fall back to a small set.
try:
import nltk # noqa: F401
from nltk.corpus import stopwords as nltk_stopwords
_STOPWORDS = set(nltk_stopwords.words("english"))
except Exception:
_STOPWORDS = {
"a","an","and","are","as","at","be","but","by","for","if","in","into",
"is","it","no","not","of","on","or","such","that","the","their","then",
"there","these","they","this","to","was","will","with","were","from","your"
}
# Decide lemmatizer vs stemmer based on whether the *corpus* exists
_use_porter = True
_lemmatizer = None
_stemmer = None
try:
import nltk # noqa: F401
from nltk.stem import WordNetLemmatizer
# Only use WordNetLemmatizer if the *wordnet* corpus is present
try:
nltk.data.find("corpora/wordnet")
_lemmatizer = WordNetLemmatizer()
_use_porter = False
except LookupError:
from nltk.stem import PorterStemmer
_stemmer = PorterStemmer()
_use_porter = True
except Exception:
# If NLTK isn't fully available, fall back to identity later
_lemmatizer = None
_stemmer = None
_use_porter = None
# ---- Pipeline helpers ----
def tokenize(text: str):
# Simple, dependency-free tokenizer: words or single non-space symbols
return re.findall(r"\w+|[^\w\s]", text or "", flags=re.UNICODE)
def remove_non_ascii(tokens):
cleaned = []
for w in tokens:
ascii_w = w.encode("ascii", "ignore").decode("ascii")
if ascii_w:
cleaned.append(ascii_w)
return cleaned
def to_lowercase(tokens):
return [w.lower() for w in tokens]
def remove_punctuation(tokens):
table = str.maketrans("", "", string.punctuation)
stripped = [w.translate(table) for w in tokens]
return [w for w in stripped if w and not w.isspace()]
def remove_stopwords(tokens):
return [w for w in tokens if w not in _STOPWORDS]
def lemmatize_list(tokens):
"""Lemmatize if wordnet is present; otherwise stem; otherwise identity.
Also guards against runtime LookupError during example caching."""
global _use_porter, _lemmatizer, _stemmer
if _use_porter is False and _lemmatizer is not None:
try:
return [_lemmatizer.lemmatize(w) for w in tokens]
except LookupError:
try:
from nltk.stem import PorterStemmer
_stemmer = PorterStemmer()
_use_porter = True
return [_stemmer.stem(w) for w in tokens]
except Exception:
return tokens
elif _use_porter is True and _stemmer is not None:
return [_stemmer.stem(w) for w in tokens]
else:
return tokens
# ---- Core pipeline (returns step-by-step dict) ----
def normalize(text: str) -> OrderedDict:
steps = OrderedDict()
t1 = tokenize(text)
steps["1) Tokenize"] = t1
t2 = remove_non_ascii(t1)
steps["2) Remove non-ASCII"] = t2
t3 = to_lowercase(t2)
steps["3) Lowercase"] = t3
t4 = remove_punctuation(t3)
steps["4) Remove punctuation"] = t4
t5 = remove_stopwords(t4)
steps["5) Remove stopwords"] = t5
t6 = lemmatize_list(t5)
steps["6) Lemmatize"] = t6
steps["Final normalized text"] = " ".join(t6)
return steps
# ---- Gradio wiring ----
examples = [
"The quick brown fox jumps over the lazy dog!",
"NLTK is a leading platform for building Python programs to work with human language data.",
"Text normalization is important for NLP tasks.",
"Café prices in 2024 were higher—aren't they? 🤔",
]
def show_steps(text):
steps = normalize(text)
parts = []
for step, value in steps.items():
if isinstance(value, list):
pretty = " ".join(value)
parts.append(f"<b>{step}</b>: {pretty} <small>({len(value)} tokens)</small>")
else:
parts.append(f"<b>{step}</b>: {value}")
return "<br>".join(parts)
iface = gr.Interface(
fn=show_steps,
inputs=gr.Textbox(lines=4, label="Enter text to normalize"),
outputs=gr.HTML(label="Step-by-step normalization"),
examples=[[ex] for ex in examples],
cache_examples=False, # avoid startup caching
flagging_mode="never",
title="Text Normalization Pipeline",
description="Enter text or select an example to see each step of the normalization process."
)
if __name__ == "__main__":
iface.launch(
server_name="0.0.0.0",
server_port=7860,
ssr_mode=False, # disable SSR for stability
share=(not IN_SPACES), # public link only when local; avoids Spaces warning
quiet=True, # suppresses the “To create a public link…” tip
)