Spaces:

ajayinsac
/

Nlp_example

Sleeping

File size: 5,181 Bytes

#!/usr/bin/env python3
"""
Gradio app: Text normalization pipeline with step-by-step outputs.
Run locally:
  pip install -r requirements.txt
  python app.py
"""

import os
import re
import string
from collections import OrderedDict
import gradio as gr

# Detect if running on Hugging Face Spaces (avoid share=True there)
IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))

# ---- Optional NLTK pieces (NO downloads at startup) ----
# Use real stopwords if available; otherwise fall back to a small set.
try:
    import nltk  # noqa: F401
    from nltk.corpus import stopwords as nltk_stopwords
    _STOPWORDS = set(nltk_stopwords.words("english"))
except Exception:
    _STOPWORDS = {
        "a","an","and","are","as","at","be","but","by","for","if","in","into",
        "is","it","no","not","of","on","or","such","that","the","their","then",
        "there","these","they","this","to","was","will","with","were","from","your"
    }

# Decide lemmatizer vs stemmer based on whether the *corpus* exists
_use_porter = True
_lemmatizer = None
_stemmer = None
try:
    import nltk  # noqa: F401
    from nltk.stem import WordNetLemmatizer
    # Only use WordNetLemmatizer if the *wordnet* corpus is present
    try:
        nltk.data.find("corpora/wordnet")
        _lemmatizer = WordNetLemmatizer()
        _use_porter = False
    except LookupError:
        from nltk.stem import PorterStemmer
        _stemmer = PorterStemmer()
        _use_porter = True
except Exception:
    # If NLTK isn't fully available, fall back to identity later
    _lemmatizer = None
    _stemmer = None
    _use_porter = None


# ---- Pipeline helpers ----
def tokenize(text: str):
    # Simple, dependency-free tokenizer: words or single non-space symbols
    return re.findall(r"\w+|[^\w\s]", text or "", flags=re.UNICODE)

def remove_non_ascii(tokens):
    cleaned = []
    for w in tokens:
        ascii_w = w.encode("ascii", "ignore").decode("ascii")
        if ascii_w:
            cleaned.append(ascii_w)
    return cleaned

def to_lowercase(tokens):
    return [w.lower() for w in tokens]

def remove_punctuation(tokens):
    table = str.maketrans("", "", string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    return [w for w in stripped if w and not w.isspace()]

def remove_stopwords(tokens):
    return [w for w in tokens if w not in _STOPWORDS]

def lemmatize_list(tokens):
    """Lemmatize if wordnet is present; otherwise stem; otherwise identity.
       Also guards against runtime LookupError during example caching."""
    global _use_porter, _lemmatizer, _stemmer
    if _use_porter is False and _lemmatizer is not None:
        try:
            return [_lemmatizer.lemmatize(w) for w in tokens]
        except LookupError:
            try:
                from nltk.stem import PorterStemmer
                _stemmer = PorterStemmer()
                _use_porter = True
                return [_stemmer.stem(w) for w in tokens]
            except Exception:
                return tokens
    elif _use_porter is True and _stemmer is not None:
        return [_stemmer.stem(w) for w in tokens]
    else:
        return tokens


# ---- Core pipeline (returns step-by-step dict) ----
def normalize(text: str) -> OrderedDict:
    steps = OrderedDict()

    t1 = tokenize(text)
    steps["1) Tokenize"] = t1

    t2 = remove_non_ascii(t1)
    steps["2) Remove non-ASCII"] = t2

    t3 = to_lowercase(t2)
    steps["3) Lowercase"] = t3

    t4 = remove_punctuation(t3)
    steps["4) Remove punctuation"] = t4

    t5 = remove_stopwords(t4)
    steps["5) Remove stopwords"] = t5

    t6 = lemmatize_list(t5)
    steps["6) Lemmatize"] = t6

    steps["Final normalized text"] = " ".join(t6)
    return steps


# ---- Gradio wiring ----
examples = [
    "The quick brown fox jumps over the lazy dog!",
    "NLTK is a leading platform for building Python programs to work with human language data.",
    "Text normalization is important for NLP tasks.",
    "Café prices in 2024 were higher—aren't they? 🤔",
]

def show_steps(text):
    steps = normalize(text)
    parts = []
    for step, value in steps.items():
        if isinstance(value, list):
            pretty = " ".join(value)
            parts.append(f"<b>{step}</b>: {pretty} <small>({len(value)} tokens)</small>")
        else:
            parts.append(f"<b>{step}</b>: {value}")
    return "<br>".join(parts)

iface = gr.Interface(
    fn=show_steps,
    inputs=gr.Textbox(lines=4, label="Enter text to normalize"),
    outputs=gr.HTML(label="Step-by-step normalization"),
    examples=[[ex] for ex in examples],
    cache_examples=False,             # avoid startup caching
    flagging_mode="never",
    title="Text Normalization Pipeline",
    description="Enter text or select an example to see each step of the normalization process."
)

if __name__ == "__main__":
    iface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        ssr_mode=False,          # disable SSR for stability
        share=(not IN_SPACES),   # public link only when local; avoids Spaces warning
        quiet=True,              # suppresses the “To create a public link…” tip
    )