Spaces:
Sleeping
Sleeping
File size: 5,181 Bytes
4657ed8 608f313 5723f1d 183842b 4657ed8 183842b 4657ed8 183842b 4657ed8 ccdb725 4657ed8 183842b 4657ed8 183842b 4657ed8 183842b 4657ed8 51abd9e 4657ed8 183842b 4657ed8 51abd9e d65f58c 4657ed8 5723f1d d65f58c 51abd9e d65f58c 4657ed8 51abd9e 4657ed8 51abd9e d65f58c 4657ed8 d65f58c a3a9b3a f50428b d65f58c 51abd9e d65f58c 5723f1d 183842b ccdb725 183842b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
#!/usr/bin/env python3
"""
Gradio app: Text normalization pipeline with step-by-step outputs.
Run locally:
pip install -r requirements.txt
python app.py
"""
import os
import re
import string
from collections import OrderedDict
import gradio as gr
# Detect if running on Hugging Face Spaces (avoid share=True there)
IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
# ---- Optional NLTK pieces (NO downloads at startup) ----
# Use real stopwords if available; otherwise fall back to a small set.
try:
import nltk # noqa: F401
from nltk.corpus import stopwords as nltk_stopwords
_STOPWORDS = set(nltk_stopwords.words("english"))
except Exception:
_STOPWORDS = {
"a","an","and","are","as","at","be","but","by","for","if","in","into",
"is","it","no","not","of","on","or","such","that","the","their","then",
"there","these","they","this","to","was","will","with","were","from","your"
}
# Decide lemmatizer vs stemmer based on whether the *corpus* exists
_use_porter = True
_lemmatizer = None
_stemmer = None
try:
import nltk # noqa: F401
from nltk.stem import WordNetLemmatizer
# Only use WordNetLemmatizer if the *wordnet* corpus is present
try:
nltk.data.find("corpora/wordnet")
_lemmatizer = WordNetLemmatizer()
_use_porter = False
except LookupError:
from nltk.stem import PorterStemmer
_stemmer = PorterStemmer()
_use_porter = True
except Exception:
# If NLTK isn't fully available, fall back to identity later
_lemmatizer = None
_stemmer = None
_use_porter = None
# ---- Pipeline helpers ----
def tokenize(text: str):
# Simple, dependency-free tokenizer: words or single non-space symbols
return re.findall(r"\w+|[^\w\s]", text or "", flags=re.UNICODE)
def remove_non_ascii(tokens):
cleaned = []
for w in tokens:
ascii_w = w.encode("ascii", "ignore").decode("ascii")
if ascii_w:
cleaned.append(ascii_w)
return cleaned
def to_lowercase(tokens):
return [w.lower() for w in tokens]
def remove_punctuation(tokens):
table = str.maketrans("", "", string.punctuation)
stripped = [w.translate(table) for w in tokens]
return [w for w in stripped if w and not w.isspace()]
def remove_stopwords(tokens):
return [w for w in tokens if w not in _STOPWORDS]
def lemmatize_list(tokens):
"""Lemmatize if wordnet is present; otherwise stem; otherwise identity.
Also guards against runtime LookupError during example caching."""
global _use_porter, _lemmatizer, _stemmer
if _use_porter is False and _lemmatizer is not None:
try:
return [_lemmatizer.lemmatize(w) for w in tokens]
except LookupError:
try:
from nltk.stem import PorterStemmer
_stemmer = PorterStemmer()
_use_porter = True
return [_stemmer.stem(w) for w in tokens]
except Exception:
return tokens
elif _use_porter is True and _stemmer is not None:
return [_stemmer.stem(w) for w in tokens]
else:
return tokens
# ---- Core pipeline (returns step-by-step dict) ----
def normalize(text: str) -> OrderedDict:
steps = OrderedDict()
t1 = tokenize(text)
steps["1) Tokenize"] = t1
t2 = remove_non_ascii(t1)
steps["2) Remove non-ASCII"] = t2
t3 = to_lowercase(t2)
steps["3) Lowercase"] = t3
t4 = remove_punctuation(t3)
steps["4) Remove punctuation"] = t4
t5 = remove_stopwords(t4)
steps["5) Remove stopwords"] = t5
t6 = lemmatize_list(t5)
steps["6) Lemmatize"] = t6
steps["Final normalized text"] = " ".join(t6)
return steps
# ---- Gradio wiring ----
examples = [
"The quick brown fox jumps over the lazy dog!",
"NLTK is a leading platform for building Python programs to work with human language data.",
"Text normalization is important for NLP tasks.",
"Café prices in 2024 were higher—aren't they? 🤔",
]
def show_steps(text):
steps = normalize(text)
parts = []
for step, value in steps.items():
if isinstance(value, list):
pretty = " ".join(value)
parts.append(f"<b>{step}</b>: {pretty} <small>({len(value)} tokens)</small>")
else:
parts.append(f"<b>{step}</b>: {value}")
return "<br>".join(parts)
iface = gr.Interface(
fn=show_steps,
inputs=gr.Textbox(lines=4, label="Enter text to normalize"),
outputs=gr.HTML(label="Step-by-step normalization"),
examples=[[ex] for ex in examples],
cache_examples=False, # avoid startup caching
flagging_mode="never",
title="Text Normalization Pipeline",
description="Enter text or select an example to see each step of the normalization process."
)
if __name__ == "__main__":
iface.launch(
server_name="0.0.0.0",
server_port=7860,
ssr_mode=False, # disable SSR for stability
share=(not IN_SPACES), # public link only when local; avoids Spaces warning
quiet=True, # suppresses the “To create a public link…” tip
)
|