Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,11 +13,11 @@ from collections import OrderedDict
|
|
| 13 |
|
| 14 |
import gradio as gr
|
| 15 |
|
| 16 |
-
# Detect if running on Hugging Face Spaces (
|
| 17 |
IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
|
| 18 |
|
| 19 |
# ---- Optional NLTK pieces (no downloads at startup) ----
|
| 20 |
-
#
|
| 21 |
try:
|
| 22 |
import nltk # noqa: F401
|
| 23 |
from nltk.corpus import stopwords as nltk_stopwords
|
|
@@ -39,15 +39,13 @@ except Exception:
|
|
| 39 |
_stemmer = PorterStemmer()
|
| 40 |
_use_porter = True
|
| 41 |
except Exception:
|
| 42 |
-
# Last-resort fallback: identity lemmatizer
|
| 43 |
_lemmatizer = None
|
| 44 |
_use_porter = None
|
| 45 |
|
| 46 |
|
| 47 |
# ---- Pipeline helpers ----
|
| 48 |
def tokenize(text: str):
|
| 49 |
-
# Simple, dependency-free tokenizer:
|
| 50 |
-
# split into "word" blocks and single non-space symbols to preserve punctuation step
|
| 51 |
return re.findall(r"\w+|[^\w\s]", text or "", flags=re.UNICODE)
|
| 52 |
|
| 53 |
def remove_non_ascii(tokens):
|
|
@@ -75,7 +73,6 @@ def lemmatize_list(tokens):
|
|
| 75 |
elif _use_porter is False:
|
| 76 |
return [_lemmatizer.lemmatize(w) for w in tokens]
|
| 77 |
else:
|
| 78 |
-
# identity if no lemmatizer/stemmer
|
| 79 |
return tokens
|
| 80 |
|
| 81 |
|
|
@@ -105,7 +102,7 @@ def normalize(text: str) -> OrderedDict:
|
|
| 105 |
return steps
|
| 106 |
|
| 107 |
|
| 108 |
-
# ---- Gradio wiring
|
| 109 |
examples = [
|
| 110 |
"The quick brown fox jumps over the lazy dog!",
|
| 111 |
"NLTK is a leading platform for building Python programs to work with human language data.",
|
|
@@ -115,14 +112,14 @@ examples = [
|
|
| 115 |
|
| 116 |
def show_steps(text):
|
| 117 |
steps = normalize(text)
|
| 118 |
-
|
| 119 |
for step, value in steps.items():
|
| 120 |
if isinstance(value, list):
|
| 121 |
pretty = " ".join(value)
|
| 122 |
-
|
| 123 |
else:
|
| 124 |
-
|
| 125 |
-
return "<br>".join(
|
| 126 |
|
| 127 |
iface = gr.Interface(
|
| 128 |
fn=show_steps,
|
|
@@ -130,11 +127,9 @@ iface = gr.Interface(
|
|
| 130 |
outputs=gr.HTML(label="Step-by-step normalization"),
|
| 131 |
examples=[[ex] for ex in examples],
|
| 132 |
title="Text Normalization Pipeline",
|
| 133 |
-
description="Enter text or select an example to see each step of the normalization process."
|
| 134 |
)
|
| 135 |
|
| 136 |
if __name__ == "__main__":
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
server_port=7860,
|
| 140 |
-
share=not
|
|
|
|
| 13 |
|
| 14 |
import gradio as gr
|
| 15 |
|
| 16 |
+
# Detect if running on Hugging Face Spaces (don't use share=True there)
|
| 17 |
IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
|
| 18 |
|
| 19 |
# ---- Optional NLTK pieces (no downloads at startup) ----
|
| 20 |
+
# Use real stopwords/lemmatizer if available; otherwise fall back.
|
| 21 |
try:
|
| 22 |
import nltk # noqa: F401
|
| 23 |
from nltk.corpus import stopwords as nltk_stopwords
|
|
|
|
| 39 |
_stemmer = PorterStemmer()
|
| 40 |
_use_porter = True
|
| 41 |
except Exception:
|
|
|
|
| 42 |
_lemmatizer = None
|
| 43 |
_use_porter = None
|
| 44 |
|
| 45 |
|
| 46 |
# ---- Pipeline helpers ----
|
| 47 |
def tokenize(text: str):
|
| 48 |
+
# Simple, dependency-free tokenizer: words or single non-space symbols
|
|
|
|
| 49 |
return re.findall(r"\w+|[^\w\s]", text or "", flags=re.UNICODE)
|
| 50 |
|
| 51 |
def remove_non_ascii(tokens):
|
|
|
|
| 73 |
elif _use_porter is False:
|
| 74 |
return [_lemmatizer.lemmatize(w) for w in tokens]
|
| 75 |
else:
|
|
|
|
| 76 |
return tokens
|
| 77 |
|
| 78 |
|
|
|
|
| 102 |
return steps
|
| 103 |
|
| 104 |
|
| 105 |
+
# ---- Gradio wiring ----
|
| 106 |
examples = [
|
| 107 |
"The quick brown fox jumps over the lazy dog!",
|
| 108 |
"NLTK is a leading platform for building Python programs to work with human language data.",
|
|
|
|
| 112 |
|
| 113 |
def show_steps(text):
|
| 114 |
steps = normalize(text)
|
| 115 |
+
parts = []
|
| 116 |
for step, value in steps.items():
|
| 117 |
if isinstance(value, list):
|
| 118 |
pretty = " ".join(value)
|
| 119 |
+
parts.append(f"<b>{step}</b>: {pretty} <small>({len(value)} tokens)</small>")
|
| 120 |
else:
|
| 121 |
+
parts.append(f"<b>{step}</b>: {value}")
|
| 122 |
+
return "<br>".join(parts)
|
| 123 |
|
| 124 |
iface = gr.Interface(
|
| 125 |
fn=show_steps,
|
|
|
|
| 127 |
outputs=gr.HTML(label="Step-by-step normalization"),
|
| 128 |
examples=[[ex] for ex in examples],
|
| 129 |
title="Text Normalization Pipeline",
|
| 130 |
+
description="Enter text or select an example to see each step of the normalization process."
|
| 131 |
)
|
| 132 |
|
| 133 |
if __name__ == "__main__":
|
| 134 |
+
# share=True only when running locally (avoids Spaces warning)
|
| 135 |
+
iface.launch(server_name="0.0.0.0", server_port=7860, share=(not IN_SPACES))
|
|
|
|
|
|