Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,14 +10,13 @@ import os
|
|
| 10 |
import re
|
| 11 |
import string
|
| 12 |
from collections import OrderedDict
|
| 13 |
-
|
| 14 |
import gradio as gr
|
| 15 |
|
| 16 |
-
# Detect if running on Hugging Face Spaces (
|
| 17 |
IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
|
| 18 |
|
| 19 |
-
# ---- Optional NLTK pieces (
|
| 20 |
-
# Use real stopwords
|
| 21 |
try:
|
| 22 |
import nltk # noqa: F401
|
| 23 |
from nltk.corpus import stopwords as nltk_stopwords
|
|
@@ -29,18 +28,27 @@ except Exception:
|
|
| 29 |
"there","these","they","this","to","was","will","with","were","from","your"
|
| 30 |
}
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
try:
|
|
|
|
| 33 |
from nltk.stem import WordNetLemmatizer
|
| 34 |
-
|
| 35 |
-
_use_porter = False
|
| 36 |
-
except Exception:
|
| 37 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
from nltk.stem import PorterStemmer
|
| 39 |
_stemmer = PorterStemmer()
|
| 40 |
_use_porter = True
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
# ---- Pipeline helpers ----
|
|
@@ -68,11 +76,25 @@ def remove_stopwords(tokens):
|
|
| 68 |
return [w for w in tokens if w not in _STOPWORDS]
|
| 69 |
|
| 70 |
def lemmatize_list(tokens):
|
| 71 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
return [_stemmer.stem(w) for w in tokens]
|
| 73 |
-
elif _use_porter is False:
|
| 74 |
-
return [_lemmatizer.lemmatize(w) for w in tokens]
|
| 75 |
else:
|
|
|
|
| 76 |
return tokens
|
| 77 |
|
| 78 |
|
|
@@ -126,10 +148,16 @@ iface = gr.Interface(
|
|
| 126 |
inputs=gr.Textbox(lines=4, label="Enter text to normalize"),
|
| 127 |
outputs=gr.HTML(label="Step-by-step normalization"),
|
| 128 |
examples=[[ex] for ex in examples],
|
|
|
|
|
|
|
| 129 |
title="Text Normalization Pipeline",
|
| 130 |
description="Enter text or select an example to see each step of the normalization process."
|
| 131 |
)
|
| 132 |
|
| 133 |
if __name__ == "__main__":
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
import re
|
| 11 |
import string
|
| 12 |
from collections import OrderedDict
|
|
|
|
| 13 |
import gradio as gr
|
| 14 |
|
| 15 |
+
# Detect if running on Hugging Face Spaces (avoid share=True there)
|
| 16 |
IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
|
| 17 |
|
| 18 |
+
# ---- Optional NLTK pieces (NO downloads at startup) ----
|
| 19 |
+
# Use real stopwords if available; otherwise fall back to a small set.
|
| 20 |
try:
|
| 21 |
import nltk # noqa: F401
|
| 22 |
from nltk.corpus import stopwords as nltk_stopwords
|
|
|
|
| 28 |
"there","these","they","this","to","was","will","with","were","from","your"
|
| 29 |
}
|
| 30 |
|
| 31 |
+
# Decide lemmatizer vs stemmer based on whether the *corpus* exists
|
| 32 |
+
_use_porter = True
|
| 33 |
+
_lemmatizer = None
|
| 34 |
+
_stemmer = None
|
| 35 |
try:
|
| 36 |
+
import nltk
|
| 37 |
from nltk.stem import WordNetLemmatizer
|
| 38 |
+
# Only use WordNetLemmatizer if the *wordnet* corpus is present
|
|
|
|
|
|
|
| 39 |
try:
|
| 40 |
+
nltk.data.find("corpora/wordnet")
|
| 41 |
+
_lemmatizer = WordNetLemmatizer()
|
| 42 |
+
_use_porter = False
|
| 43 |
+
except LookupError:
|
| 44 |
from nltk.stem import PorterStemmer
|
| 45 |
_stemmer = PorterStemmer()
|
| 46 |
_use_porter = True
|
| 47 |
+
except Exception:
|
| 48 |
+
# If NLTK isn't fully available, fall back to identity later
|
| 49 |
+
_lemmatizer = None
|
| 50 |
+
_stemmer = None
|
| 51 |
+
_use_porter = None
|
| 52 |
|
| 53 |
|
| 54 |
# ---- Pipeline helpers ----
|
|
|
|
| 76 |
return [w for w in tokens if w not in _STOPWORDS]
|
| 77 |
|
| 78 |
def lemmatize_list(tokens):
|
| 79 |
+
"""Lemmatize if wordnet is present; otherwise stem; otherwise identity.
|
| 80 |
+
Also guards against runtime LookupError during example caching."""
|
| 81 |
+
global _use_porter, _lemmatizer, _stemmer
|
| 82 |
+
if _use_porter is False and _lemmatizer is not None:
|
| 83 |
+
try:
|
| 84 |
+
return [_lemmatizer.lemmatize(w) for w in tokens]
|
| 85 |
+
except LookupError:
|
| 86 |
+
# WordNet corpus not actually present; switch to Porter
|
| 87 |
+
try:
|
| 88 |
+
from nltk.stem import PorterStemmer
|
| 89 |
+
_stemmer = PorterStemmer()
|
| 90 |
+
_use_porter = True
|
| 91 |
+
return [_stemmer.stem(w) for w in tokens]
|
| 92 |
+
except Exception:
|
| 93 |
+
return tokens
|
| 94 |
+
elif _use_porter is True and _stemmer is not None:
|
| 95 |
return [_stemmer.stem(w) for w in tokens]
|
|
|
|
|
|
|
| 96 |
else:
|
| 97 |
+
# Last resort: return as-is
|
| 98 |
return tokens
|
| 99 |
|
| 100 |
|
|
|
|
| 148 |
inputs=gr.Textbox(lines=4, label="Enter text to normalize"),
|
| 149 |
outputs=gr.HTML(label="Step-by-step normalization"),
|
| 150 |
examples=[[ex] for ex in examples],
|
| 151 |
+
cache_examples=False, # <-- avoid startup caching (which runs the fn at launch)
|
| 152 |
+
allow_flagging="never",
|
| 153 |
title="Text Normalization Pipeline",
|
| 154 |
description="Enter text or select an example to see each step of the normalization process."
|
| 155 |
)
|
| 156 |
|
| 157 |
if __name__ == "__main__":
|
| 158 |
+
iface.launch(
|
| 159 |
+
server_name="0.0.0.0",
|
| 160 |
+
server_port=7860,
|
| 161 |
+
ssr_mode=False, # <-- disable SSR (prevents blank/fragile startup)
|
| 162 |
+
share=(not IN_SPACES), # <-- no share warning on Spaces; public link when local
|
| 163 |
+
)
|