File size: 5,181 Bytes
4657ed8
 
 
 
 
 
 
 
 
 
 
 
608f313
5723f1d
183842b
4657ed8
 
183842b
 
4657ed8
 
 
 
 
 
 
 
 
 
 
183842b
 
 
 
4657ed8
ccdb725
4657ed8
183842b
4657ed8
183842b
 
 
 
4657ed8
 
 
183842b
 
 
 
 
4657ed8
 
 
 
51abd9e
4657ed8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183842b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4657ed8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51abd9e
d65f58c
 
 
 
4657ed8
5723f1d
 
d65f58c
 
51abd9e
d65f58c
4657ed8
 
51abd9e
4657ed8
51abd9e
 
d65f58c
 
 
4657ed8
d65f58c
 
a3a9b3a
f50428b
d65f58c
51abd9e
d65f58c
5723f1d
 
183842b
 
 
ccdb725
 
 
183842b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python3
"""
Gradio app: Text normalization pipeline with step-by-step outputs.
Run locally:
  pip install -r requirements.txt
  python app.py
"""

import os
import re
import string
from collections import OrderedDict
import gradio as gr

# Detect if running on Hugging Face Spaces (avoid share=True there)
IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))

# ---- Optional NLTK pieces (NO downloads at startup) ----
# Use real stopwords if available; otherwise fall back to a small set.
try:
    import nltk  # noqa: F401
    from nltk.corpus import stopwords as nltk_stopwords
    _STOPWORDS = set(nltk_stopwords.words("english"))
except Exception:
    _STOPWORDS = {
        "a","an","and","are","as","at","be","but","by","for","if","in","into",
        "is","it","no","not","of","on","or","such","that","the","their","then",
        "there","these","they","this","to","was","will","with","were","from","your"
    }

# Decide lemmatizer vs stemmer based on whether the *corpus* exists
_use_porter = True
_lemmatizer = None
_stemmer = None
try:
    import nltk  # noqa: F401
    from nltk.stem import WordNetLemmatizer
    # Only use WordNetLemmatizer if the *wordnet* corpus is present
    try:
        nltk.data.find("corpora/wordnet")
        _lemmatizer = WordNetLemmatizer()
        _use_porter = False
    except LookupError:
        from nltk.stem import PorterStemmer
        _stemmer = PorterStemmer()
        _use_porter = True
except Exception:
    # If NLTK isn't fully available, fall back to identity later
    _lemmatizer = None
    _stemmer = None
    _use_porter = None


# ---- Pipeline helpers ----
def tokenize(text: str):
    # Simple, dependency-free tokenizer: words or single non-space symbols
    return re.findall(r"\w+|[^\w\s]", text or "", flags=re.UNICODE)

def remove_non_ascii(tokens):
    cleaned = []
    for w in tokens:
        ascii_w = w.encode("ascii", "ignore").decode("ascii")
        if ascii_w:
            cleaned.append(ascii_w)
    return cleaned

def to_lowercase(tokens):
    return [w.lower() for w in tokens]

def remove_punctuation(tokens):
    table = str.maketrans("", "", string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    return [w for w in stripped if w and not w.isspace()]

def remove_stopwords(tokens):
    return [w for w in tokens if w not in _STOPWORDS]

def lemmatize_list(tokens):
    """Lemmatize if wordnet is present; otherwise stem; otherwise identity.
       Also guards against runtime LookupError during example caching."""
    global _use_porter, _lemmatizer, _stemmer
    if _use_porter is False and _lemmatizer is not None:
        try:
            return [_lemmatizer.lemmatize(w) for w in tokens]
        except LookupError:
            try:
                from nltk.stem import PorterStemmer
                _stemmer = PorterStemmer()
                _use_porter = True
                return [_stemmer.stem(w) for w in tokens]
            except Exception:
                return tokens
    elif _use_porter is True and _stemmer is not None:
        return [_stemmer.stem(w) for w in tokens]
    else:
        return tokens


# ---- Core pipeline (returns step-by-step dict) ----
def normalize(text: str) -> OrderedDict:
    steps = OrderedDict()

    t1 = tokenize(text)
    steps["1) Tokenize"] = t1

    t2 = remove_non_ascii(t1)
    steps["2) Remove non-ASCII"] = t2

    t3 = to_lowercase(t2)
    steps["3) Lowercase"] = t3

    t4 = remove_punctuation(t3)
    steps["4) Remove punctuation"] = t4

    t5 = remove_stopwords(t4)
    steps["5) Remove stopwords"] = t5

    t6 = lemmatize_list(t5)
    steps["6) Lemmatize"] = t6

    steps["Final normalized text"] = " ".join(t6)
    return steps


# ---- Gradio wiring ----
examples = [
    "The quick brown fox jumps over the lazy dog!",
    "NLTK is a leading platform for building Python programs to work with human language data.",
    "Text normalization is important for NLP tasks.",
    "Café prices in 2024 were higher—aren't they? 🤔",
]

def show_steps(text):
    steps = normalize(text)
    parts = []
    for step, value in steps.items():
        if isinstance(value, list):
            pretty = " ".join(value)
            parts.append(f"<b>{step}</b>: {pretty} <small>({len(value)} tokens)</small>")
        else:
            parts.append(f"<b>{step}</b>: {value}")
    return "<br>".join(parts)

iface = gr.Interface(
    fn=show_steps,
    inputs=gr.Textbox(lines=4, label="Enter text to normalize"),
    outputs=gr.HTML(label="Step-by-step normalization"),
    examples=[[ex] for ex in examples],
    cache_examples=False,             # avoid startup caching
    flagging_mode="never",
    title="Text Normalization Pipeline",
    description="Enter text or select an example to see each step of the normalization process."
)

if __name__ == "__main__":
    iface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        ssr_mode=False,          # disable SSR for stability
        share=(not IN_SPACES),   # public link only when local; avoids Spaces warning
        quiet=True,              # suppresses the “To create a public link…” tip
    )