File size: 4,148 Bytes
348e339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
import string
import gradio as gr

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources are available at runtime
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)

# ---------- Normalization helpers ----------
_wordnet_lemmatizer = WordNetLemmatizer()
_stop_words = set(stopwords.words("english"))
_punct_table = str.maketrans("", "", string.punctuation)

def word_tokenize(text: str):
    # Simple word tokenizer that keeps apostrophes inside words
    return nltk.word_tokenize(text)

def remove_non_ascii(words):
    return [w.encode("ascii", "ignore").decode("ascii") for w in words]

def to_lowercase(words):
    return [w.lower() for w in words]

def remove_punctuation(words):
    return [w.translate(_punct_table) for w in words if w.translate(_punct_table) != ""]

def remove_stopwords(words):
    return [w for w in words if w not in _stop_words]

def lemmatize_list(words):
    # Lemmatize as nouns first, then verbs if noun same as original
    out = []
    for w in words:
        n = _wordnet_lemmatizer.lemmatize(w, pos="n")
        v = _wordnet_lemmatizer.lemmatize(n, pos="v")
        out.append(v)
    return out

def normalize_pipeline(text: str):
    """
    Runs the full preprocessing pipeline while returning step-by-step outputs.
    Returns a dict mapping step name to value (list of tokens or final string).
    """
    steps = {}
    steps["original"] = text

    tokens = word_tokenize(text)
    steps["1) tokenize"] = tokens

    words = remove_non_ascii(tokens)
    steps["2) remove_non_ascii"] = words

    words = to_lowercase(words)
    steps["3) to_lowercase"] = words

    words = remove_punctuation(words)
    steps["4) remove_punctuation"] = words

    words = remove_stopwords(words)
    steps["5) remove_stopwords"] = words

    words = lemmatize_list(words)
    steps["6) lemmatize"] = words

    final_text = " ".join(words)
    steps["7) join"] = final_text
    return steps, final_text

# ---------- Gradio UI ----------
EXAMPLES = [
    "Habitat's 20 by 28 campaign is inspiring—let's build more homes in Jackson!",
    "NLTK makes text preprocessing EASY: Tokenize, lowercase, remove punctuation & stopwords, then lemmatize.",
    "Cats were running, jumped over fences; the dogs' tails were wagging! 🐶🐱",
    "Email me at Example@Domain.com!!! This, perhaps, isn't AS easy as it looks...",
]

def run_pipeline(text):
    steps, final_text = normalize_pipeline(text)
    # Provide a human-friendly multiline trace
    trace_lines = []
    for k, v in steps.items():
        if isinstance(v, list):
            display = ", ".join(v[:40]) + (" ..." if len(v) > 40 else "")
        else:
            display = v
        trace_lines.append(f"{k}:\n{display}\n")
    trace = "\n".join(trace_lines)
    return steps, final_text, trace

with gr.Blocks(title="Text Normalization Demo") as demo:
    gr.Markdown("# Text Normalization (Step-by-Step)")
    gr.Markdown(
        "Enter text on the left or choose an example. The pipeline shows each step: "
        "**tokenize → remove_non_ascii → lowercase → remove_punctuation → remove_stopwords → lemmatize → join**."
    )

    with gr.Row():
        with gr.Column():
            text_in = gr.Textbox(label="Input text", lines=6, placeholder="Type or pick an example below...")
            examples = gr.Examples(
                examples=[[e] for e in EXAMPLES],
                inputs=[text_in],
                label="Try these examples",
            )
            run_btn = gr.Button("Run normalization")
        with gr.Column():
            final_out = gr.Textbox(label="Final normalized text", lines=2)
            trace_out = gr.Code(label="Step-by-step trace (human readable)", language="markdown")

    # Collapsible JSON view for each step (for clarity & grading)
    steps_json = gr.JSON(label="Detailed steps (JSON)")

    run_btn.click(fn=run_pipeline, inputs=text_in, outputs=[steps_json, final_out, trace_out])

if __name__ == "__main__":
    demo.launch()