Spaces:

ajayinsac
/

Nlp_example

Sleeping

App Files Files Community

ajayinsac commited on Aug 25, 2025

Commit

d65f58c

verified ·

1 Parent(s): 58c6939

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -145

app.py CHANGED Viewed

@@ -1,151 +1,27 @@
-#!/usr/bin/env python3
-"""
-Gradio app: Text normalization pipeline with step-by-step outputs.
-Run locally:
-  pip install -r requirements.txt
-  python app.py
-"""
-import os
-import string
-import pandas as pd
 import gradio as gr
-import nltk
-# Detect if running on Hugging Face Spaces
-IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
-# Lightweight tokenizer that needs no punkt download
-from nltk.tokenize import wordpunct_tokenize
-# Optional NLTK corpora: use if present; otherwise fall back
-try:
-    from nltk.corpus import stopwords
-    _STOPWORDS = set(stopwords.words("english"))
-except Exception:
-    # Minimal built-in fallback list to avoid startup downloads
-    _STOPWORDS = {
-        "a","an","and","are","as","at","be","but","by","for","if","in","into",
-        "is","it","no","not","of","on","or","such","that","the","their","then",
-        "there","these","they","this","to","was","will","with","were","from","your"
-    }
-# Prefer WordNet lemmatizer; if unavailable, fall back to PorterStemmer (no corpora)
-try:
-    from nltk.stem import WordNetLemmatizer
-    _lemmatizer = WordNetLemmatizer()
-    _use_porter = False
-except Exception:
-    from nltk.stem import PorterStemmer
-    _stemmer = PorterStemmer()
-    _use_porter = True
-# -------- Pipeline helpers --------
-def remove_non_ascii(words):
-    """Strip non-ASCII chars from each token and drop empties."""
-    cleaned = []
-    for w in words:
-        ascii_w = w.encode("ascii", "ignore").decode("ascii")
-        if ascii_w:
-            cleaned.append(ascii_w)
-    return cleaned
-def to_lowercase(words):
-    return [w.lower() for w in words]
-def remove_punctuation(words):
-    """Remove punctuation characters from each token and drop empties."""
-    table = str.maketrans("", "", string.punctuation)
-    stripped = [w.translate(table) for w in words]
-    return [w for w in stripped if w and not w.isspace()]
-def remove_stopwords(words):
-    return [w for w in words if w not in _STOPWORDS]
-def lemmatize_list(words):
-    if _use_porter:
-        # Fallback: stemming when WordNet corpus isn't available
-        return [_stemmer.stem(w) for w in words]
-    else:
-        return [_lemmatizer.lemmatize(w) for w in words]
-# -------- Core pipeline (from prompt) --------
-def normalize(text: str):
-    """Full preprocessing pipeline"""
-    words = wordpunct_tokenize(text or "")
-    words = remove_non_ascii(words)
-    words = to_lowercase(words)
-    words = remove_punctuation(words)
-    words = remove_stopwords(words)
-    words = lemmatize_list(words)
-    return " ".join(words)
-# -------- Step-by-step output for UI --------
-def normalize_with_steps(text: str):
-    if not text or not text.strip():
-        empty_df = pd.DataFrame([["—", [], 0]], columns=["Step", "Tokens", "Count"])
-        return empty_df, ""
-    steps = []
-    # 1) Tokenize (no punkt dependency)
-    tokens = wordpunct_tokenize(text)
-    steps.append(("1) Tokenize", tokens.copy(), len(tokens)))
-    # 2) Remove non-ASCII
-    tokens = remove_non_ascii(tokens)
-    steps.append(("2) Remove non-ASCII", tokens.copy(), len(tokens)))
-    # 3) Lowercase
-    tokens = to_lowercase(tokens)
-    steps.append(("3) Lowercase", tokens.copy(), len(tokens)))
-    # 4) Remove punctuation
-    tokens = remove_punctuation(tokens)
-    steps.append(("4) Remove punctuation", tokens.copy(), len(tokens)))
-    # 5) Remove stopwords
-    tokens = remove_stopwords(tokens)
-    steps.append(("5) Remove stopwords", tokens.copy(), len(tokens)))
-    # 6) Lemmatize (or stem if WordNet missing)
-    tokens = lemmatize_list(tokens)
-    steps.append(("6) Lemmatize", tokens.copy(), len(tokens)))
-    df = pd.DataFrame(steps, columns=["Step", "Tokens", "Count"])
-    final_text = " ".join(tokens)
-    return df, final_text
-# -------- Gradio UI --------
-EXAMPLES = [
-    ["The QUICK brown foxes, jumping over 13 lazy dogs!!!"],
-    ["Café prices in 2024 were higher—aren't they? 🤔"],
-    ["NLTK's tokenization isn't perfect; e.g., 'don't' becomes two tokens."],
-    ["Hello!!! This is a TEST of the FULL preprocessing PIPELINE."],
-    ["E-mail: ajay@example.com; Visit https://example.org soon..."],
 ]
-with gr.Blocks(title="Text Normalization Pipeline") as demo:
-    gr.Markdown(
-        "# Text Normalization Pipeline\n"
-        "Type text below or click an example. Click **Normalize** to see each step and the final result."
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            inp = gr.Textbox(lines=8, label="Input Text", placeholder="Type or paste text here...")
-            btn = gr.Button("Normalize", variant="primary")
-            gr.Examples(EXAMPLES, inputs=inp, label="Examples")
-        with gr.Column(scale=1):
-            out_df = gr.Dataframe(headers=["Step", "Tokens", "Count"], wrap=True, label="Step-by-step outputs")
-            out_final = gr.Textbox(label="Final normalized text")
-    btn.click(normalize_with_steps, inputs=inp, outputs=[out_df, out_final])
-# -------- Launch (Spaces-friendly & Local public link) --------
 if __name__ == "__main__":
-    demo.queue()
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        ssr_mode=False,
-        share=True
-        #share=not IN_SPACES,  # no warning on Spaces; public link when running locally
-    )

 import gradio as gr
+from normalize_pipeline import normalize
+examples = [
+    "The quick brown fox jumps over the lazy dog!",
+    "NLTK is a leading platform for building Python programs to work with human language data.",
+    "Text normalization is important for NLP tasks.",
 ]
+def show_steps(text):
+    steps = normalize(text)
+    output = ""
+    for step, value in steps.items():
+        output += f"<b>{step}:</b> {value}<br>"
+    return output
+iface = gr.Interface(
+    fn=show_steps,
+    inputs=gr.Textbox(lines=3, label="Enter text to normalize"),
+    outputs=gr.HTML(label="Step-by-step normalization"),
+    examples=[[ex] for ex in examples],
+    title="Text Normalization Pipeline",
+    description="Enter text or select an example to see each step of the normalization process.",
+)
 if __name__ == "__main__":
+    iface.launch()