Testspace / NLP test
Konaguy's picture
Create NLP test
77733ce verified
import gradio as gr
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# --- 1. NLTK Resource Downloads ---
# Ensure necessary NLTK data is downloaded for running on fresh environments (like Hugging Face)
try:
nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
nltk.download('punkt')
try:
nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
nltk.download('stopwords')
try:
nltk.data.find('corpora/wordnet')
except nltk.downloader.DownloadError:
nltk.download('wordnet')
# Initialize NLP resources outside the main function for efficiency
STOP_WORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()
# --- 2. Helper Preprocessing Functions (Based on User's Outline) ---
def remove_non_ascii(words):
"""Removes non-ASCII characters from a list of tokens."""
return [re.sub(r'[^\x00-\x7F]+', '', word) for word in words]
def to_lowercase(words):
"""Converts all tokens to lowercase."""
return [word.lower() for word in words]
def remove_punctuation(words):
"""Removes punctuation tokens."""
# We remove punctuation marks themselves, and also filter out empty strings
return [word for word in words if word not in string.punctuation and word != '']
def remove_stopwords(words):
"""Removes common English stop words."""
# Note: Punctuation must be removed *before* this step, otherwise it might be preserved.
return [word for word in words if word not in STOP_WORDS]
def lemmatize_list(words):
"""Lemmatizes tokens to their base form (lemma)."""
# Using 'v' for verb is a common heuristic if POS tagging is skipped for simplicity
return [LEMMATIZER.lemmatize(word, pos='v') for word in words]
# --- 3. Main Normalization Function with Step Tracking ---
def normalize_with_steps(text):
"""
Full preprocessing pipeline that tracks the output at each step.
The Gradio output will show the step-by-step transformation.
"""
output_log = f"**Input Text:**\n`{text}`\n\n---"
# Step 1: Tokenization
words = word_tokenize(text)
output_log += f"\n\n**Step 1: Tokenization (nltk.word_tokenize)**\nBreaks text into individual words, including punctuation.\n`{words}`"
# Step 2: Remove Non-ASCII
words_step2 = remove_non_ascii(words)
output_log += f"\n\n**Step 2: Remove Non-ASCII**\nRemoves characters outside the standard ASCII set.\n`{words_step2}`"
words = words_step2 # Update for next step
# Step 3: To Lowercase
words_step3 = to_lowercase(words)
output_log += f"\n\n**Step 3: To Lowercase**\nConverts all characters to lowercase for consistency.\n`{words_step3}`"
words = words_step3
# Step 4: Remove Punctuation
words_step4 = remove_punctuation(words)
output_log += f"\n\n**Step 4: Remove Punctuation**\nFilters out tokens that are only punctuation marks.\n`{words_step4}`"
words = words_step4
# Step 5: Remove Stopwords
words_step5 = remove_stopwords(words)
output_log += f"\n\n**Step 5: Remove Stopwords**\nRemoves common, less meaningful words (e.g., 'the', 'is', 'a').\n`{words_step5}`"
words = words_step5
# Step 6: Lemmatization
words_step6 = lemmatize_list(words)
output_log += f"\n\n**Step 6: Lemmatization (WordNetLemmatizer)**\nReduces words to their dictionary root form (e.g., 'running' -> 'run').\n`{words_step6}`"
words = words_step6
# Step 7: Join Final Tokens
final_text = ' '.join(words)
output_log += f"\n\n---"
output_log += f"\n\n**Final Output (Joined Text):**\n`{final_text}`"
return output_log
# --- 4. Gradio Interface Setup ---
# Example Inputs for the user to try
examples = [
["The dogs are running very quickly in the parks and they aren't stopping!"],
["Washingtón, D.C. has a lot of interesting historical monuments. $1000 is needed."],
["I've been playing football for years, but now I'm traveling to London."]
]
# Create the Gradio interface
iface = gr.Interface(
fn=normalize_with_steps,
inputs=gr.Textbox(
lines=3,
label="Enter Text to Preprocess",
placeholder="Type a sentence here...",
value=examples[0][0] # Set default value
),
outputs=gr.Markdown(
label="NLP Preprocessing Steps (Step-by-Step Transformation)",
),
title="NLP Text Normalization Pipeline Demo",
description="This application demonstrates a full text preprocessing pipeline using NLTK, showing the transformation after each step: Tokenization, Lowercasing, Punctuation Removal, Stopword Removal, and Lemmatization.",
examples=examples,
allow_flagging='never'
)
# Launch the app (Setting share=True allows for deployment)
# iface.launch(share=True) # Use this command if running locally or in a notebook
# For Hugging Face deployment, the app is launched via the `app.py` execution environment.
iface.launch()