import gradio as gr import nltk import string import re from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize # --- 1. NLTK Resource Downloads --- # Ensure necessary NLTK data is downloaded for running on fresh environments (like Hugging Face) try: nltk.data.find('tokenizers/punkt') except nltk.downloader.DownloadError: nltk.download('punkt') try: nltk.data.find('corpora/stopwords') except nltk.downloader.DownloadError: nltk.download('stopwords') try: nltk.data.find('corpora/wordnet') except nltk.downloader.DownloadError: nltk.download('wordnet') # Initialize NLP resources outside the main function for efficiency STOP_WORDS = set(stopwords.words('english')) LEMMATIZER = WordNetLemmatizer() # --- 2. Helper Preprocessing Functions (Based on User's Outline) --- def remove_non_ascii(words): """Removes non-ASCII characters from a list of tokens.""" return [re.sub(r'[^\x00-\x7F]+', '', word) for word in words] def to_lowercase(words): """Converts all tokens to lowercase.""" return [word.lower() for word in words] def remove_punctuation(words): """Removes punctuation tokens.""" # We remove punctuation marks themselves, and also filter out empty strings return [word for word in words if word not in string.punctuation and word != ''] def remove_stopwords(words): """Removes common English stop words.""" # Note: Punctuation must be removed *before* this step, otherwise it might be preserved. return [word for word in words if word not in STOP_WORDS] def lemmatize_list(words): """Lemmatizes tokens to their base form (lemma).""" # Using 'v' for verb is a common heuristic if POS tagging is skipped for simplicity return [LEMMATIZER.lemmatize(word, pos='v') for word in words] # --- 3. Main Normalization Function with Step Tracking --- def normalize_with_steps(text): """ Full preprocessing pipeline that tracks the output at each step. The Gradio output will show the step-by-step transformation. """ output_log = f"**Input Text:**\n`{text}`\n\n---" # Step 1: Tokenization words = word_tokenize(text) output_log += f"\n\n**Step 1: Tokenization (nltk.word_tokenize)**\nBreaks text into individual words, including punctuation.\n`{words}`" # Step 2: Remove Non-ASCII words_step2 = remove_non_ascii(words) output_log += f"\n\n**Step 2: Remove Non-ASCII**\nRemoves characters outside the standard ASCII set.\n`{words_step2}`" words = words_step2 # Update for next step # Step 3: To Lowercase words_step3 = to_lowercase(words) output_log += f"\n\n**Step 3: To Lowercase**\nConverts all characters to lowercase for consistency.\n`{words_step3}`" words = words_step3 # Step 4: Remove Punctuation words_step4 = remove_punctuation(words) output_log += f"\n\n**Step 4: Remove Punctuation**\nFilters out tokens that are only punctuation marks.\n`{words_step4}`" words = words_step4 # Step 5: Remove Stopwords words_step5 = remove_stopwords(words) output_log += f"\n\n**Step 5: Remove Stopwords**\nRemoves common, less meaningful words (e.g., 'the', 'is', 'a').\n`{words_step5}`" words = words_step5 # Step 6: Lemmatization words_step6 = lemmatize_list(words) output_log += f"\n\n**Step 6: Lemmatization (WordNetLemmatizer)**\nReduces words to their dictionary root form (e.g., 'running' -> 'run').\n`{words_step6}`" words = words_step6 # Step 7: Join Final Tokens final_text = ' '.join(words) output_log += f"\n\n---" output_log += f"\n\n**Final Output (Joined Text):**\n`{final_text}`" return output_log # --- 4. Gradio Interface Setup --- # Example Inputs for the user to try examples = [ ["The dogs are running very quickly in the parks and they aren't stopping!"], ["Washingtón, D.C. has a lot of interesting historical monuments. $1000 is needed."], ["I've been playing football for years, but now I'm traveling to London."] ] # Create the Gradio interface iface = gr.Interface( fn=normalize_with_steps, inputs=gr.Textbox( lines=3, label="Enter Text to Preprocess", placeholder="Type a sentence here...", value=examples[0][0] # Set default value ), outputs=gr.Markdown( label="NLP Preprocessing Steps (Step-by-Step Transformation)", ), title="NLP Text Normalization Pipeline Demo", description="This application demonstrates a full text preprocessing pipeline using NLTK, showing the transformation after each step: Tokenization, Lowercasing, Punctuation Removal, Stopword Removal, and Lemmatization.", examples=examples, allow_flagging='never' ) # Launch the app (Setting share=True allows for deployment) # iface.launch(share=True) # Use this command if running locally or in a notebook # For Hugging Face deployment, the app is launched via the `app.py` execution environment. iface.launch()