| import gradio as gr |
| import nltk |
| import string |
| import re |
| from nltk.corpus import stopwords |
| from nltk.stem import WordNetLemmatizer |
| from nltk.tokenize import word_tokenize |
|
|
| |
| |
| try: |
| nltk.data.find('tokenizers/punkt') |
| except nltk.downloader.DownloadError: |
| nltk.download('punkt') |
|
|
| try: |
| nltk.data.find('corpora/stopwords') |
| except nltk.downloader.DownloadError: |
| nltk.download('stopwords') |
|
|
| try: |
| nltk.data.find('corpora/wordnet') |
| except nltk.downloader.DownloadError: |
| nltk.download('wordnet') |
|
|
| |
| STOP_WORDS = set(stopwords.words('english')) |
| LEMMATIZER = WordNetLemmatizer() |
|
|
| |
|
|
| def remove_non_ascii(words): |
| """Removes non-ASCII characters from a list of tokens.""" |
| return [re.sub(r'[^\x00-\x7F]+', '', word) for word in words] |
|
|
| def to_lowercase(words): |
| """Converts all tokens to lowercase.""" |
| return [word.lower() for word in words] |
|
|
| def remove_punctuation(words): |
| """Removes punctuation tokens.""" |
| |
| return [word for word in words if word not in string.punctuation and word != ''] |
|
|
| def remove_stopwords(words): |
| """Removes common English stop words.""" |
| |
| return [word for word in words if word not in STOP_WORDS] |
|
|
| def lemmatize_list(words): |
| """Lemmatizes tokens to their base form (lemma).""" |
| |
| return [LEMMATIZER.lemmatize(word, pos='v') for word in words] |
|
|
| |
|
|
| def normalize_with_steps(text): |
| """ |
| Full preprocessing pipeline that tracks the output at each step. |
| The Gradio output will show the step-by-step transformation. |
| """ |
| output_log = f"**Input Text:**\n`{text}`\n\n---" |
|
|
| |
| words = word_tokenize(text) |
| output_log += f"\n\n**Step 1: Tokenization (nltk.word_tokenize)**\nBreaks text into individual words, including punctuation.\n`{words}`" |
|
|
| |
| words_step2 = remove_non_ascii(words) |
| output_log += f"\n\n**Step 2: Remove Non-ASCII**\nRemoves characters outside the standard ASCII set.\n`{words_step2}`" |
| words = words_step2 |
|
|
| |
| words_step3 = to_lowercase(words) |
| output_log += f"\n\n**Step 3: To Lowercase**\nConverts all characters to lowercase for consistency.\n`{words_step3}`" |
| words = words_step3 |
|
|
| |
| words_step4 = remove_punctuation(words) |
| output_log += f"\n\n**Step 4: Remove Punctuation**\nFilters out tokens that are only punctuation marks.\n`{words_step4}`" |
| words = words_step4 |
|
|
| |
| words_step5 = remove_stopwords(words) |
| output_log += f"\n\n**Step 5: Remove Stopwords**\nRemoves common, less meaningful words (e.g., 'the', 'is', 'a').\n`{words_step5}`" |
| words = words_step5 |
|
|
| |
| words_step6 = lemmatize_list(words) |
| output_log += f"\n\n**Step 6: Lemmatization (WordNetLemmatizer)**\nReduces words to their dictionary root form (e.g., 'running' -> 'run').\n`{words_step6}`" |
| words = words_step6 |
|
|
| |
| final_text = ' '.join(words) |
| output_log += f"\n\n---" |
| output_log += f"\n\n**Final Output (Joined Text):**\n`{final_text}`" |
|
|
| return output_log |
|
|
| |
|
|
| |
| examples = [ |
| ["The dogs are running very quickly in the parks and they aren't stopping!"], |
| ["Washingtón, D.C. has a lot of interesting historical monuments. $1000 is needed."], |
| ["I've been playing football for years, but now I'm traveling to London."] |
| ] |
|
|
| |
| iface = gr.Interface( |
| fn=normalize_with_steps, |
| inputs=gr.Textbox( |
| lines=3, |
| label="Enter Text to Preprocess", |
| placeholder="Type a sentence here...", |
| value=examples[0][0] |
| ), |
| outputs=gr.Markdown( |
| label="NLP Preprocessing Steps (Step-by-Step Transformation)", |
| ), |
| title="NLP Text Normalization Pipeline Demo", |
| description="This application demonstrates a full text preprocessing pipeline using NLTK, showing the transformation after each step: Tokenization, Lowercasing, Punctuation Removal, Stopword Removal, and Lemmatization.", |
| examples=examples, |
| allow_flagging='never' |
| ) |
|
|
| |
| |
| |
| iface.launch() |