Spaces:

Konaguy
/

Testspace

No application file

App Files Files Community

Testspace / NLP test

Konaguy

Create NLP test

77733ce verified 6 months ago

raw

history blame contribute delete

4.98 kB

	import gradio as gr
	import nltk
	import string
	import re
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.tokenize import word_tokenize

	# --- 1. NLTK Resource Downloads ---
	# Ensure necessary NLTK data is downloaded for running on fresh environments (like Hugging Face)
	try:
	nltk.data.find('tokenizers/punkt')
	except nltk.downloader.DownloadError:
	nltk.download('punkt')

	try:
	nltk.data.find('corpora/stopwords')
	except nltk.downloader.DownloadError:
	nltk.download('stopwords')

	try:
	nltk.data.find('corpora/wordnet')
	except nltk.downloader.DownloadError:
	nltk.download('wordnet')

	# Initialize NLP resources outside the main function for efficiency
	STOP_WORDS = set(stopwords.words('english'))
	LEMMATIZER = WordNetLemmatizer()

	# --- 2. Helper Preprocessing Functions (Based on User's Outline) ---

	def remove_non_ascii(words):
	"""Removes non-ASCII characters from a list of tokens."""
	return [re.sub(r'[^\x00-\x7F]+', '', word) for word in words]

	def to_lowercase(words):
	"""Converts all tokens to lowercase."""
	return [word.lower() for word in words]

	def remove_punctuation(words):
	"""Removes punctuation tokens."""
	# We remove punctuation marks themselves, and also filter out empty strings
	return [word for word in words if word not in string.punctuation and word != '']

	def remove_stopwords(words):
	"""Removes common English stop words."""
	# Note: Punctuation must be removed before this step, otherwise it might be preserved.
	return [word for word in words if word not in STOP_WORDS]

	def lemmatize_list(words):
	"""Lemmatizes tokens to their base form (lemma)."""
	# Using 'v' for verb is a common heuristic if POS tagging is skipped for simplicity
	return [LEMMATIZER.lemmatize(word, pos='v') for word in words]

	# --- 3. Main Normalization Function with Step Tracking ---

	def normalize_with_steps(text):
	"""
	Full preprocessing pipeline that tracks the output at each step.
	The Gradio output will show the step-by-step transformation.
	"""
	output_log = f"Input Text:\n`{text}`\n\n---"

	# Step 1: Tokenization
	words = word_tokenize(text)
	output_log += f"\n\nStep 1: Tokenization (nltk.word_tokenize)\nBreaks text into individual words, including punctuation.\n`{words}`"

	# Step 2: Remove Non-ASCII
	words_step2 = remove_non_ascii(words)
	output_log += f"\n\nStep 2: Remove Non-ASCII\nRemoves characters outside the standard ASCII set.\n`{words_step2}`"
	words = words_step2 # Update for next step

	# Step 3: To Lowercase
	words_step3 = to_lowercase(words)
	output_log += f"\n\nStep 3: To Lowercase\nConverts all characters to lowercase for consistency.\n`{words_step3}`"
	words = words_step3

	# Step 4: Remove Punctuation
	words_step4 = remove_punctuation(words)
	output_log += f"\n\nStep 4: Remove Punctuation\nFilters out tokens that are only punctuation marks.\n`{words_step4}`"
	words = words_step4

	# Step 5: Remove Stopwords
	words_step5 = remove_stopwords(words)
	output_log += f"\n\nStep 5: Remove Stopwords\nRemoves common, less meaningful words (e.g., 'the', 'is', 'a').\n`{words_step5}`"
	words = words_step5

	# Step 6: Lemmatization
	words_step6 = lemmatize_list(words)
	output_log += f"\n\nStep 6: Lemmatization (WordNetLemmatizer)\nReduces words to their dictionary root form (e.g., 'running' -> 'run').\n`{words_step6}`"
	words = words_step6

	# Step 7: Join Final Tokens
	final_text = ' '.join(words)
	output_log += f"\n\n---"
	output_log += f"\n\nFinal Output (Joined Text):\n`{final_text}`"

	return output_log

	# --- 4. Gradio Interface Setup ---

	# Example Inputs for the user to try
	examples = [
	["The dogs are running very quickly in the parks and they aren't stopping!"],
	["Washingtón, D.C. has a lot of interesting historical monuments. $1000 is needed."],
	["I've been playing football for years, but now I'm traveling to London."]
	]

	# Create the Gradio interface
	iface = gr.Interface(
	fn=normalize_with_steps,
	inputs=gr.Textbox(
	lines=3,
	label="Enter Text to Preprocess",
	placeholder="Type a sentence here...",
	value=examples[0][0] # Set default value
	),
	outputs=gr.Markdown(
	label="NLP Preprocessing Steps (Step-by-Step Transformation)",
	),
	title="NLP Text Normalization Pipeline Demo",
	description="This application demonstrates a full text preprocessing pipeline using NLTK, showing the transformation after each step: Tokenization, Lowercasing, Punctuation Removal, Stopword Removal, and Lemmatization.",
	examples=examples,
	allow_flagging='never'
	)

	# Launch the app (Setting share=True allows for deployment)
	# iface.launch(share=True) # Use this command if running locally or in a notebook
	# For Hugging Face deployment, the app is launched via the `app.py` execution environment.
	iface.launch()