Spaces:

ajayinsac
/

Nlp_example

Sleeping

App Files Files Community

Nlp_example / app.py

ajayinsac

Update app.py

ccdb725 verified 5 months ago

raw

history blame contribute delete

5.18 kB

	#!/usr/bin/env python3
	"""
	Gradio app: Text normalization pipeline with step-by-step outputs.
	Run locally:
	pip install -r requirements.txt
	python app.py
	"""

	import os
	import re
	import string
	from collections import OrderedDict
	import gradio as gr

	# Detect if running on Hugging Face Spaces (avoid share=True there)
	IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))

	# ---- Optional NLTK pieces (NO downloads at startup) ----
	# Use real stopwords if available; otherwise fall back to a small set.
	try:
	import nltk # noqa: F401
	from nltk.corpus import stopwords as nltk_stopwords
	_STOPWORDS = set(nltk_stopwords.words("english"))
	except Exception:
	_STOPWORDS = {
	"a","an","and","are","as","at","be","but","by","for","if","in","into",
	"is","it","no","not","of","on","or","such","that","the","their","then",
	"there","these","they","this","to","was","will","with","were","from","your"
	}

	# Decide lemmatizer vs stemmer based on whether the corpus exists
	_use_porter = True
	_lemmatizer = None
	_stemmer = None
	try:
	import nltk # noqa: F401
	from nltk.stem import WordNetLemmatizer
	# Only use WordNetLemmatizer if the wordnet corpus is present
	try:
	nltk.data.find("corpora/wordnet")
	_lemmatizer = WordNetLemmatizer()
	_use_porter = False
	except LookupError:
	from nltk.stem import PorterStemmer
	_stemmer = PorterStemmer()
	_use_porter = True
	except Exception:
	# If NLTK isn't fully available, fall back to identity later
	_lemmatizer = None
	_stemmer = None
	_use_porter = None


	# ---- Pipeline helpers ----
	def tokenize(text: str):
	# Simple, dependency-free tokenizer: words or single non-space symbols
	return re.findall(r"\w+\|[^\w\s]", text or "", flags=re.UNICODE)

	def remove_non_ascii(tokens):
	cleaned = []
	for w in tokens:
	ascii_w = w.encode("ascii", "ignore").decode("ascii")
	if ascii_w:
	cleaned.append(ascii_w)
	return cleaned

	def to_lowercase(tokens):
	return [w.lower() for w in tokens]

	def remove_punctuation(tokens):
	table = str.maketrans("", "", string.punctuation)
	stripped = [w.translate(table) for w in tokens]
	return [w for w in stripped if w and not w.isspace()]

	def remove_stopwords(tokens):
	return [w for w in tokens if w not in _STOPWORDS]

	def lemmatize_list(tokens):
	"""Lemmatize if wordnet is present; otherwise stem; otherwise identity.
	Also guards against runtime LookupError during example caching."""
	global _use_porter, _lemmatizer, _stemmer
	if _use_porter is False and _lemmatizer is not None:
	try:
	return [_lemmatizer.lemmatize(w) for w in tokens]
	except LookupError:
	try:
	from nltk.stem import PorterStemmer
	_stemmer = PorterStemmer()
	_use_porter = True
	return [_stemmer.stem(w) for w in tokens]
	except Exception:
	return tokens
	elif _use_porter is True and _stemmer is not None:
	return [_stemmer.stem(w) for w in tokens]
	else:
	return tokens


	# ---- Core pipeline (returns step-by-step dict) ----
	def normalize(text: str) -> OrderedDict:
	steps = OrderedDict()

	t1 = tokenize(text)
	steps["1) Tokenize"] = t1

	t2 = remove_non_ascii(t1)
	steps["2) Remove non-ASCII"] = t2

	t3 = to_lowercase(t2)
	steps["3) Lowercase"] = t3

	t4 = remove_punctuation(t3)
	steps["4) Remove punctuation"] = t4

	t5 = remove_stopwords(t4)
	steps["5) Remove stopwords"] = t5

	t6 = lemmatize_list(t5)
	steps["6) Lemmatize"] = t6

	steps["Final normalized text"] = " ".join(t6)
	return steps


	# ---- Gradio wiring ----
	examples = [
	"The quick brown fox jumps over the lazy dog!",
	"NLTK is a leading platform for building Python programs to work with human language data.",
	"Text normalization is important for NLP tasks.",
	"Café prices in 2024 were higher—aren't they? 🤔",
	]

	def show_steps(text):
	steps = normalize(text)
	parts = []
	for step, value in steps.items():
	if isinstance(value, list):
	pretty = " ".join(value)
	parts.append(f"<b>{step}</b>: {pretty} <small>({len(value)} tokens)</small>")
	else:
	parts.append(f"<b>{step}</b>: {value}")
	return "<br>".join(parts)

	iface = gr.Interface(
	fn=show_steps,
	inputs=gr.Textbox(lines=4, label="Enter text to normalize"),
	outputs=gr.HTML(label="Step-by-step normalization"),
	examples=[[ex] for ex in examples],
	cache_examples=False, # avoid startup caching
	flagging_mode="never",
	title="Text Normalization Pipeline",
	description="Enter text or select an example to see each step of the normalization process."
	)

	if __name__ == "__main__":
	iface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	ssr_mode=False, # disable SSR for stability
	share=(not IN_SPACES), # public link only when local; avoids Spaces warning
	quiet=True, # suppresses the “To create a public link…” tip
	)