Spaces:

NzTama
/

Sentiment

Runtime error

App Files Files Community

Sentiment / services /preprocessing.py

NzTama

Initial clean deploy: Sentiment Analysis

fa8ff66 2 months ago

raw

history blame contribute delete

4.35 kB

	"""
	preprocessing.py – Clean & preprocess text for sentiment analysis.
	Only contains utility functions; no Colab/notebook code.
	"""
	import re
	import html as html_lib

	from bs4 import BeautifulSoup

	try:
	from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
	from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
	_sastrawi_available = True
	except ImportError:
	_sastrawi_available = False

	try:
	from stop_words import get_stop_words
	_stopwords_id = get_stop_words('indonesian')
	except Exception:
	_stopwords_id = []

	# ── Stopwords ──────────────────────────────────────────────────────────────────
	_sastrawi_stopwords: list = []
	_stemmer = None

	if _sastrawi_available:
	_stemmer = StemmerFactory().create_stemmer()
	_sastrawi_stopwords = StopWordRemoverFactory().get_stop_words()

	_ADDITIONAL_STOPWORDS = [
	'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg','utk',
	'deh','sih','kok','dong','udah','sdh','blm','bgmn','dgn','lgi',
	'ya','lbh','digunakan','semangat','dah','sangat','penting',
	'lancar','cepat','senang','makasih','bermanfaat','keren','baik',
	'terimakasih','bagus','semoga','aplikasi','transaksi','banget','pakai',
	'hp','tolong','gimana','iya','jadi','ambil','buka','butuh','masuk',
	'baru','jelas','yuk','mohon','punya','cara','hari','kota','berita',
	# HTML attributes
	'class','id','span','div','href','src','style','alt','aria','role',
	'tabindex','button','label','img','input','placeholder','form',
	'field','hidden','value','by','link','tags',
	]

	_NOISE_STOPWORDS = [
	'xd','xyri','yu','uobl','ypdohk','xt','pz','lziwak',
	'rp','xdj','xggy','xjbqb','xstzfhl','hfl','xat',
	'qhh','dhg','cr','tdsg','ct','etr','nq','oe','ejq','psk',
	'hl','hd','sy','amp','fbf',
	]

	_SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz')

	FINAL_STOPWORDS: set = set(
	_stopwords_id + _sastrawi_stopwords + _ADDITIONAL_STOPWORDS + _NOISE_STOPWORDS
	) \| _SINGLE_LETTERS


	# ── Individual text cleaners ───────────────────────────────────────────────────

	_AUTHOR_COMMENT_PATTERN = re.compile(r"author\b.*?\bcomment", flags=re.IGNORECASE\|re.DOTALL)

	def clean_html(text: str) -> str:
	"""Strip HTML tags and unescape HTML entities."""
	if not text:
	return ""
	try:
	soup = BeautifulSoup(str(text), "html.parser")
	for tag in soup(["script", "style"]):
	tag.decompose()
	cleaned = soup.get_text(separator=" ")
	except Exception:
	cleaned = str(text)
	cleaned = html_lib.unescape(cleaned)
	cleaned = re.sub(r"\s+", " ", cleaned).strip()
	return cleaned


	def clean_text(text: str) -> str:
	"""Basic single-string cleaner: lowercase, remove URLs, non-alpha chars."""
	if not text:
	return ""
	text = str(text).lower()
	text = _AUTHOR_COMMENT_PATTERN.sub("", text)
	text = re.sub(r'http\S+\|www\S+\|https\S+', '', text)
	text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text


	def _preprocess_single(text: str) -> str:
	"""Full pipeline for one text string."""
	# 1. Strip HTML
	text = clean_html(text)
	# 2. Lowercase + remove URLs/non-alpha
	text = clean_text(text)
	# 3. Stem (Sastrawi)
	if _stemmer:
	text = _stemmer.stem(text)
	# 4. Remove stopwords & noise
	tokens = [
	w for w in text.split()
	if w not in FINAL_STOPWORDS and len(w) > 1
	]
	# 5. Keep only tokens with at least one letter
	tokens = [t for t in tokens if re.search(r'[a-z]', t)]
	return " ".join(tokens).strip()


	# ── Public API ─────────────────────────────────────────────────────────────────

	def preprocess_text(texts) -> list:
	"""
	Accept either a single string or a list of strings.
	Returns a list of cleaned strings.
	"""
	if isinstance(texts, str):
	texts = [texts]
	return [_preprocess_single(t) for t in texts if isinstance(t, str)]