Sentiment / services /preprocessing.py
NzTama's picture
Initial clean deploy: Sentiment Analysis
fa8ff66
"""
preprocessing.py – Clean & preprocess text for sentiment analysis.
Only contains utility functions; no Colab/notebook code.
"""
import re
import html as html_lib
from bs4 import BeautifulSoup
try:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
_sastrawi_available = True
except ImportError:
_sastrawi_available = False
try:
from stop_words import get_stop_words
_stopwords_id = get_stop_words('indonesian')
except Exception:
_stopwords_id = []
# ── Stopwords ──────────────────────────────────────────────────────────────────
_sastrawi_stopwords: list = []
_stemmer = None
if _sastrawi_available:
_stemmer = StemmerFactory().create_stemmer()
_sastrawi_stopwords = StopWordRemoverFactory().get_stop_words()
_ADDITIONAL_STOPWORDS = [
'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg','utk',
'deh','sih','kok','dong','udah','sdh','blm','bgmn','dgn','lgi',
'ya','lbh','digunakan','semangat','dah','sangat','penting',
'lancar','cepat','senang','makasih','bermanfaat','keren','baik',
'terimakasih','bagus','semoga','aplikasi','transaksi','banget','pakai',
'hp','tolong','gimana','iya','jadi','ambil','buka','butuh','masuk',
'baru','jelas','yuk','mohon','punya','cara','hari','kota','berita',
# HTML attributes
'class','id','span','div','href','src','style','alt','aria','role',
'tabindex','button','label','img','input','placeholder','form',
'field','hidden','value','by','link','tags',
]
_NOISE_STOPWORDS = [
'xd','xyri','yu','uobl','ypdohk','xt','pz','lziwak',
'rp','xdj','xggy','xjbqb','xstzfhl','hfl','xat',
'qhh','dhg','cr','tdsg','ct','etr','nq','oe','ejq','psk',
'hl','hd','sy','amp','fbf',
]
_SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz')
FINAL_STOPWORDS: set = set(
_stopwords_id + _sastrawi_stopwords + _ADDITIONAL_STOPWORDS + _NOISE_STOPWORDS
) | _SINGLE_LETTERS
# ── Individual text cleaners ───────────────────────────────────────────────────
_AUTHOR_COMMENT_PATTERN = re.compile(r"author\b.*?\bcomment", flags=re.IGNORECASE|re.DOTALL)
def clean_html(text: str) -> str:
"""Strip HTML tags and unescape HTML entities."""
if not text:
return ""
try:
soup = BeautifulSoup(str(text), "html.parser")
for tag in soup(["script", "style"]):
tag.decompose()
cleaned = soup.get_text(separator=" ")
except Exception:
cleaned = str(text)
cleaned = html_lib.unescape(cleaned)
cleaned = re.sub(r"\s+", " ", cleaned).strip()
return cleaned
def clean_text(text: str) -> str:
"""Basic single-string cleaner: lowercase, remove URLs, non-alpha chars."""
if not text:
return ""
text = str(text).lower()
text = _AUTHOR_COMMENT_PATTERN.sub("", text)
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def _preprocess_single(text: str) -> str:
"""Full pipeline for one text string."""
# 1. Strip HTML
text = clean_html(text)
# 2. Lowercase + remove URLs/non-alpha
text = clean_text(text)
# 3. Stem (Sastrawi)
if _stemmer:
text = _stemmer.stem(text)
# 4. Remove stopwords & noise
tokens = [
w for w in text.split()
if w not in FINAL_STOPWORDS and len(w) > 1
]
# 5. Keep only tokens with at least one letter
tokens = [t for t in tokens if re.search(r'[a-z]', t)]
return " ".join(tokens).strip()
# ── Public API ─────────────────────────────────────────────────────────────────
def preprocess_text(texts) -> list:
"""
Accept either a single string or a list of strings.
Returns a list of cleaned strings.
"""
if isinstance(texts, str):
texts = [texts]
return [_preprocess_single(t) for t in texts if isinstance(t, str)]