""" preprocessing.py – Clean & preprocess text for sentiment analysis. Only contains utility functions; no Colab/notebook code. """ import re import html as html_lib from bs4 import BeautifulSoup try: from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory _sastrawi_available = True except ImportError: _sastrawi_available = False try: from stop_words import get_stop_words _stopwords_id = get_stop_words('indonesian') except Exception: _stopwords_id = [] # ── Stopwords ────────────────────────────────────────────────────────────────── _sastrawi_stopwords: list = [] _stemmer = None if _sastrawi_available: _stemmer = StemmerFactory().create_stemmer() _sastrawi_stopwords = StopWordRemoverFactory().get_stop_words() _ADDITIONAL_STOPWORDS = [ 'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg','utk', 'deh','sih','kok','dong','udah','sdh','blm','bgmn','dgn','lgi', 'ya','lbh','digunakan','semangat','dah','sangat','penting', 'lancar','cepat','senang','makasih','bermanfaat','keren','baik', 'terimakasih','bagus','semoga','aplikasi','transaksi','banget','pakai', 'hp','tolong','gimana','iya','jadi','ambil','buka','butuh','masuk', 'baru','jelas','yuk','mohon','punya','cara','hari','kota','berita', # HTML attributes 'class','id','span','div','href','src','style','alt','aria','role', 'tabindex','button','label','img','input','placeholder','form', 'field','hidden','value','by','link','tags', ] _NOISE_STOPWORDS = [ 'xd','xyri','yu','uobl','ypdohk','xt','pz','lziwak', 'rp','xdj','xggy','xjbqb','xstzfhl','hfl','xat', 'qhh','dhg','cr','tdsg','ct','etr','nq','oe','ejq','psk', 'hl','hd','sy','amp','fbf', ] _SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz') FINAL_STOPWORDS: set = set( _stopwords_id + _sastrawi_stopwords + _ADDITIONAL_STOPWORDS + _NOISE_STOPWORDS ) | _SINGLE_LETTERS # ── Individual text cleaners ─────────────────────────────────────────────────── _AUTHOR_COMMENT_PATTERN = re.compile(r"author\b.*?\bcomment", flags=re.IGNORECASE|re.DOTALL) def clean_html(text: str) -> str: """Strip HTML tags and unescape HTML entities.""" if not text: return "" try: soup = BeautifulSoup(str(text), "html.parser") for tag in soup(["script", "style"]): tag.decompose() cleaned = soup.get_text(separator=" ") except Exception: cleaned = str(text) cleaned = html_lib.unescape(cleaned) cleaned = re.sub(r"\s+", " ", cleaned).strip() return cleaned def clean_text(text: str) -> str: """Basic single-string cleaner: lowercase, remove URLs, non-alpha chars.""" if not text: return "" text = str(text).lower() text = _AUTHOR_COMMENT_PATTERN.sub("", text) text = re.sub(r'http\S+|www\S+|https\S+', '', text) text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) text = re.sub(r'\s+', ' ', text).strip() return text def _preprocess_single(text: str) -> str: """Full pipeline for one text string.""" # 1. Strip HTML text = clean_html(text) # 2. Lowercase + remove URLs/non-alpha text = clean_text(text) # 3. Stem (Sastrawi) if _stemmer: text = _stemmer.stem(text) # 4. Remove stopwords & noise tokens = [ w for w in text.split() if w not in FINAL_STOPWORDS and len(w) > 1 ] # 5. Keep only tokens with at least one letter tokens = [t for t in tokens if re.search(r'[a-z]', t)] return " ".join(tokens).strip() # ── Public API ───────────────────────────────────────────────────────────────── def preprocess_text(texts) -> list: """ Accept either a single string or a list of strings. Returns a list of cleaned strings. """ if isinstance(texts, str): texts = [texts] return [_preprocess_single(t) for t in texts if isinstance(t, str)]