| """ |
| preprocessing.py β Clean & preprocess text for sentiment analysis. |
| Only contains utility functions; no Colab/notebook code. |
| """ |
| import re |
| import html as html_lib |
|
|
| from bs4 import BeautifulSoup |
|
|
| try: |
| from Sastrawi.Stemmer.StemmerFactory import StemmerFactory |
| from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory |
| _sastrawi_available = True |
| except ImportError: |
| _sastrawi_available = False |
|
|
| try: |
| from stop_words import get_stop_words |
| _stopwords_id = get_stop_words('indonesian') |
| except Exception: |
| _stopwords_id = [] |
|
|
| |
| _sastrawi_stopwords: list = [] |
| _stemmer = None |
|
|
| if _sastrawi_available: |
| _stemmer = StemmerFactory().create_stemmer() |
| _sastrawi_stopwords = StopWordRemoverFactory().get_stop_words() |
|
|
| _ADDITIONAL_STOPWORDS = [ |
| 'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg','utk', |
| 'deh','sih','kok','dong','udah','sdh','blm','bgmn','dgn','lgi', |
| 'ya','lbh','digunakan','semangat','dah','sangat','penting', |
| 'lancar','cepat','senang','makasih','bermanfaat','keren','baik', |
| 'terimakasih','bagus','semoga','aplikasi','transaksi','banget','pakai', |
| 'hp','tolong','gimana','iya','jadi','ambil','buka','butuh','masuk', |
| 'baru','jelas','yuk','mohon','punya','cara','hari','kota','berita', |
| |
| 'class','id','span','div','href','src','style','alt','aria','role', |
| 'tabindex','button','label','img','input','placeholder','form', |
| 'field','hidden','value','by','link','tags', |
| ] |
|
|
| _NOISE_STOPWORDS = [ |
| 'xd','xyri','yu','uobl','ypdohk','xt','pz','lziwak', |
| 'rp','xdj','xggy','xjbqb','xstzfhl','hfl','xat', |
| 'qhh','dhg','cr','tdsg','ct','etr','nq','oe','ejq','psk', |
| 'hl','hd','sy','amp','fbf', |
| ] |
|
|
| _SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz') |
|
|
| FINAL_STOPWORDS: set = set( |
| _stopwords_id + _sastrawi_stopwords + _ADDITIONAL_STOPWORDS + _NOISE_STOPWORDS |
| ) | _SINGLE_LETTERS |
|
|
|
|
| |
|
|
| _AUTHOR_COMMENT_PATTERN = re.compile(r"author\b.*?\bcomment", flags=re.IGNORECASE|re.DOTALL) |
|
|
| def clean_html(text: str) -> str: |
| """Strip HTML tags and unescape HTML entities.""" |
| if not text: |
| return "" |
| try: |
| soup = BeautifulSoup(str(text), "html.parser") |
| for tag in soup(["script", "style"]): |
| tag.decompose() |
| cleaned = soup.get_text(separator=" ") |
| except Exception: |
| cleaned = str(text) |
| cleaned = html_lib.unescape(cleaned) |
| cleaned = re.sub(r"\s+", " ", cleaned).strip() |
| return cleaned |
|
|
|
|
| def clean_text(text: str) -> str: |
| """Basic single-string cleaner: lowercase, remove URLs, non-alpha chars.""" |
| if not text: |
| return "" |
| text = str(text).lower() |
| text = _AUTHOR_COMMENT_PATTERN.sub("", text) |
| text = re.sub(r'http\S+|www\S+|https\S+', '', text) |
| text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) |
| text = re.sub(r'\s+', ' ', text).strip() |
| return text |
|
|
|
|
| def _preprocess_single(text: str) -> str: |
| """Full pipeline for one text string.""" |
| |
| text = clean_html(text) |
| |
| text = clean_text(text) |
| |
| if _stemmer: |
| text = _stemmer.stem(text) |
| |
| tokens = [ |
| w for w in text.split() |
| if w not in FINAL_STOPWORDS and len(w) > 1 |
| ] |
| |
| tokens = [t for t in tokens if re.search(r'[a-z]', t)] |
| return " ".join(tokens).strip() |
|
|
|
|
| |
|
|
| def preprocess_text(texts) -> list: |
| """ |
| Accept either a single string or a list of strings. |
| Returns a list of cleaned strings. |
| """ |
| if isinstance(texts, str): |
| texts = [texts] |
| return [_preprocess_single(t) for t in texts if isinstance(t, str)] |