""" wordcloud_service.py – Generate a word-cloud image from a list of texts. Stripped from the original Colab notebook; only the generation function remains. """ from __future__ import annotations import io import os import re import numpy as np import matplotlib matplotlib.use("Agg") # Must be before pyplot import — headless/no-display import matplotlib.pyplot as plt from wordcloud import WordCloud # ── Stopwords (same set as preprocessing.py) ────────────────────────────────── try: from stop_words import get_stop_words _stopwords_id = get_stop_words('indonesian') except Exception: _stopwords_id = [] try: from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory _sastrawi_sw = StopWordRemoverFactory().get_stop_words() except Exception: _sastrawi_sw = [] _EXTRA_STOPWORDS = [ 'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg', 'deh','sih','kok','dong','udah','ya','banget','pakai','jadi','baru', ] _BLOCKLIST = set(_stopwords_id + _sastrawi_sw + _EXTRA_STOPWORDS) _SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz') WORDCLOUD_STOPWORDS = _BLOCKLIST | _SINGLE_LETTERS # ── Internal helpers ─────────────────────────────────────────────────────────── def _merge_texts(texts: list) -> str: """Join a list of strings, keeping only alphabetic tokens.""" joined = " ".join(str(t) for t in texts if t) tokens = joined.lower().split() tokens = [ w for w in tokens if re.match(r'^[a-z]+$', w) and w not in WORDCLOUD_STOPWORDS and len(w) > 2 ] return " ".join(tokens) def _circular_mask(size: int = 400) -> np.ndarray: x, y = np.ogrid[:size, :size] center = size // 2 radius = center - 10 mask = (x - center) ** 2 + (y - center) ** 2 > radius ** 2 return (255 * mask).astype(np.uint8) # ── Public API ───────────────────────────────────────────────────────────────── def generate_wordcloud(texts: list, output_dest) -> bool: """ Generate a circular wordcloud from a list of text strings. Args: texts: list of strings (raw or pre-processed) output_dest: file path string OR a BytesIO buffer. If a string path is given, the PNG is saved to disk. If a BytesIO buffer is given, the PNG is written there (no file is created on disk). Returns: True on success, False on failure. """ if not texts: print("[WordCloud] No texts provided.") return False text_data = _merge_texts(texts) if not text_data.strip(): print("[WordCloud] All text was filtered out by stopwords; nothing to plot.") return False # If saving to a file path, ensure the directory exists if isinstance(output_dest, str): output_dir = os.path.dirname(output_dest) if output_dir: os.makedirs(output_dir, exist_ok=True) try: mask = _circular_mask(400) wc = WordCloud( width=800, height=800, background_color="white", colormap="viridis", mask=mask, contour_width=2, contour_color="steelblue", stopwords=WORDCLOUD_STOPWORDS, max_words=100, ).generate(text_data) fig, ax = plt.subplots(figsize=(8, 8)) ax.imshow(wc, interpolation="bilinear") ax.axis("off") plt.tight_layout(pad=0) plt.savefig(output_dest, dpi=150, bbox_inches="tight", format="png") plt.close(fig) if isinstance(output_dest, str): print(f"[WordCloud] Saved to {output_dest}") else: print("[WordCloud] Written to in-memory buffer (temporal).") return True except Exception as e: print(f"[WordCloud] Error generating wordcloud: {e}") return False