Sentiment / services /wordcloud_service.py
NzTama's picture
Initial clean deploy: Sentiment Analysis
fa8ff66
"""
wordcloud_service.py – Generate a word-cloud image from a list of texts.
Stripped from the original Colab notebook; only the generation function remains.
"""
from __future__ import annotations
import io
import os
import re
import numpy as np
import matplotlib
matplotlib.use("Agg") # Must be before pyplot import β€” headless/no-display
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# ── Stopwords (same set as preprocessing.py) ──────────────────────────────────
try:
from stop_words import get_stop_words
_stopwords_id = get_stop_words('indonesian')
except Exception:
_stopwords_id = []
try:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
_sastrawi_sw = StopWordRemoverFactory().get_stop_words()
except Exception:
_sastrawi_sw = []
_EXTRA_STOPWORDS = [
'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg',
'deh','sih','kok','dong','udah','ya','banget','pakai','jadi','baru',
]
_BLOCKLIST = set(_stopwords_id + _sastrawi_sw + _EXTRA_STOPWORDS)
_SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz')
WORDCLOUD_STOPWORDS = _BLOCKLIST | _SINGLE_LETTERS
# ── Internal helpers ───────────────────────────────────────────────────────────
def _merge_texts(texts: list) -> str:
"""Join a list of strings, keeping only alphabetic tokens."""
joined = " ".join(str(t) for t in texts if t)
tokens = joined.lower().split()
tokens = [
w for w in tokens
if re.match(r'^[a-z]+$', w) and w not in WORDCLOUD_STOPWORDS and len(w) > 2
]
return " ".join(tokens)
def _circular_mask(size: int = 400) -> np.ndarray:
x, y = np.ogrid[:size, :size]
center = size // 2
radius = center - 10
mask = (x - center) ** 2 + (y - center) ** 2 > radius ** 2
return (255 * mask).astype(np.uint8)
# ── Public API ─────────────────────────────────────────────────────────────────
def generate_wordcloud(texts: list, output_dest) -> bool:
"""
Generate a circular wordcloud from a list of text strings.
Args:
texts: list of strings (raw or pre-processed)
output_dest: file path string OR a BytesIO buffer.
If a string path is given, the PNG is saved to disk.
If a BytesIO buffer is given, the PNG is written there
(no file is created on disk).
Returns:
True on success, False on failure.
"""
if not texts:
print("[WordCloud] No texts provided.")
return False
text_data = _merge_texts(texts)
if not text_data.strip():
print("[WordCloud] All text was filtered out by stopwords; nothing to plot.")
return False
# If saving to a file path, ensure the directory exists
if isinstance(output_dest, str):
output_dir = os.path.dirname(output_dest)
if output_dir:
os.makedirs(output_dir, exist_ok=True)
try:
mask = _circular_mask(400)
wc = WordCloud(
width=800,
height=800,
background_color="white",
colormap="viridis",
mask=mask,
contour_width=2,
contour_color="steelblue",
stopwords=WORDCLOUD_STOPWORDS,
max_words=100,
).generate(text_data)
fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(wc, interpolation="bilinear")
ax.axis("off")
plt.tight_layout(pad=0)
plt.savefig(output_dest, dpi=150, bbox_inches="tight", format="png")
plt.close(fig)
if isinstance(output_dest, str):
print(f"[WordCloud] Saved to {output_dest}")
else:
print("[WordCloud] Written to in-memory buffer (temporal).")
return True
except Exception as e:
print(f"[WordCloud] Error generating wordcloud: {e}")
return False