File size: 4,199 Bytes
fa8ff66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
wordcloud_service.py  –  Generate a word-cloud image from a list of texts.
Stripped from the original Colab notebook; only the generation function remains.
"""
from __future__ import annotations

import io
import os
import re
import numpy as np
import matplotlib
matplotlib.use("Agg")  # Must be before pyplot import β€” headless/no-display
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# ── Stopwords (same set as preprocessing.py) ──────────────────────────────────
try:
    from stop_words import get_stop_words
    _stopwords_id = get_stop_words('indonesian')
except Exception:
    _stopwords_id = []

try:
    from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
    _sastrawi_sw = StopWordRemoverFactory().get_stop_words()
except Exception:
    _sastrawi_sw = []

_EXTRA_STOPWORDS = [
    'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg',
    'deh','sih','kok','dong','udah','ya','banget','pakai','jadi','baru',
]

_BLOCKLIST = set(_stopwords_id + _sastrawi_sw + _EXTRA_STOPWORDS)
_SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz')
WORDCLOUD_STOPWORDS = _BLOCKLIST | _SINGLE_LETTERS


# ── Internal helpers ───────────────────────────────────────────────────────────

def _merge_texts(texts: list) -> str:
    """Join a list of strings, keeping only alphabetic tokens."""
    joined = " ".join(str(t) for t in texts if t)
    tokens = joined.lower().split()
    tokens = [
        w for w in tokens
        if re.match(r'^[a-z]+$', w) and w not in WORDCLOUD_STOPWORDS and len(w) > 2
    ]
    return " ".join(tokens)


def _circular_mask(size: int = 400) -> np.ndarray:
    x, y = np.ogrid[:size, :size]
    center = size // 2
    radius = center - 10
    mask = (x - center) ** 2 + (y - center) ** 2 > radius ** 2
    return (255 * mask).astype(np.uint8)


# ── Public API ─────────────────────────────────────────────────────────────────

def generate_wordcloud(texts: list, output_dest) -> bool:
    """
    Generate a circular wordcloud from a list of text strings.

    Args:
        texts:       list of strings (raw or pre-processed)
        output_dest: file path string OR a BytesIO buffer.
                     If a string path is given, the PNG is saved to disk.
                     If a BytesIO buffer is given, the PNG is written there
                     (no file is created on disk).

    Returns:
        True on success, False on failure.
    """
    if not texts:
        print("[WordCloud] No texts provided.")
        return False

    text_data = _merge_texts(texts)
    if not text_data.strip():
        print("[WordCloud] All text was filtered out by stopwords; nothing to plot.")
        return False

    # If saving to a file path, ensure the directory exists
    if isinstance(output_dest, str):
        output_dir = os.path.dirname(output_dest)
        if output_dir:
            os.makedirs(output_dir, exist_ok=True)

    try:
        mask = _circular_mask(400)
        wc = WordCloud(
            width=800,
            height=800,
            background_color="white",
            colormap="viridis",
            mask=mask,
            contour_width=2,
            contour_color="steelblue",
            stopwords=WORDCLOUD_STOPWORDS,
            max_words=100,
        ).generate(text_data)

        fig, ax = plt.subplots(figsize=(8, 8))
        ax.imshow(wc, interpolation="bilinear")
        ax.axis("off")
        plt.tight_layout(pad=0)
        plt.savefig(output_dest, dpi=150, bbox_inches="tight", format="png")
        plt.close(fig)

        if isinstance(output_dest, str):
            print(f"[WordCloud] Saved to {output_dest}")
        else:
            print("[WordCloud] Written to in-memory buffer (temporal).")
        return True

    except Exception as e:
        print(f"[WordCloud] Error generating wordcloud: {e}")
        return False