File size: 5,911 Bytes
fa8ff66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""
sentiment.py  –  Sentiment analysis using IndoBERT / HuggingFace pipeline.
Model is loaded lazily (first call) to avoid crashing at import time.
"""
from __future__ import annotations

import os
from typing import Optional

# ── Model configuration ────────────────────────────────────────────────────────
# If you have a local fine-tuned model, place it in ./indoBERT-sentiment
# and set MODEL_DIR. Otherwise it downloads from HuggingFace.
_LOCAL_MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "indoBERT-sentiment")
_HF_MODEL_ID = "taufiqdp/indonesian-sentiment"

# ── Lazy-loaded globals ────────────────────────────────────────────────────────
_pipeline: Optional[object] = None


def _load_pipeline():
    global _pipeline
    if _pipeline is not None:
        return _pipeline

    import torch
    from transformers import pipeline as hf_pipeline

    # Prefer local model if it exists (avoids repeated downloads in Docker)
    if os.path.isdir(_LOCAL_MODEL_DIR) and os.listdir(_LOCAL_MODEL_DIR):
        model_source = _LOCAL_MODEL_DIR
        print(f"[Sentiment] Loading model from local dir: {model_source}")
    else:
        model_source = _HF_MODEL_ID
        print(f"[Sentiment] Local model not found. Downloading from HuggingFace: {model_source}")

    device = 0 if torch.cuda.is_available() else -1

    _pipeline = hf_pipeline(
        "text-classification",
        model=model_source,
        tokenizer=model_source,
        device=device,
        truncation=True,
        max_length=256,
        return_all_scores=False,
    )
    print("[Sentiment] Model loaded successfully.")
    return _pipeline


# ── Helpers ────────────────────────────────────────────────────────────────────

def _normalize_label(lbl: str) -> str:
    """Normalise raw model label to 'positif', 'negatif', or 'netral'."""
    l = lbl.lower()
    if l in ("positif", "positive", "pos"):
        return "positif"
    if l in ("negatif", "negative", "neg"):
        return "negatif"
    if l in ("netral", "neutral", "neu"):
        return "netral"
    if "label_" in l:
        try:
            from transformers import AutoConfig
            cfg = AutoConfig.from_pretrained(_HF_MODEL_ID)
            idx = int(l.split("_")[-1])
            return _normalize_label(cfg.id2label[idx])
        except Exception:
            return "netral"
    return "netral"


# ── Keywords Override ──────────────────────────────────────────────────────────

_NEGATIVE_KEYWORDS = {
    "bego", "bodoh", "jelek", "goblok", "bangsat", "kampungan", "tolol",
    "kontol", "kirik", "koplok", "anjing", "babi", "monyet", "belegug",
    "kik", "goblog", "kntl", "buruk", "lemah", "rendah", "gagal", "hancur", 
    "rusak", "cacat", "jahat", "dusta", "bohong", "fitnah", "korup", "curang", 
    "palsu", "salah", "sesat", "kejam", "dendam", "malas", "lambat", "menyakitkan",
    "tercela", "merugikan", "menghina", "melecehkan", "menyesatkan"
}

_POSITIVE_KEYWORDS = {
    "bagus", "hebat", "mantap", "luar biasa", "keren", "canggih",
    "cerdas", "pintar", "senang", "bahagia", "memuaskan", "unggul",
    "sempurna", "berhasil", "luas", "indah"
}

_NEUTRAL_KEYWORDS = {
    "ok", "oke", "biasa", "lumayan", "standar", "normal", "cukup", "agak"
}

def _override_label(text: str, model_label: str) -> str:
    text_lower = text.lower()
    
    if any(w in text_lower for w in _NEGATIVE_KEYWORDS):
        return "negatif"
    if any(w in text_lower for w in _POSITIVE_KEYWORDS):
        return "positif"
    if any(w in text_lower for w in _NEUTRAL_KEYWORDS):
        return "netral"
        
    return model_label


# ── Public API ─────────────────────────────────────────────────────────────────

def analyze_sentiment(texts: list) -> dict:
    """
    Run sentiment analysis on a list of text strings.

    Args:
        texts: list of pre-processed strings

    Returns:
        dict with keys: positif, negatif, netral, total, detail
        Example:
          {
            "positif": 12, "negatif": 4, "netral": 6, "total": 22,
            "detail": [{"text": "...", "label": "positif", "score": 0.95}, ...]
          }
    """
    if not texts:
        return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []}

    # Filter out empty strings
    texts = [t for t in texts if t and t.strip()]
    if not texts:
        return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []}

    clf = _load_pipeline()

    try:
        preds = clf(texts, batch_size=16, truncation=True)
    except Exception as e:
        print(f"[Sentiment] Prediction error: {e}")
        return {"positif": 0, "negatif": 0, "netral": 0, "total": len(texts), "detail": []}

    counts = {"positif": 0, "negatif": 0, "netral": 0}
    detail = []
    for text, pred in zip(texts, preds):
        model_label = _normalize_label(pred["label"])
        final_label = _override_label(text, model_label)
        
        counts[final_label] += 1
        detail.append({
            "text": text[:200],
            "label": final_label,
            "score": round(float(pred["score"]), 4),
        })

    return {
        "positif": counts["positif"],
        "negatif": counts["negatif"],
        "netral": counts["netral"],
        "total": len(texts),
        "detail": detail,
    }