File size: 5,911 Bytes
fa8ff66 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | """
sentiment.py β Sentiment analysis using IndoBERT / HuggingFace pipeline.
Model is loaded lazily (first call) to avoid crashing at import time.
"""
from __future__ import annotations
import os
from typing import Optional
# ββ Model configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# If you have a local fine-tuned model, place it in ./indoBERT-sentiment
# and set MODEL_DIR. Otherwise it downloads from HuggingFace.
_LOCAL_MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "indoBERT-sentiment")
_HF_MODEL_ID = "taufiqdp/indonesian-sentiment"
# ββ Lazy-loaded globals ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_pipeline: Optional[object] = None
def _load_pipeline():
global _pipeline
if _pipeline is not None:
return _pipeline
import torch
from transformers import pipeline as hf_pipeline
# Prefer local model if it exists (avoids repeated downloads in Docker)
if os.path.isdir(_LOCAL_MODEL_DIR) and os.listdir(_LOCAL_MODEL_DIR):
model_source = _LOCAL_MODEL_DIR
print(f"[Sentiment] Loading model from local dir: {model_source}")
else:
model_source = _HF_MODEL_ID
print(f"[Sentiment] Local model not found. Downloading from HuggingFace: {model_source}")
device = 0 if torch.cuda.is_available() else -1
_pipeline = hf_pipeline(
"text-classification",
model=model_source,
tokenizer=model_source,
device=device,
truncation=True,
max_length=256,
return_all_scores=False,
)
print("[Sentiment] Model loaded successfully.")
return _pipeline
# ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _normalize_label(lbl: str) -> str:
"""Normalise raw model label to 'positif', 'negatif', or 'netral'."""
l = lbl.lower()
if l in ("positif", "positive", "pos"):
return "positif"
if l in ("negatif", "negative", "neg"):
return "negatif"
if l in ("netral", "neutral", "neu"):
return "netral"
if "label_" in l:
try:
from transformers import AutoConfig
cfg = AutoConfig.from_pretrained(_HF_MODEL_ID)
idx = int(l.split("_")[-1])
return _normalize_label(cfg.id2label[idx])
except Exception:
return "netral"
return "netral"
# ββ Keywords Override ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_NEGATIVE_KEYWORDS = {
"bego", "bodoh", "jelek", "goblok", "bangsat", "kampungan", "tolol",
"kontol", "kirik", "koplok", "anjing", "babi", "monyet", "belegug",
"kik", "goblog", "kntl", "buruk", "lemah", "rendah", "gagal", "hancur",
"rusak", "cacat", "jahat", "dusta", "bohong", "fitnah", "korup", "curang",
"palsu", "salah", "sesat", "kejam", "dendam", "malas", "lambat", "menyakitkan",
"tercela", "merugikan", "menghina", "melecehkan", "menyesatkan"
}
_POSITIVE_KEYWORDS = {
"bagus", "hebat", "mantap", "luar biasa", "keren", "canggih",
"cerdas", "pintar", "senang", "bahagia", "memuaskan", "unggul",
"sempurna", "berhasil", "luas", "indah"
}
_NEUTRAL_KEYWORDS = {
"ok", "oke", "biasa", "lumayan", "standar", "normal", "cukup", "agak"
}
def _override_label(text: str, model_label: str) -> str:
text_lower = text.lower()
if any(w in text_lower for w in _NEGATIVE_KEYWORDS):
return "negatif"
if any(w in text_lower for w in _POSITIVE_KEYWORDS):
return "positif"
if any(w in text_lower for w in _NEUTRAL_KEYWORDS):
return "netral"
return model_label
# ββ Public API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def analyze_sentiment(texts: list) -> dict:
"""
Run sentiment analysis on a list of text strings.
Args:
texts: list of pre-processed strings
Returns:
dict with keys: positif, negatif, netral, total, detail
Example:
{
"positif": 12, "negatif": 4, "netral": 6, "total": 22,
"detail": [{"text": "...", "label": "positif", "score": 0.95}, ...]
}
"""
if not texts:
return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []}
# Filter out empty strings
texts = [t for t in texts if t and t.strip()]
if not texts:
return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []}
clf = _load_pipeline()
try:
preds = clf(texts, batch_size=16, truncation=True)
except Exception as e:
print(f"[Sentiment] Prediction error: {e}")
return {"positif": 0, "negatif": 0, "netral": 0, "total": len(texts), "detail": []}
counts = {"positif": 0, "negatif": 0, "netral": 0}
detail = []
for text, pred in zip(texts, preds):
model_label = _normalize_label(pred["label"])
final_label = _override_label(text, model_label)
counts[final_label] += 1
detail.append({
"text": text[:200],
"label": final_label,
"score": round(float(pred["score"]), 4),
})
return {
"positif": counts["positif"],
"negatif": counts["negatif"],
"netral": counts["netral"],
"total": len(texts),
"detail": detail,
} |