| """ |
| sentiment.py β Sentiment analysis using IndoBERT / HuggingFace pipeline. |
| Model is loaded lazily (first call) to avoid crashing at import time. |
| """ |
| from __future__ import annotations |
|
|
| import os |
| from typing import Optional |
|
|
| |
| |
| |
| _LOCAL_MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "indoBERT-sentiment") |
| _HF_MODEL_ID = "taufiqdp/indonesian-sentiment" |
|
|
| |
| _pipeline: Optional[object] = None |
|
|
|
|
| def _load_pipeline(): |
| global _pipeline |
| if _pipeline is not None: |
| return _pipeline |
|
|
| import torch |
| from transformers import pipeline as hf_pipeline |
|
|
| |
| if os.path.isdir(_LOCAL_MODEL_DIR) and os.listdir(_LOCAL_MODEL_DIR): |
| model_source = _LOCAL_MODEL_DIR |
| print(f"[Sentiment] Loading model from local dir: {model_source}") |
| else: |
| model_source = _HF_MODEL_ID |
| print(f"[Sentiment] Local model not found. Downloading from HuggingFace: {model_source}") |
|
|
| device = 0 if torch.cuda.is_available() else -1 |
|
|
| _pipeline = hf_pipeline( |
| "text-classification", |
| model=model_source, |
| tokenizer=model_source, |
| device=device, |
| truncation=True, |
| max_length=256, |
| return_all_scores=False, |
| ) |
| print("[Sentiment] Model loaded successfully.") |
| return _pipeline |
|
|
|
|
| |
|
|
| def _normalize_label(lbl: str) -> str: |
| """Normalise raw model label to 'positif', 'negatif', or 'netral'.""" |
| l = lbl.lower() |
| if l in ("positif", "positive", "pos"): |
| return "positif" |
| if l in ("negatif", "negative", "neg"): |
| return "negatif" |
| if l in ("netral", "neutral", "neu"): |
| return "netral" |
| if "label_" in l: |
| try: |
| from transformers import AutoConfig |
| cfg = AutoConfig.from_pretrained(_HF_MODEL_ID) |
| idx = int(l.split("_")[-1]) |
| return _normalize_label(cfg.id2label[idx]) |
| except Exception: |
| return "netral" |
| return "netral" |
|
|
|
|
| |
|
|
| _NEGATIVE_KEYWORDS = { |
| "bego", "bodoh", "jelek", "goblok", "bangsat", "kampungan", "tolol", |
| "kontol", "kirik", "koplok", "anjing", "babi", "monyet", "belegug", |
| "kik", "goblog", "kntl", "buruk", "lemah", "rendah", "gagal", "hancur", |
| "rusak", "cacat", "jahat", "dusta", "bohong", "fitnah", "korup", "curang", |
| "palsu", "salah", "sesat", "kejam", "dendam", "malas", "lambat", "menyakitkan", |
| "tercela", "merugikan", "menghina", "melecehkan", "menyesatkan" |
| } |
|
|
| _POSITIVE_KEYWORDS = { |
| "bagus", "hebat", "mantap", "luar biasa", "keren", "canggih", |
| "cerdas", "pintar", "senang", "bahagia", "memuaskan", "unggul", |
| "sempurna", "berhasil", "luas", "indah" |
| } |
|
|
| _NEUTRAL_KEYWORDS = { |
| "ok", "oke", "biasa", "lumayan", "standar", "normal", "cukup", "agak" |
| } |
|
|
| def _override_label(text: str, model_label: str) -> str: |
| text_lower = text.lower() |
| |
| if any(w in text_lower for w in _NEGATIVE_KEYWORDS): |
| return "negatif" |
| if any(w in text_lower for w in _POSITIVE_KEYWORDS): |
| return "positif" |
| if any(w in text_lower for w in _NEUTRAL_KEYWORDS): |
| return "netral" |
| |
| return model_label |
|
|
|
|
| |
|
|
| def analyze_sentiment(texts: list) -> dict: |
| """ |
| Run sentiment analysis on a list of text strings. |
| |
| Args: |
| texts: list of pre-processed strings |
| |
| Returns: |
| dict with keys: positif, negatif, netral, total, detail |
| Example: |
| { |
| "positif": 12, "negatif": 4, "netral": 6, "total": 22, |
| "detail": [{"text": "...", "label": "positif", "score": 0.95}, ...] |
| } |
| """ |
| if not texts: |
| return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []} |
|
|
| |
| texts = [t for t in texts if t and t.strip()] |
| if not texts: |
| return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []} |
|
|
| clf = _load_pipeline() |
|
|
| try: |
| preds = clf(texts, batch_size=16, truncation=True) |
| except Exception as e: |
| print(f"[Sentiment] Prediction error: {e}") |
| return {"positif": 0, "negatif": 0, "netral": 0, "total": len(texts), "detail": []} |
|
|
| counts = {"positif": 0, "negatif": 0, "netral": 0} |
| detail = [] |
| for text, pred in zip(texts, preds): |
| model_label = _normalize_label(pred["label"]) |
| final_label = _override_label(text, model_label) |
| |
| counts[final_label] += 1 |
| detail.append({ |
| "text": text[:200], |
| "label": final_label, |
| "score": round(float(pred["score"]), 4), |
| }) |
|
|
| return { |
| "positif": counts["positif"], |
| "negatif": counts["negatif"], |
| "netral": counts["netral"], |
| "total": len(texts), |
| "detail": detail, |
| } |