Spaces:

NzTama
/

Sentiment

Runtime error

File size: 5,911 Bytes

fa8ff66

"""
sentiment.py  –  Sentiment analysis using IndoBERT / HuggingFace pipeline.
Model is loaded lazily (first call) to avoid crashing at import time.
"""
from __future__ import annotations

import os
from typing import Optional

# ── Model configuration ────────────────────────────────────────────────────────
# If you have a local fine-tuned model, place it in ./indoBERT-sentiment
# and set MODEL_DIR. Otherwise it downloads from HuggingFace.
_LOCAL_MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "indoBERT-sentiment")
_HF_MODEL_ID = "taufiqdp/indonesian-sentiment"

# ── Lazy-loaded globals ────────────────────────────────────────────────────────
_pipeline: Optional[object] = None


def _load_pipeline():
    global _pipeline
    if _pipeline is not None:
        return _pipeline

    import torch
    from transformers import pipeline as hf_pipeline

    # Prefer local model if it exists (avoids repeated downloads in Docker)
    if os.path.isdir(_LOCAL_MODEL_DIR) and os.listdir(_LOCAL_MODEL_DIR):
        model_source = _LOCAL_MODEL_DIR
        print(f"[Sentiment] Loading model from local dir: {model_source}")
    else:
        model_source = _HF_MODEL_ID
        print(f"[Sentiment] Local model not found. Downloading from HuggingFace: {model_source}")

    device = 0 if torch.cuda.is_available() else -1

    _pipeline = hf_pipeline(
        "text-classification",
        model=model_source,
        tokenizer=model_source,
        device=device,
        truncation=True,
        max_length=256,
        return_all_scores=False,
    )
    print("[Sentiment] Model loaded successfully.")
    return _pipeline


# ── Helpers ────────────────────────────────────────────────────────────────────

def _normalize_label(lbl: str) -> str:
    """Normalise raw model label to 'positif', 'negatif', or 'netral'."""
    l = lbl.lower()
    if l in ("positif", "positive", "pos"):
        return "positif"
    if l in ("negatif", "negative", "neg"):
        return "negatif"
    if l in ("netral", "neutral", "neu"):
        return "netral"
    if "label_" in l:
        try:
            from transformers import AutoConfig
            cfg = AutoConfig.from_pretrained(_HF_MODEL_ID)
            idx = int(l.split("_")[-1])
            return _normalize_label(cfg.id2label[idx])
        except Exception:
            return "netral"
    return "netral"


# ── Keywords Override ──────────────────────────────────────────────────────────

_NEGATIVE_KEYWORDS = {
    "bego", "bodoh", "jelek", "goblok", "bangsat", "kampungan", "tolol",
    "kontol", "kirik", "koplok", "anjing", "babi", "monyet", "belegug",
    "kik", "goblog", "kntl", "buruk", "lemah", "rendah", "gagal", "hancur", 
    "rusak", "cacat", "jahat", "dusta", "bohong", "fitnah", "korup", "curang", 
    "palsu", "salah", "sesat", "kejam", "dendam", "malas", "lambat", "menyakitkan",
    "tercela", "merugikan", "menghina", "melecehkan", "menyesatkan"
}

_POSITIVE_KEYWORDS = {
    "bagus", "hebat", "mantap", "luar biasa", "keren", "canggih",
    "cerdas", "pintar", "senang", "bahagia", "memuaskan", "unggul",
    "sempurna", "berhasil", "luas", "indah"
}

_NEUTRAL_KEYWORDS = {
    "ok", "oke", "biasa", "lumayan", "standar", "normal", "cukup", "agak"
}

def _override_label(text: str, model_label: str) -> str:
    text_lower = text.lower()
    
    if any(w in text_lower for w in _NEGATIVE_KEYWORDS):
        return "negatif"
    if any(w in text_lower for w in _POSITIVE_KEYWORDS):
        return "positif"
    if any(w in text_lower for w in _NEUTRAL_KEYWORDS):
        return "netral"
        
    return model_label


# ── Public API ─────────────────────────────────────────────────────────────────

def analyze_sentiment(texts: list) -> dict:
    """
    Run sentiment analysis on a list of text strings.

    Args:
        texts: list of pre-processed strings

    Returns:
        dict with keys: positif, negatif, netral, total, detail
        Example:
          {
            "positif": 12, "negatif": 4, "netral": 6, "total": 22,
            "detail": [{"text": "...", "label": "positif", "score": 0.95}, ...]
          }
    """
    if not texts:
        return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []}

    # Filter out empty strings
    texts = [t for t in texts if t and t.strip()]
    if not texts:
        return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []}

    clf = _load_pipeline()

    try:
        preds = clf(texts, batch_size=16, truncation=True)
    except Exception as e:
        print(f"[Sentiment] Prediction error: {e}")
        return {"positif": 0, "negatif": 0, "netral": 0, "total": len(texts), "detail": []}

    counts = {"positif": 0, "negatif": 0, "netral": 0}
    detail = []
    for text, pred in zip(texts, preds):
        model_label = _normalize_label(pred["label"])
        final_label = _override_label(text, model_label)
        
        counts[final_label] += 1
        detail.append({
            "text": text[:200],
            "label": final_label,
            "score": round(float(pred["score"]), 4),
        })

    return {
        "positif": counts["positif"],
        "negatif": counts["negatif"],
        "netral": counts["netral"],
        "total": len(texts),
        "detail": detail,
    }