File size: 2,648 Bytes
f7a8d72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline

HF_MODEL = os.getenv("HF_MODEL", "IMSyPP/hate_speech_multilingual")

# Label order follows the model card from the original model.
LABEL_NAMES = {
    "LABEL_0": "Appropriate",
    "LABEL_1": "Inappropriate",
    "LABEL_2": "Offensive",
    "LABEL_3": "Violent",
}

_pipeline = None


def _get_pipeline():
    global _pipeline
    if _pipeline is not None:
        return _pipeline

    tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)

    # Transformers pipeline uses -1 for CPU, >=0 for CUDA GPU index.
    device = 0 if torch.cuda.is_available() else -1
    _pipeline = TextClassificationPipeline(
        model=model,
        tokenizer=tokenizer,
        top_k=None,
        task="sentiment_analysis",
        function_to_apply="softmax",
        device=device,
    )
    return _pipeline


def _normalize_scores(raw_output):
    """
    Normalize Hugging Face inference output into a sorted list:
    [{"label": str, "display_label": str, "score": float}, ...]
    """
    if not isinstance(raw_output, list) or not raw_output:
        raise ValueError("Unexpected model response format")

    # HF text-classification can return either:
    # - [{"label": "...", "score": ...}] or
    # - [[{"label": "...", "score": ...}, ...]] with return_all_scores=True
    if isinstance(raw_output[0], list):
        scores = raw_output[0]
    else:
        scores = raw_output

    normalized = []
    for item in scores:
        if not isinstance(item, dict):
            continue
        label = item.get("label")
        score = float(item.get("score", 0.0))
        if not label:
            continue
        normalized.append(
            {
                "label": label,
                "display_label": LABEL_NAMES.get(label, label.replace("_", " ").title()),
                "score": score,
            }
        )

    if not normalized:
        raise ValueError("Model returned no class scores")

    return sorted(normalized, key=lambda x: x["score"], reverse=True)


def predict(text):
    try:
        classifier = _get_pipeline()
        response_data = classifier(text)
    except Exception as err:
        raise RuntimeError(f"Local model inference failed: {err}") from err

    classes = _normalize_scores(response_data)
    top = classes[0]

    return {
        "model": HF_MODEL,
        "label": top["label"],
        "display_label": top["display_label"],
        "probability": top["score"],
        "classes": classes,
    }