File size: 2,394 Bytes
b744f77
 
 
 
 
 
 
 
 
 
 
8d590ec
b744f77
8d590ec
 
 
 
b744f77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17d3919
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification



MAX_LEN = 64
labels = ["Negative", "Neutral", "Positive"]



MODEL_REPOS = {
    "roberta":       os.getenv("ROBERTA_MODEL"),
    "distilroberta": os.getenv("DISTILROBERTA_MODEL"),
    "bert":          os.getenv("BERT_MODEL"),
    "albert":        os.getenv("ALBERT_MODEL"),
}


BASE_TOKENIZERS = {
    "roberta": "roberta-base",
    "distilroberta": "distilroberta-base",
    "bert": "bert-base-uncased",
    "albert": "albert-base-v2"
}

MODEL_CACHE = {}




def load_model(model_name):
    if model_name in MODEL_CACHE:
        return MODEL_CACHE[model_name]

    print(f"🔄 Loading {model_name} from HuggingFace...")

    tokenizer = AutoTokenizer.from_pretrained(BASE_TOKENIZERS[model_name])

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_REPOS[model_name]
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    MODEL_CACHE[model_name] = (tokenizer, model, device)
    return tokenizer, model, device


def predict(text, model_name="roberta"):
    tokenizer, model, device = load_model(model_name)

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=MAX_LEN
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]

    pred = np.argmax(probs)
    return labels[pred], probs.tolist()


def compare_all_models(text):
    results = []

    for model_name in MODEL_REPOS.keys():
        tokenizer, model, device = load_model(model_name)

        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=MAX_LEN
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]

        pred = np.argmax(probs)

        results.append({
            "model": model_name,
            "prediction": labels[pred],
            "confidence": float(max(probs)),
            "negative": float(probs[0]),
            "neutral": float(probs[1]),
            "positive": float(probs[2]),
        })

    return results