File size: 4,074 Bytes
7701077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import torch
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    MarianMTModel,
    MarianTokenizer,
)
from langdetect import detect, LangDetectException
import os

FINBERT_DIR     = os.getenv("MODEL_DIR", "models/finbert-finetuned")
TRANSLATE_MODEL = "Helsinki-NLP/opus-mt-tr-en"
MAX_LENGTH      = 128

DEVICE_STR = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)
ID2LABEL = {0: "negative", 1: "neutral", 2: "positive"}

_finbert_tokenizer  = None
_finbert_model      = None
_marian_tokenizer   = None
_marian_model       = None

def load_all_models():
    global _finbert_tokenizer, _finbert_model, _marian_tokenizer, _marian_model

    print(f"FinBERT yükleniyor: {FINBERT_DIR}")
    _finbert_tokenizer = AutoTokenizer.from_pretrained(FINBERT_DIR)
    _finbert_model     = AutoModelForSequenceClassification.from_pretrained(
        FINBERT_DIR
    ).to(DEVICE_STR)
    _finbert_model.eval()
    print(f"  ✓ FinBERT hazır [{DEVICE_STR}]")

    print(f"Çeviri modeli yükleniyor: {TRANSLATE_MODEL}")
    _marian_tokenizer = MarianTokenizer.from_pretrained(TRANSLATE_MODEL)
    _marian_model     = MarianMTModel.from_pretrained(TRANSLATE_MODEL)
    _marian_model.eval()
    print(f"  ✓ Çeviri modeli hazır")

def detect_language(text: str) -> str:
    try:
        return detect(text)
    except LangDetectException:
        return "en"

def translate_to_english(texts: list[str]) -> list[str]:
    inputs  = _marian_tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256,
    )
    with torch.no_grad():
        outputs = _marian_model.generate(**inputs)
    return _marian_tokenizer.batch_decode(outputs, skip_special_tokens=True)

def run_finbert(texts: list[str]) -> list[dict]:
    enc = _finbert_tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt",
    ).to(DEVICE_STR)

    with torch.no_grad():
        logits = _finbert_model(**enc).logits

    probs = torch.softmax(logits, dim=-1).cpu().numpy()

    results = []
    for i, text in enumerate(texts):
        p       = probs[i]
        pred_id = int(np.argmax(p))
        results.append({
            "text"      : text,
            "sentiment" : ID2LABEL[pred_id],
            "confidence": round(float(p[pred_id]), 4),
            "scores"    : {
                "negative": round(float(p[0]), 4),
                "neutral" : round(float(p[1]), 4),
                "positive": round(float(p[2]), 4),
            },
        })
    return results

def run_inference_multilingual(texts: list[str]) -> list[dict]:
    langs   = [detect_language(t) for t in texts]
    results = [None] * len(texts)

    tr_indices = [i for i, l in enumerate(langs) if l == "tr"]
    en_indices = [i for i, l in enumerate(langs) if l != "tr"]

    if tr_indices:
        tr_texts   = [texts[i] for i in tr_indices]
        translated = translate_to_english(tr_texts)
        tr_results = run_finbert(translated)

        for j, idx in enumerate(tr_indices):
            r = tr_results[j]
            results[idx] = {
                "text"           : texts[idx],
                "translated_text": translated[j],
                "sentiment"      : r["sentiment"],
                "confidence"     : r["confidence"],
                "language"       : "tr",
                "scores"         : r["scores"],
            }

    if en_indices:
        en_texts   = [texts[i] for i in en_indices]
        en_results = run_finbert(en_texts)

        for j, idx in enumerate(en_indices):
            r = en_results[j]
            results[idx] = {
                "text"           : texts[idx],
                "translated_text": None,
                "sentiment"      : r["sentiment"],
                "confidence"     : r["confidence"],
                "language"       : "en",
                "scores"         : r["scores"],
            }

    return results