Spaces:
Sleeping
Sleeping
File size: 4,074 Bytes
7701077 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | import torch
import numpy as np
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
MarianMTModel,
MarianTokenizer,
)
from langdetect import detect, LangDetectException
import os
FINBERT_DIR = os.getenv("MODEL_DIR", "models/finbert-finetuned")
TRANSLATE_MODEL = "Helsinki-NLP/opus-mt-tr-en"
MAX_LENGTH = 128
DEVICE_STR = (
"cuda" if torch.cuda.is_available()
else "mps" if torch.backends.mps.is_available()
else "cpu"
)
ID2LABEL = {0: "negative", 1: "neutral", 2: "positive"}
_finbert_tokenizer = None
_finbert_model = None
_marian_tokenizer = None
_marian_model = None
def load_all_models():
global _finbert_tokenizer, _finbert_model, _marian_tokenizer, _marian_model
print(f"FinBERT yükleniyor: {FINBERT_DIR}")
_finbert_tokenizer = AutoTokenizer.from_pretrained(FINBERT_DIR)
_finbert_model = AutoModelForSequenceClassification.from_pretrained(
FINBERT_DIR
).to(DEVICE_STR)
_finbert_model.eval()
print(f" ✓ FinBERT hazır [{DEVICE_STR}]")
print(f"Çeviri modeli yükleniyor: {TRANSLATE_MODEL}")
_marian_tokenizer = MarianTokenizer.from_pretrained(TRANSLATE_MODEL)
_marian_model = MarianMTModel.from_pretrained(TRANSLATE_MODEL)
_marian_model.eval()
print(f" ✓ Çeviri modeli hazır")
def detect_language(text: str) -> str:
try:
return detect(text)
except LangDetectException:
return "en"
def translate_to_english(texts: list[str]) -> list[str]:
inputs = _marian_tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=256,
)
with torch.no_grad():
outputs = _marian_model.generate(**inputs)
return _marian_tokenizer.batch_decode(outputs, skip_special_tokens=True)
def run_finbert(texts: list[str]) -> list[dict]:
enc = _finbert_tokenizer(
texts,
padding=True,
truncation=True,
max_length=MAX_LENGTH,
return_tensors="pt",
).to(DEVICE_STR)
with torch.no_grad():
logits = _finbert_model(**enc).logits
probs = torch.softmax(logits, dim=-1).cpu().numpy()
results = []
for i, text in enumerate(texts):
p = probs[i]
pred_id = int(np.argmax(p))
results.append({
"text" : text,
"sentiment" : ID2LABEL[pred_id],
"confidence": round(float(p[pred_id]), 4),
"scores" : {
"negative": round(float(p[0]), 4),
"neutral" : round(float(p[1]), 4),
"positive": round(float(p[2]), 4),
},
})
return results
def run_inference_multilingual(texts: list[str]) -> list[dict]:
langs = [detect_language(t) for t in texts]
results = [None] * len(texts)
tr_indices = [i for i, l in enumerate(langs) if l == "tr"]
en_indices = [i for i, l in enumerate(langs) if l != "tr"]
if tr_indices:
tr_texts = [texts[i] for i in tr_indices]
translated = translate_to_english(tr_texts)
tr_results = run_finbert(translated)
for j, idx in enumerate(tr_indices):
r = tr_results[j]
results[idx] = {
"text" : texts[idx],
"translated_text": translated[j],
"sentiment" : r["sentiment"],
"confidence" : r["confidence"],
"language" : "tr",
"scores" : r["scores"],
}
if en_indices:
en_texts = [texts[i] for i in en_indices]
en_results = run_finbert(en_texts)
for j, idx in enumerate(en_indices):
r = en_results[j]
results[idx] = {
"text" : texts[idx],
"translated_text": None,
"sentiment" : r["sentiment"],
"confidence" : r["confidence"],
"language" : "en",
"scores" : r["scores"],
}
return results
|