Spaces:
Sleeping
Sleeping
| import torch | |
| import numpy as np | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| MarianMTModel, | |
| MarianTokenizer, | |
| ) | |
| from langdetect import detect, LangDetectException | |
| import os | |
| FINBERT_DIR = os.getenv("MODEL_DIR", "models/finbert-finetuned") | |
| TRANSLATE_MODEL = "Helsinki-NLP/opus-mt-tr-en" | |
| MAX_LENGTH = 128 | |
| DEVICE_STR = ( | |
| "cuda" if torch.cuda.is_available() | |
| else "mps" if torch.backends.mps.is_available() | |
| else "cpu" | |
| ) | |
| ID2LABEL = {0: "negative", 1: "neutral", 2: "positive"} | |
| _finbert_tokenizer = None | |
| _finbert_model = None | |
| _marian_tokenizer = None | |
| _marian_model = None | |
| def load_all_models(): | |
| global _finbert_tokenizer, _finbert_model, _marian_tokenizer, _marian_model | |
| print(f"FinBERT yükleniyor: {FINBERT_DIR}") | |
| _finbert_tokenizer = AutoTokenizer.from_pretrained(FINBERT_DIR) | |
| _finbert_model = AutoModelForSequenceClassification.from_pretrained( | |
| FINBERT_DIR | |
| ).to(DEVICE_STR) | |
| _finbert_model.eval() | |
| print(f" ✓ FinBERT hazır [{DEVICE_STR}]") | |
| print(f"Çeviri modeli yükleniyor: {TRANSLATE_MODEL}") | |
| _marian_tokenizer = MarianTokenizer.from_pretrained(TRANSLATE_MODEL) | |
| _marian_model = MarianMTModel.from_pretrained(TRANSLATE_MODEL) | |
| _marian_model.eval() | |
| print(f" ✓ Çeviri modeli hazır") | |
| def detect_language(text: str) -> str: | |
| try: | |
| return detect(text) | |
| except LangDetectException: | |
| return "en" | |
| def translate_to_english(texts: list[str]) -> list[str]: | |
| inputs = _marian_tokenizer( | |
| texts, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=256, | |
| ) | |
| with torch.no_grad(): | |
| outputs = _marian_model.generate(**inputs) | |
| return _marian_tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
| def run_finbert(texts: list[str]) -> list[dict]: | |
| enc = _finbert_tokenizer( | |
| texts, | |
| padding=True, | |
| truncation=True, | |
| max_length=MAX_LENGTH, | |
| return_tensors="pt", | |
| ).to(DEVICE_STR) | |
| with torch.no_grad(): | |
| logits = _finbert_model(**enc).logits | |
| probs = torch.softmax(logits, dim=-1).cpu().numpy() | |
| results = [] | |
| for i, text in enumerate(texts): | |
| p = probs[i] | |
| pred_id = int(np.argmax(p)) | |
| results.append({ | |
| "text" : text, | |
| "sentiment" : ID2LABEL[pred_id], | |
| "confidence": round(float(p[pred_id]), 4), | |
| "scores" : { | |
| "negative": round(float(p[0]), 4), | |
| "neutral" : round(float(p[1]), 4), | |
| "positive": round(float(p[2]), 4), | |
| }, | |
| }) | |
| return results | |
| def run_inference_multilingual(texts: list[str]) -> list[dict]: | |
| langs = [detect_language(t) for t in texts] | |
| results = [None] * len(texts) | |
| tr_indices = [i for i, l in enumerate(langs) if l == "tr"] | |
| en_indices = [i for i, l in enumerate(langs) if l != "tr"] | |
| if tr_indices: | |
| tr_texts = [texts[i] for i in tr_indices] | |
| translated = translate_to_english(tr_texts) | |
| tr_results = run_finbert(translated) | |
| for j, idx in enumerate(tr_indices): | |
| r = tr_results[j] | |
| results[idx] = { | |
| "text" : texts[idx], | |
| "translated_text": translated[j], | |
| "sentiment" : r["sentiment"], | |
| "confidence" : r["confidence"], | |
| "language" : "tr", | |
| "scores" : r["scores"], | |
| } | |
| if en_indices: | |
| en_texts = [texts[i] for i in en_indices] | |
| en_results = run_finbert(en_texts) | |
| for j, idx in enumerate(en_indices): | |
| r = en_results[j] | |
| results[idx] = { | |
| "text" : texts[idx], | |
| "translated_text": None, | |
| "sentiment" : r["sentiment"], | |
| "confidence" : r["confidence"], | |
| "language" : "en", | |
| "scores" : r["scores"], | |
| } | |
| return results | |