Spaces:
Sleeping
Sleeping
File size: 16,651 Bytes
4912b0b 016dbc6 4912b0b 5d18194 4912b0b 9b370ba 4912b0b 016dbc6 4912b0b 016dbc6 4912b0b 016dbc6 4912b0b 016dbc6 4912b0b 721b9e7 016dbc6 4912b0b 721b9e7 4912b0b 721b9e7 4912b0b 721b9e7 4912b0b 721b9e7 4912b0b 721b9e7 4912b0b 721b9e7 4912b0b 016dbc6 4912b0b 016dbc6 4912b0b 9b370ba 721b9e7 9b370ba 4912b0b 721b9e7 4912b0b 9b370ba 721b9e7 4912b0b 721b9e7 4912b0b 9b370ba 4912b0b 721b9e7 4912b0b 721b9e7 4912b0b 721b9e7 4912b0b 721b9e7 4912b0b 721b9e7 4912b0b 93578fc 4912b0b 016dbc6 4912b0b 016dbc6 e1377d6 4912b0b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 | import os # isletim sistemi degiskenlerine erismek icin
import re # metin isleme icin regexp kutuphanesi
import time # gecikme olcumu icin zaman fonksiyonlari
import gc # bellek temizligi icin garbage collector
from typing import Dict, Tuple, Optional, List # tip ipuclari icin
import gradio as gr # Hugging Face Spaces arayuzunu kurmak icin
import torch # pytorch modellerini calistirmak icin
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoConfig
# ====== MODEL KAYITLARI (sade ve ogrenci dostu) ======
MODELS: Dict[str, Dict] = {
# en dili icin 3 model
"roberta": {
"name": "RoBERTa Twitter 3class EN", # aciklama adi
"id": "cardiffnlp/twitter-roberta-base-sentiment-latest", # HF model kimligi
"kind": "3class" # cikis turu
},
"distilbert": {
"name": "DistilBERT SST2 2class EN", # aciklama adi
"id": "distilbert-base-uncased-finetuned-sst-2-english", # HF model kimligi
"kind": "2class" # cikis turu
},
"bertweet": {
"name": "BERTweet 3class EN", # aciklama adi
"id": "finiteautomata/bertweet-base-sentiment-analysis", # HF model kimligi
"kind": "3class" # cikis turu
},
# tr veya diger diller icin 3 model (cok dilli agirlikli)
"xlmr": {
"name": "XLM-R 3class Multi", # aciklama adi
"id": "cardiffnlp/twitter-xlm-roberta-base-sentiment", # HF model kimligi
"kind": "3class" # cikis turu
},
"bert_5star": {
"name": "BERT Multi 5star", # aciklama adi
"id": "nlptown/bert-base-multilingual-uncased-sentiment", # HF model kimligi
"kind": "5star" # cikis turu
},
"albert": {
"name": "ALBERT v2 3class Light", # aciklama adi
"id": "barissayil/bert-sentiment-analysis-sst", # HF model kimligi
"kind": "3class" # cikis turu
},
}
# ====== DIL BAZLI TOP3 SECIMLERI ======
LANG_TOP3 = { # her dil icin 3 aday model listesi
"en": ["roberta", "distilbert", "bertweet"], # ingilizce icin ilk 3
"tr": ["xlmr", "bert_5star", "albert"], # turkce icin ilk 3 (cok dilli agirlikli)
"other": ["xlmr", "bert_5star", "roberta"] # diger diller icin yedek 3
}
# ====== LAZY CACHE (modelleri ihtiyac olunca yukleme) ======
_PIPE_CACHE: Dict[str, TextClassificationPipeline] = {} # pipeline nesneleri icin cache sozlugu
_CFG_CACHE: Dict[str, AutoConfig] = {} # model config nesneleri icin cache sozlugu
MAX_CACHE_SIZE = 4 # en fazla 4 farkli model cachede tut
def cleanup_cache() -> None: # cache buyurse eskileri silmek icin fonksiyon
while len(_PIPE_CACHE) > MAX_CACHE_SIZE: # eger siniri astiysa
oldest_key = next(iter(_PIPE_CACHE.keys())) # ilk ekleneni bul
_PIPE_CACHE.pop(oldest_key, None) # pipeline sil
_CFG_CACHE.pop(oldest_key, None) # config sil
gc.collect() # python cop toplayici cagir
if torch.cuda.is_available(): # eger gpu varsa
torch.cuda.empty_cache() # gpu bellegini de bosalt
def get_pipe_and_cfg(model_key: str) -> Tuple[Optional[TextClassificationPipeline], Optional[AutoConfig]]: # model yukleme fonksiyonu
spec = MODELS[model_key] # model sozlugunden kaydi al
model_id = spec["id"] # hf id'sini al
if model_id in _PIPE_CACHE: # eger cachede varsa
return _PIPE_CACHE[model_id], _CFG_CACHE.get(model_id) # cacheden dondur
try:
tok = AutoTokenizer.from_pretrained(model_id) # tokenizer yukle
mdl = AutoModelForSequenceClassification.from_pretrained(model_id) # model yukle
pipe = TextClassificationPipeline( # pipeline olustur
model=mdl, # model set et
tokenizer=tok, # tokenizer set et
framework="pt", # pytorch kullan
return_all_scores=True, # tum sinif skorlarini iste
device=-1 # cpu kullan
)
_PIPE_CACHE[model_id] = pipe # pipeline'i cache'e yaz
_CFG_CACHE[model_id] = AutoConfig.from_pretrained(model_id) # config'i cache'e yaz
cleanup_cache() # gerekirse cache temizligi yap
return pipe, _CFG_CACHE[model_id] # pipeline ve config dondur
except Exception as e: # yukleme hatasi olursa
print(f"model yukleme hatasi: {model_key} -> {e}") # ekrana yaz
return None, None # None dondur
# ====== DIL TESPITI ======
try:
from langdetect import detect # hafif dil tespiti kutuphanesi
except Exception:
detect = None # eger yuklenemezse None yap
def detect_lang(text: str) -> str: # girilen metnin dilini bul
t = (text or "").strip() # bosluklari temizle
if not t or len(t) < 2: # cok kisa ise
return "other" # diger kabul et
if detect is None: # kutuphane yoksa
return "other" # diger kabul et
try:
lang = detect(t) # dil tespiti yap
return lang if lang in ("en", "tr") else "other" # sadece en ve tr destekliyoruz
except Exception:
return "other" # hata olursa diger de
# ====== LABEL NORMALIZASYONU ======
def normalize_label(raw_label: str, cfg: Optional[AutoConfig], kind: str) -> str: # farkli etiketleri standarda cevir
lbl = (raw_label or "").lower() # etiket kucuk harfe cevir
if lbl.startswith("label_") and cfg is not None and hasattr(cfg, "id2label"): # eger LABEL_0 gibi ise
try:
idx = int(lbl.split("_")[-1]) # sayiyi al
lbl = str(cfg.id2label[idx]).lower() # id2label ile gercek etikete cevir
except Exception:
pass # hata olursa devam et
if kind == "5star": # 5 yildizli model icin
m = re.search(r"([1-5])", lbl) # 1..5 ara
if m: # bulunduysa
s = int(m.group(1)) # sayiyi al
if s <= 2: # 1 veya 2 ise
return "negative" # negatif
if s == 3: # 3 ise
return "neutral" # notr
return "positive" # 4 veya 5 ise pozitif
if "neg" in lbl: # metin icinde neg geciyorsa
return "negative" # negatif dondur
if "neu" in lbl: # metin icinde neu geciyorsa
return "neutral" # notr dondur
if "pos" in lbl: # metin icinde pos geciyorsa
return "positive" # pozitif dondur
return "neutral" # diger durumlarda notr dondur (2-class icin guvenli secim)
# ====== ENGLISH ON-ISLEME ======
def preprocess_en(text: str) -> str: # ingilizce metin icin hafif on isleme
if not text: # metin bos ise
return text # aynen dondur
t = re.sub(r"\s+", " ", text).strip() # fazla bosluklari sadelestir
t = re.sub(r"(.)\1{3,}", r"\1\1", t) # cok tekrar eden karakterleri kisalt
t = re.sub(r"http[s]?://\S+", "URL", t) # linkleri URL ile degistir
t = re.sub(r"@\w+", "@USER", t) # mentionlari duzelt
t = re.sub(r"#(\w+)", r"\1", t) # hashtag isaretini kaldir
reps = { # kisaltma acilimlari sozlugu
"won't": "will not",
"can't": "cannot",
"n't": " not",
"'re": " are",
"'ve": " have",
"'ll": " will",
"'d": " would",
"'m": " am"
}
for old, new in reps.items(): # her kural icin
t = t.replace(old, new) # metinde degistir
return t # islenmis metni dondur
# ====== DIL -> VARSAYILAN MODEL KURALI ======
def pick_default_key_for_lang(lang: str) -> str: # dile gore varsayilan modeli sec
if lang == "en": # ingilizce ise
return "roberta" # roberta sec
if lang == "tr": # turkce ise
return "xlmr" # xlmr sec
return "xlmr" # diger diller icin xlmr sec
# ====== ANA API FONKSIYONU (/api/predict/analyze) ======
def analyze(text: str, force_lang: Optional[str] = None, benchmark: bool = False): # ana uretim endpointi
txt = (text or "").strip() # giris metnini temizle
if not txt: # bos metin ise
return { # notr dondur
"label": "neutral",
"score": 1.0,
"confidence": "high",
"lang": force_lang or "other",
"model_used": "none",
"processing_time_ms": 0.0
}
lang = force_lang if force_lang in ("en", "tr", "other") else detect_lang(txt) # dili belirle
proc = preprocess_en(txt) if lang == "en" else txt # ingilizce ise on isleme uygula
candidates: List[Dict] = [] # benchmark aday listesi
if benchmark: # eger mini benchmark istenirse
keys = LANG_TOP3.get(lang, LANG_TOP3["other"]) # o dilin TOP3 listesi
for k in keys: # her aday icin
pipe, cfg = get_pipe_and_cfg(k) # pipeline ve config al
if pipe is None: # yuklenemediyse
continue # atla
t0 = time.perf_counter() # zaman sayacini baslat
out = pipe(proc)[0] # modeli calistir
ms = (time.perf_counter() - t0) * 1000.0 # gecikmeyi hesapla
top = max(out, key=lambda s: s["score"]) # en yuksek skorlu sinifi bul
lab = normalize_label(top["label"], cfg, MODELS[k]["kind"]) # etiketi standarda cevir
candidates.append({ # adayi listeye ekle
"model": k,
"label": lab,
"score": float(top["score"]),
"latency_ms": round(ms, 2)
})
if candidates: # adaylar varsa
candidates.sort(key=lambda c: (-c["score"], c["latency_ms"])) # skora gore azalan, sonra gecikmeye gore artan
winner_key = candidates[0]["model"] # kazananin anahtarini al
else: # aday yoksa
winner_key = pick_default_key_for_lang(lang) # varsayilani sec
else: # benchmark yoksa
winner_key = pick_default_key_for_lang(lang) # dogrudan varsayilani sec
pipe, cfg = get_pipe_and_cfg(winner_key) # kazanan pipeline ve config al
if pipe is None: # yuklenemezse
return { # hata don
"label": "error",
"score": 0.0,
"confidence": "low",
"lang": lang,
"model_used": winner_key,
"processing_time_ms": 0.0,
"error": "model_load_failed"
}
t0 = time.perf_counter() # zaman sayacini baslat
out = pipe(proc)[0] # tek atis tahmini al
ms = (time.perf_counter() - t0) * 1000.0 # gecikmeyi hesapla
top = max(out, key=lambda s: s["score"]) # en yuksek skorlu sinifi bul
label = normalize_label(top["label"], cfg, MODELS[winner_key]["kind"]) # etiketi standarda cevir
score = float(top["score"]) # skoru sayiya cevir
confidence = "high" if score > 0.8 else ("medium" if score > 0.6 else "low") # guven araligini belirle
resp = { # yanit sozlugunu olustur
"label": label, # son etiket
"score": round(score, 4), # skor 4 ondalik
"confidence": confidence, # guven seviyesi
"lang": lang, # tespit edilen dil
"model_used": MODELS[winner_key]["id"].split("/")[-1], # kullanilan modelin son parcasi
"processing_time_ms": round(ms, 2) # islem suresi ms
}
if benchmark and candidates: # eger benchmark yapildiysa
resp["candidates"] = candidates # adaylar listesini da dondur
return resp # yaniti dondur
# ====== BENCHMARK UI (otomatik EN/TR/OTHER kovalama) ======
def run_benchmark_auto(texts_blob: str): # coklu metin benchmark fonksiyonu
texts = [t.strip() for t in (texts_blob or "").splitlines() if t.strip()] # satirlari ayir ve boslari temizle
if not texts: # eger hic metin yoksa
return "Uyari: metin alani bos.", [] # uyari ve bos tablo dondur
buckets = {"en": [], "tr": [], "other": []} # dil kovalari olustur
for t in texts: # her metin icin
buckets[detect_lang(t)].append(t) # uygun kovaya ekle
rows: List[List] = [] # cikti satirlarini tutacak liste
errors: List[str] = [] # hata mesajlari icin liste
def bench_set(text_list: List[str], keys: List[str], tag: str): # bir dil kovasi icin benchmark calistir
if not text_list: # eger bu kovada metin yoksa
return # cik
for k in keys: # her model adayi icin
spec = MODELS[k] # model kaydini al
pipe, cfg = get_pipe_and_cfg(k) # pipeline ve config al
modelname = f"{tag}/{spec['name']}" # tablo icin gorsel ad olustur
if pipe is None: # yuklenemedi ise
errors.append(f"yuklenemedi: {modelname}") # hatayi kaydet
for t in text_list: # her metin icin
rows.append([t[:50], modelname, "ERROR", 0.0, 0.0, "N/A"]) # hata satiri ekle
continue # diger modele gec
proc = [preprocess_en(x) if tag == "EN" else x for x in text_list] # EN icin on isleme yap
t0 = time.perf_counter() # zaman sayacini baslat
outs = pipe(proc) # toplu tahmin al
avg_ms = (time.perf_counter() - t0) * 1000.0 / max(1, len(proc)) # ortalama gecikme hesapla
for orig, out in zip(text_list, outs): # her cikti icin
try:
top = max(out, key=lambda s: s["score"]) # en yuksek skoru bul
lab = normalize_label(top["label"], cfg, spec["kind"]) # etiketi standarda cevir
sc = float(top["score"]) # skoru sayiya cevir
conf = "high" if sc > 0.8 else ("medium" if sc > 0.6 else "low") # guven seviyesi
rows.append([ # tabloya bir satir ekle
orig[:50] + ("..." if len(orig) > 50 else ""), # metnin kisa hali
modelname, # model adi
lab, # etiket
round(sc, 4), # skor
round(avg_ms, 1), # ortalama gecikme
conf # guven
])
except Exception as ex: # tahmin hatasi olursa
errors.append(f"hata: {modelname}: {str(ex)[:80]}") # hata listesine ekle
rows.append([orig[:50], modelname, "ERROR", 0.0, 0.0, "N/A"]) # hata satiri ekle
bench_set(buckets["en"], LANG_TOP3["en"], "EN") # ingilizce kovayi calistir
bench_set(buckets["tr"], LANG_TOP3["tr"], "TR") # turkce kovayi calistir
bench_set(buckets["other"], LANG_TOP3["other"], "OTHER") # diger kovayi calistir
# sadelesmis ozet metni olustur
summary_lines: List[str] = [] # ozet satirlari icin liste
if errors: # hata varsa
summary_lines.append("Hatalar:") # baslik yaz
for e in errors: # her hata icin
summary_lines.append(f"- {e}") # listele
if not summary_lines: # hic hata yoksa
summary_lines.append("Benchmark tamamlandi.") # basit bilgi satiri
return "\n".join(summary_lines), rows # ozet metni ve tablo satirlarini dondur
# ====== GRADIO ARAYUZLERI ======
api_intf = gr.Interface( # uretim api arayuzu
fn=analyze, # cagrilacak fonksiyon
inputs=[ # giris bilesenleri
gr.Textbox(lines=3, label="Text"), # metin kutusu
gr.Textbox(lines=1, label="force_lang (en|tr|other, opsiyonel)", value=""), # zorlama dil alani
gr.Checkbox(label="benchmark (kisa TOP3 karsilastirma)", value=False), # benchmark secenegi
],
outputs=gr.JSON(label="Result"), # cikti JSON gosterimi
title="Sentiment API (Production)", # baslik
description="POST /api/predict/analyze doner: {label, score, confidence, lang, model_used, processing_time_ms[, candidates]}" # aciklama
)
api_intf.api_name = "analyze" # endpoint yolu: /api/predict/analyze
with gr.Blocks(title="Sentiment Benchmark") as bench_ui: # benchmark arayuzu
gr.Markdown("Coklu metin benchmark. Her satir ayri bir ornek olmalidir.") # kullaniciya kisa bilgi
txt = gr.Textbox(lines=10, label="Ornekler (satir satir)") # coklu satir metin girisi
btn = gr.Button("Calistir") # calistir butonu
out_md = gr.Markdown() # ozet metin alanı
out_tbl = gr.Dataframe( # tablo cikti alani
headers=["text", "bucket/model", "label", "score", "latency_ms", "confidence"], # tablo basliklari
row_count=(0, "dynamic"), # dinamik satir sayisi
col_count=(6, "fixed"), # sabit sutun sayisi
interactive=False, # kullanici duzenleyemesin
wrap=True # metin sarma acik
)
btn.click(fn=run_benchmark_auto, inputs=[txt], outputs=[out_md, out_tbl]) # buton tiklayinca fonksiyonu cagir
demo = gr.TabbedInterface( # iki sekmeli toplam arayuz
[api_intf, bench_ui], # birinci sekme api, ikinci sekme benchmark
tab_names=["API", "Benchmark"] # sekme isimleri
)
if __name__ == "__main__": # ana calisma blogu
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), show_error=True) # gradioyu baslat
|