sherdd's picture
updating
4912b0b verified
import os # isletim sistemi degiskenlerine erismek icin
import re # metin isleme icin regexp kutuphanesi
import time # gecikme olcumu icin zaman fonksiyonlari
import gc # bellek temizligi icin garbage collector
from typing import Dict, Tuple, Optional, List # tip ipuclari icin
import gradio as gr # Hugging Face Spaces arayuzunu kurmak icin
import torch # pytorch modellerini calistirmak icin
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoConfig
# ====== MODEL KAYITLARI (sade ve ogrenci dostu) ======
MODELS: Dict[str, Dict] = {
# en dili icin 3 model
"roberta": {
"name": "RoBERTa Twitter 3class EN", # aciklama adi
"id": "cardiffnlp/twitter-roberta-base-sentiment-latest", # HF model kimligi
"kind": "3class" # cikis turu
},
"distilbert": {
"name": "DistilBERT SST2 2class EN", # aciklama adi
"id": "distilbert-base-uncased-finetuned-sst-2-english", # HF model kimligi
"kind": "2class" # cikis turu
},
"bertweet": {
"name": "BERTweet 3class EN", # aciklama adi
"id": "finiteautomata/bertweet-base-sentiment-analysis", # HF model kimligi
"kind": "3class" # cikis turu
},
# tr veya diger diller icin 3 model (cok dilli agirlikli)
"xlmr": {
"name": "XLM-R 3class Multi", # aciklama adi
"id": "cardiffnlp/twitter-xlm-roberta-base-sentiment", # HF model kimligi
"kind": "3class" # cikis turu
},
"bert_5star": {
"name": "BERT Multi 5star", # aciklama adi
"id": "nlptown/bert-base-multilingual-uncased-sentiment", # HF model kimligi
"kind": "5star" # cikis turu
},
"albert": {
"name": "ALBERT v2 3class Light", # aciklama adi
"id": "barissayil/bert-sentiment-analysis-sst", # HF model kimligi
"kind": "3class" # cikis turu
},
}
# ====== DIL BAZLI TOP3 SECIMLERI ======
LANG_TOP3 = { # her dil icin 3 aday model listesi
"en": ["roberta", "distilbert", "bertweet"], # ingilizce icin ilk 3
"tr": ["xlmr", "bert_5star", "albert"], # turkce icin ilk 3 (cok dilli agirlikli)
"other": ["xlmr", "bert_5star", "roberta"] # diger diller icin yedek 3
}
# ====== LAZY CACHE (modelleri ihtiyac olunca yukleme) ======
_PIPE_CACHE: Dict[str, TextClassificationPipeline] = {} # pipeline nesneleri icin cache sozlugu
_CFG_CACHE: Dict[str, AutoConfig] = {} # model config nesneleri icin cache sozlugu
MAX_CACHE_SIZE = 4 # en fazla 4 farkli model cachede tut
def cleanup_cache() -> None: # cache buyurse eskileri silmek icin fonksiyon
while len(_PIPE_CACHE) > MAX_CACHE_SIZE: # eger siniri astiysa
oldest_key = next(iter(_PIPE_CACHE.keys())) # ilk ekleneni bul
_PIPE_CACHE.pop(oldest_key, None) # pipeline sil
_CFG_CACHE.pop(oldest_key, None) # config sil
gc.collect() # python cop toplayici cagir
if torch.cuda.is_available(): # eger gpu varsa
torch.cuda.empty_cache() # gpu bellegini de bosalt
def get_pipe_and_cfg(model_key: str) -> Tuple[Optional[TextClassificationPipeline], Optional[AutoConfig]]: # model yukleme fonksiyonu
spec = MODELS[model_key] # model sozlugunden kaydi al
model_id = spec["id"] # hf id'sini al
if model_id in _PIPE_CACHE: # eger cachede varsa
return _PIPE_CACHE[model_id], _CFG_CACHE.get(model_id) # cacheden dondur
try:
tok = AutoTokenizer.from_pretrained(model_id) # tokenizer yukle
mdl = AutoModelForSequenceClassification.from_pretrained(model_id) # model yukle
pipe = TextClassificationPipeline( # pipeline olustur
model=mdl, # model set et
tokenizer=tok, # tokenizer set et
framework="pt", # pytorch kullan
return_all_scores=True, # tum sinif skorlarini iste
device=-1 # cpu kullan
)
_PIPE_CACHE[model_id] = pipe # pipeline'i cache'e yaz
_CFG_CACHE[model_id] = AutoConfig.from_pretrained(model_id) # config'i cache'e yaz
cleanup_cache() # gerekirse cache temizligi yap
return pipe, _CFG_CACHE[model_id] # pipeline ve config dondur
except Exception as e: # yukleme hatasi olursa
print(f"model yukleme hatasi: {model_key} -> {e}") # ekrana yaz
return None, None # None dondur
# ====== DIL TESPITI ======
try:
from langdetect import detect # hafif dil tespiti kutuphanesi
except Exception:
detect = None # eger yuklenemezse None yap
def detect_lang(text: str) -> str: # girilen metnin dilini bul
t = (text or "").strip() # bosluklari temizle
if not t or len(t) < 2: # cok kisa ise
return "other" # diger kabul et
if detect is None: # kutuphane yoksa
return "other" # diger kabul et
try:
lang = detect(t) # dil tespiti yap
return lang if lang in ("en", "tr") else "other" # sadece en ve tr destekliyoruz
except Exception:
return "other" # hata olursa diger de
# ====== LABEL NORMALIZASYONU ======
def normalize_label(raw_label: str, cfg: Optional[AutoConfig], kind: str) -> str: # farkli etiketleri standarda cevir
lbl = (raw_label or "").lower() # etiket kucuk harfe cevir
if lbl.startswith("label_") and cfg is not None and hasattr(cfg, "id2label"): # eger LABEL_0 gibi ise
try:
idx = int(lbl.split("_")[-1]) # sayiyi al
lbl = str(cfg.id2label[idx]).lower() # id2label ile gercek etikete cevir
except Exception:
pass # hata olursa devam et
if kind == "5star": # 5 yildizli model icin
m = re.search(r"([1-5])", lbl) # 1..5 ara
if m: # bulunduysa
s = int(m.group(1)) # sayiyi al
if s <= 2: # 1 veya 2 ise
return "negative" # negatif
if s == 3: # 3 ise
return "neutral" # notr
return "positive" # 4 veya 5 ise pozitif
if "neg" in lbl: # metin icinde neg geciyorsa
return "negative" # negatif dondur
if "neu" in lbl: # metin icinde neu geciyorsa
return "neutral" # notr dondur
if "pos" in lbl: # metin icinde pos geciyorsa
return "positive" # pozitif dondur
return "neutral" # diger durumlarda notr dondur (2-class icin guvenli secim)
# ====== ENGLISH ON-ISLEME ======
def preprocess_en(text: str) -> str: # ingilizce metin icin hafif on isleme
if not text: # metin bos ise
return text # aynen dondur
t = re.sub(r"\s+", " ", text).strip() # fazla bosluklari sadelestir
t = re.sub(r"(.)\1{3,}", r"\1\1", t) # cok tekrar eden karakterleri kisalt
t = re.sub(r"http[s]?://\S+", "URL", t) # linkleri URL ile degistir
t = re.sub(r"@\w+", "@USER", t) # mentionlari duzelt
t = re.sub(r"#(\w+)", r"\1", t) # hashtag isaretini kaldir
reps = { # kisaltma acilimlari sozlugu
"won't": "will not",
"can't": "cannot",
"n't": " not",
"'re": " are",
"'ve": " have",
"'ll": " will",
"'d": " would",
"'m": " am"
}
for old, new in reps.items(): # her kural icin
t = t.replace(old, new) # metinde degistir
return t # islenmis metni dondur
# ====== DIL -> VARSAYILAN MODEL KURALI ======
def pick_default_key_for_lang(lang: str) -> str: # dile gore varsayilan modeli sec
if lang == "en": # ingilizce ise
return "roberta" # roberta sec
if lang == "tr": # turkce ise
return "xlmr" # xlmr sec
return "xlmr" # diger diller icin xlmr sec
# ====== ANA API FONKSIYONU (/api/predict/analyze) ======
def analyze(text: str, force_lang: Optional[str] = None, benchmark: bool = False): # ana uretim endpointi
txt = (text or "").strip() # giris metnini temizle
if not txt: # bos metin ise
return { # notr dondur
"label": "neutral",
"score": 1.0,
"confidence": "high",
"lang": force_lang or "other",
"model_used": "none",
"processing_time_ms": 0.0
}
lang = force_lang if force_lang in ("en", "tr", "other") else detect_lang(txt) # dili belirle
proc = preprocess_en(txt) if lang == "en" else txt # ingilizce ise on isleme uygula
candidates: List[Dict] = [] # benchmark aday listesi
if benchmark: # eger mini benchmark istenirse
keys = LANG_TOP3.get(lang, LANG_TOP3["other"]) # o dilin TOP3 listesi
for k in keys: # her aday icin
pipe, cfg = get_pipe_and_cfg(k) # pipeline ve config al
if pipe is None: # yuklenemediyse
continue # atla
t0 = time.perf_counter() # zaman sayacini baslat
out = pipe(proc)[0] # modeli calistir
ms = (time.perf_counter() - t0) * 1000.0 # gecikmeyi hesapla
top = max(out, key=lambda s: s["score"]) # en yuksek skorlu sinifi bul
lab = normalize_label(top["label"], cfg, MODELS[k]["kind"]) # etiketi standarda cevir
candidates.append({ # adayi listeye ekle
"model": k,
"label": lab,
"score": float(top["score"]),
"latency_ms": round(ms, 2)
})
if candidates: # adaylar varsa
candidates.sort(key=lambda c: (-c["score"], c["latency_ms"])) # skora gore azalan, sonra gecikmeye gore artan
winner_key = candidates[0]["model"] # kazananin anahtarini al
else: # aday yoksa
winner_key = pick_default_key_for_lang(lang) # varsayilani sec
else: # benchmark yoksa
winner_key = pick_default_key_for_lang(lang) # dogrudan varsayilani sec
pipe, cfg = get_pipe_and_cfg(winner_key) # kazanan pipeline ve config al
if pipe is None: # yuklenemezse
return { # hata don
"label": "error",
"score": 0.0,
"confidence": "low",
"lang": lang,
"model_used": winner_key,
"processing_time_ms": 0.0,
"error": "model_load_failed"
}
t0 = time.perf_counter() # zaman sayacini baslat
out = pipe(proc)[0] # tek atis tahmini al
ms = (time.perf_counter() - t0) * 1000.0 # gecikmeyi hesapla
top = max(out, key=lambda s: s["score"]) # en yuksek skorlu sinifi bul
label = normalize_label(top["label"], cfg, MODELS[winner_key]["kind"]) # etiketi standarda cevir
score = float(top["score"]) # skoru sayiya cevir
confidence = "high" if score > 0.8 else ("medium" if score > 0.6 else "low") # guven araligini belirle
resp = { # yanit sozlugunu olustur
"label": label, # son etiket
"score": round(score, 4), # skor 4 ondalik
"confidence": confidence, # guven seviyesi
"lang": lang, # tespit edilen dil
"model_used": MODELS[winner_key]["id"].split("/")[-1], # kullanilan modelin son parcasi
"processing_time_ms": round(ms, 2) # islem suresi ms
}
if benchmark and candidates: # eger benchmark yapildiysa
resp["candidates"] = candidates # adaylar listesini da dondur
return resp # yaniti dondur
# ====== BENCHMARK UI (otomatik EN/TR/OTHER kovalama) ======
def run_benchmark_auto(texts_blob: str): # coklu metin benchmark fonksiyonu
texts = [t.strip() for t in (texts_blob or "").splitlines() if t.strip()] # satirlari ayir ve boslari temizle
if not texts: # eger hic metin yoksa
return "Uyari: metin alani bos.", [] # uyari ve bos tablo dondur
buckets = {"en": [], "tr": [], "other": []} # dil kovalari olustur
for t in texts: # her metin icin
buckets[detect_lang(t)].append(t) # uygun kovaya ekle
rows: List[List] = [] # cikti satirlarini tutacak liste
errors: List[str] = [] # hata mesajlari icin liste
def bench_set(text_list: List[str], keys: List[str], tag: str): # bir dil kovasi icin benchmark calistir
if not text_list: # eger bu kovada metin yoksa
return # cik
for k in keys: # her model adayi icin
spec = MODELS[k] # model kaydini al
pipe, cfg = get_pipe_and_cfg(k) # pipeline ve config al
modelname = f"{tag}/{spec['name']}" # tablo icin gorsel ad olustur
if pipe is None: # yuklenemedi ise
errors.append(f"yuklenemedi: {modelname}") # hatayi kaydet
for t in text_list: # her metin icin
rows.append([t[:50], modelname, "ERROR", 0.0, 0.0, "N/A"]) # hata satiri ekle
continue # diger modele gec
proc = [preprocess_en(x) if tag == "EN" else x for x in text_list] # EN icin on isleme yap
t0 = time.perf_counter() # zaman sayacini baslat
outs = pipe(proc) # toplu tahmin al
avg_ms = (time.perf_counter() - t0) * 1000.0 / max(1, len(proc)) # ortalama gecikme hesapla
for orig, out in zip(text_list, outs): # her cikti icin
try:
top = max(out, key=lambda s: s["score"]) # en yuksek skoru bul
lab = normalize_label(top["label"], cfg, spec["kind"]) # etiketi standarda cevir
sc = float(top["score"]) # skoru sayiya cevir
conf = "high" if sc > 0.8 else ("medium" if sc > 0.6 else "low") # guven seviyesi
rows.append([ # tabloya bir satir ekle
orig[:50] + ("..." if len(orig) > 50 else ""), # metnin kisa hali
modelname, # model adi
lab, # etiket
round(sc, 4), # skor
round(avg_ms, 1), # ortalama gecikme
conf # guven
])
except Exception as ex: # tahmin hatasi olursa
errors.append(f"hata: {modelname}: {str(ex)[:80]}") # hata listesine ekle
rows.append([orig[:50], modelname, "ERROR", 0.0, 0.0, "N/A"]) # hata satiri ekle
bench_set(buckets["en"], LANG_TOP3["en"], "EN") # ingilizce kovayi calistir
bench_set(buckets["tr"], LANG_TOP3["tr"], "TR") # turkce kovayi calistir
bench_set(buckets["other"], LANG_TOP3["other"], "OTHER") # diger kovayi calistir
# sadelesmis ozet metni olustur
summary_lines: List[str] = [] # ozet satirlari icin liste
if errors: # hata varsa
summary_lines.append("Hatalar:") # baslik yaz
for e in errors: # her hata icin
summary_lines.append(f"- {e}") # listele
if not summary_lines: # hic hata yoksa
summary_lines.append("Benchmark tamamlandi.") # basit bilgi satiri
return "\n".join(summary_lines), rows # ozet metni ve tablo satirlarini dondur
# ====== GRADIO ARAYUZLERI ======
api_intf = gr.Interface( # uretim api arayuzu
fn=analyze, # cagrilacak fonksiyon
inputs=[ # giris bilesenleri
gr.Textbox(lines=3, label="Text"), # metin kutusu
gr.Textbox(lines=1, label="force_lang (en|tr|other, opsiyonel)", value=""), # zorlama dil alani
gr.Checkbox(label="benchmark (kisa TOP3 karsilastirma)", value=False), # benchmark secenegi
],
outputs=gr.JSON(label="Result"), # cikti JSON gosterimi
title="Sentiment API (Production)", # baslik
description="POST /api/predict/analyze doner: {label, score, confidence, lang, model_used, processing_time_ms[, candidates]}" # aciklama
)
api_intf.api_name = "analyze" # endpoint yolu: /api/predict/analyze
with gr.Blocks(title="Sentiment Benchmark") as bench_ui: # benchmark arayuzu
gr.Markdown("Coklu metin benchmark. Her satir ayri bir ornek olmalidir.") # kullaniciya kisa bilgi
txt = gr.Textbox(lines=10, label="Ornekler (satir satir)") # coklu satir metin girisi
btn = gr.Button("Calistir") # calistir butonu
out_md = gr.Markdown() # ozet metin alanı
out_tbl = gr.Dataframe( # tablo cikti alani
headers=["text", "bucket/model", "label", "score", "latency_ms", "confidence"], # tablo basliklari
row_count=(0, "dynamic"), # dinamik satir sayisi
col_count=(6, "fixed"), # sabit sutun sayisi
interactive=False, # kullanici duzenleyemesin
wrap=True # metin sarma acik
)
btn.click(fn=run_benchmark_auto, inputs=[txt], outputs=[out_md, out_tbl]) # buton tiklayinca fonksiyonu cagir
demo = gr.TabbedInterface( # iki sekmeli toplam arayuz
[api_intf, bench_ui], # birinci sekme api, ikinci sekme benchmark
tab_names=["API", "Benchmark"] # sekme isimleri
)
if __name__ == "__main__": # ana calisma blogu
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), show_error=True) # gradioyu baslat