Spaces:
Sleeping
Sleeping
| import os # isletim sistemi degiskenlerine erismek icin | |
| import re # metin isleme icin regexp kutuphanesi | |
| import time # gecikme olcumu icin zaman fonksiyonlari | |
| import gc # bellek temizligi icin garbage collector | |
| from typing import Dict, Tuple, Optional, List # tip ipuclari icin | |
| import gradio as gr # Hugging Face Spaces arayuzunu kurmak icin | |
| import torch # pytorch modellerini calistirmak icin | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoConfig | |
| # ====== MODEL KAYITLARI (sade ve ogrenci dostu) ====== | |
| MODELS: Dict[str, Dict] = { | |
| # en dili icin 3 model | |
| "roberta": { | |
| "name": "RoBERTa Twitter 3class EN", # aciklama adi | |
| "id": "cardiffnlp/twitter-roberta-base-sentiment-latest", # HF model kimligi | |
| "kind": "3class" # cikis turu | |
| }, | |
| "distilbert": { | |
| "name": "DistilBERT SST2 2class EN", # aciklama adi | |
| "id": "distilbert-base-uncased-finetuned-sst-2-english", # HF model kimligi | |
| "kind": "2class" # cikis turu | |
| }, | |
| "bertweet": { | |
| "name": "BERTweet 3class EN", # aciklama adi | |
| "id": "finiteautomata/bertweet-base-sentiment-analysis", # HF model kimligi | |
| "kind": "3class" # cikis turu | |
| }, | |
| # tr veya diger diller icin 3 model (cok dilli agirlikli) | |
| "xlmr": { | |
| "name": "XLM-R 3class Multi", # aciklama adi | |
| "id": "cardiffnlp/twitter-xlm-roberta-base-sentiment", # HF model kimligi | |
| "kind": "3class" # cikis turu | |
| }, | |
| "bert_5star": { | |
| "name": "BERT Multi 5star", # aciklama adi | |
| "id": "nlptown/bert-base-multilingual-uncased-sentiment", # HF model kimligi | |
| "kind": "5star" # cikis turu | |
| }, | |
| "albert": { | |
| "name": "ALBERT v2 3class Light", # aciklama adi | |
| "id": "barissayil/bert-sentiment-analysis-sst", # HF model kimligi | |
| "kind": "3class" # cikis turu | |
| }, | |
| } | |
| # ====== DIL BAZLI TOP3 SECIMLERI ====== | |
| LANG_TOP3 = { # her dil icin 3 aday model listesi | |
| "en": ["roberta", "distilbert", "bertweet"], # ingilizce icin ilk 3 | |
| "tr": ["xlmr", "bert_5star", "albert"], # turkce icin ilk 3 (cok dilli agirlikli) | |
| "other": ["xlmr", "bert_5star", "roberta"] # diger diller icin yedek 3 | |
| } | |
| # ====== LAZY CACHE (modelleri ihtiyac olunca yukleme) ====== | |
| _PIPE_CACHE: Dict[str, TextClassificationPipeline] = {} # pipeline nesneleri icin cache sozlugu | |
| _CFG_CACHE: Dict[str, AutoConfig] = {} # model config nesneleri icin cache sozlugu | |
| MAX_CACHE_SIZE = 4 # en fazla 4 farkli model cachede tut | |
| def cleanup_cache() -> None: # cache buyurse eskileri silmek icin fonksiyon | |
| while len(_PIPE_CACHE) > MAX_CACHE_SIZE: # eger siniri astiysa | |
| oldest_key = next(iter(_PIPE_CACHE.keys())) # ilk ekleneni bul | |
| _PIPE_CACHE.pop(oldest_key, None) # pipeline sil | |
| _CFG_CACHE.pop(oldest_key, None) # config sil | |
| gc.collect() # python cop toplayici cagir | |
| if torch.cuda.is_available(): # eger gpu varsa | |
| torch.cuda.empty_cache() # gpu bellegini de bosalt | |
| def get_pipe_and_cfg(model_key: str) -> Tuple[Optional[TextClassificationPipeline], Optional[AutoConfig]]: # model yukleme fonksiyonu | |
| spec = MODELS[model_key] # model sozlugunden kaydi al | |
| model_id = spec["id"] # hf id'sini al | |
| if model_id in _PIPE_CACHE: # eger cachede varsa | |
| return _PIPE_CACHE[model_id], _CFG_CACHE.get(model_id) # cacheden dondur | |
| try: | |
| tok = AutoTokenizer.from_pretrained(model_id) # tokenizer yukle | |
| mdl = AutoModelForSequenceClassification.from_pretrained(model_id) # model yukle | |
| pipe = TextClassificationPipeline( # pipeline olustur | |
| model=mdl, # model set et | |
| tokenizer=tok, # tokenizer set et | |
| framework="pt", # pytorch kullan | |
| return_all_scores=True, # tum sinif skorlarini iste | |
| device=-1 # cpu kullan | |
| ) | |
| _PIPE_CACHE[model_id] = pipe # pipeline'i cache'e yaz | |
| _CFG_CACHE[model_id] = AutoConfig.from_pretrained(model_id) # config'i cache'e yaz | |
| cleanup_cache() # gerekirse cache temizligi yap | |
| return pipe, _CFG_CACHE[model_id] # pipeline ve config dondur | |
| except Exception as e: # yukleme hatasi olursa | |
| print(f"model yukleme hatasi: {model_key} -> {e}") # ekrana yaz | |
| return None, None # None dondur | |
| # ====== DIL TESPITI ====== | |
| try: | |
| from langdetect import detect # hafif dil tespiti kutuphanesi | |
| except Exception: | |
| detect = None # eger yuklenemezse None yap | |
| def detect_lang(text: str) -> str: # girilen metnin dilini bul | |
| t = (text or "").strip() # bosluklari temizle | |
| if not t or len(t) < 2: # cok kisa ise | |
| return "other" # diger kabul et | |
| if detect is None: # kutuphane yoksa | |
| return "other" # diger kabul et | |
| try: | |
| lang = detect(t) # dil tespiti yap | |
| return lang if lang in ("en", "tr") else "other" # sadece en ve tr destekliyoruz | |
| except Exception: | |
| return "other" # hata olursa diger de | |
| # ====== LABEL NORMALIZASYONU ====== | |
| def normalize_label(raw_label: str, cfg: Optional[AutoConfig], kind: str) -> str: # farkli etiketleri standarda cevir | |
| lbl = (raw_label or "").lower() # etiket kucuk harfe cevir | |
| if lbl.startswith("label_") and cfg is not None and hasattr(cfg, "id2label"): # eger LABEL_0 gibi ise | |
| try: | |
| idx = int(lbl.split("_")[-1]) # sayiyi al | |
| lbl = str(cfg.id2label[idx]).lower() # id2label ile gercek etikete cevir | |
| except Exception: | |
| pass # hata olursa devam et | |
| if kind == "5star": # 5 yildizli model icin | |
| m = re.search(r"([1-5])", lbl) # 1..5 ara | |
| if m: # bulunduysa | |
| s = int(m.group(1)) # sayiyi al | |
| if s <= 2: # 1 veya 2 ise | |
| return "negative" # negatif | |
| if s == 3: # 3 ise | |
| return "neutral" # notr | |
| return "positive" # 4 veya 5 ise pozitif | |
| if "neg" in lbl: # metin icinde neg geciyorsa | |
| return "negative" # negatif dondur | |
| if "neu" in lbl: # metin icinde neu geciyorsa | |
| return "neutral" # notr dondur | |
| if "pos" in lbl: # metin icinde pos geciyorsa | |
| return "positive" # pozitif dondur | |
| return "neutral" # diger durumlarda notr dondur (2-class icin guvenli secim) | |
| # ====== ENGLISH ON-ISLEME ====== | |
| def preprocess_en(text: str) -> str: # ingilizce metin icin hafif on isleme | |
| if not text: # metin bos ise | |
| return text # aynen dondur | |
| t = re.sub(r"\s+", " ", text).strip() # fazla bosluklari sadelestir | |
| t = re.sub(r"(.)\1{3,}", r"\1\1", t) # cok tekrar eden karakterleri kisalt | |
| t = re.sub(r"http[s]?://\S+", "URL", t) # linkleri URL ile degistir | |
| t = re.sub(r"@\w+", "@USER", t) # mentionlari duzelt | |
| t = re.sub(r"#(\w+)", r"\1", t) # hashtag isaretini kaldir | |
| reps = { # kisaltma acilimlari sozlugu | |
| "won't": "will not", | |
| "can't": "cannot", | |
| "n't": " not", | |
| "'re": " are", | |
| "'ve": " have", | |
| "'ll": " will", | |
| "'d": " would", | |
| "'m": " am" | |
| } | |
| for old, new in reps.items(): # her kural icin | |
| t = t.replace(old, new) # metinde degistir | |
| return t # islenmis metni dondur | |
| # ====== DIL -> VARSAYILAN MODEL KURALI ====== | |
| def pick_default_key_for_lang(lang: str) -> str: # dile gore varsayilan modeli sec | |
| if lang == "en": # ingilizce ise | |
| return "roberta" # roberta sec | |
| if lang == "tr": # turkce ise | |
| return "xlmr" # xlmr sec | |
| return "xlmr" # diger diller icin xlmr sec | |
| # ====== ANA API FONKSIYONU (/api/predict/analyze) ====== | |
| def analyze(text: str, force_lang: Optional[str] = None, benchmark: bool = False): # ana uretim endpointi | |
| txt = (text or "").strip() # giris metnini temizle | |
| if not txt: # bos metin ise | |
| return { # notr dondur | |
| "label": "neutral", | |
| "score": 1.0, | |
| "confidence": "high", | |
| "lang": force_lang or "other", | |
| "model_used": "none", | |
| "processing_time_ms": 0.0 | |
| } | |
| lang = force_lang if force_lang in ("en", "tr", "other") else detect_lang(txt) # dili belirle | |
| proc = preprocess_en(txt) if lang == "en" else txt # ingilizce ise on isleme uygula | |
| candidates: List[Dict] = [] # benchmark aday listesi | |
| if benchmark: # eger mini benchmark istenirse | |
| keys = LANG_TOP3.get(lang, LANG_TOP3["other"]) # o dilin TOP3 listesi | |
| for k in keys: # her aday icin | |
| pipe, cfg = get_pipe_and_cfg(k) # pipeline ve config al | |
| if pipe is None: # yuklenemediyse | |
| continue # atla | |
| t0 = time.perf_counter() # zaman sayacini baslat | |
| out = pipe(proc)[0] # modeli calistir | |
| ms = (time.perf_counter() - t0) * 1000.0 # gecikmeyi hesapla | |
| top = max(out, key=lambda s: s["score"]) # en yuksek skorlu sinifi bul | |
| lab = normalize_label(top["label"], cfg, MODELS[k]["kind"]) # etiketi standarda cevir | |
| candidates.append({ # adayi listeye ekle | |
| "model": k, | |
| "label": lab, | |
| "score": float(top["score"]), | |
| "latency_ms": round(ms, 2) | |
| }) | |
| if candidates: # adaylar varsa | |
| candidates.sort(key=lambda c: (-c["score"], c["latency_ms"])) # skora gore azalan, sonra gecikmeye gore artan | |
| winner_key = candidates[0]["model"] # kazananin anahtarini al | |
| else: # aday yoksa | |
| winner_key = pick_default_key_for_lang(lang) # varsayilani sec | |
| else: # benchmark yoksa | |
| winner_key = pick_default_key_for_lang(lang) # dogrudan varsayilani sec | |
| pipe, cfg = get_pipe_and_cfg(winner_key) # kazanan pipeline ve config al | |
| if pipe is None: # yuklenemezse | |
| return { # hata don | |
| "label": "error", | |
| "score": 0.0, | |
| "confidence": "low", | |
| "lang": lang, | |
| "model_used": winner_key, | |
| "processing_time_ms": 0.0, | |
| "error": "model_load_failed" | |
| } | |
| t0 = time.perf_counter() # zaman sayacini baslat | |
| out = pipe(proc)[0] # tek atis tahmini al | |
| ms = (time.perf_counter() - t0) * 1000.0 # gecikmeyi hesapla | |
| top = max(out, key=lambda s: s["score"]) # en yuksek skorlu sinifi bul | |
| label = normalize_label(top["label"], cfg, MODELS[winner_key]["kind"]) # etiketi standarda cevir | |
| score = float(top["score"]) # skoru sayiya cevir | |
| confidence = "high" if score > 0.8 else ("medium" if score > 0.6 else "low") # guven araligini belirle | |
| resp = { # yanit sozlugunu olustur | |
| "label": label, # son etiket | |
| "score": round(score, 4), # skor 4 ondalik | |
| "confidence": confidence, # guven seviyesi | |
| "lang": lang, # tespit edilen dil | |
| "model_used": MODELS[winner_key]["id"].split("/")[-1], # kullanilan modelin son parcasi | |
| "processing_time_ms": round(ms, 2) # islem suresi ms | |
| } | |
| if benchmark and candidates: # eger benchmark yapildiysa | |
| resp["candidates"] = candidates # adaylar listesini da dondur | |
| return resp # yaniti dondur | |
| # ====== BENCHMARK UI (otomatik EN/TR/OTHER kovalama) ====== | |
| def run_benchmark_auto(texts_blob: str): # coklu metin benchmark fonksiyonu | |
| texts = [t.strip() for t in (texts_blob or "").splitlines() if t.strip()] # satirlari ayir ve boslari temizle | |
| if not texts: # eger hic metin yoksa | |
| return "Uyari: metin alani bos.", [] # uyari ve bos tablo dondur | |
| buckets = {"en": [], "tr": [], "other": []} # dil kovalari olustur | |
| for t in texts: # her metin icin | |
| buckets[detect_lang(t)].append(t) # uygun kovaya ekle | |
| rows: List[List] = [] # cikti satirlarini tutacak liste | |
| errors: List[str] = [] # hata mesajlari icin liste | |
| def bench_set(text_list: List[str], keys: List[str], tag: str): # bir dil kovasi icin benchmark calistir | |
| if not text_list: # eger bu kovada metin yoksa | |
| return # cik | |
| for k in keys: # her model adayi icin | |
| spec = MODELS[k] # model kaydini al | |
| pipe, cfg = get_pipe_and_cfg(k) # pipeline ve config al | |
| modelname = f"{tag}/{spec['name']}" # tablo icin gorsel ad olustur | |
| if pipe is None: # yuklenemedi ise | |
| errors.append(f"yuklenemedi: {modelname}") # hatayi kaydet | |
| for t in text_list: # her metin icin | |
| rows.append([t[:50], modelname, "ERROR", 0.0, 0.0, "N/A"]) # hata satiri ekle | |
| continue # diger modele gec | |
| proc = [preprocess_en(x) if tag == "EN" else x for x in text_list] # EN icin on isleme yap | |
| t0 = time.perf_counter() # zaman sayacini baslat | |
| outs = pipe(proc) # toplu tahmin al | |
| avg_ms = (time.perf_counter() - t0) * 1000.0 / max(1, len(proc)) # ortalama gecikme hesapla | |
| for orig, out in zip(text_list, outs): # her cikti icin | |
| try: | |
| top = max(out, key=lambda s: s["score"]) # en yuksek skoru bul | |
| lab = normalize_label(top["label"], cfg, spec["kind"]) # etiketi standarda cevir | |
| sc = float(top["score"]) # skoru sayiya cevir | |
| conf = "high" if sc > 0.8 else ("medium" if sc > 0.6 else "low") # guven seviyesi | |
| rows.append([ # tabloya bir satir ekle | |
| orig[:50] + ("..." if len(orig) > 50 else ""), # metnin kisa hali | |
| modelname, # model adi | |
| lab, # etiket | |
| round(sc, 4), # skor | |
| round(avg_ms, 1), # ortalama gecikme | |
| conf # guven | |
| ]) | |
| except Exception as ex: # tahmin hatasi olursa | |
| errors.append(f"hata: {modelname}: {str(ex)[:80]}") # hata listesine ekle | |
| rows.append([orig[:50], modelname, "ERROR", 0.0, 0.0, "N/A"]) # hata satiri ekle | |
| bench_set(buckets["en"], LANG_TOP3["en"], "EN") # ingilizce kovayi calistir | |
| bench_set(buckets["tr"], LANG_TOP3["tr"], "TR") # turkce kovayi calistir | |
| bench_set(buckets["other"], LANG_TOP3["other"], "OTHER") # diger kovayi calistir | |
| # sadelesmis ozet metni olustur | |
| summary_lines: List[str] = [] # ozet satirlari icin liste | |
| if errors: # hata varsa | |
| summary_lines.append("Hatalar:") # baslik yaz | |
| for e in errors: # her hata icin | |
| summary_lines.append(f"- {e}") # listele | |
| if not summary_lines: # hic hata yoksa | |
| summary_lines.append("Benchmark tamamlandi.") # basit bilgi satiri | |
| return "\n".join(summary_lines), rows # ozet metni ve tablo satirlarini dondur | |
| # ====== GRADIO ARAYUZLERI ====== | |
| api_intf = gr.Interface( # uretim api arayuzu | |
| fn=analyze, # cagrilacak fonksiyon | |
| inputs=[ # giris bilesenleri | |
| gr.Textbox(lines=3, label="Text"), # metin kutusu | |
| gr.Textbox(lines=1, label="force_lang (en|tr|other, opsiyonel)", value=""), # zorlama dil alani | |
| gr.Checkbox(label="benchmark (kisa TOP3 karsilastirma)", value=False), # benchmark secenegi | |
| ], | |
| outputs=gr.JSON(label="Result"), # cikti JSON gosterimi | |
| title="Sentiment API (Production)", # baslik | |
| description="POST /api/predict/analyze doner: {label, score, confidence, lang, model_used, processing_time_ms[, candidates]}" # aciklama | |
| ) | |
| api_intf.api_name = "analyze" # endpoint yolu: /api/predict/analyze | |
| with gr.Blocks(title="Sentiment Benchmark") as bench_ui: # benchmark arayuzu | |
| gr.Markdown("Coklu metin benchmark. Her satir ayri bir ornek olmalidir.") # kullaniciya kisa bilgi | |
| txt = gr.Textbox(lines=10, label="Ornekler (satir satir)") # coklu satir metin girisi | |
| btn = gr.Button("Calistir") # calistir butonu | |
| out_md = gr.Markdown() # ozet metin alanı | |
| out_tbl = gr.Dataframe( # tablo cikti alani | |
| headers=["text", "bucket/model", "label", "score", "latency_ms", "confidence"], # tablo basliklari | |
| row_count=(0, "dynamic"), # dinamik satir sayisi | |
| col_count=(6, "fixed"), # sabit sutun sayisi | |
| interactive=False, # kullanici duzenleyemesin | |
| wrap=True # metin sarma acik | |
| ) | |
| btn.click(fn=run_benchmark_auto, inputs=[txt], outputs=[out_md, out_tbl]) # buton tiklayinca fonksiyonu cagir | |
| demo = gr.TabbedInterface( # iki sekmeli toplam arayuz | |
| [api_intf, bench_ui], # birinci sekme api, ikinci sekme benchmark | |
| tab_names=["API", "Benchmark"] # sekme isimleri | |
| ) | |
| if __name__ == "__main__": # ana calisma blogu | |
| demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), show_error=True) # gradioyu baslat | |