import os # isletim sistemi degiskenlerine erismek icin import re # metin isleme icin regexp kutuphanesi import time # gecikme olcumu icin zaman fonksiyonlari import gc # bellek temizligi icin garbage collector from typing import Dict, Tuple, Optional, List # tip ipuclari icin import gradio as gr # Hugging Face Spaces arayuzunu kurmak icin import torch # pytorch modellerini calistirmak icin from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoConfig # ====== MODEL KAYITLARI (sade ve ogrenci dostu) ====== MODELS: Dict[str, Dict] = { # en dili icin 3 model "roberta": { "name": "RoBERTa Twitter 3class EN", # aciklama adi "id": "cardiffnlp/twitter-roberta-base-sentiment-latest", # HF model kimligi "kind": "3class" # cikis turu }, "distilbert": { "name": "DistilBERT SST2 2class EN", # aciklama adi "id": "distilbert-base-uncased-finetuned-sst-2-english", # HF model kimligi "kind": "2class" # cikis turu }, "bertweet": { "name": "BERTweet 3class EN", # aciklama adi "id": "finiteautomata/bertweet-base-sentiment-analysis", # HF model kimligi "kind": "3class" # cikis turu }, # tr veya diger diller icin 3 model (cok dilli agirlikli) "xlmr": { "name": "XLM-R 3class Multi", # aciklama adi "id": "cardiffnlp/twitter-xlm-roberta-base-sentiment", # HF model kimligi "kind": "3class" # cikis turu }, "bert_5star": { "name": "BERT Multi 5star", # aciklama adi "id": "nlptown/bert-base-multilingual-uncased-sentiment", # HF model kimligi "kind": "5star" # cikis turu }, "albert": { "name": "ALBERT v2 3class Light", # aciklama adi "id": "barissayil/bert-sentiment-analysis-sst", # HF model kimligi "kind": "3class" # cikis turu }, } # ====== DIL BAZLI TOP3 SECIMLERI ====== LANG_TOP3 = { # her dil icin 3 aday model listesi "en": ["roberta", "distilbert", "bertweet"], # ingilizce icin ilk 3 "tr": ["xlmr", "bert_5star", "albert"], # turkce icin ilk 3 (cok dilli agirlikli) "other": ["xlmr", "bert_5star", "roberta"] # diger diller icin yedek 3 } # ====== LAZY CACHE (modelleri ihtiyac olunca yukleme) ====== _PIPE_CACHE: Dict[str, TextClassificationPipeline] = {} # pipeline nesneleri icin cache sozlugu _CFG_CACHE: Dict[str, AutoConfig] = {} # model config nesneleri icin cache sozlugu MAX_CACHE_SIZE = 4 # en fazla 4 farkli model cachede tut def cleanup_cache() -> None: # cache buyurse eskileri silmek icin fonksiyon while len(_PIPE_CACHE) > MAX_CACHE_SIZE: # eger siniri astiysa oldest_key = next(iter(_PIPE_CACHE.keys())) # ilk ekleneni bul _PIPE_CACHE.pop(oldest_key, None) # pipeline sil _CFG_CACHE.pop(oldest_key, None) # config sil gc.collect() # python cop toplayici cagir if torch.cuda.is_available(): # eger gpu varsa torch.cuda.empty_cache() # gpu bellegini de bosalt def get_pipe_and_cfg(model_key: str) -> Tuple[Optional[TextClassificationPipeline], Optional[AutoConfig]]: # model yukleme fonksiyonu spec = MODELS[model_key] # model sozlugunden kaydi al model_id = spec["id"] # hf id'sini al if model_id in _PIPE_CACHE: # eger cachede varsa return _PIPE_CACHE[model_id], _CFG_CACHE.get(model_id) # cacheden dondur try: tok = AutoTokenizer.from_pretrained(model_id) # tokenizer yukle mdl = AutoModelForSequenceClassification.from_pretrained(model_id) # model yukle pipe = TextClassificationPipeline( # pipeline olustur model=mdl, # model set et tokenizer=tok, # tokenizer set et framework="pt", # pytorch kullan return_all_scores=True, # tum sinif skorlarini iste device=-1 # cpu kullan ) _PIPE_CACHE[model_id] = pipe # pipeline'i cache'e yaz _CFG_CACHE[model_id] = AutoConfig.from_pretrained(model_id) # config'i cache'e yaz cleanup_cache() # gerekirse cache temizligi yap return pipe, _CFG_CACHE[model_id] # pipeline ve config dondur except Exception as e: # yukleme hatasi olursa print(f"model yukleme hatasi: {model_key} -> {e}") # ekrana yaz return None, None # None dondur # ====== DIL TESPITI ====== try: from langdetect import detect # hafif dil tespiti kutuphanesi except Exception: detect = None # eger yuklenemezse None yap def detect_lang(text: str) -> str: # girilen metnin dilini bul t = (text or "").strip() # bosluklari temizle if not t or len(t) < 2: # cok kisa ise return "other" # diger kabul et if detect is None: # kutuphane yoksa return "other" # diger kabul et try: lang = detect(t) # dil tespiti yap return lang if lang in ("en", "tr") else "other" # sadece en ve tr destekliyoruz except Exception: return "other" # hata olursa diger de # ====== LABEL NORMALIZASYONU ====== def normalize_label(raw_label: str, cfg: Optional[AutoConfig], kind: str) -> str: # farkli etiketleri standarda cevir lbl = (raw_label or "").lower() # etiket kucuk harfe cevir if lbl.startswith("label_") and cfg is not None and hasattr(cfg, "id2label"): # eger LABEL_0 gibi ise try: idx = int(lbl.split("_")[-1]) # sayiyi al lbl = str(cfg.id2label[idx]).lower() # id2label ile gercek etikete cevir except Exception: pass # hata olursa devam et if kind == "5star": # 5 yildizli model icin m = re.search(r"([1-5])", lbl) # 1..5 ara if m: # bulunduysa s = int(m.group(1)) # sayiyi al if s <= 2: # 1 veya 2 ise return "negative" # negatif if s == 3: # 3 ise return "neutral" # notr return "positive" # 4 veya 5 ise pozitif if "neg" in lbl: # metin icinde neg geciyorsa return "negative" # negatif dondur if "neu" in lbl: # metin icinde neu geciyorsa return "neutral" # notr dondur if "pos" in lbl: # metin icinde pos geciyorsa return "positive" # pozitif dondur return "neutral" # diger durumlarda notr dondur (2-class icin guvenli secim) # ====== ENGLISH ON-ISLEME ====== def preprocess_en(text: str) -> str: # ingilizce metin icin hafif on isleme if not text: # metin bos ise return text # aynen dondur t = re.sub(r"\s+", " ", text).strip() # fazla bosluklari sadelestir t = re.sub(r"(.)\1{3,}", r"\1\1", t) # cok tekrar eden karakterleri kisalt t = re.sub(r"http[s]?://\S+", "URL", t) # linkleri URL ile degistir t = re.sub(r"@\w+", "@USER", t) # mentionlari duzelt t = re.sub(r"#(\w+)", r"\1", t) # hashtag isaretini kaldir reps = { # kisaltma acilimlari sozlugu "won't": "will not", "can't": "cannot", "n't": " not", "'re": " are", "'ve": " have", "'ll": " will", "'d": " would", "'m": " am" } for old, new in reps.items(): # her kural icin t = t.replace(old, new) # metinde degistir return t # islenmis metni dondur # ====== DIL -> VARSAYILAN MODEL KURALI ====== def pick_default_key_for_lang(lang: str) -> str: # dile gore varsayilan modeli sec if lang == "en": # ingilizce ise return "roberta" # roberta sec if lang == "tr": # turkce ise return "xlmr" # xlmr sec return "xlmr" # diger diller icin xlmr sec # ====== ANA API FONKSIYONU (/api/predict/analyze) ====== def analyze(text: str, force_lang: Optional[str] = None, benchmark: bool = False): # ana uretim endpointi txt = (text or "").strip() # giris metnini temizle if not txt: # bos metin ise return { # notr dondur "label": "neutral", "score": 1.0, "confidence": "high", "lang": force_lang or "other", "model_used": "none", "processing_time_ms": 0.0 } lang = force_lang if force_lang in ("en", "tr", "other") else detect_lang(txt) # dili belirle proc = preprocess_en(txt) if lang == "en" else txt # ingilizce ise on isleme uygula candidates: List[Dict] = [] # benchmark aday listesi if benchmark: # eger mini benchmark istenirse keys = LANG_TOP3.get(lang, LANG_TOP3["other"]) # o dilin TOP3 listesi for k in keys: # her aday icin pipe, cfg = get_pipe_and_cfg(k) # pipeline ve config al if pipe is None: # yuklenemediyse continue # atla t0 = time.perf_counter() # zaman sayacini baslat out = pipe(proc)[0] # modeli calistir ms = (time.perf_counter() - t0) * 1000.0 # gecikmeyi hesapla top = max(out, key=lambda s: s["score"]) # en yuksek skorlu sinifi bul lab = normalize_label(top["label"], cfg, MODELS[k]["kind"]) # etiketi standarda cevir candidates.append({ # adayi listeye ekle "model": k, "label": lab, "score": float(top["score"]), "latency_ms": round(ms, 2) }) if candidates: # adaylar varsa candidates.sort(key=lambda c: (-c["score"], c["latency_ms"])) # skora gore azalan, sonra gecikmeye gore artan winner_key = candidates[0]["model"] # kazananin anahtarini al else: # aday yoksa winner_key = pick_default_key_for_lang(lang) # varsayilani sec else: # benchmark yoksa winner_key = pick_default_key_for_lang(lang) # dogrudan varsayilani sec pipe, cfg = get_pipe_and_cfg(winner_key) # kazanan pipeline ve config al if pipe is None: # yuklenemezse return { # hata don "label": "error", "score": 0.0, "confidence": "low", "lang": lang, "model_used": winner_key, "processing_time_ms": 0.0, "error": "model_load_failed" } t0 = time.perf_counter() # zaman sayacini baslat out = pipe(proc)[0] # tek atis tahmini al ms = (time.perf_counter() - t0) * 1000.0 # gecikmeyi hesapla top = max(out, key=lambda s: s["score"]) # en yuksek skorlu sinifi bul label = normalize_label(top["label"], cfg, MODELS[winner_key]["kind"]) # etiketi standarda cevir score = float(top["score"]) # skoru sayiya cevir confidence = "high" if score > 0.8 else ("medium" if score > 0.6 else "low") # guven araligini belirle resp = { # yanit sozlugunu olustur "label": label, # son etiket "score": round(score, 4), # skor 4 ondalik "confidence": confidence, # guven seviyesi "lang": lang, # tespit edilen dil "model_used": MODELS[winner_key]["id"].split("/")[-1], # kullanilan modelin son parcasi "processing_time_ms": round(ms, 2) # islem suresi ms } if benchmark and candidates: # eger benchmark yapildiysa resp["candidates"] = candidates # adaylar listesini da dondur return resp # yaniti dondur # ====== BENCHMARK UI (otomatik EN/TR/OTHER kovalama) ====== def run_benchmark_auto(texts_blob: str): # coklu metin benchmark fonksiyonu texts = [t.strip() for t in (texts_blob or "").splitlines() if t.strip()] # satirlari ayir ve boslari temizle if not texts: # eger hic metin yoksa return "Uyari: metin alani bos.", [] # uyari ve bos tablo dondur buckets = {"en": [], "tr": [], "other": []} # dil kovalari olustur for t in texts: # her metin icin buckets[detect_lang(t)].append(t) # uygun kovaya ekle rows: List[List] = [] # cikti satirlarini tutacak liste errors: List[str] = [] # hata mesajlari icin liste def bench_set(text_list: List[str], keys: List[str], tag: str): # bir dil kovasi icin benchmark calistir if not text_list: # eger bu kovada metin yoksa return # cik for k in keys: # her model adayi icin spec = MODELS[k] # model kaydini al pipe, cfg = get_pipe_and_cfg(k) # pipeline ve config al modelname = f"{tag}/{spec['name']}" # tablo icin gorsel ad olustur if pipe is None: # yuklenemedi ise errors.append(f"yuklenemedi: {modelname}") # hatayi kaydet for t in text_list: # her metin icin rows.append([t[:50], modelname, "ERROR", 0.0, 0.0, "N/A"]) # hata satiri ekle continue # diger modele gec proc = [preprocess_en(x) if tag == "EN" else x for x in text_list] # EN icin on isleme yap t0 = time.perf_counter() # zaman sayacini baslat outs = pipe(proc) # toplu tahmin al avg_ms = (time.perf_counter() - t0) * 1000.0 / max(1, len(proc)) # ortalama gecikme hesapla for orig, out in zip(text_list, outs): # her cikti icin try: top = max(out, key=lambda s: s["score"]) # en yuksek skoru bul lab = normalize_label(top["label"], cfg, spec["kind"]) # etiketi standarda cevir sc = float(top["score"]) # skoru sayiya cevir conf = "high" if sc > 0.8 else ("medium" if sc > 0.6 else "low") # guven seviyesi rows.append([ # tabloya bir satir ekle orig[:50] + ("..." if len(orig) > 50 else ""), # metnin kisa hali modelname, # model adi lab, # etiket round(sc, 4), # skor round(avg_ms, 1), # ortalama gecikme conf # guven ]) except Exception as ex: # tahmin hatasi olursa errors.append(f"hata: {modelname}: {str(ex)[:80]}") # hata listesine ekle rows.append([orig[:50], modelname, "ERROR", 0.0, 0.0, "N/A"]) # hata satiri ekle bench_set(buckets["en"], LANG_TOP3["en"], "EN") # ingilizce kovayi calistir bench_set(buckets["tr"], LANG_TOP3["tr"], "TR") # turkce kovayi calistir bench_set(buckets["other"], LANG_TOP3["other"], "OTHER") # diger kovayi calistir # sadelesmis ozet metni olustur summary_lines: List[str] = [] # ozet satirlari icin liste if errors: # hata varsa summary_lines.append("Hatalar:") # baslik yaz for e in errors: # her hata icin summary_lines.append(f"- {e}") # listele if not summary_lines: # hic hata yoksa summary_lines.append("Benchmark tamamlandi.") # basit bilgi satiri return "\n".join(summary_lines), rows # ozet metni ve tablo satirlarini dondur # ====== GRADIO ARAYUZLERI ====== api_intf = gr.Interface( # uretim api arayuzu fn=analyze, # cagrilacak fonksiyon inputs=[ # giris bilesenleri gr.Textbox(lines=3, label="Text"), # metin kutusu gr.Textbox(lines=1, label="force_lang (en|tr|other, opsiyonel)", value=""), # zorlama dil alani gr.Checkbox(label="benchmark (kisa TOP3 karsilastirma)", value=False), # benchmark secenegi ], outputs=gr.JSON(label="Result"), # cikti JSON gosterimi title="Sentiment API (Production)", # baslik description="POST /api/predict/analyze doner: {label, score, confidence, lang, model_used, processing_time_ms[, candidates]}" # aciklama ) api_intf.api_name = "analyze" # endpoint yolu: /api/predict/analyze with gr.Blocks(title="Sentiment Benchmark") as bench_ui: # benchmark arayuzu gr.Markdown("Coklu metin benchmark. Her satir ayri bir ornek olmalidir.") # kullaniciya kisa bilgi txt = gr.Textbox(lines=10, label="Ornekler (satir satir)") # coklu satir metin girisi btn = gr.Button("Calistir") # calistir butonu out_md = gr.Markdown() # ozet metin alanı out_tbl = gr.Dataframe( # tablo cikti alani headers=["text", "bucket/model", "label", "score", "latency_ms", "confidence"], # tablo basliklari row_count=(0, "dynamic"), # dinamik satir sayisi col_count=(6, "fixed"), # sabit sutun sayisi interactive=False, # kullanici duzenleyemesin wrap=True # metin sarma acik ) btn.click(fn=run_benchmark_auto, inputs=[txt], outputs=[out_md, out_tbl]) # buton tiklayinca fonksiyonu cagir demo = gr.TabbedInterface( # iki sekmeli toplam arayuz [api_intf, bench_ui], # birinci sekme api, ikinci sekme benchmark tab_names=["API", "Benchmark"] # sekme isimleri ) if __name__ == "__main__": # ana calisma blogu demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), show_error=True) # gradioyu baslat