Spaces:

sherdd
/

chat-sentiment-api

Sleeping

File size: 16,651 Bytes

import os  # isletim sistemi degiskenlerine erismek icin
import re  # metin isleme icin regexp kutuphanesi
import time  # gecikme olcumu icin zaman fonksiyonlari
import gc  # bellek temizligi icin garbage collector
from typing import Dict, Tuple, Optional, List  # tip ipuclari icin
import gradio as gr  # Hugging Face Spaces arayuzunu kurmak icin
import torch  # pytorch modellerini calistirmak icin
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoConfig  

# ====== MODEL KAYITLARI (sade ve ogrenci dostu) ======
MODELS: Dict[str, Dict] = {
    # en dili icin 3 model
    "roberta": { 
        "name": "RoBERTa Twitter 3class EN",  # aciklama adi
        "id": "cardiffnlp/twitter-roberta-base-sentiment-latest",  # HF model kimligi
        "kind": "3class"  # cikis turu
    },
    "distilbert": {  
        "name": "DistilBERT SST2 2class EN",  # aciklama adi
        "id": "distilbert-base-uncased-finetuned-sst-2-english",  # HF model kimligi
        "kind": "2class"  # cikis turu
    },
    "bertweet": {  
        "name": "BERTweet 3class EN",  # aciklama adi
        "id": "finiteautomata/bertweet-base-sentiment-analysis",  # HF model kimligi
        "kind": "3class"  # cikis turu
    },
    # tr veya diger diller icin 3 model (cok dilli agirlikli)
    "xlmr": {  
        "name": "XLM-R 3class Multi",  # aciklama adi
        "id": "cardiffnlp/twitter-xlm-roberta-base-sentiment",  # HF model kimligi
        "kind": "3class"  # cikis turu
    },
    "bert_5star": { 
        "name": "BERT Multi 5star",  # aciklama adi
        "id": "nlptown/bert-base-multilingual-uncased-sentiment",  # HF model kimligi
        "kind": "5star"  # cikis turu
    },
    "albert": {  
        "name": "ALBERT v2 3class Light",  # aciklama adi
        "id": "barissayil/bert-sentiment-analysis-sst",  # HF model kimligi
        "kind": "3class"  # cikis turu
    },
}

# ====== DIL BAZLI TOP3 SECIMLERI ======
LANG_TOP3 = {  # her dil icin 3 aday model listesi
    "en": ["roberta", "distilbert", "bertweet"],  # ingilizce icin ilk 3
    "tr": ["xlmr", "bert_5star", "albert"],  # turkce icin ilk 3 (cok dilli agirlikli)
    "other": ["xlmr", "bert_5star", "roberta"]  # diger diller icin yedek 3
}

# ====== LAZY CACHE (modelleri ihtiyac olunca yukleme) ======
_PIPE_CACHE: Dict[str, TextClassificationPipeline] = {}  # pipeline nesneleri icin cache sozlugu
_CFG_CACHE: Dict[str, AutoConfig] = {}  # model config nesneleri icin cache sozlugu
MAX_CACHE_SIZE = 4  # en fazla 4 farkli model cachede tut

def cleanup_cache() -> None:  # cache buyurse eskileri silmek icin fonksiyon
    while len(_PIPE_CACHE) > MAX_CACHE_SIZE:  # eger siniri astiysa
        oldest_key = next(iter(_PIPE_CACHE.keys()))  # ilk ekleneni bul
        _PIPE_CACHE.pop(oldest_key, None)  # pipeline sil
        _CFG_CACHE.pop(oldest_key, None)  # config sil
    gc.collect()  # python cop toplayici cagir
    if torch.cuda.is_available():  # eger gpu varsa
        torch.cuda.empty_cache()  # gpu bellegini de bosalt

def get_pipe_and_cfg(model_key: str) -> Tuple[Optional[TextClassificationPipeline], Optional[AutoConfig]]:  # model yukleme fonksiyonu
    spec = MODELS[model_key]  # model sozlugunden kaydi al
    model_id = spec["id"]  # hf id'sini al
    if model_id in _PIPE_CACHE:  # eger cachede varsa
        return _PIPE_CACHE[model_id], _CFG_CACHE.get(model_id)  # cacheden dondur
    try:
        tok = AutoTokenizer.from_pretrained(model_id)  # tokenizer yukle
        mdl = AutoModelForSequenceClassification.from_pretrained(model_id)  # model yukle
        pipe = TextClassificationPipeline(  # pipeline olustur
            model=mdl,  # model set et
            tokenizer=tok,  # tokenizer set et
            framework="pt",  # pytorch kullan
            return_all_scores=True,  # tum sinif skorlarini iste
            device=-1  # cpu kullan
        )
        _PIPE_CACHE[model_id] = pipe  # pipeline'i cache'e yaz
        _CFG_CACHE[model_id] = AutoConfig.from_pretrained(model_id)  # config'i cache'e yaz
        cleanup_cache()  # gerekirse cache temizligi yap
        return pipe, _CFG_CACHE[model_id]  # pipeline ve config dondur
    except Exception as e:  # yukleme hatasi olursa
        print(f"model yukleme hatasi: {model_key} -> {e}")  # ekrana yaz
        return None, None  # None dondur

# ====== DIL TESPITI ======
try:
    from langdetect import detect  # hafif dil tespiti kutuphanesi
except Exception:
    detect = None  # eger yuklenemezse None yap

def detect_lang(text: str) -> str:  # girilen metnin dilini bul
    t = (text or "").strip()  # bosluklari temizle
    if not t or len(t) < 2:  # cok kisa ise
        return "other"  # diger kabul et
    if detect is None:  # kutuphane yoksa
        return "other"  # diger kabul et
    try:
        lang = detect(t)  # dil tespiti yap
        return lang if lang in ("en", "tr") else "other"  # sadece en ve tr destekliyoruz
    except Exception:
        return "other"  # hata olursa diger de

# ====== LABEL NORMALIZASYONU ======
def normalize_label(raw_label: str, cfg: Optional[AutoConfig], kind: str) -> str:  # farkli etiketleri standarda cevir
    lbl = (raw_label or "").lower()  # etiket kucuk harfe cevir
    if lbl.startswith("label_") and cfg is not None and hasattr(cfg, "id2label"):  # eger LABEL_0 gibi ise
        try:
            idx = int(lbl.split("_")[-1])  # sayiyi al
            lbl = str(cfg.id2label[idx]).lower()  # id2label ile gercek etikete cevir
        except Exception:
            pass  # hata olursa devam et
    if kind == "5star":  # 5 yildizli model icin
        m = re.search(r"([1-5])", lbl)  # 1..5 ara
        if m:  # bulunduysa
            s = int(m.group(1))  # sayiyi al
            if s <= 2:  # 1 veya 2 ise
                return "negative"  # negatif
            if s == 3:  # 3 ise
                return "neutral"  # notr
            return "positive"  # 4 veya 5 ise pozitif
    if "neg" in lbl:  # metin icinde neg geciyorsa
        return "negative"  # negatif dondur
    if "neu" in lbl:  # metin icinde neu geciyorsa
        return "neutral"  # notr dondur
    if "pos" in lbl:  # metin icinde pos geciyorsa
        return "positive"  # pozitif dondur
    return "neutral"  # diger durumlarda notr dondur (2-class icin guvenli secim)

# ====== ENGLISH ON-ISLEME ======
def preprocess_en(text: str) -> str:  # ingilizce metin icin hafif on isleme
    if not text:  # metin bos ise
        return text  # aynen dondur
    t = re.sub(r"\s+", " ", text).strip()  # fazla bosluklari sadelestir
    t = re.sub(r"(.)\1{3,}", r"\1\1", t)  # cok tekrar eden karakterleri kisalt
    t = re.sub(r"http[s]?://\S+", "URL", t)  # linkleri URL ile degistir
    t = re.sub(r"@\w+", "@USER", t)  # mentionlari duzelt
    t = re.sub(r"#(\w+)", r"\1", t)  # hashtag isaretini kaldir
    reps = {  # kisaltma acilimlari sozlugu
        "won't": "will not",
        "can't": "cannot",
        "n't": " not",
        "'re": " are",
        "'ve": " have",
        "'ll": " will",
        "'d": " would",
        "'m": " am"
    }
    for old, new in reps.items():  # her kural icin
        t = t.replace(old, new)  # metinde degistir
    return t  # islenmis metni dondur

# ====== DIL -> VARSAYILAN MODEL KURALI ======
def pick_default_key_for_lang(lang: str) -> str:  # dile gore varsayilan modeli sec
    if lang == "en":  # ingilizce ise
        return "roberta"  # roberta sec
    if lang == "tr":  # turkce ise
        return "xlmr"  # xlmr sec
    return "xlmr"  # diger diller icin xlmr sec

# ====== ANA API FONKSIYONU (/api/predict/analyze) ======
def analyze(text: str, force_lang: Optional[str] = None, benchmark: bool = False):  # ana uretim endpointi
    txt = (text or "").strip()  # giris metnini temizle
    if not txt:  # bos metin ise
        return {  # notr dondur
            "label": "neutral",
            "score": 1.0,
            "confidence": "high",
            "lang": force_lang or "other",
            "model_used": "none",
            "processing_time_ms": 0.0
        }
    lang = force_lang if force_lang in ("en", "tr", "other") else detect_lang(txt)  # dili belirle
    proc = preprocess_en(txt) if lang == "en" else txt  # ingilizce ise on isleme uygula
    candidates: List[Dict] = []  # benchmark aday listesi
    if benchmark:  # eger mini benchmark istenirse
        keys = LANG_TOP3.get(lang, LANG_TOP3["other"])  # o dilin TOP3 listesi
        for k in keys:  # her aday icin
            pipe, cfg = get_pipe_and_cfg(k)  # pipeline ve config al
            if pipe is None:  # yuklenemediyse
                continue  # atla
            t0 = time.perf_counter()  # zaman sayacini baslat
            out = pipe(proc)[0]  # modeli calistir
            ms = (time.perf_counter() - t0) * 1000.0  # gecikmeyi hesapla
            top = max(out, key=lambda s: s["score"])  # en yuksek skorlu sinifi bul
            lab = normalize_label(top["label"], cfg, MODELS[k]["kind"])  # etiketi standarda cevir
            candidates.append({  # adayi listeye ekle
                "model": k,
                "label": lab,
                "score": float(top["score"]),
                "latency_ms": round(ms, 2)
            })
        if candidates:  # adaylar varsa
            candidates.sort(key=lambda c: (-c["score"], c["latency_ms"]))  # skora gore azalan, sonra gecikmeye gore artan
            winner_key = candidates[0]["model"]  # kazananin anahtarini al
        else:  # aday yoksa
            winner_key = pick_default_key_for_lang(lang)  # varsayilani sec
    else:  # benchmark yoksa
        winner_key = pick_default_key_for_lang(lang)  # dogrudan varsayilani sec
    pipe, cfg = get_pipe_and_cfg(winner_key)  # kazanan pipeline ve config al
    if pipe is None:  # yuklenemezse
        return {  # hata don
            "label": "error",
            "score": 0.0,
            "confidence": "low",
            "lang": lang,
            "model_used": winner_key,
            "processing_time_ms": 0.0,
            "error": "model_load_failed"
        }
    t0 = time.perf_counter()  # zaman sayacini baslat
    out = pipe(proc)[0]  # tek atis tahmini al
    ms = (time.perf_counter() - t0) * 1000.0  # gecikmeyi hesapla
    top = max(out, key=lambda s: s["score"])  # en yuksek skorlu sinifi bul
    label = normalize_label(top["label"], cfg, MODELS[winner_key]["kind"])  # etiketi standarda cevir
    score = float(top["score"])  # skoru sayiya cevir
    confidence = "high" if score > 0.8 else ("medium" if score > 0.6 else "low")  # guven araligini belirle
    resp = {  # yanit sozlugunu olustur
        "label": label,  # son etiket
        "score": round(score, 4),  # skor 4 ondalik
        "confidence": confidence,  # guven seviyesi
        "lang": lang,  # tespit edilen dil
        "model_used": MODELS[winner_key]["id"].split("/")[-1],  # kullanilan modelin son parcasi
        "processing_time_ms": round(ms, 2)  # islem suresi ms
    }
    if benchmark and candidates:  # eger benchmark yapildiysa
        resp["candidates"] = candidates  # adaylar listesini da dondur
    return resp  # yaniti dondur

# ====== BENCHMARK UI (otomatik EN/TR/OTHER kovalama) ======
def run_benchmark_auto(texts_blob: str):  # coklu metin benchmark fonksiyonu
    texts = [t.strip() for t in (texts_blob or "").splitlines() if t.strip()]  # satirlari ayir ve boslari temizle
    if not texts:  # eger hic metin yoksa
        return "Uyari: metin alani bos.", []  # uyari ve bos tablo dondur
    buckets = {"en": [], "tr": [], "other": []}  # dil kovalari olustur
    for t in texts:  # her metin icin
        buckets[detect_lang(t)].append(t)  # uygun kovaya ekle
    rows: List[List] = []  # cikti satirlarini tutacak liste
    errors: List[str] = []  # hata mesajlari icin liste

    def bench_set(text_list: List[str], keys: List[str], tag: str):  # bir dil kovasi icin benchmark calistir
        if not text_list:  # eger bu kovada metin yoksa
            return  # cik
        for k in keys:  # her model adayi icin
            spec = MODELS[k]  # model kaydini al
            pipe, cfg = get_pipe_and_cfg(k)  # pipeline ve config al
            modelname = f"{tag}/{spec['name']}"  # tablo icin gorsel ad olustur
            if pipe is None:  # yuklenemedi ise
                errors.append(f"yuklenemedi: {modelname}")  # hatayi kaydet
                for t in text_list:  # her metin icin
                    rows.append([t[:50], modelname, "ERROR", 0.0, 0.0, "N/A"])  # hata satiri ekle
                continue  # diger modele gec
            proc = [preprocess_en(x) if tag == "EN" else x for x in text_list]  # EN icin on isleme yap
            t0 = time.perf_counter()  # zaman sayacini baslat
            outs = pipe(proc)  # toplu tahmin al
            avg_ms = (time.perf_counter() - t0) * 1000.0 / max(1, len(proc))  # ortalama gecikme hesapla
            for orig, out in zip(text_list, outs):  # her cikti icin
                try:
                    top = max(out, key=lambda s: s["score"])  # en yuksek skoru bul
                    lab = normalize_label(top["label"], cfg, spec["kind"])  # etiketi standarda cevir
                    sc = float(top["score"])  # skoru sayiya cevir
                    conf = "high" if sc > 0.8 else ("medium" if sc > 0.6 else "low")  # guven seviyesi
                    rows.append([  # tabloya bir satir ekle
                        orig[:50] + ("..." if len(orig) > 50 else ""),  # metnin kisa hali
                        modelname,  # model adi
                        lab,  # etiket
                        round(sc, 4),  # skor
                        round(avg_ms, 1),  # ortalama gecikme
                        conf  # guven
                    ])
                except Exception as ex:  # tahmin hatasi olursa
                    errors.append(f"hata: {modelname}: {str(ex)[:80]}")  # hata listesine ekle
                    rows.append([orig[:50], modelname, "ERROR", 0.0, 0.0, "N/A"])  # hata satiri ekle

    bench_set(buckets["en"], LANG_TOP3["en"], "EN")  # ingilizce kovayi calistir
    bench_set(buckets["tr"], LANG_TOP3["tr"], "TR")  # turkce kovayi calistir
    bench_set(buckets["other"], LANG_TOP3["other"], "OTHER")  # diger kovayi calistir

    # sadelesmis ozet metni olustur
    summary_lines: List[str] = []  # ozet satirlari icin liste
    if errors:  # hata varsa
        summary_lines.append("Hatalar:")  # baslik yaz
        for e in errors:  # her hata icin
            summary_lines.append(f"- {e}")  # listele
    if not summary_lines:  # hic hata yoksa
        summary_lines.append("Benchmark tamamlandi.")  # basit bilgi satiri
    return "\n".join(summary_lines), rows  # ozet metni ve tablo satirlarini dondur

# ====== GRADIO ARAYUZLERI ======
api_intf = gr.Interface(  # uretim api arayuzu
    fn=analyze,  # cagrilacak fonksiyon
    inputs=[  # giris bilesenleri
        gr.Textbox(lines=3, label="Text"),  # metin kutusu
        gr.Textbox(lines=1, label="force_lang (en|tr|other, opsiyonel)", value=""),  # zorlama dil alani
        gr.Checkbox(label="benchmark (kisa TOP3 karsilastirma)", value=False),  # benchmark secenegi
    ],
    outputs=gr.JSON(label="Result"),  # cikti JSON gosterimi
    title="Sentiment API (Production)",  # baslik
    description="POST /api/predict/analyze doner: {label, score, confidence, lang, model_used, processing_time_ms[, candidates]}"  # aciklama
)
api_intf.api_name = "analyze"  # endpoint yolu: /api/predict/analyze

with gr.Blocks(title="Sentiment Benchmark") as bench_ui:  # benchmark arayuzu
    gr.Markdown("Coklu metin benchmark. Her satir ayri bir ornek olmalidir.")  # kullaniciya kisa bilgi
    txt = gr.Textbox(lines=10, label="Ornekler (satir satir)")  # coklu satir metin girisi
    btn = gr.Button("Calistir")  # calistir butonu
    out_md = gr.Markdown()  # ozet metin alanı
    out_tbl = gr.Dataframe(  # tablo cikti alani
        headers=["text", "bucket/model", "label", "score", "latency_ms", "confidence"],  # tablo basliklari
        row_count=(0, "dynamic"),  # dinamik satir sayisi
        col_count=(6, "fixed"),  # sabit sutun sayisi
        interactive=False,  # kullanici duzenleyemesin
        wrap=True  # metin sarma acik
    )
    btn.click(fn=run_benchmark_auto, inputs=[txt], outputs=[out_md, out_tbl])  # buton tiklayinca fonksiyonu cagir

demo = gr.TabbedInterface(  # iki sekmeli toplam arayuz
    [api_intf, bench_ui],  # birinci sekme api, ikinci sekme benchmark
    tab_names=["API", "Benchmark"]  # sekme isimleri
)

if __name__ == "__main__":  # ana calisma blogu
    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), show_error=True)  # gradioyu baslat