File size: 16,651 Bytes
4912b0b
 
 
 
 
 
 
 
 
 
016dbc6
4912b0b
 
 
 
 
5d18194
4912b0b
 
 
 
9b370ba
4912b0b
 
 
 
016dbc6
4912b0b
 
 
 
 
016dbc6
4912b0b
 
 
 
016dbc6
4912b0b
 
 
 
016dbc6
 
 
4912b0b
 
 
 
 
721b9e7
016dbc6
4912b0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721b9e7
4912b0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721b9e7
4912b0b
721b9e7
4912b0b
 
 
 
 
 
 
 
721b9e7
4912b0b
 
721b9e7
4912b0b
721b9e7
4912b0b
 
 
 
016dbc6
4912b0b
 
016dbc6
4912b0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b370ba
 
 
721b9e7
 
 
9b370ba
4912b0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721b9e7
4912b0b
 
 
 
 
 
 
 
 
 
9b370ba
 
721b9e7
 
4912b0b
721b9e7
4912b0b
9b370ba
4912b0b
 
 
 
 
 
 
 
 
 
 
 
 
 
721b9e7
4912b0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721b9e7
4912b0b
 
 
 
 
 
 
 
 
 
 
721b9e7
4912b0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721b9e7
4912b0b
 
 
721b9e7
4912b0b
 
 
 
 
 
 
 
 
 
 
 
 
93578fc
4912b0b
016dbc6
4912b0b
 
 
016dbc6
e1377d6
4912b0b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
import os  # isletim sistemi degiskenlerine erismek icin
import re  # metin isleme icin regexp kutuphanesi
import time  # gecikme olcumu icin zaman fonksiyonlari
import gc  # bellek temizligi icin garbage collector
from typing import Dict, Tuple, Optional, List  # tip ipuclari icin
import gradio as gr  # Hugging Face Spaces arayuzunu kurmak icin
import torch  # pytorch modellerini calistirmak icin
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoConfig  

# ====== MODEL KAYITLARI (sade ve ogrenci dostu) ======
MODELS: Dict[str, Dict] = {
    # en dili icin 3 model
    "roberta": { 
        "name": "RoBERTa Twitter 3class EN",  # aciklama adi
        "id": "cardiffnlp/twitter-roberta-base-sentiment-latest",  # HF model kimligi
        "kind": "3class"  # cikis turu
    },
    "distilbert": {  
        "name": "DistilBERT SST2 2class EN",  # aciklama adi
        "id": "distilbert-base-uncased-finetuned-sst-2-english",  # HF model kimligi
        "kind": "2class"  # cikis turu
    },
    "bertweet": {  
        "name": "BERTweet 3class EN",  # aciklama adi
        "id": "finiteautomata/bertweet-base-sentiment-analysis",  # HF model kimligi
        "kind": "3class"  # cikis turu
    },
    # tr veya diger diller icin 3 model (cok dilli agirlikli)
    "xlmr": {  
        "name": "XLM-R 3class Multi",  # aciklama adi
        "id": "cardiffnlp/twitter-xlm-roberta-base-sentiment",  # HF model kimligi
        "kind": "3class"  # cikis turu
    },
    "bert_5star": { 
        "name": "BERT Multi 5star",  # aciklama adi
        "id": "nlptown/bert-base-multilingual-uncased-sentiment",  # HF model kimligi
        "kind": "5star"  # cikis turu
    },
    "albert": {  
        "name": "ALBERT v2 3class Light",  # aciklama adi
        "id": "barissayil/bert-sentiment-analysis-sst",  # HF model kimligi
        "kind": "3class"  # cikis turu
    },
}

# ====== DIL BAZLI TOP3 SECIMLERI ======
LANG_TOP3 = {  # her dil icin 3 aday model listesi
    "en": ["roberta", "distilbert", "bertweet"],  # ingilizce icin ilk 3
    "tr": ["xlmr", "bert_5star", "albert"],  # turkce icin ilk 3 (cok dilli agirlikli)
    "other": ["xlmr", "bert_5star", "roberta"]  # diger diller icin yedek 3
}

# ====== LAZY CACHE (modelleri ihtiyac olunca yukleme) ======
_PIPE_CACHE: Dict[str, TextClassificationPipeline] = {}  # pipeline nesneleri icin cache sozlugu
_CFG_CACHE: Dict[str, AutoConfig] = {}  # model config nesneleri icin cache sozlugu
MAX_CACHE_SIZE = 4  # en fazla 4 farkli model cachede tut

def cleanup_cache() -> None:  # cache buyurse eskileri silmek icin fonksiyon
    while len(_PIPE_CACHE) > MAX_CACHE_SIZE:  # eger siniri astiysa
        oldest_key = next(iter(_PIPE_CACHE.keys()))  # ilk ekleneni bul
        _PIPE_CACHE.pop(oldest_key, None)  # pipeline sil
        _CFG_CACHE.pop(oldest_key, None)  # config sil
    gc.collect()  # python cop toplayici cagir
    if torch.cuda.is_available():  # eger gpu varsa
        torch.cuda.empty_cache()  # gpu bellegini de bosalt

def get_pipe_and_cfg(model_key: str) -> Tuple[Optional[TextClassificationPipeline], Optional[AutoConfig]]:  # model yukleme fonksiyonu
    spec = MODELS[model_key]  # model sozlugunden kaydi al
    model_id = spec["id"]  # hf id'sini al
    if model_id in _PIPE_CACHE:  # eger cachede varsa
        return _PIPE_CACHE[model_id], _CFG_CACHE.get(model_id)  # cacheden dondur
    try:
        tok = AutoTokenizer.from_pretrained(model_id)  # tokenizer yukle
        mdl = AutoModelForSequenceClassification.from_pretrained(model_id)  # model yukle
        pipe = TextClassificationPipeline(  # pipeline olustur
            model=mdl,  # model set et
            tokenizer=tok,  # tokenizer set et
            framework="pt",  # pytorch kullan
            return_all_scores=True,  # tum sinif skorlarini iste
            device=-1  # cpu kullan
        )
        _PIPE_CACHE[model_id] = pipe  # pipeline'i cache'e yaz
        _CFG_CACHE[model_id] = AutoConfig.from_pretrained(model_id)  # config'i cache'e yaz
        cleanup_cache()  # gerekirse cache temizligi yap
        return pipe, _CFG_CACHE[model_id]  # pipeline ve config dondur
    except Exception as e:  # yukleme hatasi olursa
        print(f"model yukleme hatasi: {model_key} -> {e}")  # ekrana yaz
        return None, None  # None dondur

# ====== DIL TESPITI ======
try:
    from langdetect import detect  # hafif dil tespiti kutuphanesi
except Exception:
    detect = None  # eger yuklenemezse None yap

def detect_lang(text: str) -> str:  # girilen metnin dilini bul
    t = (text or "").strip()  # bosluklari temizle
    if not t or len(t) < 2:  # cok kisa ise
        return "other"  # diger kabul et
    if detect is None:  # kutuphane yoksa
        return "other"  # diger kabul et
    try:
        lang = detect(t)  # dil tespiti yap
        return lang if lang in ("en", "tr") else "other"  # sadece en ve tr destekliyoruz
    except Exception:
        return "other"  # hata olursa diger de

# ====== LABEL NORMALIZASYONU ======
def normalize_label(raw_label: str, cfg: Optional[AutoConfig], kind: str) -> str:  # farkli etiketleri standarda cevir
    lbl = (raw_label or "").lower()  # etiket kucuk harfe cevir
    if lbl.startswith("label_") and cfg is not None and hasattr(cfg, "id2label"):  # eger LABEL_0 gibi ise
        try:
            idx = int(lbl.split("_")[-1])  # sayiyi al
            lbl = str(cfg.id2label[idx]).lower()  # id2label ile gercek etikete cevir
        except Exception:
            pass  # hata olursa devam et
    if kind == "5star":  # 5 yildizli model icin
        m = re.search(r"([1-5])", lbl)  # 1..5 ara
        if m:  # bulunduysa
            s = int(m.group(1))  # sayiyi al
            if s <= 2:  # 1 veya 2 ise
                return "negative"  # negatif
            if s == 3:  # 3 ise
                return "neutral"  # notr
            return "positive"  # 4 veya 5 ise pozitif
    if "neg" in lbl:  # metin icinde neg geciyorsa
        return "negative"  # negatif dondur
    if "neu" in lbl:  # metin icinde neu geciyorsa
        return "neutral"  # notr dondur
    if "pos" in lbl:  # metin icinde pos geciyorsa
        return "positive"  # pozitif dondur
    return "neutral"  # diger durumlarda notr dondur (2-class icin guvenli secim)

# ====== ENGLISH ON-ISLEME ======
def preprocess_en(text: str) -> str:  # ingilizce metin icin hafif on isleme
    if not text:  # metin bos ise
        return text  # aynen dondur
    t = re.sub(r"\s+", " ", text).strip()  # fazla bosluklari sadelestir
    t = re.sub(r"(.)\1{3,}", r"\1\1", t)  # cok tekrar eden karakterleri kisalt
    t = re.sub(r"http[s]?://\S+", "URL", t)  # linkleri URL ile degistir
    t = re.sub(r"@\w+", "@USER", t)  # mentionlari duzelt
    t = re.sub(r"#(\w+)", r"\1", t)  # hashtag isaretini kaldir
    reps = {  # kisaltma acilimlari sozlugu
        "won't": "will not",
        "can't": "cannot",
        "n't": " not",
        "'re": " are",
        "'ve": " have",
        "'ll": " will",
        "'d": " would",
        "'m": " am"
    }
    for old, new in reps.items():  # her kural icin
        t = t.replace(old, new)  # metinde degistir
    return t  # islenmis metni dondur

# ====== DIL -> VARSAYILAN MODEL KURALI ======
def pick_default_key_for_lang(lang: str) -> str:  # dile gore varsayilan modeli sec
    if lang == "en":  # ingilizce ise
        return "roberta"  # roberta sec
    if lang == "tr":  # turkce ise
        return "xlmr"  # xlmr sec
    return "xlmr"  # diger diller icin xlmr sec

# ====== ANA API FONKSIYONU (/api/predict/analyze) ======
def analyze(text: str, force_lang: Optional[str] = None, benchmark: bool = False):  # ana uretim endpointi
    txt = (text or "").strip()  # giris metnini temizle
    if not txt:  # bos metin ise
        return {  # notr dondur
            "label": "neutral",
            "score": 1.0,
            "confidence": "high",
            "lang": force_lang or "other",
            "model_used": "none",
            "processing_time_ms": 0.0
        }
    lang = force_lang if force_lang in ("en", "tr", "other") else detect_lang(txt)  # dili belirle
    proc = preprocess_en(txt) if lang == "en" else txt  # ingilizce ise on isleme uygula
    candidates: List[Dict] = []  # benchmark aday listesi
    if benchmark:  # eger mini benchmark istenirse
        keys = LANG_TOP3.get(lang, LANG_TOP3["other"])  # o dilin TOP3 listesi
        for k in keys:  # her aday icin
            pipe, cfg = get_pipe_and_cfg(k)  # pipeline ve config al
            if pipe is None:  # yuklenemediyse
                continue  # atla
            t0 = time.perf_counter()  # zaman sayacini baslat
            out = pipe(proc)[0]  # modeli calistir
            ms = (time.perf_counter() - t0) * 1000.0  # gecikmeyi hesapla
            top = max(out, key=lambda s: s["score"])  # en yuksek skorlu sinifi bul
            lab = normalize_label(top["label"], cfg, MODELS[k]["kind"])  # etiketi standarda cevir
            candidates.append({  # adayi listeye ekle
                "model": k,
                "label": lab,
                "score": float(top["score"]),
                "latency_ms": round(ms, 2)
            })
        if candidates:  # adaylar varsa
            candidates.sort(key=lambda c: (-c["score"], c["latency_ms"]))  # skora gore azalan, sonra gecikmeye gore artan
            winner_key = candidates[0]["model"]  # kazananin anahtarini al
        else:  # aday yoksa
            winner_key = pick_default_key_for_lang(lang)  # varsayilani sec
    else:  # benchmark yoksa
        winner_key = pick_default_key_for_lang(lang)  # dogrudan varsayilani sec
    pipe, cfg = get_pipe_and_cfg(winner_key)  # kazanan pipeline ve config al
    if pipe is None:  # yuklenemezse
        return {  # hata don
            "label": "error",
            "score": 0.0,
            "confidence": "low",
            "lang": lang,
            "model_used": winner_key,
            "processing_time_ms": 0.0,
            "error": "model_load_failed"
        }
    t0 = time.perf_counter()  # zaman sayacini baslat
    out = pipe(proc)[0]  # tek atis tahmini al
    ms = (time.perf_counter() - t0) * 1000.0  # gecikmeyi hesapla
    top = max(out, key=lambda s: s["score"])  # en yuksek skorlu sinifi bul
    label = normalize_label(top["label"], cfg, MODELS[winner_key]["kind"])  # etiketi standarda cevir
    score = float(top["score"])  # skoru sayiya cevir
    confidence = "high" if score > 0.8 else ("medium" if score > 0.6 else "low")  # guven araligini belirle
    resp = {  # yanit sozlugunu olustur
        "label": label,  # son etiket
        "score": round(score, 4),  # skor 4 ondalik
        "confidence": confidence,  # guven seviyesi
        "lang": lang,  # tespit edilen dil
        "model_used": MODELS[winner_key]["id"].split("/")[-1],  # kullanilan modelin son parcasi
        "processing_time_ms": round(ms, 2)  # islem suresi ms
    }
    if benchmark and candidates:  # eger benchmark yapildiysa
        resp["candidates"] = candidates  # adaylar listesini da dondur
    return resp  # yaniti dondur

# ====== BENCHMARK UI (otomatik EN/TR/OTHER kovalama) ======
def run_benchmark_auto(texts_blob: str):  # coklu metin benchmark fonksiyonu
    texts = [t.strip() for t in (texts_blob or "").splitlines() if t.strip()]  # satirlari ayir ve boslari temizle
    if not texts:  # eger hic metin yoksa
        return "Uyari: metin alani bos.", []  # uyari ve bos tablo dondur
    buckets = {"en": [], "tr": [], "other": []}  # dil kovalari olustur
    for t in texts:  # her metin icin
        buckets[detect_lang(t)].append(t)  # uygun kovaya ekle
    rows: List[List] = []  # cikti satirlarini tutacak liste
    errors: List[str] = []  # hata mesajlari icin liste

    def bench_set(text_list: List[str], keys: List[str], tag: str):  # bir dil kovasi icin benchmark calistir
        if not text_list:  # eger bu kovada metin yoksa
            return  # cik
        for k in keys:  # her model adayi icin
            spec = MODELS[k]  # model kaydini al
            pipe, cfg = get_pipe_and_cfg(k)  # pipeline ve config al
            modelname = f"{tag}/{spec['name']}"  # tablo icin gorsel ad olustur
            if pipe is None:  # yuklenemedi ise
                errors.append(f"yuklenemedi: {modelname}")  # hatayi kaydet
                for t in text_list:  # her metin icin
                    rows.append([t[:50], modelname, "ERROR", 0.0, 0.0, "N/A"])  # hata satiri ekle
                continue  # diger modele gec
            proc = [preprocess_en(x) if tag == "EN" else x for x in text_list]  # EN icin on isleme yap
            t0 = time.perf_counter()  # zaman sayacini baslat
            outs = pipe(proc)  # toplu tahmin al
            avg_ms = (time.perf_counter() - t0) * 1000.0 / max(1, len(proc))  # ortalama gecikme hesapla
            for orig, out in zip(text_list, outs):  # her cikti icin
                try:
                    top = max(out, key=lambda s: s["score"])  # en yuksek skoru bul
                    lab = normalize_label(top["label"], cfg, spec["kind"])  # etiketi standarda cevir
                    sc = float(top["score"])  # skoru sayiya cevir
                    conf = "high" if sc > 0.8 else ("medium" if sc > 0.6 else "low")  # guven seviyesi
                    rows.append([  # tabloya bir satir ekle
                        orig[:50] + ("..." if len(orig) > 50 else ""),  # metnin kisa hali
                        modelname,  # model adi
                        lab,  # etiket
                        round(sc, 4),  # skor
                        round(avg_ms, 1),  # ortalama gecikme
                        conf  # guven
                    ])
                except Exception as ex:  # tahmin hatasi olursa
                    errors.append(f"hata: {modelname}: {str(ex)[:80]}")  # hata listesine ekle
                    rows.append([orig[:50], modelname, "ERROR", 0.0, 0.0, "N/A"])  # hata satiri ekle

    bench_set(buckets["en"], LANG_TOP3["en"], "EN")  # ingilizce kovayi calistir
    bench_set(buckets["tr"], LANG_TOP3["tr"], "TR")  # turkce kovayi calistir
    bench_set(buckets["other"], LANG_TOP3["other"], "OTHER")  # diger kovayi calistir

    # sadelesmis ozet metni olustur
    summary_lines: List[str] = []  # ozet satirlari icin liste
    if errors:  # hata varsa
        summary_lines.append("Hatalar:")  # baslik yaz
        for e in errors:  # her hata icin
            summary_lines.append(f"- {e}")  # listele
    if not summary_lines:  # hic hata yoksa
        summary_lines.append("Benchmark tamamlandi.")  # basit bilgi satiri
    return "\n".join(summary_lines), rows  # ozet metni ve tablo satirlarini dondur

# ====== GRADIO ARAYUZLERI ======
api_intf = gr.Interface(  # uretim api arayuzu
    fn=analyze,  # cagrilacak fonksiyon
    inputs=[  # giris bilesenleri
        gr.Textbox(lines=3, label="Text"),  # metin kutusu
        gr.Textbox(lines=1, label="force_lang (en|tr|other, opsiyonel)", value=""),  # zorlama dil alani
        gr.Checkbox(label="benchmark (kisa TOP3 karsilastirma)", value=False),  # benchmark secenegi
    ],
    outputs=gr.JSON(label="Result"),  # cikti JSON gosterimi
    title="Sentiment API (Production)",  # baslik
    description="POST /api/predict/analyze doner: {label, score, confidence, lang, model_used, processing_time_ms[, candidates]}"  # aciklama
)
api_intf.api_name = "analyze"  # endpoint yolu: /api/predict/analyze

with gr.Blocks(title="Sentiment Benchmark") as bench_ui:  # benchmark arayuzu
    gr.Markdown("Coklu metin benchmark. Her satir ayri bir ornek olmalidir.")  # kullaniciya kisa bilgi
    txt = gr.Textbox(lines=10, label="Ornekler (satir satir)")  # coklu satir metin girisi
    btn = gr.Button("Calistir")  # calistir butonu
    out_md = gr.Markdown()  # ozet metin alanı
    out_tbl = gr.Dataframe(  # tablo cikti alani
        headers=["text", "bucket/model", "label", "score", "latency_ms", "confidence"],  # tablo basliklari
        row_count=(0, "dynamic"),  # dinamik satir sayisi
        col_count=(6, "fixed"),  # sabit sutun sayisi
        interactive=False,  # kullanici duzenleyemesin
        wrap=True  # metin sarma acik
    )
    btn.click(fn=run_benchmark_auto, inputs=[txt], outputs=[out_md, out_tbl])  # buton tiklayinca fonksiyonu cagir

demo = gr.TabbedInterface(  # iki sekmeli toplam arayuz
    [api_intf, bench_ui],  # birinci sekme api, ikinci sekme benchmark
    tab_names=["API", "Benchmark"]  # sekme isimleri
)

if __name__ == "__main__":  # ana calisma blogu
    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), show_error=True)  # gradioyu baslat