sherdd commited on
Commit
4912b0b
·
verified ·
1 Parent(s): 721b9e7
Files changed (1) hide show
  1. app.py +285 -426
app.py CHANGED
@@ -1,232 +1,172 @@
1
- # app.py Hugging Face Space (Gradio) for 2-Aşamalı (Dil Tespiti ➜ Model Yönlendirme)
2
- # + EN/TR en iyi 3 model ile Benchmark
3
- #
4
- # Gereksinimler (Spaces > Files > "requirements.txt"):
5
- # transformers>=4.43.0
6
- # torch
7
- # gradio>=4.44.0
8
- # langdetect
9
- #
10
- # Çalışma Modları:
11
- # 1) 🔌 API (Production): /api/predict/analyze
12
- # Body örn:
13
- # {
14
- # "text": "Harika bir ürün!",
15
- # "force_lang": null, // opsiyonel: "en" | "tr" | "other"
16
- # "benchmark": false // true ise en iyi 3 adaydan mini-benchmark sonucu döner
17
- # }
18
- #
19
- # 2) 🧪 Benchmark (Auto EN/TR/Other): Çoklu metni satır satır test edip özetler
20
- #
21
- # Notlar:
22
- # - EN için basit ön-işleme uygulanır (TR için uygulanmaz).
23
- # - Label standardizasyonu: positive/neutral/negative
24
- # - Cache + lazy load: Modeller ihtiyaç oldukça yüklenir, bellek sınırı aşıldığında eskiler çıkarılır.
25
-
26
- import os, re, time, gc, traceback
27
- from typing import List, Dict, Tuple, Optional
28
-
29
- import gradio as gr
30
- import torch
31
- from transformers import (
32
- AutoTokenizer,
33
- AutoModelForSequenceClassification,
34
- TextClassificationPipeline,
35
- AutoConfig,
36
- )
37
-
38
- # =========================================
39
- # MODEL HAVUZU (ID’ler Hugging Face’ten)
40
- # =========================================
41
  MODELS: Dict[str, Dict] = {
42
- # ——— EN Önerilen 3 ———
43
- "roberta": {
44
- "name": "RoBERTa Twitter (3-class) [EN]",
45
- "id": "cardiffnlp/twitter-roberta-base-sentiment-latest",
46
- "kind": "3class",
47
- "size_mb": 476,
48
  },
49
- "distilbert": {
50
- "name": "DistilBERT SST-2 (2-class) [EN]",
51
- "id": "distilbert-base-uncased-finetuned-sst-2-english",
52
- "kind": "2class",
53
- "size_mb": 255,
54
  },
55
- "bertweet": {
56
- "name": "BERTweet Sentiment (3-class) [EN]",
57
- "id": "finiteautomata/bertweet-base-sentiment-analysis",
58
- "kind": "3class",
59
- "size_mb": 540,
60
  },
61
-
62
- # ——— TR (şimdilik çok dilli ağırlıklı 3) ———
63
- "xlmr": {
64
- "name": "XLM-R Multilingual (3-class) [TR/Multi]",
65
- "id": "cardiffnlp/twitter-xlm-roberta-base-sentiment",
66
- "kind": "3class",
67
- "size_mb": 278,
68
  },
69
- "bert_5star": {
70
- "name": "BERT Multilingual Reviews (5-star) [TR/Multi]",
71
- "id": "nlptown/bert-base-multilingual-uncased-sentiment",
72
- "kind": "5star",
73
- "size_mb": 425,
74
  },
75
- # Hafif bir seçenek (EN verisiyle eğitilmiş olsa da fallback amaçlı)
76
- "albert": {
77
- "name": "ALBERT v2 (3-class) [Light/Fallback]",
78
- "id": "barissayil/bert-sentiment-analysis-sst",
79
- "kind": "3class",
80
- "size_mb": 46,
81
  },
82
  }
83
 
84
- # Dil bazlı “top-3” kümeleri (ileride TR-özel model eklersen burayı güncelle)
85
- LANG_TOP3 = {
86
- "en": ["roberta", "distilbert", "bertweet"],
87
- "tr": ["xlmr", "bert_5star", "albert"], # TR için: çok dilli + hafif model
88
- "other": ["xlmr", "bert_5star", "roberta"], # fallback
89
  }
90
 
91
- # =========================================
92
- # LAZY CACHE (Bellek Yönetimi)
93
- # =========================================
94
- _PIPE_CACHE: Dict[str, TextClassificationPipeline] = {}
95
- _CFG_CACHE: Dict[str, AutoConfig] = {}
96
- MAX_CACHE_SIZE = int(os.getenv("MAX_CACHE_SIZE", "4")) # Hugging Face CPU/VRAM'e göre ayarla
97
-
98
- def _cleanup_cache():
99
- """Eski modelleri temizle (FIFO)."""
 
 
 
 
 
 
 
 
 
 
100
  try:
101
- while len(_PIPE_CACHE) > MAX_CACHE_SIZE:
102
- oldest_key = next(iter(_PIPE_CACHE.keys()))
103
- _PIPE_CACHE.pop(oldest_key, None)
104
- _CFG_CACHE.pop(oldest_key, None)
105
- gc.collect()
106
- if torch.cuda.is_available():
107
- torch.cuda.empty_cache()
108
- except Exception:
109
- pass
110
-
111
- def get_pipe_and_cfg(model_key: str) -> Tuple[Optional[TextClassificationPipeline], Optional[AutoConfig]]:
112
- """model_key -> (pipeline, config). Hata varsa (None, None)."""
113
- try:
114
- spec = MODELS[model_key]
115
- model_id = spec["id"]
116
- if model_id not in _PIPE_CACHE:
117
- _cleanup_cache()
118
- tok = AutoTokenizer.from_pretrained(model_id)
119
- mdl = AutoModelForSequenceClassification.from_pretrained(model_id)
120
- pipe = TextClassificationPipeline(
121
- model=mdl,
122
- tokenizer=tok,
123
- framework="pt",
124
- return_all_scores=True,
125
- device=-1 # CPU
126
- )
127
- _PIPE_CACHE[model_id] = pipe
128
- _CFG_CACHE[model_id] = AutoConfig.from_pretrained(model_id)
129
- return _PIPE_CACHE[model_id], _CFG_CACHE[model_id]
130
- except Exception as e:
131
- print(f"[load-error] {model_key} -> {e}")
132
- return None, None
133
-
134
- # =========================================
135
- # DİL TESPİTİ
136
- # =========================================
137
  try:
138
- from langdetect import detect # hızlı ve hafif; kısa metinlerde bazen şaşabilir
139
  except Exception:
140
- detect = None
141
-
142
- def detect_lang(text: str) -> str:
143
- """'en' | 'tr' | 'other'. Boş/çok kısa metin 'other' döner."""
144
- txt = (text or "").strip()
145
- if not txt or len(txt) < 2:
146
- return "other"
147
- if detect is None:
148
- return "other"
149
  try:
150
- lang = detect(txt)
151
- return lang if lang in ("en", "tr") else "other"
152
  except Exception:
153
- return "other"
154
-
155
- # =========================================
156
- # LABEL NORMALİZASYONU
157
- # =========================================
158
- def normalize_label(raw_label: str, cfg: Optional[AutoConfig], kind: str) -> str:
159
- """Çeşitli model etiketlerini: negative / neutral / positive standardına çevir."""
160
- lbl = (raw_label or "").lower()
161
 
162
- # Bazı modeller LABEL_0/1/2 verir -> id2label ile çöz
163
- if lbl.startswith("label_") and cfg is not None and hasattr(cfg, "id2label"):
 
 
164
  try:
165
- idx = int(lbl.split("_")[-1])
166
- lbl = str(cfg.id2label[idx]).lower()
167
  except Exception:
168
- pass
169
-
170
- # 5-yıldızlılar: 1..5
171
- if kind == "5star":
172
- m = re.search(r"([1-5])", lbl)
173
- if m:
174
- s = int(m.group(1))
175
- if s <= 2: return "negative"
176
- if s == 3: return "neutral"
177
- return "positive"
178
-
179
- # Metin tabanlı eşleşmeler
180
- if "neg" in lbl: return "negative"
181
- if "neu" in lbl: return "neutral"
182
- if "pos" in lbl: return "positive"
183
-
184
- # 2-class modellerde nötr yoksa temkinli varsayılan
185
- return "neutral"
186
-
187
- # =========================================
188
- # ÖN-İŞLEME (ENGLISH)
189
- # =========================================
190
- def preprocess_en(text: str) -> str:
191
- """EN için hafif normalize. TR için dokunmuyoruz."""
192
- if not text:
193
- return text
194
- t = re.sub(r"\s+", " ", text).strip()
195
- t = re.sub(r"(.)\1{3,}", r"\1\1", t) # goooood -> good
196
- t = re.sub(r"http[s]?://\S+", "URL", t) # linkleri normalize
197
- t = re.sub(r"@\w+", "@USER", t) # mention normalize
198
- t = re.sub(r"#(\w+)", r"\1", t) # hashtag kelimesini koru
199
- # Basit contraction açma
200
- for old, new in {
201
- "won't":"will not","can't":"cannot","n't":" not",
202
- "'re":" are","'ve":" have","'ll":" will","'d":" would","'m":" am"
203
- }.items():
204
- t = t.replace(old, new)
205
- return t
206
-
207
- # =========================================
208
- # ANALYZE (API) — 2 Aşama: Dil Tespiti ➜ Uygun Model
209
- # =========================================
210
- def _pick_default_key_for_lang(lang: str) -> str:
211
- if lang == "en":
212
- return "roberta"
213
- if lang == "tr":
214
- return "xlmr"
215
- return "xlmr" # other -> çok dilli güvenli seçim
216
-
217
- def analyze(
218
- text: str,
219
- force_lang: Optional[str] = None,
220
- benchmark: bool = False
221
- ):
222
- """
223
- Production API.
224
- - force_lang: "en" | "tr" | "other" | None
225
- - benchmark: True ise dil kümesindeki en iyi 3 adaydan mini karşılaştırma döner
226
- """
227
- text = (text or "").strip()
228
- if not text:
229
- return {
230
  "label": "neutral",
231
  "score": 1.0,
232
  "confidence": "high",
@@ -234,235 +174,154 @@ def analyze(
234
  "model_used": "none",
235
  "processing_time_ms": 0.0
236
  }
237
-
238
- lang = force_lang if force_lang in ("en","tr","other") else detect_lang(text)
239
- text_proc = preprocess_en(text) if lang == "en" else text
240
-
241
- # Mini-benchmark istendiyse: LANG_TOP3'te dolaş
242
- candidates = []
243
- if benchmark:
244
- for key in LANG_TOP3.get(lang, LANG_TOP3["other"]):
245
- pipe, cfg = get_pipe_and_cfg(key)
246
- if pipe is None:
247
- continue
248
- t0 = time.perf_counter()
249
- out = pipe(text_proc)[0]
250
- latency_ms = (time.perf_counter() - t0) * 1000.0
251
- top = max(out, key=lambda s: s["score"])
252
- label = normalize_label(top["label"], cfg, MODELS[key]["kind"])
253
- candidates.append({
254
- "model": key,
255
- "label": label,
256
- "score": round(float(top["score"]), 4),
257
- "latency_ms": round(latency_ms, 2)
258
  })
259
-
260
- # En iyi aday: skor farkı <0.03 ise daha hızlı olanı seç
261
- if candidates:
262
- best = sorted(
263
- candidates,
264
- key=lambda c: (-c["score"], c["latency_ms"])
265
- )[0]
266
- final_model_key = best["model"]
267
- else:
268
- final_model_key = _pick_default_key_for_lang(lang)
269
- else:
270
- final_model_key = _pick_default_key_for_lang(lang)
271
-
272
- # Tek atış (veya benchmark sonrası kazanan)
273
- pipe, cfg = get_pipe_and_cfg(final_model_key)
274
- if pipe is None:
275
- return {
276
  "label": "error",
277
  "score": 0.0,
278
  "confidence": "low",
279
  "lang": lang,
280
- "model_used": final_model_key,
281
  "processing_time_ms": 0.0,
282
- "error": f"model_load_failed:{final_model_key}"
283
  }
284
-
285
- t0 = time.perf_counter()
286
- out = pipe(text_proc)[0]
287
- latency_ms = (time.perf_counter() - t0) * 1000.0
288
- top = max(out, key=lambda s: s["score"])
289
- label = normalize_label(top["label"], cfg, MODELS[final_model_key]["kind"])
290
- score = float(top["score"])
291
- conf = "high" if score > 0.8 else "medium" if score > 0.6 else "low"
292
-
293
- resp = {
294
- "label": label,
295
- "score": round(score, 4),
296
- "confidence": conf,
297
- "lang": lang,
298
- "model_used": MODELS[final_model_key]["id"].split("/")[-1],
299
- "processing_time_ms": round(latency_ms, 2),
300
- "text_len": len(text)
301
  }
302
- if benchmark and candidates:
303
- resp["candidates"] = candidates
304
- return resp
305
-
306
- # =========================================
307
- # BENCHMARK (UI) EN/TR/OTHER otomatik kova
308
- # =========================================
309
- def _summarize_rows(rows: List[List], errors: List[str]) -> str:
310
- # rows: ["text", "bucket/modelname", "label", "score", "latency_ms", "confidence"]
311
- by_model: Dict[str, Dict] = {}
312
- for r in rows:
313
- if len(r) < 6:
314
- continue
315
- _, mname, lab, sc, lat, conf = r
316
- agg = by_model.setdefault(mname, {
317
- "n": 0, "lat_sum": 0.0, "score_sum": 0.0,
318
- "neg": 0, "neu": 0, "pos": 0, "err": 0,
319
- "high": 0, "med": 0, "low": 0
320
- })
321
- agg["n"] += 1
322
- agg["lat_sum"] += (lat or 0.0)
323
- agg["score_sum"] += (sc or 0.0)
324
- if lab == "ERROR":
325
- agg["err"] += 1
326
- elif lab.startswith("neg"):
327
- agg["neg"] += 1
328
- elif lab.startswith("neu"):
329
- agg["neu"] += 1
330
- elif lab.startswith("pos"):
331
- agg["pos"] += 1
332
- if conf == "high":
333
- agg["high"] += 1
334
- elif conf == "medium":
335
- agg["med"] += 1
336
- elif conf == "low":
337
- agg["low"] += 1
338
-
339
- lines = ["## 📊 Benchmark Results\n"]
340
- if errors:
341
- lines.append("### ⚠️ Errors:")
342
- for e in errors:
343
- lines.append(f"- {e}")
344
- lines.append("")
345
-
346
- lines.append("### 🏆 Model Performance (sorted by avg latency):")
347
- order = sorted(by_model.items(), key=lambda kv: kv[1]["lat_sum"]/max(1,kv[1]["n"]))
348
- for mname, agg in order:
349
- n = max(1, agg["n"])
350
- avg_lat = agg["lat_sum"]/n
351
- avg_score = agg["score_sum"]/n
352
- lines.append(f"\n#### {mname}")
353
- lines.append(f"- **Speed:** {avg_lat:.1f} ms (avg)")
354
- lines.append(f"- **Avg Confidence:** {avg_score:.2%}")
355
- lines.append(f"- **Sentiment:** 😞 {agg['neg']} | 😐 {agg['neu']} | 😊 {agg['pos']}" +
356
- (f" | ❌ {agg['err']}" if agg['err'] else ""))
357
- lines.append(f"- **Conf:** High {agg['high']} / Med {agg['med']} / Low {agg['low']}")
358
-
359
- return "\n".join(lines)
360
-
361
- def run_benchmark_auto(texts_blob: str):
362
- texts = [t.strip() for t in (texts_blob or "").splitlines() if t.strip()]
363
- if not texts:
364
- return "⚠️ Metin alanı boş. Her satıra bir örnek yaz.", []
365
-
366
- buckets = {"en": [], "tr": [], "other": []}
367
- for t in texts:
368
- buckets[detect_lang(t)].append(t)
369
-
370
- rows, errors = [], []
371
-
372
- def bench_set(text_list: List[str], keys: List[str], tag: str):
373
- nonlocal rows, errors
374
- if not text_list:
375
- return
376
- for key in keys:
377
- spec = MODELS[key]
378
- pipe, cfg = get_pipe_and_cfg(key)
379
- modelname = f"{tag}/{spec['name']}"
380
- if pipe is None:
381
- errors.append(f"❌ {modelname} yüklenemedi")
382
- for t in text_list:
383
- rows.append([t[:50], modelname, "ERROR", 0.0, 0.0, "N/A"])
384
- continue
385
- # EN kümesi için basit ön-işleme
386
- proc = [preprocess_en(x) if tag=="EN" else x for x in text_list]
387
- t0 = time.perf_counter()
388
- outs = pipe(proc)
389
- avg_ms = (time.perf_counter() - t0) * 1000.0 / max(1, len(proc))
390
- for orig, out in zip(text_list, outs):
391
  try:
392
- top = max(out, key=lambda s: s["score"])
393
- lab = normalize_label(top["label"], cfg, spec["kind"])
394
- sc = float(top["score"])
395
- conf = "high" if sc > 0.8 else "medium" if sc > 0.6 else "low"
396
- rows.append([
397
- orig[:50] + ("..." if len(orig) > 50 else ""),
398
- modelname,
399
- lab,
400
- round(sc, 4),
401
- round(avg_ms, 1),
402
- conf
403
  ])
404
- except Exception as ex:
405
- errors.append(f"⚠️ {modelname}: {str(ex)[:100]}")
406
- rows.append([orig[:50], modelname, "ERROR", 0.0, 0.0, "N/A"])
407
-
408
- bench_set(buckets["en"], LANG_TOP3["en"], "EN")
409
- bench_set(buckets["tr"], LANG_TOP3["tr"], "TR")
410
- bench_set(buckets["other"], LANG_TOP3["other"], "OTHER")
411
-
412
- summary = _summarize_rows(rows, errors)
413
- return summary, rows
414
-
415
- # =========================================
416
- # GRADIO ARAYÜZLERİ
417
- # =========================================
418
- api_intf = gr.Interface(
419
- fn=analyze,
420
- inputs=[
421
- gr.Textbox(lines=3, label="Text", placeholder="Type a message..."),
422
- gr.Textbox(lines=1, label="force_lang (optional: en|tr|other)", value="", visible=False),
423
- gr.Checkbox(label="benchmark (return candidates)", value=False, visible=False),
 
 
 
 
 
424
  ],
425
- outputs=gr.JSON(label="Result"),
426
- title="🔌 Sentiment API (Production)",
427
- description="POST /api/predict/analyze Returns {label, score, confidence, lang, model_used, processing_time_ms[, candidates]}",
428
  )
429
- api_intf.api_name = "analyze" # /api/predict/analyze
430
-
431
- with gr.Blocks(title="Sentiment Analysis — EN/TR Auto Routing") as bench_ui:
432
- gr.Markdown("""
433
- ## 🧪 Multi-Model Benchmark (Auto EN/TR/Other)
434
- Her satıra bir cümle gir. Uygulama her cümlenin dilini otomatik saptar:
435
- - **EN** en iyi 3 EN modeli
436
- - **TR** ➜ en iyi 3 TR (çok dilli ağırlıklı) modeli
437
- - **Other** fallback 3'lüsü
438
-
439
- **Önerilen hedefler:**
440
- - P95 latency < 200ms
441
- - Doğruluk/Skor yüksek, fark < 0.03 ise daha hızlı modeli seç
442
- """)
443
- txt = gr.Textbox(
444
- lines=12,
445
- label="Test Sentences (one per line, TR ve EN karışık olabilir)",
446
- placeholder="I absolutely love this product!\nHizmet çok yavaş, memnun kalmadım.\nIt's okay, not great.\nFiyatına göre idare eder.\nWorst experience ever."
447
- )
448
- run_btn = gr.Button("Run benchmark (auto EN/TR/Other)")
449
- out_md = gr.Markdown()
450
- out_tbl = gr.Dataframe(
451
- headers=["text", "bucket/model", "label", "score", "latency_ms", "confidence"],
452
- row_count=(0, "dynamic"),
453
- col_count=(6, "fixed"),
454
- interactive=False,
455
- wrap=True,
456
  )
457
- run_btn.click(fn=run_benchmark_auto, inputs=[txt], outputs=[out_md, out_tbl])
458
 
459
- demo = gr.TabbedInterface(
460
- [api_intf, bench_ui],
461
- tab_names=["🔌 API (Production)", "🧪 Model Comparison"]
462
  )
463
 
464
- if __name__ == "__main__":
465
- # İsteğe bağlı: Sıcak başlatma için en olası 2-3 modeli önceden dokundurabilirsin
466
- # for k in ["roberta", "xlmr"]:
467
- # get_pipe_and_cfg(k)
468
- demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), show_error=True)
 
1
+ import os # isletim sistemi degiskenlerine erismek icin
2
+ import re # metin isleme icin regexp kutuphanesi
3
+ import time # gecikme olcumu icin zaman fonksiyonlari
4
+ import gc # bellek temizligi icin garbage collector
5
+ from typing import Dict, Tuple, Optional, List # tip ipuclari icin
6
+ import gradio as gr # Hugging Face Spaces arayuzunu kurmak icin
7
+ import torch # pytorch modellerini calistirmak icin
8
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoConfig
9
+
10
+ # ====== MODEL KAYITLARI (sade ve ogrenci dostu) ======
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  MODELS: Dict[str, Dict] = {
12
+ # en dili icin 3 model
13
+ "roberta": {
14
+ "name": "RoBERTa Twitter 3class EN", # aciklama adi
15
+ "id": "cardiffnlp/twitter-roberta-base-sentiment-latest", # HF model kimligi
16
+ "kind": "3class" # cikis turu
 
17
  },
18
+ "distilbert": {
19
+ "name": "DistilBERT SST2 2class EN", # aciklama adi
20
+ "id": "distilbert-base-uncased-finetuned-sst-2-english", # HF model kimligi
21
+ "kind": "2class" # cikis turu
 
22
  },
23
+ "bertweet": {
24
+ "name": "BERTweet 3class EN", # aciklama adi
25
+ "id": "finiteautomata/bertweet-base-sentiment-analysis", # HF model kimligi
26
+ "kind": "3class" # cikis turu
 
27
  },
28
+ # tr veya diger diller icin 3 model (cok dilli agirlikli)
29
+ "xlmr": {
30
+ "name": "XLM-R 3class Multi", # aciklama adi
31
+ "id": "cardiffnlp/twitter-xlm-roberta-base-sentiment", # HF model kimligi
32
+ "kind": "3class" # cikis turu
 
 
33
  },
34
+ "bert_5star": {
35
+ "name": "BERT Multi 5star", # aciklama adi
36
+ "id": "nlptown/bert-base-multilingual-uncased-sentiment", # HF model kimligi
37
+ "kind": "5star" # cikis turu
 
38
  },
39
+ "albert": {
40
+ "name": "ALBERT v2 3class Light", # aciklama adi
41
+ "id": "barissayil/bert-sentiment-analysis-sst", # HF model kimligi
42
+ "kind": "3class" # cikis turu
 
 
43
  },
44
  }
45
 
46
+ # ====== DIL BAZLI TOP3 SECIMLERI ======
47
+ LANG_TOP3 = { # her dil icin 3 aday model listesi
48
+ "en": ["roberta", "distilbert", "bertweet"], # ingilizce icin ilk 3
49
+ "tr": ["xlmr", "bert_5star", "albert"], # turkce icin ilk 3 (cok dilli agirlikli)
50
+ "other": ["xlmr", "bert_5star", "roberta"] # diger diller icin yedek 3
51
  }
52
 
53
+ # ====== LAZY CACHE (modelleri ihtiyac olunca yukleme) ======
54
+ _PIPE_CACHE: Dict[str, TextClassificationPipeline] = {} # pipeline nesneleri icin cache sozlugu
55
+ _CFG_CACHE: Dict[str, AutoConfig] = {} # model config nesneleri icin cache sozlugu
56
+ MAX_CACHE_SIZE = 4 # en fazla 4 farkli model cachede tut
57
+
58
+ def cleanup_cache() -> None: # cache buyurse eskileri silmek icin fonksiyon
59
+ while len(_PIPE_CACHE) > MAX_CACHE_SIZE: # eger siniri astiysa
60
+ oldest_key = next(iter(_PIPE_CACHE.keys())) # ilk ekleneni bul
61
+ _PIPE_CACHE.pop(oldest_key, None) # pipeline sil
62
+ _CFG_CACHE.pop(oldest_key, None) # config sil
63
+ gc.collect() # python cop toplayici cagir
64
+ if torch.cuda.is_available(): # eger gpu varsa
65
+ torch.cuda.empty_cache() # gpu bellegini de bosalt
66
+
67
+ def get_pipe_and_cfg(model_key: str) -> Tuple[Optional[TextClassificationPipeline], Optional[AutoConfig]]: # model yukleme fonksiyonu
68
+ spec = MODELS[model_key] # model sozlugunden kaydi al
69
+ model_id = spec["id"] # hf id'sini al
70
+ if model_id in _PIPE_CACHE: # eger cachede varsa
71
+ return _PIPE_CACHE[model_id], _CFG_CACHE.get(model_id) # cacheden dondur
72
  try:
73
+ tok = AutoTokenizer.from_pretrained(model_id) # tokenizer yukle
74
+ mdl = AutoModelForSequenceClassification.from_pretrained(model_id) # model yukle
75
+ pipe = TextClassificationPipeline( # pipeline olustur
76
+ model=mdl, # model set et
77
+ tokenizer=tok, # tokenizer set et
78
+ framework="pt", # pytorch kullan
79
+ return_all_scores=True, # tum sinif skorlarini iste
80
+ device=-1 # cpu kullan
81
+ )
82
+ _PIPE_CACHE[model_id] = pipe # pipeline'i cache'e yaz
83
+ _CFG_CACHE[model_id] = AutoConfig.from_pretrained(model_id) # config'i cache'e yaz
84
+ cleanup_cache() # gerekirse cache temizligi yap
85
+ return pipe, _CFG_CACHE[model_id] # pipeline ve config dondur
86
+ except Exception as e: # yukleme hatasi olursa
87
+ print(f"model yukleme hatasi: {model_key} -> {e}") # ekrana yaz
88
+ return None, None # None dondur
89
+
90
+ # ====== DIL TESPITI ======
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  try:
92
+ from langdetect import detect # hafif dil tespiti kutuphanesi
93
  except Exception:
94
+ detect = None # eger yuklenemezse None yap
95
+
96
+ def detect_lang(text: str) -> str: # girilen metnin dilini bul
97
+ t = (text or "").strip() # bosluklari temizle
98
+ if not t or len(t) < 2: # cok kisa ise
99
+ return "other" # diger kabul et
100
+ if detect is None: # kutuphane yoksa
101
+ return "other" # diger kabul et
 
102
  try:
103
+ lang = detect(t) # dil tespiti yap
104
+ return lang if lang in ("en", "tr") else "other" # sadece en ve tr destekliyoruz
105
  except Exception:
106
+ return "other" # hata olursa diger de
 
 
 
 
 
 
 
107
 
108
+ # ====== LABEL NORMALIZASYONU ======
109
+ def normalize_label(raw_label: str, cfg: Optional[AutoConfig], kind: str) -> str: # farkli etiketleri standarda cevir
110
+ lbl = (raw_label or "").lower() # etiket kucuk harfe cevir
111
+ if lbl.startswith("label_") and cfg is not None and hasattr(cfg, "id2label"): # eger LABEL_0 gibi ise
112
  try:
113
+ idx = int(lbl.split("_")[-1]) # sayiyi al
114
+ lbl = str(cfg.id2label[idx]).lower() # id2label ile gercek etikete cevir
115
  except Exception:
116
+ pass # hata olursa devam et
117
+ if kind == "5star": # 5 yildizli model icin
118
+ m = re.search(r"([1-5])", lbl) # 1..5 ara
119
+ if m: # bulunduysa
120
+ s = int(m.group(1)) # sayiyi al
121
+ if s <= 2: # 1 veya 2 ise
122
+ return "negative" # negatif
123
+ if s == 3: # 3 ise
124
+ return "neutral" # notr
125
+ return "positive" # 4 veya 5 ise pozitif
126
+ if "neg" in lbl: # metin icinde neg geciyorsa
127
+ return "negative" # negatif dondur
128
+ if "neu" in lbl: # metin icinde neu geciyorsa
129
+ return "neutral" # notr dondur
130
+ if "pos" in lbl: # metin icinde pos geciyorsa
131
+ return "positive" # pozitif dondur
132
+ return "neutral" # diger durumlarda notr dondur (2-class icin guvenli secim)
133
+
134
+ # ====== ENGLISH ON-ISLEME ======
135
+ def preprocess_en(text: str) -> str: # ingilizce metin icin hafif on isleme
136
+ if not text: # metin bos ise
137
+ return text # aynen dondur
138
+ t = re.sub(r"\s+", " ", text).strip() # fazla bosluklari sadelestir
139
+ t = re.sub(r"(.)\1{3,}", r"\1\1", t) # cok tekrar eden karakterleri kisalt
140
+ t = re.sub(r"http[s]?://\S+", "URL", t) # linkleri URL ile degistir
141
+ t = re.sub(r"@\w+", "@USER", t) # mentionlari duzelt
142
+ t = re.sub(r"#(\w+)", r"\1", t) # hashtag isaretini kaldir
143
+ reps = { # kisaltma acilimlari sozlugu
144
+ "won't": "will not",
145
+ "can't": "cannot",
146
+ "n't": " not",
147
+ "'re": " are",
148
+ "'ve": " have",
149
+ "'ll": " will",
150
+ "'d": " would",
151
+ "'m": " am"
152
+ }
153
+ for old, new in reps.items(): # her kural icin
154
+ t = t.replace(old, new) # metinde degistir
155
+ return t # islenmis metni dondur
156
+
157
+ # ====== DIL -> VARSAYILAN MODEL KURALI ======
158
+ def pick_default_key_for_lang(lang: str) -> str: # dile gore varsayilan modeli sec
159
+ if lang == "en": # ingilizce ise
160
+ return "roberta" # roberta sec
161
+ if lang == "tr": # turkce ise
162
+ return "xlmr" # xlmr sec
163
+ return "xlmr" # diger diller icin xlmr sec
164
+
165
+ # ====== ANA API FONKSIYONU (/api/predict/analyze) ======
166
+ def analyze(text: str, force_lang: Optional[str] = None, benchmark: bool = False): # ana uretim endpointi
167
+ txt = (text or "").strip() # giris metnini temizle
168
+ if not txt: # bos metin ise
169
+ return { # notr dondur
 
 
 
 
 
 
 
 
170
  "label": "neutral",
171
  "score": 1.0,
172
  "confidence": "high",
 
174
  "model_used": "none",
175
  "processing_time_ms": 0.0
176
  }
177
+ lang = force_lang if force_lang in ("en", "tr", "other") else detect_lang(txt) # dili belirle
178
+ proc = preprocess_en(txt) if lang == "en" else txt # ingilizce ise on isleme uygula
179
+ candidates: List[Dict] = [] # benchmark aday listesi
180
+ if benchmark: # eger mini benchmark istenirse
181
+ keys = LANG_TOP3.get(lang, LANG_TOP3["other"]) # o dilin TOP3 listesi
182
+ for k in keys: # her aday icin
183
+ pipe, cfg = get_pipe_and_cfg(k) # pipeline ve config al
184
+ if pipe is None: # yuklenemediyse
185
+ continue # atla
186
+ t0 = time.perf_counter() # zaman sayacini baslat
187
+ out = pipe(proc)[0] # modeli calistir
188
+ ms = (time.perf_counter() - t0) * 1000.0 # gecikmeyi hesapla
189
+ top = max(out, key=lambda s: s["score"]) # en yuksek skorlu sinifi bul
190
+ lab = normalize_label(top["label"], cfg, MODELS[k]["kind"]) # etiketi standarda cevir
191
+ candidates.append({ # adayi listeye ekle
192
+ "model": k,
193
+ "label": lab,
194
+ "score": float(top["score"]),
195
+ "latency_ms": round(ms, 2)
 
 
196
  })
197
+ if candidates: # adaylar varsa
198
+ candidates.sort(key=lambda c: (-c["score"], c["latency_ms"])) # skora gore azalan, sonra gecikmeye gore artan
199
+ winner_key = candidates[0]["model"] # kazananin anahtarini al
200
+ else: # aday yoksa
201
+ winner_key = pick_default_key_for_lang(lang) # varsayilani sec
202
+ else: # benchmark yoksa
203
+ winner_key = pick_default_key_for_lang(lang) # dogrudan varsayilani sec
204
+ pipe, cfg = get_pipe_and_cfg(winner_key) # kazanan pipeline ve config al
205
+ if pipe is None: # yuklenemezse
206
+ return { # hata don
 
 
 
 
 
 
 
207
  "label": "error",
208
  "score": 0.0,
209
  "confidence": "low",
210
  "lang": lang,
211
+ "model_used": winner_key,
212
  "processing_time_ms": 0.0,
213
+ "error": "model_load_failed"
214
  }
215
+ t0 = time.perf_counter() # zaman sayacini baslat
216
+ out = pipe(proc)[0] # tek atis tahmini al
217
+ ms = (time.perf_counter() - t0) * 1000.0 # gecikmeyi hesapla
218
+ top = max(out, key=lambda s: s["score"]) # en yuksek skorlu sinifi bul
219
+ label = normalize_label(top["label"], cfg, MODELS[winner_key]["kind"]) # etiketi standarda cevir
220
+ score = float(top["score"]) # skoru sayiya cevir
221
+ confidence = "high" if score > 0.8 else ("medium" if score > 0.6 else "low") # guven araligini belirle
222
+ resp = { # yanit sozlugunu olustur
223
+ "label": label, # son etiket
224
+ "score": round(score, 4), # skor 4 ondalik
225
+ "confidence": confidence, # guven seviyesi
226
+ "lang": lang, # tespit edilen dil
227
+ "model_used": MODELS[winner_key]["id"].split("/")[-1], # kullanilan modelin son parcasi
228
+ "processing_time_ms": round(ms, 2) # islem suresi ms
 
 
 
229
  }
230
+ if benchmark and candidates: # eger benchmark yapildiysa
231
+ resp["candidates"] = candidates # adaylar listesini da dondur
232
+ return resp # yaniti dondur
233
+
234
+ # ====== BENCHMARK UI (otomatik EN/TR/OTHER kovalama) ======
235
+ def run_benchmark_auto(texts_blob: str): # coklu metin benchmark fonksiyonu
236
+ texts = [t.strip() for t in (texts_blob or "").splitlines() if t.strip()] # satirlari ayir ve boslari temizle
237
+ if not texts: # eger hic metin yoksa
238
+ return "Uyari: metin alani bos.", [] # uyari ve bos tablo dondur
239
+ buckets = {"en": [], "tr": [], "other": []} # dil kovalari olustur
240
+ for t in texts: # her metin icin
241
+ buckets[detect_lang(t)].append(t) # uygun kovaya ekle
242
+ rows: List[List] = [] # cikti satirlarini tutacak liste
243
+ errors: List[str] = [] # hata mesajlari icin liste
244
+
245
+ def bench_set(text_list: List[str], keys: List[str], tag: str): # bir dil kovasi icin benchmark calistir
246
+ if not text_list: # eger bu kovada metin yoksa
247
+ return # cik
248
+ for k in keys: # her model adayi icin
249
+ spec = MODELS[k] # model kaydini al
250
+ pipe, cfg = get_pipe_and_cfg(k) # pipeline ve config al
251
+ modelname = f"{tag}/{spec['name']}" # tablo icin gorsel ad olustur
252
+ if pipe is None: # yuklenemedi ise
253
+ errors.append(f"yuklenemedi: {modelname}") # hatayi kaydet
254
+ for t in text_list: # her metin icin
255
+ rows.append([t[:50], modelname, "ERROR", 0.0, 0.0, "N/A"]) # hata satiri ekle
256
+ continue # diger modele gec
257
+ proc = [preprocess_en(x) if tag == "EN" else x for x in text_list] # EN icin on isleme yap
258
+ t0 = time.perf_counter() # zaman sayacini baslat
259
+ outs = pipe(proc) # toplu tahmin al
260
+ avg_ms = (time.perf_counter() - t0) * 1000.0 / max(1, len(proc)) # ortalama gecikme hesapla
261
+ for orig, out in zip(text_list, outs): # her cikti icin
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  try:
263
+ top = max(out, key=lambda s: s["score"]) # en yuksek skoru bul
264
+ lab = normalize_label(top["label"], cfg, spec["kind"]) # etiketi standarda cevir
265
+ sc = float(top["score"]) # skoru sayiya cevir
266
+ conf = "high" if sc > 0.8 else ("medium" if sc > 0.6 else "low") # guven seviyesi
267
+ rows.append([ # tabloya bir satir ekle
268
+ orig[:50] + ("..." if len(orig) > 50 else ""), # metnin kisa hali
269
+ modelname, # model adi
270
+ lab, # etiket
271
+ round(sc, 4), # skor
272
+ round(avg_ms, 1), # ortalama gecikme
273
+ conf # guven
274
  ])
275
+ except Exception as ex: # tahmin hatasi olursa
276
+ errors.append(f"hata: {modelname}: {str(ex)[:80]}") # hata listesine ekle
277
+ rows.append([orig[:50], modelname, "ERROR", 0.0, 0.0, "N/A"]) # hata satiri ekle
278
+
279
+ bench_set(buckets["en"], LANG_TOP3["en"], "EN") # ingilizce kovayi calistir
280
+ bench_set(buckets["tr"], LANG_TOP3["tr"], "TR") # turkce kovayi calistir
281
+ bench_set(buckets["other"], LANG_TOP3["other"], "OTHER") # diger kovayi calistir
282
+
283
+ # sadelesmis ozet metni olustur
284
+ summary_lines: List[str] = [] # ozet satirlari icin liste
285
+ if errors: # hata varsa
286
+ summary_lines.append("Hatalar:") # baslik yaz
287
+ for e in errors: # her hata icin
288
+ summary_lines.append(f"- {e}") # listele
289
+ if not summary_lines: # hic hata yoksa
290
+ summary_lines.append("Benchmark tamamlandi.") # basit bilgi satiri
291
+ return "\n".join(summary_lines), rows # ozet metni ve tablo satirlarini dondur
292
+
293
+ # ====== GRADIO ARAYUZLERI ======
294
+ api_intf = gr.Interface( # uretim api arayuzu
295
+ fn=analyze, # cagrilacak fonksiyon
296
+ inputs=[ # giris bilesenleri
297
+ gr.Textbox(lines=3, label="Text"), # metin kutusu
298
+ gr.Textbox(lines=1, label="force_lang (en|tr|other, opsiyonel)", value=""), # zorlama dil alani
299
+ gr.Checkbox(label="benchmark (kisa TOP3 karsilastirma)", value=False), # benchmark secenegi
300
  ],
301
+ outputs=gr.JSON(label="Result"), # cikti JSON gosterimi
302
+ title="Sentiment API (Production)", # baslik
303
+ description="POST /api/predict/analyze doner: {label, score, confidence, lang, model_used, processing_time_ms[, candidates]}" # aciklama
304
  )
305
+ api_intf.api_name = "analyze" # endpoint yolu: /api/predict/analyze
306
+
307
+ with gr.Blocks(title="Sentiment Benchmark") as bench_ui: # benchmark arayuzu
308
+ gr.Markdown("Coklu metin benchmark. Her satir ayri bir ornek olmalidir.") # kullaniciya kisa bilgi
309
+ txt = gr.Textbox(lines=10, label="Ornekler (satir satir)") # coklu satir metin girisi
310
+ btn = gr.Button("Calistir") # calistir butonu
311
+ out_md = gr.Markdown() # ozet metin alanı
312
+ out_tbl = gr.Dataframe( # tablo cikti alani
313
+ headers=["text", "bucket/model", "label", "score", "latency_ms", "confidence"], # tablo basliklari
314
+ row_count=(0, "dynamic"), # dinamik satir sayisi
315
+ col_count=(6, "fixed"), # sabit sutun sayisi
316
+ interactive=False, # kullanici duzenleyemesin
317
+ wrap=True # metin sarma acik
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  )
319
+ btn.click(fn=run_benchmark_auto, inputs=[txt], outputs=[out_md, out_tbl]) # buton tiklayinca fonksiyonu cagir
320
 
321
+ demo = gr.TabbedInterface( # iki sekmeli toplam arayuz
322
+ [api_intf, bench_ui], # birinci sekme api, ikinci sekme benchmark
323
+ tab_names=["API", "Benchmark"] # sekme isimleri
324
  )
325
 
326
+ if __name__ == "__main__": # ana calisma blogu
327
+ demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), show_error=True) # gradioyu baslat