Spaces:

sherdd
/

chat-sentiment-api

Sleeping

App Files Files Community

chat-sentiment-api / app.py

sherdd

updating

4912b0b verified 6 months ago

raw

history blame contribute delete

16.7 kB

	import os # isletim sistemi degiskenlerine erismek icin
	import re # metin isleme icin regexp kutuphanesi
	import time # gecikme olcumu icin zaman fonksiyonlari
	import gc # bellek temizligi icin garbage collector
	from typing import Dict, Tuple, Optional, List # tip ipuclari icin
	import gradio as gr # Hugging Face Spaces arayuzunu kurmak icin
	import torch # pytorch modellerini calistirmak icin
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoConfig

	# ====== MODEL KAYITLARI (sade ve ogrenci dostu) ======
	MODELS: Dict[str, Dict] = {
	# en dili icin 3 model
	"roberta": {
	"name": "RoBERTa Twitter 3class EN", # aciklama adi
	"id": "cardiffnlp/twitter-roberta-base-sentiment-latest", # HF model kimligi
	"kind": "3class" # cikis turu
	},
	"distilbert": {
	"name": "DistilBERT SST2 2class EN", # aciklama adi
	"id": "distilbert-base-uncased-finetuned-sst-2-english", # HF model kimligi
	"kind": "2class" # cikis turu
	},
	"bertweet": {
	"name": "BERTweet 3class EN", # aciklama adi
	"id": "finiteautomata/bertweet-base-sentiment-analysis", # HF model kimligi
	"kind": "3class" # cikis turu
	},
	# tr veya diger diller icin 3 model (cok dilli agirlikli)
	"xlmr": {
	"name": "XLM-R 3class Multi", # aciklama adi
	"id": "cardiffnlp/twitter-xlm-roberta-base-sentiment", # HF model kimligi
	"kind": "3class" # cikis turu
	},
	"bert_5star": {
	"name": "BERT Multi 5star", # aciklama adi
	"id": "nlptown/bert-base-multilingual-uncased-sentiment", # HF model kimligi
	"kind": "5star" # cikis turu
	},
	"albert": {
	"name": "ALBERT v2 3class Light", # aciklama adi
	"id": "barissayil/bert-sentiment-analysis-sst", # HF model kimligi
	"kind": "3class" # cikis turu
	},
	}

	# ====== DIL BAZLI TOP3 SECIMLERI ======
	LANG_TOP3 = { # her dil icin 3 aday model listesi
	"en": ["roberta", "distilbert", "bertweet"], # ingilizce icin ilk 3
	"tr": ["xlmr", "bert_5star", "albert"], # turkce icin ilk 3 (cok dilli agirlikli)
	"other": ["xlmr", "bert_5star", "roberta"] # diger diller icin yedek 3
	}

	# ====== LAZY CACHE (modelleri ihtiyac olunca yukleme) ======
	_PIPE_CACHE: Dict[str, TextClassificationPipeline] = {} # pipeline nesneleri icin cache sozlugu
	_CFG_CACHE: Dict[str, AutoConfig] = {} # model config nesneleri icin cache sozlugu
	MAX_CACHE_SIZE = 4 # en fazla 4 farkli model cachede tut

	def cleanup_cache() -> None: # cache buyurse eskileri silmek icin fonksiyon
	while len(_PIPE_CACHE) > MAX_CACHE_SIZE: # eger siniri astiysa
	oldest_key = next(iter(_PIPE_CACHE.keys())) # ilk ekleneni bul
	_PIPE_CACHE.pop(oldest_key, None) # pipeline sil
	_CFG_CACHE.pop(oldest_key, None) # config sil
	gc.collect() # python cop toplayici cagir
	if torch.cuda.is_available(): # eger gpu varsa
	torch.cuda.empty_cache() # gpu bellegini de bosalt

	def get_pipe_and_cfg(model_key: str) -> Tuple[Optional[TextClassificationPipeline], Optional[AutoConfig]]: # model yukleme fonksiyonu
	spec = MODELS[model_key] # model sozlugunden kaydi al
	model_id = spec["id"] # hf id'sini al
	if model_id in _PIPE_CACHE: # eger cachede varsa
	return _PIPE_CACHE[model_id], _CFG_CACHE.get(model_id) # cacheden dondur
	try:
	tok = AutoTokenizer.from_pretrained(model_id) # tokenizer yukle
	mdl = AutoModelForSequenceClassification.from_pretrained(model_id) # model yukle
	pipe = TextClassificationPipeline( # pipeline olustur
	model=mdl, # model set et
	tokenizer=tok, # tokenizer set et
	framework="pt", # pytorch kullan
	return_all_scores=True, # tum sinif skorlarini iste
	device=-1 # cpu kullan
	)
	_PIPE_CACHE[model_id] = pipe # pipeline'i cache'e yaz
	_CFG_CACHE[model_id] = AutoConfig.from_pretrained(model_id) # config'i cache'e yaz
	cleanup_cache() # gerekirse cache temizligi yap
	return pipe, _CFG_CACHE[model_id] # pipeline ve config dondur
	except Exception as e: # yukleme hatasi olursa
	print(f"model yukleme hatasi: {model_key} -> {e}") # ekrana yaz
	return None, None # None dondur

	# ====== DIL TESPITI ======
	try:
	from langdetect import detect # hafif dil tespiti kutuphanesi
	except Exception:
	detect = None # eger yuklenemezse None yap

	def detect_lang(text: str) -> str: # girilen metnin dilini bul
	t = (text or "").strip() # bosluklari temizle
	if not t or len(t) < 2: # cok kisa ise
	return "other" # diger kabul et
	if detect is None: # kutuphane yoksa
	return "other" # diger kabul et
	try:
	lang = detect(t) # dil tespiti yap
	return lang if lang in ("en", "tr") else "other" # sadece en ve tr destekliyoruz
	except Exception:
	return "other" # hata olursa diger de

	# ====== LABEL NORMALIZASYONU ======
	def normalize_label(raw_label: str, cfg: Optional[AutoConfig], kind: str) -> str: # farkli etiketleri standarda cevir
	lbl = (raw_label or "").lower() # etiket kucuk harfe cevir
	if lbl.startswith("label_") and cfg is not None and hasattr(cfg, "id2label"): # eger LABEL_0 gibi ise
	try:
	idx = int(lbl.split("_")[-1]) # sayiyi al
	lbl = str(cfg.id2label[idx]).lower() # id2label ile gercek etikete cevir
	except Exception:
	pass # hata olursa devam et
	if kind == "5star": # 5 yildizli model icin
	m = re.search(r"([1-5])", lbl) # 1..5 ara
	if m: # bulunduysa
	s = int(m.group(1)) # sayiyi al
	if s <= 2: # 1 veya 2 ise
	return "negative" # negatif
	if s == 3: # 3 ise
	return "neutral" # notr
	return "positive" # 4 veya 5 ise pozitif
	if "neg" in lbl: # metin icinde neg geciyorsa
	return "negative" # negatif dondur
	if "neu" in lbl: # metin icinde neu geciyorsa
	return "neutral" # notr dondur
	if "pos" in lbl: # metin icinde pos geciyorsa
	return "positive" # pozitif dondur
	return "neutral" # diger durumlarda notr dondur (2-class icin guvenli secim)

	# ====== ENGLISH ON-ISLEME ======
	def preprocess_en(text: str) -> str: # ingilizce metin icin hafif on isleme
	if not text: # metin bos ise
	return text # aynen dondur
	t = re.sub(r"\s+", " ", text).strip() # fazla bosluklari sadelestir
	t = re.sub(r"(.)\1{3,}", r"\1\1", t) # cok tekrar eden karakterleri kisalt
	t = re.sub(r"http[s]?://\S+", "URL", t) # linkleri URL ile degistir
	t = re.sub(r"@\w+", "@USER", t) # mentionlari duzelt
	t = re.sub(r"#(\w+)", r"\1", t) # hashtag isaretini kaldir
	reps = { # kisaltma acilimlari sozlugu
	"won't": "will not",
	"can't": "cannot",
	"n't": " not",
	"'re": " are",
	"'ve": " have",
	"'ll": " will",
	"'d": " would",
	"'m": " am"
	}
	for old, new in reps.items(): # her kural icin
	t = t.replace(old, new) # metinde degistir
	return t # islenmis metni dondur

	# ====== DIL -> VARSAYILAN MODEL KURALI ======
	def pick_default_key_for_lang(lang: str) -> str: # dile gore varsayilan modeli sec
	if lang == "en": # ingilizce ise
	return "roberta" # roberta sec
	if lang == "tr": # turkce ise
	return "xlmr" # xlmr sec
	return "xlmr" # diger diller icin xlmr sec

	# ====== ANA API FONKSIYONU (/api/predict/analyze) ======
	def analyze(text: str, force_lang: Optional[str] = None, benchmark: bool = False): # ana uretim endpointi
	txt = (text or "").strip() # giris metnini temizle
	if not txt: # bos metin ise
	return { # notr dondur
	"label": "neutral",
	"score": 1.0,
	"confidence": "high",
	"lang": force_lang or "other",
	"model_used": "none",
	"processing_time_ms": 0.0
	}
	lang = force_lang if force_lang in ("en", "tr", "other") else detect_lang(txt) # dili belirle
	proc = preprocess_en(txt) if lang == "en" else txt # ingilizce ise on isleme uygula
	candidates: List[Dict] = [] # benchmark aday listesi
	if benchmark: # eger mini benchmark istenirse
	keys = LANG_TOP3.get(lang, LANG_TOP3["other"]) # o dilin TOP3 listesi
	for k in keys: # her aday icin
	pipe, cfg = get_pipe_and_cfg(k) # pipeline ve config al
	if pipe is None: # yuklenemediyse
	continue # atla
	t0 = time.perf_counter() # zaman sayacini baslat
	out = pipe(proc)[0] # modeli calistir
	ms = (time.perf_counter() - t0) * 1000.0 # gecikmeyi hesapla
	top = max(out, key=lambda s: s["score"]) # en yuksek skorlu sinifi bul
	lab = normalize_label(top["label"], cfg, MODELS[k]["kind"]) # etiketi standarda cevir
	candidates.append({ # adayi listeye ekle
	"model": k,
	"label": lab,
	"score": float(top["score"]),
	"latency_ms": round(ms, 2)
	})
	if candidates: # adaylar varsa
	candidates.sort(key=lambda c: (-c["score"], c["latency_ms"])) # skora gore azalan, sonra gecikmeye gore artan
	winner_key = candidates[0]["model"] # kazananin anahtarini al
	else: # aday yoksa
	winner_key = pick_default_key_for_lang(lang) # varsayilani sec
	else: # benchmark yoksa
	winner_key = pick_default_key_for_lang(lang) # dogrudan varsayilani sec
	pipe, cfg = get_pipe_and_cfg(winner_key) # kazanan pipeline ve config al
	if pipe is None: # yuklenemezse
	return { # hata don
	"label": "error",
	"score": 0.0,
	"confidence": "low",
	"lang": lang,
	"model_used": winner_key,
	"processing_time_ms": 0.0,
	"error": "model_load_failed"
	}
	t0 = time.perf_counter() # zaman sayacini baslat
	out = pipe(proc)[0] # tek atis tahmini al
	ms = (time.perf_counter() - t0) * 1000.0 # gecikmeyi hesapla
	top = max(out, key=lambda s: s["score"]) # en yuksek skorlu sinifi bul
	label = normalize_label(top["label"], cfg, MODELS[winner_key]["kind"]) # etiketi standarda cevir
	score = float(top["score"]) # skoru sayiya cevir
	confidence = "high" if score > 0.8 else ("medium" if score > 0.6 else "low") # guven araligini belirle
	resp = { # yanit sozlugunu olustur
	"label": label, # son etiket
	"score": round(score, 4), # skor 4 ondalik
	"confidence": confidence, # guven seviyesi
	"lang": lang, # tespit edilen dil
	"model_used": MODELS[winner_key]["id"].split("/")[-1], # kullanilan modelin son parcasi
	"processing_time_ms": round(ms, 2) # islem suresi ms
	}
	if benchmark and candidates: # eger benchmark yapildiysa
	resp["candidates"] = candidates # adaylar listesini da dondur
	return resp # yaniti dondur

	# ====== BENCHMARK UI (otomatik EN/TR/OTHER kovalama) ======
	def run_benchmark_auto(texts_blob: str): # coklu metin benchmark fonksiyonu
	texts = [t.strip() for t in (texts_blob or "").splitlines() if t.strip()] # satirlari ayir ve boslari temizle
	if not texts: # eger hic metin yoksa
	return "Uyari: metin alani bos.", [] # uyari ve bos tablo dondur
	buckets = {"en": [], "tr": [], "other": []} # dil kovalari olustur
	for t in texts: # her metin icin
	buckets[detect_lang(t)].append(t) # uygun kovaya ekle
	rows: List[List] = [] # cikti satirlarini tutacak liste
	errors: List[str] = [] # hata mesajlari icin liste

	def bench_set(text_list: List[str], keys: List[str], tag: str): # bir dil kovasi icin benchmark calistir
	if not text_list: # eger bu kovada metin yoksa
	return # cik
	for k in keys: # her model adayi icin
	spec = MODELS[k] # model kaydini al
	pipe, cfg = get_pipe_and_cfg(k) # pipeline ve config al
	modelname = f"{tag}/{spec['name']}" # tablo icin gorsel ad olustur
	if pipe is None: # yuklenemedi ise
	errors.append(f"yuklenemedi: {modelname}") # hatayi kaydet
	for t in text_list: # her metin icin
	rows.append([t[:50], modelname, "ERROR", 0.0, 0.0, "N/A"]) # hata satiri ekle
	continue # diger modele gec
	proc = [preprocess_en(x) if tag == "EN" else x for x in text_list] # EN icin on isleme yap
	t0 = time.perf_counter() # zaman sayacini baslat
	outs = pipe(proc) # toplu tahmin al
	avg_ms = (time.perf_counter() - t0) * 1000.0 / max(1, len(proc)) # ortalama gecikme hesapla
	for orig, out in zip(text_list, outs): # her cikti icin
	try:
	top = max(out, key=lambda s: s["score"]) # en yuksek skoru bul
	lab = normalize_label(top["label"], cfg, spec["kind"]) # etiketi standarda cevir
	sc = float(top["score"]) # skoru sayiya cevir
	conf = "high" if sc > 0.8 else ("medium" if sc > 0.6 else "low") # guven seviyesi
	rows.append([ # tabloya bir satir ekle
	orig[:50] + ("..." if len(orig) > 50 else ""), # metnin kisa hali
	modelname, # model adi
	lab, # etiket
	round(sc, 4), # skor
	round(avg_ms, 1), # ortalama gecikme
	conf # guven
	])
	except Exception as ex: # tahmin hatasi olursa
	errors.append(f"hata: {modelname}: {str(ex)[:80]}") # hata listesine ekle
	rows.append([orig[:50], modelname, "ERROR", 0.0, 0.0, "N/A"]) # hata satiri ekle

	bench_set(buckets["en"], LANG_TOP3["en"], "EN") # ingilizce kovayi calistir
	bench_set(buckets["tr"], LANG_TOP3["tr"], "TR") # turkce kovayi calistir
	bench_set(buckets["other"], LANG_TOP3["other"], "OTHER") # diger kovayi calistir

	# sadelesmis ozet metni olustur
	summary_lines: List[str] = [] # ozet satirlari icin liste
	if errors: # hata varsa
	summary_lines.append("Hatalar:") # baslik yaz
	for e in errors: # her hata icin
	summary_lines.append(f"- {e}") # listele
	if not summary_lines: # hic hata yoksa
	summary_lines.append("Benchmark tamamlandi.") # basit bilgi satiri
	return "\n".join(summary_lines), rows # ozet metni ve tablo satirlarini dondur

	# ====== GRADIO ARAYUZLERI ======
	api_intf = gr.Interface( # uretim api arayuzu
	fn=analyze, # cagrilacak fonksiyon
	inputs=[ # giris bilesenleri
	gr.Textbox(lines=3, label="Text"), # metin kutusu
	gr.Textbox(lines=1, label="force_lang (en\|tr\|other, opsiyonel)", value=""), # zorlama dil alani
	gr.Checkbox(label="benchmark (kisa TOP3 karsilastirma)", value=False), # benchmark secenegi
	],
	outputs=gr.JSON(label="Result"), # cikti JSON gosterimi
	title="Sentiment API (Production)", # baslik
	description="POST /api/predict/analyze doner: {label, score, confidence, lang, model_used, processing_time_ms[, candidates]}" # aciklama
	)
	api_intf.api_name = "analyze" # endpoint yolu: /api/predict/analyze

	with gr.Blocks(title="Sentiment Benchmark") as bench_ui: # benchmark arayuzu
	gr.Markdown("Coklu metin benchmark. Her satir ayri bir ornek olmalidir.") # kullaniciya kisa bilgi
	txt = gr.Textbox(lines=10, label="Ornekler (satir satir)") # coklu satir metin girisi
	btn = gr.Button("Calistir") # calistir butonu
	out_md = gr.Markdown() # ozet metin alanı
	out_tbl = gr.Dataframe( # tablo cikti alani
	headers=["text", "bucket/model", "label", "score", "latency_ms", "confidence"], # tablo basliklari
	row_count=(0, "dynamic"), # dinamik satir sayisi
	col_count=(6, "fixed"), # sabit sutun sayisi
	interactive=False, # kullanici duzenleyemesin
	wrap=True # metin sarma acik
	)
	btn.click(fn=run_benchmark_auto, inputs=[txt], outputs=[out_md, out_tbl]) # buton tiklayinca fonksiyonu cagir

	demo = gr.TabbedInterface( # iki sekmeli toplam arayuz
	[api_intf, bench_ui], # birinci sekme api, ikinci sekme benchmark
	tab_names=["API", "Benchmark"] # sekme isimleri
	)

	if __name__ == "__main__": # ana calisma blogu
	demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), show_error=True) # gradioyu baslat