import os import re import torch import pandas as pd import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # ========================================================================= # 1. Sabitler ve Model Yükleme # ========================================================================= HF_MODEL_ID = "LiProject/BERT-Turkish-Lemmatization-V3" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" try: tok = AutoTokenizer.from_pretrained(HF_MODEL_ID, use_fast=True) mdl = AutoModelForSeq2SeqLM.from_pretrained(HF_MODEL_ID).to(DEVICE).eval() print(f"Model yükleme başarılı: {HF_MODEL_ID} ({DEVICE} üzerinde)") except Exception as e: print(f"Model veya Tokenizer yüklenirken kritik hata oluştu: {e}") raise SystemExit(1) # ========================================================================= # 2. Arka Plan İşlemleri # ========================================================================= def get_lemma_for_word(word: str) -> str: """Tek kelimeyi temizler, sayıysa sayıyı bırakır, değilse modele yollar.""" clean_word = word.strip(".,!?();:\"'’") if not clean_word: return word num_match = re.match(r"^(\d+(?:[.,]\d+)?)(?:['’.]?[a-zA-ZğüşıöçĞÜŞİÖÇ]*)$", clean_word) if num_match: return num_match.group(1) inputs = tok(clean_word, return_tensors="pt", truncation=True, max_length=128).to(DEVICE) outputs = mdl.generate(**inputs, max_length=128) lemma = tok.decode(outputs[0], skip_special_tokens=True).strip() return lemma if lemma else clean_word @torch.inference_mode() def lemmatize_rows(multiline_text: str): rows = [] sentences = [s.strip() for s in multiline_text.splitlines() if s.strip()] if not sentences: return pd.DataFrame(columns=["Full_Sentence", "Word", "Lemma"]) for sent in sentences: words = sent.split() for w in words: l = get_lemma_for_word(w) rows.append({ "Full_Sentence": sent, "Word": w, "Lemma": l }) return pd.DataFrame(rows) def add_sentence_separators(df: pd.DataFrame, char: str = "-", repeat: int = 10) -> pd.DataFrame: if df.empty: return df rows = [] prev = None for _, r in df.iterrows(): if prev is not None and r["Full_Sentence"] != prev: sep = char * repeat rows.append({ "Full_Sentence": sep, "Word": sep, "Lemma": sep }) rows.append(r.to_dict()) prev = r["Full_Sentence"] return pd.DataFrame(rows) def run_and_save(text): df = lemmatize_rows(text) df_view = add_sentence_separators(df, char="-", repeat=10) out_path = "lemma_output.csv" df.to_csv(out_path, index=False, encoding="utf-8-sig") return df_view, out_path examples = [ "Yolcular, zorlu yollarda yolculuk yaparken yoldan çıkmamaya özen gösterirler.", "Öğrenciler 2'şerli gruplar halinde 15.30'da içeri alındılar.", "Benimki seninkinden daha güzelmiş, dedi usulca." ] # ========================================================================= # 3. Gradio Arayüzü # ========================================================================= theme = gr.themes.Soft( primary_hue="blue", secondary_hue="slate", neutral_hue="slate" ) custom_css = """ .gradio-container { max-width: 1100px !important; margin: 0 auto !important; padding-top: 20px !important; } #input_text textarea { min-height: 190px !important; font-size: 15px !important; line-height: 1.5 !important; } #results_table { max-height: 420px !important; overflow: auto !important; } #results_table table { table-layout: fixed !important; width: 100% !important; } #results_table th, #results_table td { white-space: normal !important; word-break: break-word !important; } .main-title { text-align: center; margin-bottom: 4px; } .sub-text { text-align: center; opacity: 0.9; margin-bottom: 18px; } .info-box { border: 1px solid #cbd5e1; border-radius: 14px; padding: 14px 16px; margin-top: 12px; margin-bottom: 16px; background: rgba(148,163,184,0.08); } footer { visibility: hidden !important; } """ with gr.Blocks(title="Türkçe Lemmatizer") as demo: gr.HTML("""