import os import re import torch import pandas as pd import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # ========================================================================= # 1. Sabitler ve Model Yükleme # ========================================================================= HF_MODEL_ID = "LiProject/BERT-Turkish-Lemmatization-V3" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" try: tok = AutoTokenizer.from_pretrained(HF_MODEL_ID, use_fast=True) mdl = AutoModelForSeq2SeqLM.from_pretrained(HF_MODEL_ID).to(DEVICE).eval() print(f"Model yükleme başarılı: {HF_MODEL_ID} ({DEVICE} üzerinde)") except Exception as e: print(f"Model veya Tokenizer yüklenirken kritik hata oluştu: {e}") raise SystemExit(1) # ========================================================================= # 2. Arka Plan İşlemleri # ========================================================================= def get_lemma_for_word(word: str) -> str: """Tek kelimeyi temizler, sayıysa sayıyı bırakır, değilse modele yollar.""" clean_word = word.strip(".,!?();:\"'’") if not clean_word: return word num_match = re.match(r"^(\d+(?:[.,]\d+)?)(?:['’.]?[a-zA-ZğüşıöçĞÜŞİÖÇ]*)$", clean_word) if num_match: return num_match.group(1) inputs = tok(clean_word, return_tensors="pt", truncation=True, max_length=128).to(DEVICE) outputs = mdl.generate(**inputs, max_length=128) lemma = tok.decode(outputs[0], skip_special_tokens=True).strip() return lemma if lemma else clean_word @torch.inference_mode() def lemmatize_rows(multiline_text: str): rows = [] sentences = [s.strip() for s in multiline_text.splitlines() if s.strip()] if not sentences: return pd.DataFrame(columns=["Full_Sentence", "Word", "Lemma"]) for sent in sentences: words = sent.split() for w in words: l = get_lemma_for_word(w) rows.append({ "Full_Sentence": sent, "Word": w, "Lemma": l }) return pd.DataFrame(rows) def add_sentence_separators(df: pd.DataFrame, char: str = "-", repeat: int = 10) -> pd.DataFrame: if df.empty: return df rows = [] prev = None for _, r in df.iterrows(): if prev is not None and r["Full_Sentence"] != prev: sep = char * repeat rows.append({ "Full_Sentence": sep, "Word": sep, "Lemma": sep }) rows.append(r.to_dict()) prev = r["Full_Sentence"] return pd.DataFrame(rows) def run_and_save(text): df = lemmatize_rows(text) df_view = add_sentence_separators(df, char="-", repeat=10) out_path = "lemma_output.csv" df.to_csv(out_path, index=False, encoding="utf-8-sig") return df_view, out_path examples = [ "Yolcular, zorlu yollarda yolculuk yaparken yoldan çıkmamaya özen gösterirler.", "Öğrenciler 2'şerli gruplar halinde 15.30'da içeri alındılar.", "Benimki seninkinden daha güzelmiş, dedi usulca." ] # ========================================================================= # 3. Gradio Arayüzü # ========================================================================= theme = gr.themes.Soft( primary_hue="blue", secondary_hue="slate", neutral_hue="slate" ) custom_css = """ .gradio-container { max-width: 1100px !important; margin: 0 auto !important; padding-top: 20px !important; } #input_text textarea { min-height: 190px !important; font-size: 15px !important; line-height: 1.5 !important; } #results_table { max-height: 420px !important; overflow: auto !important; } #results_table table { table-layout: fixed !important; width: 100% !important; } #results_table th, #results_table td { white-space: normal !important; word-break: break-word !important; } .main-title { text-align: center; margin-bottom: 4px; } .sub-text { text-align: center; opacity: 0.9; margin-bottom: 18px; } .info-box { border: 1px solid #cbd5e1; border-radius: 14px; padding: 14px 16px; margin-top: 12px; margin-bottom: 16px; background: rgba(148,163,184,0.08); } footer { visibility: hidden !important; } """ with gr.Blocks(title="Türkçe Lemmatizer") as demo: gr.HTML("""

Türkçe Lemmatization Aracı

Türkçe cümleleri kelime kelime işleyerek köklerini çıkarır ve CSV olarak indirmenizi sağlar.
""") gr.HTML(f"""
Model: {HF_MODEL_ID}
Çalışma mantığı: Metin satır satır, her satır da kelime kelime işlenir.
Not: Arayüzde nadiren Türkçe karakter görüntüleme farkları olabilir; model mantığında Türkçe desteği korunur.
""") with gr.Row(): with gr.Column(scale=3): inp = gr.Textbox( label="Metin Girişi", placeholder="Buraya bir veya birden fazla Türkçe cümle yazın...", lines=8, elem_id="input_text" ) gr.Examples( examples=[[e] for e in examples], inputs=inp, label="Örnek girdiler" ) with gr.Column(scale=1): btn = gr.Button("Kökleri Bul", variant="primary") clr = gr.Button("Temizle", variant="secondary") out_tbl = gr.Dataframe( headers=["Full_Sentence", "Word", "Lemma"], label="Sonuç Önizleme", interactive=False, wrap=True, elem_id="results_table" ) out_file = gr.File(label="CSV Çıktısı") btn.click( fn=run_and_save, inputs=inp, outputs=[out_tbl, out_file] ) inp.submit( fn=run_and_save, inputs=inp, outputs=[out_tbl, out_file] ) clr.click( fn=lambda: ("", None, None), inputs=None, outputs=[inp, out_tbl, out_file] ) if __name__ == "__main__": demo.launch(theme=theme, css=custom_css)