| | import os |
| | import re |
| | import torch |
| | import pandas as pd |
| | import gradio as gr |
| | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
| |
|
| | |
| | |
| | |
| |
|
| | HF_MODEL_ID = "LiProject/BERT-Turkish-Lemmatization-V3" |
| | DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| |
|
| | try: |
| | tok = AutoTokenizer.from_pretrained(HF_MODEL_ID, use_fast=True) |
| | mdl = AutoModelForSeq2SeqLM.from_pretrained(HF_MODEL_ID).to(DEVICE).eval() |
| | print(f"Model yükleme başarılı: {HF_MODEL_ID} ({DEVICE} üzerinde)") |
| | except Exception as e: |
| | print(f"Model veya Tokenizer yüklenirken kritik hata oluştu: {e}") |
| | raise SystemExit(1) |
| |
|
| | |
| | |
| | |
| |
|
| | def get_lemma_for_word(word: str) -> str: |
| | """Tek kelimeyi temizler, sayıysa sayıyı bırakır, değilse modele yollar.""" |
| | clean_word = word.strip(".,!?();:\"'’") |
| |
|
| | if not clean_word: |
| | return word |
| |
|
| | num_match = re.match(r"^(\d+(?:[.,]\d+)?)(?:['’.]?[a-zA-ZğüşıöçĞÜŞİÖÇ]*)$", clean_word) |
| | if num_match: |
| | return num_match.group(1) |
| |
|
| | inputs = tok(clean_word, return_tensors="pt", truncation=True, max_length=128).to(DEVICE) |
| | outputs = mdl.generate(**inputs, max_length=128) |
| | lemma = tok.decode(outputs[0], skip_special_tokens=True).strip() |
| |
|
| | return lemma if lemma else clean_word |
| |
|
| |
|
| | @torch.inference_mode() |
| | def lemmatize_rows(multiline_text: str): |
| | rows = [] |
| | sentences = [s.strip() for s in multiline_text.splitlines() if s.strip()] |
| |
|
| | if not sentences: |
| | return pd.DataFrame(columns=["Full_Sentence", "Word", "Lemma"]) |
| |
|
| | for sent in sentences: |
| | words = sent.split() |
| | for w in words: |
| | l = get_lemma_for_word(w) |
| | rows.append({ |
| | "Full_Sentence": sent, |
| | "Word": w, |
| | "Lemma": l |
| | }) |
| |
|
| | return pd.DataFrame(rows) |
| |
|
| |
|
| | def add_sentence_separators(df: pd.DataFrame, char: str = "-", repeat: int = 10) -> pd.DataFrame: |
| | if df.empty: |
| | return df |
| |
|
| | rows = [] |
| | prev = None |
| |
|
| | for _, r in df.iterrows(): |
| | if prev is not None and r["Full_Sentence"] != prev: |
| | sep = char * repeat |
| | rows.append({ |
| | "Full_Sentence": sep, |
| | "Word": sep, |
| | "Lemma": sep |
| | }) |
| | rows.append(r.to_dict()) |
| | prev = r["Full_Sentence"] |
| |
|
| | return pd.DataFrame(rows) |
| |
|
| |
|
| | def run_and_save(text): |
| | df = lemmatize_rows(text) |
| | df_view = add_sentence_separators(df, char="-", repeat=10) |
| |
|
| | out_path = "lemma_output.csv" |
| | df.to_csv(out_path, index=False, encoding="utf-8-sig") |
| |
|
| | return df_view, out_path |
| |
|
| |
|
| | examples = [ |
| | "Yolcular, zorlu yollarda yolculuk yaparken yoldan çıkmamaya özen gösterirler.", |
| | "Öğrenciler 2'şerli gruplar halinde 15.30'da içeri alındılar.", |
| | "Benimki seninkinden daha güzelmiş, dedi usulca." |
| | ] |
| |
|
| | |
| | |
| | |
| |
|
| | theme = gr.themes.Soft( |
| | primary_hue="blue", |
| | secondary_hue="slate", |
| | neutral_hue="slate" |
| | ) |
| |
|
| | custom_css = """ |
| | .gradio-container { |
| | max-width: 1100px !important; |
| | margin: 0 auto !important; |
| | padding-top: 20px !important; |
| | } |
| | |
| | #input_text textarea { |
| | min-height: 190px !important; |
| | font-size: 15px !important; |
| | line-height: 1.5 !important; |
| | } |
| | |
| | #results_table { |
| | max-height: 420px !important; |
| | overflow: auto !important; |
| | } |
| | |
| | #results_table table { |
| | table-layout: fixed !important; |
| | width: 100% !important; |
| | } |
| | |
| | #results_table th, #results_table td { |
| | white-space: normal !important; |
| | word-break: break-word !important; |
| | } |
| | |
| | .main-title { |
| | text-align: center; |
| | margin-bottom: 4px; |
| | } |
| | |
| | .sub-text { |
| | text-align: center; |
| | opacity: 0.9; |
| | margin-bottom: 18px; |
| | } |
| | |
| | .info-box { |
| | border: 1px solid #cbd5e1; |
| | border-radius: 14px; |
| | padding: 14px 16px; |
| | margin-top: 12px; |
| | margin-bottom: 16px; |
| | background: rgba(148,163,184,0.08); |
| | } |
| | |
| | footer { |
| | visibility: hidden !important; |
| | } |
| | """ |
| |
|
| | with gr.Blocks(title="Türkçe Lemmatizer") as demo: |
| |
|
| | gr.HTML(""" |
| | <div class="main-title"> |
| | <h1>Türkçe Lemmatization Aracı</h1> |
| | </div> |
| | <div class="sub-text"> |
| | Türkçe cümleleri kelime kelime işleyerek köklerini çıkarır ve CSV olarak indirmenizi sağlar. |
| | </div> |
| | """) |
| |
|
| | gr.HTML(f""" |
| | <div class="info-box"> |
| | <b>Model:</b> {HF_MODEL_ID}<br> |
| | <b>Çalışma mantığı:</b> Metin satır satır, her satır da kelime kelime işlenir.<br> |
| | <b>Not:</b> Arayüzde nadiren Türkçe karakter görüntüleme farkları olabilir; model mantığında Türkçe desteği korunur. |
| | </div> |
| | """) |
| |
|
| | with gr.Row(): |
| | with gr.Column(scale=3): |
| | inp = gr.Textbox( |
| | label="Metin Girişi", |
| | placeholder="Buraya bir veya birden fazla Türkçe cümle yazın...", |
| | lines=8, |
| | elem_id="input_text" |
| | ) |
| |
|
| | gr.Examples( |
| | examples=[[e] for e in examples], |
| | inputs=inp, |
| | label="Örnek girdiler" |
| | ) |
| |
|
| | with gr.Column(scale=1): |
| | btn = gr.Button("Kökleri Bul", variant="primary") |
| | clr = gr.Button("Temizle", variant="secondary") |
| |
|
| | out_tbl = gr.Dataframe( |
| | headers=["Full_Sentence", "Word", "Lemma"], |
| | label="Sonuç Önizleme", |
| | interactive=False, |
| | wrap=True, |
| | elem_id="results_table" |
| | ) |
| |
|
| | out_file = gr.File(label="CSV Çıktısı") |
| |
|
| | btn.click( |
| | fn=run_and_save, |
| | inputs=inp, |
| | outputs=[out_tbl, out_file] |
| | ) |
| |
|
| | inp.submit( |
| | fn=run_and_save, |
| | inputs=inp, |
| | outputs=[out_tbl, out_file] |
| | ) |
| |
|
| | clr.click( |
| | fn=lambda: ("", None, None), |
| | inputs=None, |
| | outputs=[inp, out_tbl, out_file] |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch(theme=theme, css=custom_css) |