Sarpyy commited on
Commit
e9de5fb
·
verified ·
1 Parent(s): 9120242

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -52
app.py CHANGED
@@ -9,7 +9,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
  # 1. Sabitler ve Model Yükleme
10
  # =========================================================================
11
 
12
- HF_MODEL_ID = "LiProject/BERT-Turkish-Lemmatization-V2"
13
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
 
15
  try:
@@ -18,37 +18,36 @@ try:
18
  print(f"Model yükleme başarılı: {HF_MODEL_ID} ({DEVICE} üzerinde)")
19
  except Exception as e:
20
  print(f"Model veya Tokenizer yüklenirken kritik hata oluştu: {e}")
21
- exit(1)
22
 
23
  # =========================================================================
24
- # 2. Arka Plan İşlemleri (Kelime Kelime İşlem & Excel Düzeltmesi)
25
  # =========================================================================
26
 
27
  def get_lemma_for_word(word: str) -> str:
28
  """Tek kelimeyi temizler, sayıysa sayıyı bırakır, değilse modele yollar."""
29
- # Kelimenin başındaki ve sonundaki noktalama işaretlerini temizle
30
  clean_word = word.strip(".,!?();:\"'’")
31
-
32
  if not clean_word:
33
  return word
34
-
35
- # Sayı kontrolü (Örn: 15.30'da -> 15.30)
36
  num_match = re.match(r"^(\d+(?:[.,]\d+)?)(?:['’.]?[a-zA-ZğüşıöçĞÜŞİÖÇ]*)$", clean_word)
37
  if num_match:
38
  return num_match.group(1)
39
 
40
- # Modeli tekil kelime için çalıştır (Modelin doğasına en uygun yöntem)
41
  inputs = tok(clean_word, return_tensors="pt", truncation=True, max_length=128).to(DEVICE)
42
  outputs = mdl.generate(**inputs, max_length=128)
43
  lemma = tok.decode(outputs[0], skip_special_tokens=True).strip()
44
-
45
  return lemma if lemma else clean_word
46
 
 
47
  @torch.inference_mode()
48
  def lemmatize_rows(multiline_text: str):
49
  rows = []
50
  sentences = [s.strip() for s in multiline_text.splitlines() if s.strip()]
51
-
52
  if not sentences:
53
  return pd.DataFrame(columns=["Full_Sentence", "Word", "Lemma"])
54
 
@@ -56,91 +55,184 @@ def lemmatize_rows(multiline_text: str):
56
  words = sent.split()
57
  for w in words:
58
  l = get_lemma_for_word(w)
59
- rows.append({"Full_Sentence": sent, "Word": w, "Lemma": l})
 
 
 
 
60
 
61
  return pd.DataFrame(rows)
62
 
 
63
  def add_sentence_separators(df: pd.DataFrame, char: str = "-", repeat: int = 10) -> pd.DataFrame:
64
- if df.empty: return df
65
- rows, prev = [], None
 
 
 
 
66
  for _, r in df.iterrows():
67
  if prev is not None and r["Full_Sentence"] != prev:
68
  sep = char * repeat
69
- rows.append({"Full_Sentence": sep, "Word": sep, "Lemma": sep})
 
 
 
 
70
  rows.append(r.to_dict())
71
  prev = r["Full_Sentence"]
 
72
  return pd.DataFrame(rows)
73
 
 
74
  def run_and_save(text):
75
  df = lemmatize_rows(text)
76
  df_view = add_sentence_separators(df, char="-", repeat=10)
77
 
78
- out_path = "lemma_output.csv"
79
- # EXCEL TÜRKÇE KARAKTER DÜZELTMESİ (utf-8-sig)
80
- df.to_csv(out_path, index=False, encoding="utf-8-sig")
81
 
82
  return df_view, out_path
83
 
 
84
  examples = [
85
  "Yolcular, zorlu yollarda yolculuk yaparken yoldan çıkmamaya özen gösterirler.",
86
- "Öğrenciler 2'şerli gruplar halinde 15.30'da içeri alındılar."
 
87
  ]
88
 
89
  # =========================================================================
90
  # 3. Gradio Arayüzü
91
  # =========================================================================
92
 
93
- theme = gr.themes.Soft(primary_hue="slate", neutral_hue="slate")
 
 
 
 
 
94
  custom_css = """
95
- .gradio-container { background: #000000 !important; color: #FFE8DB !important; font-family: Inter, ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, "Helvetica Neue", Arial, sans-serif; }
96
- .prose h1, .prose h2, .prose h3, .prose p, label { color: #FFE8DB !important; }
97
- .gr-box, .gr-panel, .border, .container { background: #0b0b0b !important; border: 1.5px solid #739EC9 !important; border-radius: 14px !important; }
98
- textarea, input, .gr-textbox, .gr-file, .gr-form input, .gr-form textarea { background: #0f1a26 !important; color: #FFE8DB !important; border: 2px solid #5682B1 !important; border-radius: 12px !important; }
99
- button { transition: background 0.15s ease, filter 0.15s ease, box-shadow 0.15s ease; }
100
- button.primary, .btn-primary { background: #FFE8DB !important; color: #000000 !important; }
101
- button.primary:hover, .btn-primary:hover { filter: brightness(0.92); }
102
- button.secondary, .btn-secondary { background: rgba(86,130,177,0.15) !important; color: #FFE8DB !important; }
103
- button.secondary:hover, .btn-secondary:hover { background: rgba(86,130,177,0.38) !important; border-color: #5682B1 !important; }
104
- table { border-collapse: separate !important; border-spacing: 0 !important; }
105
- th { background: #5682B1 !important; color: #FFE8DB !important; }
106
- td { background: #0f1a26 !important; color: #FFE8DB !important; }
107
- tbody tr:nth-child(2n) td { background: #122434 !important; }
108
- #results_table { max-height: 360px !important; overflow: auto !important; }
109
- #results_table table { table-layout: fixed !important; width: 100% !important; }
110
- #results_table th, #results_table td { white-space: normal !important; word-break: break-word !important; }
111
- #input_text textarea { min-height: 150px !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  """
113
 
114
- with gr.Blocks(title="TR Lemmatizer", theme=theme, css=custom_css, fill_height=True) as demo:
115
- gr.Markdown("# 🇹🇷 Türkçe Lemmatization (Kök Bulma)")
116
- gr.Markdown(f"Model: `{HF_MODEL_ID.split('/')[-1]}`. İşlem modelin doğası gereği kelime kelime yapılır. Bilgilendirme: Kullanmakta olduğunuz web arayüzündeki (Gradio) sistemsel bir kısıtlama sebebiyle Türkçe karakterlerde bozulmalar görülebilir. Ana yapay zeka modelimizin altyapısında herhangi bir Türkçe karakter sorunu bulunmamaktadır; Eğitilmiş modelimizde ascii hatası bulunmamaktadır.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  with gr.Row():
119
  with gr.Column(scale=3):
120
  inp = gr.Textbox(
121
- lines=6,
122
- placeholder="Örn:\nYolcular yollarda yürüdü.",
123
- show_label=False,
124
  elem_id="input_text"
125
  )
 
 
 
 
 
 
 
126
  with gr.Column(scale=1):
127
- btn = gr.Button("Kökleri Bul ve CSV indir", variant="primary", elem_id="run_btn")
128
- clr = gr.Button("Temizle", variant="secondary", elem_id="clear_btn")
129
 
130
  out_tbl = gr.Dataframe(
131
- headers=["Full_Sentence","Word","Lemma"],
132
- label="Önizleme",
133
  interactive=False,
 
134
  elem_id="results_table"
135
  )
136
 
137
- out_file = gr.File(label="Çıktı CSV")
138
 
139
- gr.Examples(examples=[[e] for e in examples], inputs=inp)
 
 
 
 
140
 
141
- btn.click(run_and_save, inputs=inp, outputs=[out_tbl, out_file])
142
- inp.submit(run_and_save, inputs=inp, outputs=[out_tbl, out_file])
143
- clr.click(lambda: ("", None, None), outputs=[inp, out_tbl, out_file])
 
 
 
 
 
 
 
 
144
 
145
  if __name__ == "__main__":
146
- demo.launch(debug=True)
 
9
  # 1. Sabitler ve Model Yükleme
10
  # =========================================================================
11
 
12
+ HF_MODEL_ID = "LiProject/BERT-Turkish-Lemmatization-V3"
13
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
 
15
  try:
 
18
  print(f"Model yükleme başarılı: {HF_MODEL_ID} ({DEVICE} üzerinde)")
19
  except Exception as e:
20
  print(f"Model veya Tokenizer yüklenirken kritik hata oluştu: {e}")
21
+ raise SystemExit(1)
22
 
23
  # =========================================================================
24
+ # 2. Arka Plan İşlemleri
25
  # =========================================================================
26
 
27
  def get_lemma_for_word(word: str) -> str:
28
  """Tek kelimeyi temizler, sayıysa sayıyı bırakır, değilse modele yollar."""
 
29
  clean_word = word.strip(".,!?();:\"'’")
30
+
31
  if not clean_word:
32
  return word
33
+
34
+ # Sayı kontrolü
35
  num_match = re.match(r"^(\d+(?:[.,]\d+)?)(?:['’.]?[a-zA-ZğüşıöçĞÜŞİÖÇ]*)$", clean_word)
36
  if num_match:
37
  return num_match.group(1)
38
 
 
39
  inputs = tok(clean_word, return_tensors="pt", truncation=True, max_length=128).to(DEVICE)
40
  outputs = mdl.generate(**inputs, max_length=128)
41
  lemma = tok.decode(outputs[0], skip_special_tokens=True).strip()
42
+
43
  return lemma if lemma else clean_word
44
 
45
+
46
  @torch.inference_mode()
47
  def lemmatize_rows(multiline_text: str):
48
  rows = []
49
  sentences = [s.strip() for s in multiline_text.splitlines() if s.strip()]
50
+
51
  if not sentences:
52
  return pd.DataFrame(columns=["Full_Sentence", "Word", "Lemma"])
53
 
 
55
  words = sent.split()
56
  for w in words:
57
  l = get_lemma_for_word(w)
58
+ rows.append({
59
+ "Full_Sentence": sent,
60
+ "Word": w,
61
+ "Lemma": l
62
+ })
63
 
64
  return pd.DataFrame(rows)
65
 
66
+
67
  def add_sentence_separators(df: pd.DataFrame, char: str = "-", repeat: int = 10) -> pd.DataFrame:
68
+ if df.empty:
69
+ return df
70
+
71
+ rows = []
72
+ prev = None
73
+
74
  for _, r in df.iterrows():
75
  if prev is not None and r["Full_Sentence"] != prev:
76
  sep = char * repeat
77
+ rows.append({
78
+ "Full_Sentence": sep,
79
+ "Word": sep,
80
+ "Lemma": sep
81
+ })
82
  rows.append(r.to_dict())
83
  prev = r["Full_Sentence"]
84
+
85
  return pd.DataFrame(rows)
86
 
87
+
88
  def run_and_save(text):
89
  df = lemmatize_rows(text)
90
  df_view = add_sentence_separators(df, char="-", repeat=10)
91
 
92
+ out_path = "lemma_output.csv"
93
+ df.to_csv(out_path, index=False, encoding="utf-8-sig")
 
94
 
95
  return df_view, out_path
96
 
97
+
98
  examples = [
99
  "Yolcular, zorlu yollarda yolculuk yaparken yoldan çıkmamaya özen gösterirler.",
100
+ "Öğrenciler 2'şerli gruplar halinde 15.30'da içeri alındılar.",
101
+ "Benimki seninkinden daha güzelmiş, dedi usulca."
102
  ]
103
 
104
  # =========================================================================
105
  # 3. Gradio Arayüzü
106
  # =========================================================================
107
 
108
+ theme = gr.themes.Soft(
109
+ primary_hue="blue",
110
+ secondary_hue="slate",
111
+ neutral_hue="slate"
112
+ )
113
+
114
  custom_css = """
115
+ .gradio-container {
116
+ max-width: 1100px !important;
117
+ margin: 0 auto !important;
118
+ padding-top: 20px !important;
119
+ }
120
+
121
+ #input_text textarea {
122
+ min-height: 190px !important;
123
+ font-size: 15px !important;
124
+ line-height: 1.5 !important;
125
+ }
126
+
127
+ #results_table {
128
+ max-height: 420px !important;
129
+ overflow: auto !important;
130
+ }
131
+
132
+ #results_table table {
133
+ table-layout: fixed !important;
134
+ width: 100% !important;
135
+ }
136
+
137
+ #results_table th, #results_table td {
138
+ white-space: normal !important;
139
+ word-break: break-word !important;
140
+ }
141
+
142
+ .main-title {
143
+ text-align: center;
144
+ margin-bottom: 4px;
145
+ }
146
+
147
+ .sub-text {
148
+ text-align: center;
149
+ opacity: 0.9;
150
+ margin-bottom: 18px;
151
+ }
152
+
153
+ .info-box {
154
+ border: 1px solid #cbd5e1;
155
+ border-radius: 14px;
156
+ padding: 14px 16px;
157
+ margin-top: 12px;
158
+ margin-bottom: 16px;
159
+ background: rgba(148,163,184,0.08);
160
+ }
161
+
162
+ footer {
163
+ visibility: hidden !important;
164
+ }
165
  """
166
 
167
+ with gr.Blocks(
168
+ title="Türkçe Lemmatizer",
169
+ theme=theme,
170
+ css=custom_css
171
+ ) as demo:
172
+
173
+ gr.HTML("""
174
+ <div class="main-title">
175
+ <h1>Türkçe Lemmatization Aracı</h1>
176
+ </div>
177
+ <div class="sub-text">
178
+ Türkçe cümleleri kelime kelime işleyerek köklerini çıkarır ve CSV olarak indirmenizi sağlar.
179
+ </div>
180
+ """)
181
+
182
+ gr.HTML(f"""
183
+ <div class="info-box">
184
+ <b>Model:</b> {HF_MODEL_ID}<br>
185
+ <b>Çalışma mantığı:</b> Metin satır satır, her satır da kelime kelime işlenir.<br>
186
+ <b>Not:</b> Arayüzde nadiren Türkçe karakter görüntüleme farkları olabilir; model mantığında Türkçe desteği korunur.
187
+ </div>
188
+ """)
189
 
190
  with gr.Row():
191
  with gr.Column(scale=3):
192
  inp = gr.Textbox(
193
+ label="Metin Girişi",
194
+ placeholder="Buraya bir veya birden fazla Türkçe cümle yazın...",
195
+ lines=8,
196
  elem_id="input_text"
197
  )
198
+
199
+ gr.Examples(
200
+ examples=[[e] for e in examples],
201
+ inputs=inp,
202
+ label="Örnek girdiler"
203
+ )
204
+
205
  with gr.Column(scale=1):
206
+ btn = gr.Button("Kökleri Bul", variant="primary")
207
+ clr = gr.Button("Temizle", variant="secondary")
208
 
209
  out_tbl = gr.Dataframe(
210
+ headers=["Full_Sentence", "Word", "Lemma"],
211
+ label="Sonuç Önizleme",
212
  interactive=False,
213
+ wrap=True,
214
  elem_id="results_table"
215
  )
216
 
217
+ out_file = gr.File(label="CSV Çıktısı")
218
 
219
+ btn.click(
220
+ fn=run_and_save,
221
+ inputs=inp,
222
+ outputs=[out_tbl, out_file]
223
+ )
224
 
225
+ inp.submit(
226
+ fn=run_and_save,
227
+ inputs=inp,
228
+ outputs=[out_tbl, out_file]
229
+ )
230
+
231
+ clr.click(
232
+ fn=lambda: ("", None, None),
233
+ inputs=None,
234
+ outputs=[inp, out_tbl, out_file]
235
+ )
236
 
237
  if __name__ == "__main__":
238
+ demo.launch()