yalishanda commited on
Commit
bc70c67
·
verified ·
1 Parent(s): 79be820
Files changed (1) hide show
  1. streamlit_app.py +506 -0
streamlit_app.py ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ import streamlit as st
4
+
5
+
6
+ def process_polish_text(
7
+ text: str,
8
+ iotazation_mode: Literal["separate", "iotized"],
9
+ i_marker: Literal["ь", "j", "ı"] = "ı", # Used when iotazation_mode="separate"
10
+ j_marker: Literal["й", "j", "ï"] = "й", # Used when iotazation_mode="separate"
11
+ replace_nasals: bool = False,
12
+ rz_as_r: bool = False,
13
+ replace_o_with_uk: bool = False,
14
+ ) -> str:
15
+ """Convert Polish text to Cyrillic using specific character mappings and options.
16
+
17
+ Args:
18
+ text: The Polish text to convert
19
+ iotazation_mode: "separate" for separate markers, "iotized" for iotized vowels
20
+ i_marker: Marker for i in combinations (ь/j/ı) when iotazation_mode="separate"
21
+ j_marker: Marker for j letter (й/j/ï) when iotazation_mode="separate"
22
+ replace_nasals: Whether to replace Ąą and Ęę with Cyrillic equivalents
23
+ rz_as_r: Whether to replace RZ with Р instead of Ж
24
+ replace_o_with_uk: Whether to replace Óó with ꙋꙊ instead of Уу
25
+ """
26
+
27
+ # Order matters! Longer sequences must be replaced first
28
+ replacements = [
29
+ # Two-letter digraphs (must come before single letters)
30
+ ("SZ", "Ш"),
31
+ ("Sz", "Ш"),
32
+ ("sz", "ш"),
33
+ ("CZ", "Ч"),
34
+ ("Cz", "Ч"),
35
+ ("cz", "ч"),
36
+ ("DŻ", "Џ"),
37
+ ("Dż", "Џ"),
38
+ ("dż", "џ"),
39
+ # ("DZ", "S"),
40
+ # ("Dz", "S"),
41
+ # ("dz", "s"),
42
+ ("CH", "Х"),
43
+ ("Ch", "Х"),
44
+ ("ch", "х"),
45
+ ("DŹ", "Ђ"),
46
+ ("Dź", "Ђ"),
47
+ ("dź", "ђ"),
48
+ ]
49
+
50
+ # RZ replacement - conditional based on rz_as_r option
51
+ if rz_as_r:
52
+ replacements.extend(
53
+ [
54
+ ("RZ", "Р"),
55
+ ("Rz", "Р"),
56
+ ("rz", "р"),
57
+ ]
58
+ )
59
+ else:
60
+ replacements.extend(
61
+ [
62
+ ("RZ", "Ж"),
63
+ ("Rz", "Ж"),
64
+ ("rz", "ж"),
65
+ ]
66
+ )
67
+
68
+ # Three-letter combinations - different behavior based on iotazation_mode
69
+ if iotazation_mode == "separate":
70
+ # Use selected marker as separate iota symbol
71
+ replacements.extend(
72
+ [
73
+ ("IE", f"{i_marker.upper()}E"),
74
+ ("Ie", f"{i_marker}e"),
75
+ ("ie", f"{i_marker}e"),
76
+ ("IA", f"{i_marker.upper()}A"),
77
+ ("Ia", f"{i_marker}a"),
78
+ ("ia", f"{i_marker}a"),
79
+ ("IU", f"{i_marker.upper()}У"),
80
+ ("Iu", f"{i_marker}у"),
81
+ ("iu", f"{i_marker}у"),
82
+ ("IO", f"{i_marker.upper()}O"),
83
+ ("Io", f"{i_marker}o"),
84
+ ("io", f"{i_marker}o"),
85
+ ("IĘ", f"{i_marker.upper()}Ę"),
86
+ ("Ię", f"{i_marker}ę"),
87
+ ("ię", f"{i_marker}ę"),
88
+ ("IĄ", f"{i_marker.upper()}Ą"),
89
+ ("Ią", f"{i_marker}ą"),
90
+ ("ią", f"{i_marker}ą"),
91
+ ]
92
+ )
93
+ else:
94
+ # Standard behavior - full replacement (replace_nasals always true in this mode)
95
+ replacements.extend(
96
+ [
97
+ ("IĘ", "Ѩ"),
98
+ ("Ię", "Ѩ"),
99
+ ("ię", "ѩ"),
100
+ ("IĄ", "Ѭ"),
101
+ ("Ią", "Ѭ"),
102
+ ("ią", "ѭ"),
103
+ ("IE", "Є"),
104
+ ("Ie", "Є"),
105
+ ("ie", "є"),
106
+ ("IA", "Я"),
107
+ ("Ia", "Я"),
108
+ ("ia", "я"),
109
+ ("IU", "Ю"),
110
+ ("Iu", "Ю"),
111
+ ("iu", "ю"),
112
+ ("IO", "Ё"),
113
+ ("Io", "Ё"),
114
+ ("io", "ё"),
115
+ ("IĘ", "Ѩ"),
116
+ ("Ię", "Ѩ"),
117
+ ("ię", "ѩ"),
118
+ ("IĄ", "Ѭ"),
119
+ ("Ią", "Ѭ"),
120
+ ("ią", "ѭ"),
121
+ ("JE", "Є"),
122
+ ("Je", "Є"),
123
+ ("je", "є"),
124
+ ("JA", "Я"),
125
+ ("Ja", "Я"),
126
+ ("ja", "я"),
127
+ ("JU", "Ю"),
128
+ ("Ju", "Ю"),
129
+ ("ju", "ю"),
130
+ ("JO", "Ё"),
131
+ ("Jo", "Ё"),
132
+ ("jo", "ё"),
133
+ ("JĘ", "Ѩ"),
134
+ ("Ję", "Ѩ"),
135
+ ("ję", "ѩ"),
136
+ ("JĄ", "Ѭ"),
137
+ ("Ją", "Ѭ"),
138
+ ("ją", "ѭ"),
139
+ ]
140
+ )
141
+
142
+ # Single letters with diacritics
143
+ single_letter_replacements = [
144
+ ("Ż", "Ж"),
145
+ ("ż", "ж"),
146
+ ("Ł", "Л"),
147
+ ("ł", "л"),
148
+ ("Ś", "Щ"),
149
+ ("ś", "щ"),
150
+ ("Ć", "Ћ"),
151
+ ("ć", "ћ"),
152
+ ("Ź", "Җ"),
153
+ ("ź", "җ"),
154
+ ("Ń", "Њ"),
155
+ ("ń", "њ"),
156
+ ]
157
+
158
+ # Ó/ó replacement - conditional based on replace_o_with_uk option
159
+ if replace_o_with_uk:
160
+ single_letter_replacements.extend(
161
+ [
162
+ ("Ó", "ꙋ"),
163
+ ("ó", "Ꙋ"),
164
+ ]
165
+ )
166
+ else:
167
+ single_letter_replacements.extend(
168
+ [
169
+ ("Ó", "У"),
170
+ ("ó", "у"),
171
+ ]
172
+ )
173
+
174
+ replacements.extend(single_letter_replacements)
175
+
176
+ # Nasal vowels - conditional replacement
177
+ if replace_nasals:
178
+ # Always replace when replace_nasals=True
179
+ replacements.extend(
180
+ [
181
+ ("Ą", "Ѫ"),
182
+ ("ą", "ѫ"),
183
+ ("Ę", "Ѧ"),
184
+ ("ę", "ѧ"),
185
+ ]
186
+ )
187
+ # else: keep Ąą and Ęę as is
188
+
189
+ # Regular Latin to Cyrillic
190
+ replacements.extend(
191
+ [
192
+ ("A", "А"),
193
+ ("a", "а"),
194
+ ("B", "Б"),
195
+ ("b", "б"),
196
+ ("C", "Ц"),
197
+ ("c", "ц"),
198
+ ("D", "Д"),
199
+ ("d", "д"),
200
+ ("E", "Е"),
201
+ ("e", "е"),
202
+ ("F", "Ф"),
203
+ ("f", "ф"),
204
+ ("G", "Г"),
205
+ ("g", "г"),
206
+ ("H", "Х"),
207
+ ("h", "х"),
208
+ ("I", "І"),
209
+ ("i", "і"),
210
+ ("K", "К"),
211
+ ("k", "к"),
212
+ ("L", "Љ"),
213
+ ("l", "љ"),
214
+ ("M", "М"),
215
+ ("m", "м"),
216
+ ("N", "Н"),
217
+ ("n", "н"),
218
+ ("O", "О"),
219
+ ("o", "о"),
220
+ ("P", "П"),
221
+ ("p", "п"),
222
+ ("Q", "К"),
223
+ ("q", "к"),
224
+ ("R", "Р"),
225
+ ("r", "р"),
226
+ ("S", "С"),
227
+ ("s", "с"),
228
+ ("T", "Т"),
229
+ ("t", "т"),
230
+ ("U", "У"),
231
+ ("u", "у"),
232
+ ("V", "В"),
233
+ ("v", "в"),
234
+ ("W", "В"),
235
+ ("w", "в"),
236
+ ("X", "КС"),
237
+ ("x", "кс"),
238
+ ("Y", "И"),
239
+ ("y", "и"),
240
+ ("Z", "З"),
241
+ ("z", "з"),
242
+ ]
243
+ )
244
+
245
+ # J/j handling based on iotazation_mode
246
+ if iotazation_mode == "separate":
247
+ # J becomes the selected j_marker
248
+ # Determine uppercase version of j_marker
249
+ j_upper = j_marker.upper()
250
+ replacements.extend(
251
+ [
252
+ ("J", j_upper),
253
+ ("j", j_marker),
254
+ ]
255
+ )
256
+ # else: J stays the same (no replacement added)
257
+
258
+ # because Ss is present in original text, we put DZdz last to ensure it is correctly replaced
259
+ replacements.extend(
260
+ [
261
+ ("ДЗ", "Ѕ"),
262
+ ("Дз", "Ѕ"),
263
+ ("дз", "ѕ"),
264
+ ]
265
+ )
266
+
267
+ result = text
268
+ for old, new in replacements:
269
+ result = result.replace(old, new)
270
+
271
+ return result
272
+
273
+
274
+ def process_bulgarian_text(
275
+ text: str, use_macedonian_digraphs: bool = False, use_iota_letter: bool = False
276
+ ) -> str:
277
+ result = text
278
+
279
+ if use_macedonian_digraphs:
280
+ # Replace ДЖ with Џ
281
+ result = result.replace("ДЖ", "Џ")
282
+ result = result.replace("Дж", "Џ")
283
+ result = result.replace("дж", "џ")
284
+
285
+ # Replace ДЗ with Ѕ
286
+ result = result.replace("ДЗ", "Ѕ")
287
+ result = result.replace("Дз", "Ѕ")
288
+ result = result.replace("дз", "ѕ")
289
+
290
+ if use_iota_letter:
291
+ # Replace ю with ıу (must come before ь replacement)
292
+ result = result.replace("Ю", "Iу")
293
+ result = result.replace("ю", "ıу")
294
+
295
+ # Replace я with ıа (must come before ь replacement)
296
+ result = result.replace("Я", "Iа")
297
+ result = result.replace("я", "ıа")
298
+
299
+ # Replace ь and й with ı
300
+ result = result.replace("Ь", "I")
301
+ result = result.replace("ь", "ı")
302
+ result = result.replace("Й", "I")
303
+ result = result.replace("й", "ı")
304
+
305
+ return result
306
+
307
+
308
+ DEFAULT_POLISH = """
309
+ W miasteczku Złotobrzeg czas płynął wolniej niż gdziekolwiek indziej. Zegary na rynku zawsze spieszyły się o pięć minut, a mimo to nikt nigdy nie przychodził punktualnie. Pewnego jesiennego poranka Janek, bibliotekarz o wiecznie poplamionych atramentem palcach, znalazł pod drzwiami biblioteki małą, drewnianą szkatułkę. Nie było na niej żadnego napisu, tylko wyryty symbol ptaka bez skrzydeł.
310
+
311
+ Zabrał ją do środka i postawił między regałami z zapomnianymi kronikami. Gdy ją otworzył, nie znalazł złota ani listu, lecz mapę miasteczka, na której zaznaczono miejsca już nieistniejące: starą piekarnię, kino „Echo”, dom zielarki spłonięty przed laty. Gdy dotknął mapy, usłyszał cichy szmer, jakby ktoś szeptał wspomnienia.
312
+
313
+ Od tego dnia Janek zaczął odwiedzać zaznaczone punkty. W ruinach kina poczuł zapach popcornu i usłyszał śmiech dzieci. Na miejscu piekarni znalazł ciepły kamień, jakby dopiero co wyjęty z pieca. Zrozumiał, że szkatułka nie pokazuje miejsc, lecz emocje, które wciąż krążyły po Złotobrzegu.
314
+
315
+ Wieść o jego wędrówkach szybko się rozeszła. Ludzie zaczęli prosić Janka, by odnalazł ich własne wspomnienia. On jednak pewnego dnia zamknął szkatułkę i schował ją na najwyższej półce biblioteki. Uznał, że nie wszystko musi być odnalezione.
316
+
317
+ Od tamtej pory Złotobrzeg znów ucichł, ale czasem, gdy wiatr wieje od rynku, można usłyszeć trzepot ptaka bez skrzydeł, przypominający, że przeszłość nigdy całkiem nie znika.
318
+
319
+
320
+ Grzegorz Brzęczyszczykiewicz"""
321
+
322
+
323
+ DEFAULT_BULGARIAN = """
324
+ В един слънчев юни, Янко и Джон (английският му приятел) тръгнаха с джип към големия гьол, още известен като язовир "Кючюк Атлантик".
325
+ — Дзън! — звънна телефонът, докато слушаха приятен джаз и обсъждаха дзен философията.
326
+ — Йо, аверчета! Направила съм ви вкусна топла сьомга. Нося и ядки за уискито. - провика се развълнувано Мария през телефона.
327
+ """
328
+
329
+
330
+ def polish_page():
331
+ st.header("🇵🇱 POLSZCZYZNA")
332
+
333
+ # Polish-specific options
334
+ st.subheader("Options")
335
+
336
+ # Iotazation and palatization options
337
+ st.markdown("**Iotazation and palatization**")
338
+ iotazation_mode = st.radio(
339
+ "Choose mode:",
340
+ options=["iotized", "separate"],
341
+ format_func=lambda x: (
342
+ "Soft letters (Ь/J/I + Й/J/Ï)"
343
+ if x == "separate"
344
+ else "Iotized vowels (Я, Ю, Є, Ё, Ѩ, Ѭ)"
345
+ ),
346
+ help="Separate markers: Use customizable markers for palatalization/iotization. "
347
+ "Iotized vowels: Use standard Cyrillic iotized vowels (IA→Я, etc.)",
348
+ label_visibility="collapsed",
349
+ )
350
+
351
+ # Show marker selects only when "separate" mode is selected
352
+ if iotazation_mode == "separate":
353
+ col1, col2 = st.columns(2)
354
+ with col1:
355
+ i_marker = st.selectbox(
356
+ "Replace 'i' in combinations like ie, ia, iu, etc. with this letter:",
357
+ options=["ь", "j", "ı"],
358
+ index=2, # Default to ı
359
+ help="Choose the marker to replace 'i' in combinations like ie, ia, iu, etc.",
360
+ )
361
+ with col2:
362
+ j_marker = st.selectbox(
363
+ "Replace 'j' with this letter:",
364
+ options=["й", "j", "ï"],
365
+ index=0, # Default to й
366
+ help="Choose the replacement for the letter 'j'",
367
+ )
368
+ else:
369
+ # Use default values when iotized mode is selected (they won't be used anyway)
370
+ i_marker = "ı"
371
+ j_marker = "ı"
372
+
373
+ st.markdown("---")
374
+
375
+ replace_nasals = st.checkbox(
376
+ "Ą → Ѫ and Ę → Ѧ",
377
+ value=True,
378
+ help="When enabled, replaces Ąą with Ѫѫ and Ęę with Ѧѧ. "
379
+ "When disabled, keeps the nasal letters as is. "
380
+ "Note: When 'Iotized vowels' mode is selected, J+nasal combinations (JĄ, JĘ) are always converted to iotized nasals (Ѭ, Ѩ) regardless of this setting.",
381
+ )
382
+
383
+ rz_as_r = st.checkbox(
384
+ "RZ → Р (instead of Ж)",
385
+ value=False,
386
+ help="When enabled, replaces RZ/rz with Рр instead of the default Жж.",
387
+ )
388
+
389
+ replace_o_with_uk = st.checkbox(
390
+ "Ó → Ꙋ (instead of У)",
391
+ value=False,
392
+ help="When enabled, replaces Óó with Ꙋꙋ instead of the default Уу.",
393
+ )
394
+
395
+ # Create two columns for input and output
396
+ col1, col2 = st.columns(2)
397
+
398
+ with col1:
399
+ st.subheader("Input")
400
+ input_text = st.text_area(
401
+ "Enter Polish text:",
402
+ value=DEFAULT_POLISH,
403
+ height=800,
404
+ placeholder="Type or paste your Polish text here...",
405
+ label_visibility="collapsed",
406
+ )
407
+
408
+ # Display output automatically when there's input text
409
+ with col2:
410
+ st.subheader("Output")
411
+ if input_text:
412
+ result = process_polish_text(
413
+ input_text, iotazation_mode, i_marker, j_marker, replace_nasals, rz_as_r, replace_o_with_uk # type: ignore
414
+ )
415
+ st.text_area(
416
+ "Processed text:",
417
+ value=result,
418
+ height=800,
419
+ disabled=True,
420
+ label_visibility="collapsed",
421
+ )
422
+ else:
423
+ st.text_area(
424
+ "Processed text:",
425
+ value="",
426
+ height=800,
427
+ disabled=True,
428
+ label_visibility="collapsed",
429
+ placeholder="Processed text will appear here...",
430
+ )
431
+
432
+
433
+ def bulgarian_page():
434
+ st.header("🇧🇬 БЪЛГАРСКИ")
435
+
436
+ # Bulgarian-specific options
437
+ st.subheader("Options")
438
+
439
+ use_macedonian_digraphs = st.checkbox(
440
+ "ДЖ → Џ, ДЗ → Ѕ",
441
+ value=False,
442
+ help="When enabled, replaces ДЖдж with Џџ and ДЗдз with Ѕѕ.",
443
+ )
444
+
445
+ use_iota_letter = st.checkbox(
446
+ "Use iota letter (ю → ıу, я → ıа, ь/й → ı)",
447
+ value=False,
448
+ help="This will be buggy if text is written in all-caps, as there is no easy way to see the surrounding context of the letters.",
449
+ )
450
+
451
+ # Create two columns for input and output
452
+ col1, col2 = st.columns(2)
453
+
454
+ with col1:
455
+ st.subheader("Input")
456
+ input_text = st.text_area(
457
+ "Enter Bulgarian text:",
458
+ value=DEFAULT_BULGARIAN,
459
+ height=400,
460
+ placeholder="Type or paste your Bulgarian text here...",
461
+ label_visibility="collapsed",
462
+ )
463
+
464
+ # Display output automatically when there's input text
465
+ with col2:
466
+ st.subheader("Output")
467
+ if input_text:
468
+ result = process_bulgarian_text(
469
+ input_text, use_macedonian_digraphs, use_iota_letter
470
+ )
471
+ st.text_area(
472
+ "Processed text:",
473
+ value=result,
474
+ height=400,
475
+ disabled=True,
476
+ label_visibility="collapsed",
477
+ )
478
+ else:
479
+ st.text_area(
480
+ "Processed text:",
481
+ value="",
482
+ height=400,
483
+ disabled=True,
484
+ label_visibility="collapsed",
485
+ placeholder="Processed text will appear here...",
486
+ )
487
+
488
+
489
+ def main():
490
+ st.set_page_config(page_title="Language 'Fixer'", page_icon="🌍", layout="wide")
491
+
492
+ st.title("'Fixing' languages")
493
+
494
+ # Sidebar navigation
495
+ st.sidebar.title("Navigation")
496
+ page = st.sidebar.radio("Select Language:", ["Polish", "Bulgarian"])
497
+
498
+ # Display selected page
499
+ if page == "Polish":
500
+ polish_page()
501
+ elif page == "Bulgarian":
502
+ bulgarian_page()
503
+
504
+
505
+ if __name__ == "__main__":
506
+ main()