Spaces:

yalishanda
/

fixing-langs

Sleeping

App Files Files Community

yalishanda commited on Jan 25

Commit

bc70c67

verified ·

1 Parent(s): 79be820

Add code

Browse files

Files changed (1) hide show

streamlit_app.py +506 -0

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,506 @@

+from typing import Literal
+import streamlit as st
+def process_polish_text(
+    text: str,
+    iotazation_mode: Literal["separate", "iotized"],
+    i_marker: Literal["ь", "j", "ı"] = "ı",  # Used when iotazation_mode="separate"
+    j_marker: Literal["й", "j", "ï"] = "й",  # Used when iotazation_mode="separate"
+    replace_nasals: bool = False,
+    rz_as_r: bool = False,
+    replace_o_with_uk: bool = False,
+) -> str:
+    """Convert Polish text to Cyrillic using specific character mappings and options.
+    Args:
+        text: The Polish text to convert
+        iotazation_mode: "separate" for separate markers, "iotized" for iotized vowels
+        i_marker: Marker for i in combinations (ь/j/ı) when iotazation_mode="separate"
+        j_marker: Marker for j letter (й/j/ï) when iotazation_mode="separate"
+        replace_nasals: Whether to replace Ąą and Ęę with Cyrillic equivalents
+        rz_as_r: Whether to replace RZ with Р instead of Ж
+        replace_o_with_uk: Whether to replace Óó with ꙋꙊ instead of Уу
+    """
+    # Order matters! Longer sequences must be replaced first
+    replacements = [
+        # Two-letter digraphs (must come before single letters)
+        ("SZ", "Ш"),
+        ("Sz", "Ш"),
+        ("sz", "ш"),
+        ("CZ", "Ч"),
+        ("Cz", "Ч"),
+        ("cz", "ч"),
+        ("DŻ", "Џ"),
+        ("Dż", "Џ"),
+        ("dż", "џ"),
+        # ("DZ", "S"),
+        # ("Dz", "S"),
+        # ("dz", "s"),
+        ("CH", "Х"),
+        ("Ch", "Х"),
+        ("ch", "х"),
+        ("DŹ", "Ђ"),
+        ("Dź", "Ђ"),
+        ("dź", "ђ"),
+    ]
+    # RZ replacement - conditional based on rz_as_r option
+    if rz_as_r:
+        replacements.extend(
+            [
+                ("RZ", "Р"),
+                ("Rz", "Р"),
+                ("rz", "р"),
+            ]
+        )
+    else:
+        replacements.extend(
+            [
+                ("RZ", "Ж"),
+                ("Rz", "Ж"),
+                ("rz", "ж"),
+            ]
+        )
+    # Three-letter combinations - different behavior based on iotazation_mode
+    if iotazation_mode == "separate":
+        # Use selected marker as separate iota symbol
+        replacements.extend(
+            [
+                ("IE", f"{i_marker.upper()}E"),
+                ("Ie", f"{i_marker}e"),
+                ("ie", f"{i_marker}e"),
+                ("IA", f"{i_marker.upper()}A"),
+                ("Ia", f"{i_marker}a"),
+                ("ia", f"{i_marker}a"),
+                ("IU", f"{i_marker.upper()}У"),
+                ("Iu", f"{i_marker}у"),
+                ("iu", f"{i_marker}у"),
+                ("IO", f"{i_marker.upper()}O"),
+                ("Io", f"{i_marker}o"),
+                ("io", f"{i_marker}o"),
+                ("IĘ", f"{i_marker.upper()}Ę"),
+                ("Ię", f"{i_marker}ę"),
+                ("ię", f"{i_marker}ę"),
+                ("IĄ", f"{i_marker.upper()}Ą"),
+                ("Ią", f"{i_marker}ą"),
+                ("ią", f"{i_marker}ą"),
+            ]
+        )
+    else:
+        # Standard behavior - full replacement (replace_nasals always true in this mode)
+        replacements.extend(
+            [
+                ("IĘ", "Ѩ"),
+                ("Ię", "Ѩ"),
+                ("ię", "ѩ"),
+                ("IĄ", "Ѭ"),
+                ("Ią", "Ѭ"),
+                ("ią", "ѭ"),
+                ("IE", "Є"),
+                ("Ie", "Є"),
+                ("ie", "є"),
+                ("IA", "Я"),
+                ("Ia", "Я"),
+                ("ia", "я"),
+                ("IU", "Ю"),
+                ("Iu", "Ю"),
+                ("iu", "ю"),
+                ("IO", "Ё"),
+                ("Io", "Ё"),
+                ("io", "ё"),
+                ("IĘ", "Ѩ"),
+                ("Ię", "Ѩ"),
+                ("ię", "ѩ"),
+                ("IĄ", "Ѭ"),
+                ("Ią", "Ѭ"),
+                ("ią", "ѭ"),
+                ("JE", "Є"),
+                ("Je", "Є"),
+                ("je", "є"),
+                ("JA", "Я"),
+                ("Ja", "Я"),
+                ("ja", "я"),
+                ("JU", "Ю"),
+                ("Ju", "Ю"),
+                ("ju", "ю"),
+                ("JO", "Ё"),
+                ("Jo", "Ё"),
+                ("jo", "ё"),
+                ("JĘ", "Ѩ"),
+                ("Ję", "Ѩ"),
+                ("ję", "ѩ"),
+                ("JĄ", "Ѭ"),
+                ("Ją", "Ѭ"),
+                ("ją", "ѭ"),
+            ]
+        )
+    # Single letters with diacritics
+    single_letter_replacements = [
+        ("Ż", "Ж"),
+        ("ż", "ж"),
+        ("Ł", "Л"),
+        ("ł", "л"),
+        ("Ś", "Щ"),
+        ("ś", "щ"),
+        ("Ć", "Ћ"),
+        ("ć", "ћ"),
+        ("Ź", "Җ"),
+        ("ź", "җ"),
+        ("Ń", "Њ"),
+        ("ń", "њ"),
+    ]
+    # Ó/ó replacement - conditional based on replace_o_with_uk option
+    if replace_o_with_uk:
+        single_letter_replacements.extend(
+            [
+                ("Ó", "ꙋ"),
+                ("ó", "Ꙋ"),
+            ]
+        )
+    else:
+        single_letter_replacements.extend(
+            [
+                ("Ó", "У"),
+                ("ó", "у"),
+            ]
+        )
+    replacements.extend(single_letter_replacements)
+    # Nasal vowels - conditional replacement
+    if replace_nasals:
+        # Always replace when replace_nasals=True
+        replacements.extend(
+            [
+                ("Ą", "Ѫ"),
+                ("ą", "ѫ"),
+                ("Ę", "Ѧ"),
+                ("ę", "ѧ"),
+            ]
+        )
+    # else: keep Ąą and Ęę as is
+    # Regular Latin to Cyrillic
+    replacements.extend(
+        [
+            ("A", "А"),
+            ("a", "а"),
+            ("B", "Б"),
+            ("b", "б"),
+            ("C", "Ц"),
+            ("c", "ц"),
+            ("D", "Д"),
+            ("d", "д"),
+            ("E", "Е"),
+            ("e", "е"),
+            ("F", "Ф"),
+            ("f", "ф"),
+            ("G", "Г"),
+            ("g", "г"),
+            ("H", "Х"),
+            ("h", "х"),
+            ("I", "І"),
+            ("i", "і"),
+            ("K", "К"),
+            ("k", "к"),
+            ("L", "Љ"),
+            ("l", "љ"),
+            ("M", "М"),
+            ("m", "м"),
+            ("N", "Н"),
+            ("n", "н"),
+            ("O", "О"),
+            ("o", "о"),
+            ("P", "П"),
+            ("p", "п"),
+            ("Q", "К"),
+            ("q", "к"),
+            ("R", "Р"),
+            ("r", "р"),
+            ("S", "С"),
+            ("s", "с"),
+            ("T", "Т"),
+            ("t", "т"),
+            ("U", "У"),
+            ("u", "у"),
+            ("V", "В"),
+            ("v", "в"),
+            ("W", "В"),
+            ("w", "в"),
+            ("X", "КС"),
+            ("x", "кс"),
+            ("Y", "И"),
+            ("y", "и"),
+            ("Z", "З"),
+            ("z", "з"),
+        ]
+    )
+    # J/j handling based on iotazation_mode
+    if iotazation_mode == "separate":
+        # J becomes the selected j_marker
+        # Determine uppercase version of j_marker
+        j_upper = j_marker.upper()
+        replacements.extend(
+            [
+                ("J", j_upper),
+                ("j", j_marker),
+            ]
+        )
+    # else: J stays the same (no replacement added)
+    # because Ss is present in original text, we put DZdz last to ensure it is correctly replaced
+    replacements.extend(
+        [
+            ("ДЗ", "Ѕ"),
+            ("Дз", "Ѕ"),
+            ("дз", "ѕ"),
+        ]
+    )
+    result = text
+    for old, new in replacements:
+        result = result.replace(old, new)
+    return result
+def process_bulgarian_text(
+    text: str, use_macedonian_digraphs: bool = False, use_iota_letter: bool = False
+) -> str:
+    result = text
+    if use_macedonian_digraphs:
+        # Replace ДЖ with Џ
+        result = result.replace("ДЖ", "Џ")
+        result = result.replace("Дж", "Џ")
+        result = result.replace("дж", "џ")
+        # Replace ДЗ with Ѕ
+        result = result.replace("ДЗ", "Ѕ")
+        result = result.replace("Дз", "Ѕ")
+        result = result.replace("дз", "ѕ")
+    if use_iota_letter:
+        # Replace ю with ıу (must come before ь replacement)
+        result = result.replace("Ю", "Iу")
+        result = result.replace("ю", "ıу")
+        # Replace я with ıа (must come before ь replacement)
+        result = result.replace("Я", "Iа")
+        result = result.replace("я", "ıа")
+        # Replace ь and й with ı
+        result = result.replace("Ь", "I")
+        result = result.replace("ь", "ı")
+        result = result.replace("Й", "I")
+        result = result.replace("й", "ı")
+    return result
+DEFAULT_POLISH = """
+W miasteczku Złotobrzeg czas płynął wolniej niż gdziekolwiek indziej. Zegary na rynku zawsze spieszyły się o pięć minut, a mimo to nikt nigdy nie przychodził punktualnie. Pewnego jesiennego poranka Janek, bibliotekarz o wiecznie poplamionych atramentem palcach, znalazł pod drzwiami biblioteki małą, drewnianą szkatułkę. Nie było na niej żadnego napisu, tylko wyryty symbol ptaka bez skrzydeł.
+Zabrał ją do środka i postawił między regałami z zapomnianymi kronikami. Gdy ją otworzył, nie znalazł złota ani listu, lecz mapę miasteczka, na której zaznaczono miejsca już nieistniejące: starą piekarnię, kino „Echo”, dom zielarki spłonięty przed laty. Gdy dotknął mapy, usłyszał cichy szmer, jakby ktoś szeptał wspomnienia.
+Od tego dnia Janek zaczął odwiedzać zaznaczone punkty. W ruinach kina poczuł zapach popcornu i usłyszał śmiech dzieci. Na miejscu piekarni znalazł ciepły kamień, jakby dopiero co wyjęty z pieca. Zrozumiał, że szkatułka nie pokazuje miejsc, lecz emocje, które wciąż krążyły po Złotobrzegu.
+Wieść o jego wędrówkach szybko się rozeszła. Ludzie zaczęli prosić Janka, by odnalazł ich własne wspomnienia. On jednak pewnego dnia zamknął szkatułkę i schował ją na najwyższej półce biblioteki. Uznał, że nie wszystko musi być odnalezione.
+Od tamtej pory Złotobrzeg znów ucichł, ale czasem, gdy wiatr wieje od rynku, można usłyszeć trzepot ptaka bez skrzydeł, przypominający, że przeszłość nigdy całkiem nie znika.
+Grzegorz Brzęczyszczykiewicz"""
+DEFAULT_BULGARIAN = """
+В един слънчев юни, Янко и Джон (английският му приятел) тръгнаха с джип към големия гьол, още известен като язовир "Кючюк Атлантик".
+— Дзън! — звънна телефонът, докато слушаха приятен джаз и обсъждаха дзен философията.
+— Йо, аверчета! Направила съм ви вкусна топла сьомга. Нося и ядки за уискито. - провика се развълнувано Мария през телефона.
+"""
+def polish_page():
+    st.header("🇵🇱 POLSZCZYZNA")
+    # Polish-specific options
+    st.subheader("Options")
+    # Iotazation and palatization options
+    st.markdown("**Iotazation and palatization**")
+    iotazation_mode = st.radio(
+        "Choose mode:",
+        options=["iotized", "separate"],
+        format_func=lambda x: (
+            "Soft letters (Ь/J/I + Й/J/Ï)"
+            if x == "separate"
+            else "Iotized vowels (Я, Ю, Є, Ё, Ѩ, Ѭ)"
+        ),
+        help="Separate markers: Use customizable markers for palatalization/iotization. "
+        "Iotized vowels: Use standard Cyrillic iotized vowels (IA→Я, etc.)",
+        label_visibility="collapsed",
+    )
+    # Show marker selects only when "separate" mode is selected
+    if iotazation_mode == "separate":
+        col1, col2 = st.columns(2)
+        with col1:
+            i_marker = st.selectbox(
+                "Replace 'i' in combinations like ie, ia, iu, etc. with this letter:",
+                options=["ь", "j", "ı"],
+                index=2,  # Default to ı
+                help="Choose the marker to replace 'i' in combinations like ie, ia, iu, etc.",
+            )
+        with col2:
+            j_marker = st.selectbox(
+                "Replace 'j' with this letter:",
+                options=["й", "j", "ï"],
+                index=0,  # Default to й
+                help="Choose the replacement for the letter 'j'",
+            )
+    else:
+        # Use default values when iotized mode is selected (they won't be used anyway)
+        i_marker = "ı"
+        j_marker = "ı"
+    st.markdown("---")
+    replace_nasals = st.checkbox(
+        "Ą → Ѫ and Ę → Ѧ",
+        value=True,
+        help="When enabled, replaces Ąą with Ѫѫ and Ęę with Ѧѧ. "
+        "When disabled, keeps the nasal letters as is. "
+        "Note: When 'Iotized vowels' mode is selected, J+nasal combinations (JĄ, JĘ) are always converted to iotized nasals (Ѭ, Ѩ) regardless of this setting.",
+    )
+    rz_as_r = st.checkbox(
+        "RZ → Р (instead of Ж)",
+        value=False,
+        help="When enabled, replaces RZ/rz with Рр instead of the default Жж.",
+    )
+    replace_o_with_uk = st.checkbox(
+        "Ó → Ꙋ (instead of У)",
+        value=False,
+        help="When enabled, replaces Óó with Ꙋꙋ instead of the default Уу.",
+    )
+    # Create two columns for input and output
+    col1, col2 = st.columns(2)
+    with col1:
+        st.subheader("Input")
+        input_text = st.text_area(
+            "Enter Polish text:",
+            value=DEFAULT_POLISH,
+            height=800,
+            placeholder="Type or paste your Polish text here...",
+            label_visibility="collapsed",
+        )
+    # Display output automatically when there's input text
+    with col2:
+        st.subheader("Output")
+        if input_text:
+            result = process_polish_text(
+                input_text, iotazation_mode, i_marker, j_marker, replace_nasals, rz_as_r, replace_o_with_uk  # type: ignore
+            )
+            st.text_area(
+                "Processed text:",
+                value=result,
+                height=800,
+                disabled=True,
+                label_visibility="collapsed",
+            )
+        else:
+            st.text_area(
+                "Processed text:",
+                value="",
+                height=800,
+                disabled=True,
+                label_visibility="collapsed",
+                placeholder="Processed text will appear here...",
+            )
+def bulgarian_page():
+    st.header("🇧🇬 БЪЛГАРСКИ")
+    # Bulgarian-specific options
+    st.subheader("Options")
+    use_macedonian_digraphs = st.checkbox(
+        "ДЖ → Џ, ДЗ → Ѕ",
+        value=False,
+        help="When enabled, replaces ДЖдж with Џџ and ДЗдз with Ѕѕ.",
+    )
+    use_iota_letter = st.checkbox(
+        "Use iota letter (ю → ıу, я → ıа, ь/й → ı)",
+        value=False,
+        help="This will be buggy if text is written in all-caps, as there is no easy way to see the surrounding context of the letters.",
+    )
+    # Create two columns for input and output
+    col1, col2 = st.columns(2)
+    with col1:
+        st.subheader("Input")
+        input_text = st.text_area(
+            "Enter Bulgarian text:",
+            value=DEFAULT_BULGARIAN,
+            height=400,
+            placeholder="Type or paste your Bulgarian text here...",
+            label_visibility="collapsed",
+        )
+    # Display output automatically when there's input text
+    with col2:
+        st.subheader("Output")
+        if input_text:
+            result = process_bulgarian_text(
+                input_text, use_macedonian_digraphs, use_iota_letter
+            )
+            st.text_area(
+                "Processed text:",
+                value=result,
+                height=400,
+                disabled=True,
+                label_visibility="collapsed",
+            )
+        else:
+            st.text_area(
+                "Processed text:",
+                value="",
+                height=400,
+                disabled=True,
+                label_visibility="collapsed",
+                placeholder="Processed text will appear here...",
+            )
+def main():
+    st.set_page_config(page_title="Language 'Fixer'", page_icon="🌍", layout="wide")
+    st.title("'Fixing' languages")
+    # Sidebar navigation
+    st.sidebar.title("Navigation")
+    page = st.sidebar.radio("Select Language:", ["Polish", "Bulgarian"])
+    # Display selected page
+    if page == "Polish":
+        polish_page()
+    elif page == "Bulgarian":
+        bulgarian_page()
+if __name__ == "__main__":
+    main()