Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import epitran | |
| import langcodes | |
| from langcodes import LanguageTagError | |
| from pathlib import Path | |
| from operator import itemgetter | |
| from collections import defaultdict | |
| # TODO: reverse transliterate? | |
| def get_lang_description_from_mapping_name(string_to_check, add_original_code= True, add_iso_url=False): | |
| description = None | |
| if "generic-Latn" == string_to_check: | |
| return "Generic Latin Script text" | |
| lang = get_langcode_lang_from_mapping_name(string_to_check) | |
| if lang: | |
| items = [] | |
| for key, value in lang.describe().items(): | |
| if key == "language" and add_iso_url: | |
| iso_code = lang.to_alpha3() | |
| value = f"[{value}](https://iso639-3.sil.org/code/{iso_code})" | |
| items.append(f"{key}: {value}") | |
| description = ", ".join(items) | |
| notes = { | |
| "-red": " (reduced mode)", | |
| "-suf": " (Based on data with suffixes attached)", | |
| "-nosuf": "Based on data with suffixes removed", | |
| "-np": " (naively assume phonemic orthography)", | |
| } | |
| for key, note in notes.items(): | |
| if key in string_to_check: | |
| description = description + note | |
| if add_original_code: | |
| description = f"{string_to_check}: " + description | |
| return description | |
| def get_langcode_lang_from_mapping_name(string_to_check): | |
| if len(string_to_check)<2: | |
| return None | |
| substrings = string_to_check.split("-") | |
| iso_lang_and_iso_script = substrings[:2] # first two are ISO 639-3 language, and ISO 15924 script | |
| string_to_check = "-".join(iso_lang_and_iso_script ) | |
| lang = langcodes.get(string_to_check) | |
| return lang | |
| def get_valid_epitran_mappings_list(): | |
| map_path = Path(epitran.__path__[0]) / "data" / "map" | |
| map_files = map_path.glob("*.*") | |
| valid_mappings = [map_file.stem for map_file in map_files] | |
| valid_mappings.append("cmn-Hans") # special case | |
| valid_mappings.append("cmn-Hant") # Taiwan #1 | |
| problem_mappings = ['generic-Latn', | |
| 'tur-Latn-bab', | |
| 'ood-Latn-sax', | |
| 'vie-Latn-so', | |
| 'vie-Latn-ce', | |
| 'vie-Latn-no', | |
| 'kaz-Cyrl-bab'] # https://github.com/dmort27/epitran/issues/98 | |
| filtered_mappings = [mapping for mapping in valid_mappings if mapping not in problem_mappings] | |
| return filtered_mappings | |
| def get_epitran(selected_mapping): | |
| if selected_mapping == "cmn-Hans" or selected_mapping == "cmn-Hant": | |
| st.info("Chinese requires a special dictionary. Downloading now") | |
| epitran.download.cedict() | |
| epi = epitran.Epitran(selected_mapping) | |
| return epi | |
| if __name__ == "__main__": | |
| st.write("# Phonemize your text with [Epitran](https://github.com/dmort27/epitran)!") | |
| st.write("Epitran is a library and tool for transliterating orthographic text as IPA (International Phonetic Alphabet), by Mortensen, David R. and Dalmia, Siddharth and Littell, Patrick. [Click here to visit their repository!](https://github.com/dmort27/epitran)") | |
| st.write("I, [Colin Leong](cdleong.github.io) did not create Epitran, but I have created this web app (kindly hosted by Hugging Face) to make it convenient to use: simply type your text in the box below!") | |
| st.write(f"**Feedback:** Provide feedback regarding this web app at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A") | |
| valid_epitran_mappings = get_valid_epitran_mappings_list() | |
| #st.write(valid_epitran_mappings) | |
| st.write(f"It supports converting many writing sytems to IPA symbols, including approximately {len(valid_epitran_mappings)} languages/scripts, listed below:") | |
| index_of_desired_default = valid_epitran_mappings.index("swa-Latn") | |
| selected_mapping = st.selectbox("Select input language/script:", | |
| valid_epitran_mappings, | |
| index=index_of_desired_default, | |
| format_func=get_lang_description_from_mapping_name, | |
| ) | |
| description = get_lang_description_from_mapping_name(selected_mapping, add_iso_url=True) | |
| st.write(f"Selected input language/script: {description}") | |
| st.info("attempting to instantiate epitran transliterator for your language/script") | |
| epi = get_epitran(str(selected_mapping)) | |
| examples = defaultdict(lambda: 'Try typing some words in the language you chose, and they will be transliterated.') | |
| examples['cmn-Hans'] = '太初有道,道与神同在,道就是神' # https://www.biblegateway.com/passage/?search=John+1&version=CUVS | |
| examples['cmn-Hant'] = '太初有道,道與神同在,道就是神。' # https://www.biblegateway.com/passage/?search=John+1&version=CUV | |
| examples['swa-Latn'] = 'Mwanzoni Kabla ulimwengu haujaumbwa alikuwepo Neno Huyo Neno alikuwa pamoja na Mungu, na Neno alikuwa Mungu.' # https://www.biblegateway.com/passage/?search=John+1&version=SNT | |
| examples['ara-Arab'] = 'فِي الْبَدْءِ كَانَ الْكَلِمَةُ، وَالْكَلِمَةُ كَانَ عِنْدَ اللهِ. وَكَانَ الْكَلِمَةُ اللهُ.' # https://www.biblegateway.com/passage/?search=John+1&version=NAV | |
| examples['urd-Arab'] = 'دُنیا کی ابتدا ء سے پہلے کلام وہاں تھا کلام خدا کے ساتھ تھا اور کلام خدا تھا۔' # https://www.biblegateway.com/passage/?search=John+1&version=ERV-UR | |
| st.write("### Input text below") | |
| input_text = st.text_area(label="Whatever you type here will be transliterated!", value=examples[selected_mapping]) | |
| # combined_code = "-".join([iso_lang_code, iso_script_code]) | |
| # st.write(f"Combined code: {combined_code}") | |
| st.info(f"transliterating `{input_text}`\n\tusing {selected_mapping}...") | |
| transliteration = epi.transliterate(input_text) | |
| output = { | |
| "original": input_text, | |
| "transliteration":transliteration, | |
| } | |
| st.write(output) | |