Spaces:
Runtime error
Runtime error
| from datasets import load_dataset | |
| import numpy as np | |
| from transformers import AutoTokenizer | |
| import gradio as gr | |
| lang_codes = """Acehnese (Arabic script) | ace_Arab | |
| Acehnese (Latin script) | ace_Latn | |
| Mesopotamian Arabic | acm_Arab | |
| Ta’izzi-Adeni Arabic | acq_Arab | |
| Tunisian Arabic | aeb_Arab | |
| Afrikaans | afr_Latn | |
| South Levantine Arabic | ajp_Arab | |
| Akan | aka_Latn | |
| Amharic | amh_Ethi | |
| North Levantine Arabic | apc_Arab | |
| Modern Standard Arabic | arb_Arab | |
| Modern Standard Arabic (Romanized) | arb_Latn | |
| Najdi Arabic | ars_Arab | |
| Moroccan Arabic | ary_Arab | |
| Egyptian Arabic | arz_Arab | |
| Assamese | asm_Beng | |
| Asturian | ast_Latn | |
| Awadhi | awa_Deva | |
| Central Aymara | ayr_Latn | |
| South Azerbaijani | azb_Arab | |
| North Azerbaijani | azj_Latn | |
| Bashkir | bak_Cyrl | |
| Bambara | bam_Latn | |
| Balinese | ban_Latn | |
| Belarusian | bel_Cyrl | |
| Bemba | bem_Latn | |
| Bengali | ben_Beng | |
| Bhojpuri | bho_Deva | |
| Banjar (Arabic script) | bjn_Arab | |
| Banjar (Latin script) | bjn_Latn | |
| Standard Tibetan | bod_Tibt | |
| Bosnian | bos_Latn | |
| Buginese | bug_Latn | |
| Bulgarian | bul_Cyrl | |
| Catalan | cat_Latn | |
| Cebuano | ceb_Latn | |
| Czech | ces_Latn | |
| Chokwe | cjk_Latn | |
| Central Kurdish | ckb_Arab | |
| Crimean Tatar | crh_Latn | |
| Welsh | cym_Latn | |
| Danish | dan_Latn | |
| German | deu_Latn | |
| Southwestern Dinka | dik_Latn | |
| Dyula | dyu_Latn | |
| Dzongkha | dzo_Tibt | |
| Greek | ell_Grek | |
| English | eng_Latn | |
| Esperanto | epo_Latn | |
| Estonian | est_Latn | |
| Basque | eus_Latn | |
| Ewe | ewe_Latn | |
| Faroese | fao_Latn | |
| Fijian | fij_Latn | |
| Finnish | fin_Latn | |
| Fon | fon_Latn | |
| French | fra_Latn | |
| Friulian | fur_Latn | |
| Nigerian Fulfulde | fuv_Latn | |
| Scottish Gaelic | gla_Latn | |
| Irish | gle_Latn | |
| Galician | glg_Latn | |
| Guarani | grn_Latn | |
| Gujarati | guj_Gujr | |
| Haitian Creole | hat_Latn | |
| Hausa | hau_Latn | |
| Hebrew | heb_Hebr | |
| Hindi | hin_Deva | |
| Chhattisgarhi | hne_Deva | |
| Croatian | hrv_Latn | |
| Hungarian | hun_Latn | |
| Armenian | hye_Armn | |
| Igbo | ibo_Latn | |
| Ilocano | ilo_Latn | |
| Indonesian | ind_Latn | |
| Icelandic | isl_Latn | |
| Italian | ita_Latn | |
| Javanese | jav_Latn | |
| Japanese | jpn_Jpan | |
| Kabyle | kab_Latn | |
| Jingpho | kac_Latn | |
| Kamba | kam_Latn | |
| Kannada | kan_Knda | |
| Kashmiri (Arabic script) | kas_Arab | |
| Kashmiri (Devanagari script) | kas_Deva | |
| Georgian | kat_Geor | |
| Central Kanuri (Arabic script) | knc_Arab | |
| Central Kanuri (Latin script) | knc_Latn | |
| Kazakh | kaz_Cyrl | |
| Kabiyè | kbp_Latn | |
| Kabuverdianu | kea_Latn | |
| Khmer | khm_Khmr | |
| Kikuyu | kik_Latn | |
| Kinyarwanda | kin_Latn | |
| Kyrgyz | kir_Cyrl | |
| Kimbundu | kmb_Latn | |
| Northern Kurdish | kmr_Latn | |
| Kikongo | kon_Latn | |
| Korean | kor_Hang | |
| Lao | lao_Laoo | |
| Ligurian | lij_Latn | |
| Limburgish | lim_Latn | |
| Lingala | lin_Latn | |
| Lithuanian | lit_Latn | |
| Lombard | lmo_Latn | |
| Latgalian | ltg_Latn | |
| Luxembourgish | ltz_Latn | |
| Luba-Kasai | lua_Latn | |
| Ganda | lug_Latn | |
| Luo | luo_Latn | |
| Mizo | lus_Latn | |
| Standard Latvian | lvs_Latn | |
| Magahi | mag_Deva | |
| Maithili | mai_Deva | |
| Malayalam | mal_Mlym | |
| Marathi | mar_Deva | |
| Minangkabau (Arabic script) | min_Arab | |
| Minangkabau (Latin script) | min_Latn | |
| Macedonian | mkd_Cyrl | |
| Plateau Malagasy | plt_Latn | |
| Maltese | mlt_Latn | |
| Meitei (Bengali script) | mni_Beng | |
| Halh Mongolian | khk_Cyrl | |
| Mossi | mos_Latn | |
| Maori | mri_Latn | |
| Burmese | mya_Mymr | |
| Dutch | nld_Latn | |
| Norwegian Nynorsk | nno_Latn | |
| Norwegian Bokmål | nob_Latn | |
| Nepali | npi_Deva | |
| Northern Sotho | nso_Latn | |
| Nuer | nus_Latn | |
| Nyanja | nya_Latn | |
| Occitan | oci_Latn | |
| West Central Oromo | gaz_Latn | |
| Odia | ory_Orya | |
| Pangasinan | pag_Latn | |
| Eastern Panjabi | pan_Guru | |
| Papiamento | pap_Latn | |
| Western Persian | pes_Arab | |
| Polish | pol_Latn | |
| Portuguese | por_Latn | |
| Dari | prs_Arab | |
| Southern Pashto | pbt_Arab | |
| Ayacucho Quechua | quy_Latn | |
| Romanian | ron_Latn | |
| Rundi | run_Latn | |
| Russian | rus_Cyrl | |
| Sango | sag_Latn | |
| Sanskrit | san_Deva | |
| Santali | sat_Olck | |
| Sicilian | scn_Latn | |
| Shan | shn_Mymr | |
| Sinhala | sin_Sinh | |
| Slovak | slk_Latn | |
| Slovenian | slv_Latn | |
| Samoan | smo_Latn | |
| Shona | sna_Latn | |
| Sindhi | snd_Arab | |
| Somali | som_Latn | |
| Southern Sotho | sot_Latn | |
| Spanish | spa_Latn | |
| Tosk Albanian | als_Latn | |
| Sardinian | srd_Latn | |
| Serbian | srp_Cyrl | |
| Swati | ssw_Latn | |
| Sundanese | sun_Latn | |
| Swedish | swe_Latn | |
| Swahili | swh_Latn | |
| Silesian | szl_Latn | |
| Tamil | tam_Taml | |
| Tatar | tat_Cyrl | |
| Telugu | tel_Telu | |
| Tajik | tgk_Cyrl | |
| Tagalog | tgl_Latn | |
| Thai | tha_Thai | |
| Tigrinya | tir_Ethi | |
| Tamasheq (Latin script) | taq_Latn | |
| Tamasheq (Tifinagh script) | taq_Tfng | |
| Tok Pisin | tpi_Latn | |
| Tswana | tsn_Latn | |
| Tsonga | tso_Latn | |
| Turkmen | tuk_Latn | |
| Tumbuka | tum_Latn | |
| Turkish | tur_Latn | |
| Twi | twi_Latn | |
| Central Atlas Tamazight | tzm_Tfng | |
| Uyghur | uig_Arab | |
| Ukrainian | ukr_Cyrl | |
| Umbundu | umb_Latn | |
| Urdu | urd_Arab | |
| Northern Uzbek | uzn_Latn | |
| Venetian | vec_Latn | |
| Vietnamese | vie_Latn | |
| Waray | war_Latn | |
| Wolof | wol_Latn | |
| Xhosa | xho_Latn | |
| Eastern Yiddish | ydd_Hebr | |
| Yoruba | yor_Latn | |
| Yue Chinese | yue_Hant | |
| Chinese (Simplified) | zho_Hans | |
| Chinese (Traditional) | zho_Hant | |
| Standard Malay | zsm_Latn | |
| Zulu | zul_Latn""" | |
| lang_codes = {l.split(" | ")[0]: l.split(" | ")[1] for l in lang_codes.split("\n")} | |
| dataset = load_dataset("facebook/flores", "all", trust_remote_code=True)["dev"] | |
| data_per_lang = {} | |
| for d in dataset: | |
| for full, code in lang_codes.items(): | |
| k = f"sentence_{code}" | |
| data_per_lang[full] = data_per_lang.get(code, []) + [d[k]] | |
| def get_results(tokenizer_name, base_lang, comp_lang): | |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) | |
| base_data = data_per_lang[base_lang] | |
| comp_data = data_per_lang[comp_lang] | |
| base_results = [] | |
| comp_results = [] | |
| for base_d, comp_d in zip(base_data, comp_data): | |
| input_ids = tokenizer(base_d, return_tensors="np")[0] | |
| base_results.append(len(input_ids)) | |
| input_ids = tokenizer(comp_d, return_tensors="np")[0] | |
| comp_results.append(len(input_ids)) | |
| agg_base = np.array(base_results).mean() | |
| agg_comp = np.array(comp_results).mean() | |
| token_ratio = (agg_comp / agg_base) | |
| print(token_ratio) | |
| if token_ratio < 1.: | |
| adverb = "less" | |
| token_ratio = (1. - token_ratio) * 100 | |
| else: | |
| adverb = "more" | |
| token_ratio = (token_ratio - 1.) * 100 | |
| output = f"You need {round(token_ratio, 3)}% {adverb} tokens to represent your text in {comp_lang} than in {base_lang}." | |
| return output | |
| with gr.Blocks() as demo: | |
| with gr.Column(): | |
| with gr.Row(): | |
| tokenizer = gr.Textbox(label="Tokenizer name", value="bert-base-cased") | |
| with gr.Row(): | |
| with gr.Column(): | |
| base_lang = gr.Dropdown( | |
| list(lang_codes.keys()), label="Languages" | |
| ) | |
| with gr.Column(): | |
| comp_lang = gr.Dropdown( | |
| list(lang_codes.keys()), label="Languages" | |
| ) | |
| with gr.Row(): | |
| btn = gr.Button("Submit") | |
| out_text = gr.Markdown() | |
| btn.click( | |
| get_results, | |
| inputs=[tokenizer, base_lang, comp_lang], | |
| outputs=[out_text], | |
| api_name=False, | |
| ) | |
| demo.launch() |