--- license: cc-by-sa-4.0 language: - bn pipeline_tag: translation tags: - ipa - bengali - text-to-ipa - regional-dialect - number ---

🚨 BanglaIPA 🚨 BanglaIPA: Towards Robust Text-to-IPA Transcription with Contextual Rewriting in Bengali

📝 Paper, 🖥️ Github
**BanglaIPA** - Bengali text to International Phonetic Alphabet (IPA) transcription system is trained on standard Bengali and six regional dialects of Bangladesh using the DUAL-IPA dataset from Bengali.AI. ## Load the BanglaIPA System **Prerequisite**
``` !pip install tensorflow ``` **Log in to HuggingFace**
```python from huggingface_hub import login login("TOKEN") ``` **Load BanglaIPA model**
```python ## BanglaIPA from huggingface_hub import snapshot_download import os local_dir = snapshot_download( repo_id="Jakir057/BanglaIPA" ) print(local_dir) MODEL_PATH = os.path.join(local_dir, "BanglaIPA") print(f"Model path={MODEL_PATH}") ``` ## Transcription Generation ```python import tensorflow as tf from tensorflow.keras.layers import TextVectorization import numpy as np import os os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" def get_vocab(): """ Returns sorted list of Bengali characters, IPA characters, special tokens and other characters seen in the training set. """ vb = ['', '[UNK]', '[start]', '[end]', 'া', 'র', '্', 'ে', 'ি', 'ন', 'ক', 'ব', 'স', 'ল', 'ত', 'ম', 'প', 'ু', 'দ', 'ট', 'য়', 'জ', '।', 'ো', 'গ', 'হ', 'য', 'শ', 'ী', 'ই', 'চ', 'ভ', 'আ', 'ও', 'ছ', 'ষ', 'ড', 'ফ', 'অ', 'ধ', 'খ', 'ড়', 'উ', 'ণ', 'এ', 'থ', 'ং', 'ঁ', 'ূ', 'ৃ', 'ঠ', 'ঘ', 'ঞ', 'ঙ', 'ৌ', '‘', 'ৎ', 'ঝ', 'ৈ', '়', 'ঢ', 'ঃ', 'ঈ', '\u200c', 'ৗ', 'a', 'ঐ', 'd', 'w', 'ঋ', 'i', 'e', 't', 's', 'n', 'm', 'b', '“', 'u', 'r', 'œ', 'o', '–', 'ঊ', 'ঢ়', 'Í', 'g', 'p', '\xad', 'h', 'c', 'l', 'ঔ', 'ƒ', '”', 'Ñ', '¡', 'y', 'j', 'f', '→', '—', 'ø', 'è', '¦', '¥', 'x', 'v', 'k'] vipa = ['', '[UNK]', '[start]', '[end]', 'ɐ', 'ɾ', 'i', 'o', 'e', '̪', 't', 'n', 'k', 'ɔ', 'ʃ', 'b', 'd', 'l', 'u', 'p', 'm', 'ʰ', 'ɟ', '͡', '̯', 'g', 'ʱ', '।', 'c', 'ʲ', 'h', 's', 'ŋ', 'ɛ', 'ɽ', '̃', 'ʷ', '‘', '“', '–', '”', '—', 'w', 'j'] v = vb + vipa s = set() for ch in v: s.add(ch) vocab = sorted(list(s)) return vocab def get_vectorization(): """ Performs vectorization. """ vocab = get_vocab() vocab_size = len(vocab) sequence_length = 64 bn_vectorization = TextVectorization( max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length, vocabulary=vocab ) ipa_vectorization = TextVectorization( max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length + 1, vocabulary=vocab ) return bn_vectorization, ipa_vectorization def decode_sequence(input_sentence, bn_vectorization, ipa_vectorization, banglaipa_model): """ Generate IPA for subword. Args: - input_sentence (str): Synthetic sentence where every adjacent characters has a space between them. - bn_vectorization: TextVectorization - en_vectorization: TextVectorization - banglaipa_model: Transformer model Returns: - str: String of IPA characters and special tokens where adjacent characters are separated with a space. """ max_decoded_sentence_length = 64 spa_vocab = ipa_vectorization.get_vocabulary() spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab)) tokenized_input_sentence = bn_vectorization([input_sentence]) decoded_sentence = '[start]' for i in range(max_decoded_sentence_length): tokenized_target_sentence = ipa_vectorization([decoded_sentence])[:, :-1] predictions = banglaipa_model([tokenized_input_sentence, tokenized_target_sentence]) sampled_token_index = np.argmax(predictions[0, i, :]) sampled_token = spa_index_lookup[sampled_token_index] decoded_sentence += " " + sampled_token if sampled_token == '[UNK]': break return decoded_sentence def sentence_to_word(sentence): """ Generate word from synthetic sentence by removing spaces between adjacent characters. Args: - sentence (str): Synthetic sentence. Returns: - str: subword/word """ trg='' for ch in sentence: if ch != " ": trg += ch return trg def word_to_sentence(word): """ Generate synthetic sentence from word by inserting spaces between adjacent characters. Args: - word (str): subword/word segement Returns: - str: Synthetic sentence """ sentence = "" for ch in word: sentence += (ch + " ") return sentence def get_subword2ipa(word, bn_vectorization, ipa_vectorization, banglaipa_model): translated = decode_sequence(word_to_sentence(word), bn_vectorization, ipa_vectorization, banglaipa_model) trg = sentence_to_word(translated) trg = trg[7:] trg = trg[:-5] return trg if __name__ == "__main__": path = MODEL_PATH banglaipa_model=tf.saved_model.load(path) print("BanglaIPA model loaded.") bn_vectorization, ipa_vectorization = get_vectorization() text = "একটি বাছাই করুন গণিত প্রথম গণিত দ্বিতীয় পত্র" ipa = "" words = text.split(" ") for word in words: trg = get_subword2ipa(word, bn_vectorization, ipa_vectorization, banglaipa_model) print(word, trg) ipa += (trg + " ") print(f"IPA={ipa}") ## python inference.py # # Output: # BanglaIPA model loaded. # একটি ekti # বাছাই bɐcʰɐ͡i̯ # করুন koɾun # গণিত gonit̪o # প্রথম pɾot̪ʰom # গণিত gonit̪o # দ্বিতীয় d̪it̪iʲo # পত্র pɔt̪ɾo # IPA=ekti bɐcʰɐ͡i̯ koɾun gonit̪o pɾot̪ʰom gonit̪o d̪it̪iʲo pɔt̪ɾo ``` ## Citation ``` @misc {hasan2026banglaiparobusttexttoipatranscription, title={BanglaIPA: Towards Robust Text-to-IPA Transcription with Contextual Rewriting in Bengali}, author={Jakir Hasan and Shrestha Datta and Md Saiful Islam and Shubhashis Roy Dipta and Ameya Debnath}, year={2026}, eprint={2601.01778}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2601.01778}, } ```