Jakir057 commited on
Commit
a526474
·
verified ·
1 Parent(s): 87299b5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +144 -55
README.md CHANGED
@@ -33,61 +33,150 @@ from huggingface_hub import login
33
  login("TOKEN")
34
  ```
35
 
36
- **Load base model and BRDialect**<br>
37
  ```python
38
- ## BRDialect
39
- from huggingface_hub import hf_hub_download
 
40
 
41
- kenlm_model_path = hf_hub_download(repo_id="Jakir057/BRDialect", filename="BRDialect/5gram_kenlm.arpa")
42
- state_dict_path = hf_hub_download(repo_id="Jakir057/BRDialect", filename="BRDialect/wav2vec2_bangla_regional_dialect.pth")
43
- ```
44
- ```python
45
- from transformers import AutoProcessor, AutoModelForCTC, Wav2Vec2ProcessorWithLM
46
- import torch
47
- import numpy as np
48
- import pyctcdecode
49
- import librosa
50
-
51
- base_model_id = "ai4bharat/indicwav2vec_v1_bengali"
52
- processor = AutoProcessor.from_pretrained(base_model_id)
53
- model = AutoModelForCTC.from_pretrained(base_model_id)
54
- model.load_state_dict(torch.load(state_dict_path)["model"])
55
-
56
- vocab_dict = processor.tokenizer.get_vocab()
57
- sorted_vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
58
- decoder = pyctcdecode.build_ctcdecoder(
59
- list(sorted_vocab_dict.keys()),
60
- str(kenlm_model_path)
61
  )
62
- processor_with_lm = Wav2Vec2ProcessorWithLM(
63
- feature_extractor=processor.feature_extractor,
64
- tokenizer=processor.tokenizer,
65
- decoder=decoder
66
- )
67
- model.freeze_feature_encoder()
68
- model.eval()
69
  ```
70
 
71
  ## Transcription Generation
72
  ```python
73
- sampling_rate = 16000
74
- path = "AUDIO_PATH"
75
- frame, sr = librosa.load(path, sr=sampling_rate, mono=True)
76
-
77
- inputs = processor(
78
- frame,
79
- sampling_rate=sampling_rate,
80
- return_tensors="pt",
81
- padding=False
82
- )
83
-
84
- with torch.no_grad():
85
- logits = model(inputs.input_values.to("cpu")).logits
86
-
87
- np_logits = logits.squeeze(0).cpu().numpy()
88
- result = processor_with_lm.decode(np_logits, beam_width=256)
89
- text = result.text
90
- print(f"Transcription={text}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  ```
92
 
93
  ## Citation
@@ -95,12 +184,12 @@ print(f"Transcription={text}")
95
  ```
96
  @misc
97
  {hasan2026banglaiparobusttexttoipatranscription,
98
- title={BanglaIPA: Towards Robust Text-to-IPA Transcription with Contextual Rewriting in Bengali},
99
- author={Jakir Hasan and Shrestha Datta and Md Saiful Islam and Shubhashis Roy Dipta and Ameya Debnath},
100
- year={2026},
101
- eprint={2601.01778},
102
- archivePrefix={arXiv},
103
- primaryClass={cs.CL},
104
- url={https://arxiv.org/abs/2601.01778},
105
  }
106
  ```
 
33
  login("TOKEN")
34
  ```
35
 
36
+ **Load BanglaIPA model**<br>
37
  ```python
38
+ ## BanglaIPA
39
+ from huggingface_hub import snapshot_download
40
+ import os
41
 
42
+ local_dir = snapshot_download(
43
+ repo_id="Jakir057/BanglaIPA"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  )
45
+ print(local_dir)
46
+
47
+ MODEL_PATH = os.path.join(local_dir, "BanglaIPA")
48
+ print(f"Model path={MODEL_PATH}")
 
 
 
49
  ```
50
 
51
  ## Transcription Generation
52
  ```python
53
+ import tensorflow as tf
54
+ from tensorflow.keras.layers import TextVectorization
55
+ import numpy as np
56
+ import os
57
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
58
+
59
+ def get_vocab():
60
+ """
61
+ Returns sorted list of Bengali characters, IPA characters, special tokens and other characters seen in the training set.
62
+ """
63
+ vb = ['', '[UNK]', '[start]', '[end]', 'া', 'র', '্', 'ে', 'ি', 'ন', 'ক', 'ব', 'স', 'ল', 'ত', 'ম', 'প', 'ু', 'দ', 'ট', 'য়', 'জ', '।', 'ো', 'গ', 'হ', 'য', 'শ', 'ী', 'ই', 'চ', 'ভ', 'আ', 'ও', 'ছ', 'ষ', 'ড', 'ফ', 'অ', 'ধ', 'খ', 'ড়', 'উ', 'ণ', 'এ', 'থ', 'ং', 'ঁ', 'ূ', 'ৃ', 'ঠ', 'ঘ', 'ঞ', 'ঙ', 'ৌ', '‘', 'ৎ', 'ঝ', 'ৈ', '়', 'ঢ', 'ঃ', 'ঈ', '\u200c', 'ৗ', 'a', 'ঐ', 'd', 'w', 'ঋ', 'i', 'e', 't', 's', 'n', 'm', 'b', '“', 'u', 'r', 'œ', 'o', '–', 'ঊ', 'ঢ়', 'Í', 'g', 'p', '\xad', 'h', 'c', 'l', 'ঔ', 'ƒ', '”', 'Ñ', '¡', 'y', 'j', 'f', '→', '—', 'ø', 'è', '¦', '¥', 'x', 'v', 'k']
64
+ vipa = ['', '[UNK]', '[start]', '[end]', 'ɐ', 'ɾ', 'i', 'o', 'e', '̪', 't', 'n', 'k', 'ɔ', 'ʃ', 'b', 'd', 'l', 'u', 'p', 'm', 'ʰ', 'ɟ', '͡', '̯', 'g', 'ʱ', '।', 'c', 'ʲ', 'h', 's', 'ŋ', 'ɛ', 'ɽ', '̃', 'ʷ', '‘', '“', '–', '”', '—', 'w', 'j']
65
+ v = vb + vipa
66
+ s = set()
67
+ for ch in v:
68
+ s.add(ch)
69
+ vocab = sorted(list(s))
70
+ return vocab
71
+
72
+ def get_vectorization():
73
+ """
74
+ Performs vectorization.
75
+ """
76
+ vocab = get_vocab()
77
+ vocab_size = len(vocab)
78
+ sequence_length = 64
79
+ bn_vectorization = TextVectorization(
80
+ max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length,
81
+ vocabulary=vocab
82
+ )
83
+ ipa_vectorization = TextVectorization(
84
+ max_tokens=vocab_size,
85
+ output_mode="int",
86
+ output_sequence_length=sequence_length + 1,
87
+ vocabulary=vocab
88
+ )
89
+ return bn_vectorization, ipa_vectorization
90
+
91
+ def decode_sequence(input_sentence, bn_vectorization, ipa_vectorization, banglaipa_model):
92
+ """
93
+ Generate IPA for subword.
94
+
95
+ Args:
96
+ - input_sentence (str): Synthetic sentence where every adjacent characters has a space between them.
97
+ - bn_vectorization: TextVectorization
98
+ - en_vectorization: TextVectorization
99
+ - banglaipa_model: Transformer model
100
+ Returns:
101
+ - str: String of IPA characters and special tokens where adjacent characters are separated with a space.
102
+ """
103
+ max_decoded_sentence_length = 64
104
+ spa_vocab = ipa_vectorization.get_vocabulary()
105
+ spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
106
+ tokenized_input_sentence = bn_vectorization([input_sentence])
107
+ decoded_sentence = '[start]'
108
+ for i in range(max_decoded_sentence_length):
109
+ tokenized_target_sentence = ipa_vectorization([decoded_sentence])[:, :-1]
110
+ predictions = banglaipa_model([tokenized_input_sentence, tokenized_target_sentence])
111
+ sampled_token_index = np.argmax(predictions[0, i, :])
112
+ sampled_token = spa_index_lookup[sampled_token_index]
113
+ decoded_sentence += " " + sampled_token
114
+ if sampled_token == '[UNK]':
115
+ break
116
+ return decoded_sentence
117
+
118
+ def sentence_to_word(sentence):
119
+ """
120
+ Generate word from synthetic sentence by removing spaces between adjacent characters.
121
+
122
+ Args:
123
+ - sentence (str): Synthetic sentence.
124
+ Returns:
125
+ - str: subword/word
126
+ """
127
+ trg=''
128
+ for ch in sentence:
129
+ if ch != " ":
130
+ trg += ch
131
+ return trg
132
+
133
+ def word_to_sentence(word):
134
+ """
135
+ Generate synthetic sentence from word by inserting spaces between adjacent characters.
136
+
137
+ Args:
138
+ - word (str): subword/word segement
139
+ Returns:
140
+ - str: Synthetic sentence
141
+ """
142
+ sentence = ""
143
+ for ch in word:
144
+ sentence += (ch + " ")
145
+ return sentence
146
+
147
+ def get_subword2ipa(word, bn_vectorization, ipa_vectorization, banglaipa_model):
148
+ translated = decode_sequence(word_to_sentence(word), bn_vectorization, ipa_vectorization, banglaipa_model)
149
+ trg = sentence_to_word(translated)
150
+ trg = trg[7:]
151
+ trg = trg[:-5]
152
+ return trg
153
+
154
+ if __name__ == "__main__":
155
+ path = MODEL_PATH
156
+ banglaipa_model=tf.saved_model.load(path)
157
+ print("BanglaIPA model loaded.")
158
+ bn_vectorization, ipa_vectorization = get_vectorization()
159
+ text = "একটি বাছাই করুন গণিত প্রথম গণিত দ্বিতীয় পত্র"
160
+ ipa = ""
161
+ words = text.split(" ")
162
+ for word in words:
163
+ trg = get_subword2ipa(word, bn_vectorization, ipa_vectorization, banglaipa_model)
164
+ print(word, trg)
165
+ ipa += (trg + " ")
166
+ print(f"IPA={ipa}")
167
+
168
+ ## python inference.py
169
+ # # Output:
170
+ # BanglaIPA model loaded.
171
+ # একটি ekti
172
+ # বাছাই bɐcʰɐ͡i̯
173
+ # করুন koɾun
174
+ # গণিত gonit̪o
175
+ # প্রথম pɾot̪ʰom
176
+ # গণিত gonit̪o
177
+ # দ্বিতীয় d̪it̪iʲo
178
+ # পত্র pɔt̪ɾo
179
+ # IPA=ekti bɐcʰɐ͡i̯ koɾun gonit̪o pɾot̪ʰom gonit̪o d̪it̪iʲo pɔt̪ɾo
180
  ```
181
 
182
  ## Citation
 
184
  ```
185
  @misc
186
  {hasan2026banglaiparobusttexttoipatranscription,
187
+ title={BanglaIPA: Towards Robust Text-to-IPA Transcription with Contextual Rewriting in Bengali},
188
+ author={Jakir Hasan and Shrestha Datta and Md Saiful Islam and Shubhashis Roy Dipta and Ameya Debnath},
189
+ year={2026},
190
+ eprint={2601.01778},
191
+ archivePrefix={arXiv},
192
+ primaryClass={cs.CL},
193
+ url={https://arxiv.org/abs/2601.01778},
194
  }
195
  ```