Aditya02 commited on
Commit
1d87ed5
·
verified ·
1 Parent(s): 91fc676

Upload folder using huggingface_hub

Browse files
__pycache__/NumberToText.cpython-310.pyc ADDED
Binary file (3.04 kB). View file
 
__pycache__/text_preprocess_for_inference.cpython-310.pyc ADDED
Binary file (22.3 kB). View file
 
__pycache__/utilities.cpython-310.pyc ADDED
Binary file (8.17 kB). View file
 
hifigan/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (199 Bytes). View file
 
hifigan/__pycache__/env.cpython-310.pyc ADDED
Binary file (864 Bytes). View file
 
hifigan/__pycache__/meldataset.cpython-310.pyc ADDED
Binary file (5.43 kB). View file
 
hifigan/__pycache__/models.cpython-310.pyc ADDED
Binary file (8.71 kB). View file
 
hifigan/__pycache__/utils.cpython-310.pyc ADDED
Binary file (1.99 kB). View file
 
hindi_latest/female/model/config.yaml CHANGED
@@ -51,7 +51,7 @@ energy_extract_conf:
51
  win_length: 4096
52
  energy_normalize: global_mvn
53
  energy_normalize_conf:
54
- stats_file: /home/speech/Fastspeech2_latest models/Fastspeech2_HS/hindi_latest/female/model/energy_stats.npz
55
  exclude_weight_decay: false
56
  exclude_weight_decay_conf: {}
57
  feats_extract: fbank
@@ -90,7 +90,7 @@ no_forward_run: false
90
  non_linguistic_symbols: null
91
  normalize: global_mvn
92
  normalize_conf:
93
- stats_file: /home/speech/Fastspeech2_latest models/Fastspeech2_HS/hindi_latest/female/model/feats_stats.npz
94
  num_att_plot: 3
95
  num_cache_chunks: 1024
96
  num_iters_per_epoch: 800
@@ -111,7 +111,7 @@ pitch_extract_conf:
111
  reduction_factor: 1
112
  pitch_normalize: global_mvn
113
  pitch_normalize_conf:
114
- stats_file: /home/speech/Fastspeech2_latest models/Fastspeech2_HS/hindi_latest/female/model/pitch_stats.npz
115
  pretrain_path: null
116
  print_config: false
117
  required:
 
51
  win_length: 4096
52
  energy_normalize: global_mvn
53
  energy_normalize_conf:
54
+ stats_file: C:\Users\PEGPVINUser01\Desktop\Realtime-Translation-AIPC\backend\TTS\FastSpeech2_HS_HF\hindi_latest\female\model/energy_stats.npz
55
  exclude_weight_decay: false
56
  exclude_weight_decay_conf: {}
57
  feats_extract: fbank
 
90
  non_linguistic_symbols: null
91
  normalize: global_mvn
92
  normalize_conf:
93
+ stats_file: C:\Users\PEGPVINUser01\Desktop\Realtime-Translation-AIPC\backend\TTS\FastSpeech2_HS_HF\hindi_latest\female\model/feats_stats.npz
94
  num_att_plot: 3
95
  num_cache_chunks: 1024
96
  num_iters_per_epoch: 800
 
111
  reduction_factor: 1
112
  pitch_normalize: global_mvn
113
  pitch_normalize_conf:
114
+ stats_file: C:\Users\PEGPVINUser01\Desktop\Realtime-Translation-AIPC\backend\TTS\FastSpeech2_HS_HF\hindi_latest\female\model/pitch_stats.npz
115
  pretrain_path: null
116
  print_config: false
117
  required:
hindi_latest/male/model/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81790566fd4660ca46c4692f6fbdb30d8e5c9cd657084a0eb40c804e6ec2b9ab
3
+ size 152128410
main_ov.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text_preprocess_for_inference import TTSDurAlignPreprocessor, CharTextPreprocessor, TTSPreprocessor
2
+ from espnet2.bin.tts_inference import Text2Speech
3
+ from scipy.io.wavfile import write
4
+ import json
5
+ import torch
6
+ import yaml
7
+ import sys
8
+ from utilities import SAMPLING_RATE, WARMUP_PARAGRAPHS
9
+ from datetime import datetime
10
+ import os
11
+ import time
12
+ import numpy as np
13
+ import openvino as ov
14
+
15
+ sys.path.append(os.getenv("HIFIGAN_PATH", f"hifigan"))
16
+ from hifigan.env import AttrDict
17
+ from hifigan.models import Generator
18
+ from hifigan.meldataset import MAX_WAV_VALUE
19
+ import torch.nn.functional as F
20
+
21
+ import nltk
22
+ nltk.download('averaged_perceptron_tagger_eng')
23
+
24
+ device = "cuda" if torch.cuda.is_available() else "cpu"
25
+ MAX_DEFAULT_VALUE = 600
26
+
27
+
28
+ def load_hifigan_vocoder(language: str, gender: str, device: str, dtype: str = "float32"):
29
+ """
30
+ Loads HiFi-GAN vocoder configuration file and generator model.
31
+ """
32
+ vocoder_config = f"vocoder/{gender}/{language}/config.json"
33
+ vocoder_generator = f"vocoder/{gender}/{language}/generator"
34
+
35
+ if not os.path.exists(vocoder_config) or not os.path.exists(vocoder_generator):
36
+ raise FileNotFoundError(
37
+ f"Vocoder files not found. Expected config: {vocoder_config}, generator: {vocoder_generator}")
38
+
39
+ with open(vocoder_config, 'r') as f:
40
+ data = f.read()
41
+ json_config = json.loads(data)
42
+ h = AttrDict(json_config)
43
+ torch.manual_seed(h.seed)
44
+ device = torch.device(device)
45
+ generator = Generator(h).to(device)
46
+ state_dict_g = torch.load(vocoder_generator, map_location=device)
47
+ generator.load_state_dict(state_dict_g['generator'])
48
+ generator.eval()
49
+ generator.remove_weight_norm()
50
+
51
+ if dtype == "bfloat16":
52
+ generator = generator.to(torch.bfloat16)
53
+
54
+ return generator
55
+
56
+
57
+ def load_fastspeech2_model(language: str, gender: str, device: str, dtype: str = "float32"):
58
+ """
59
+ Loads FastSpeech2 model and updates its configuration with absolute paths.
60
+ """
61
+ config_path = f"{language}/{gender}/model/config.yaml"
62
+ tts_model_path = f"{language}/{gender}/model/model.pth"
63
+
64
+ if not os.path.exists(config_path) or not os.path.exists(tts_model_path):
65
+ raise FileNotFoundError(
66
+ f"FastSpeech2 model files not found. Expected config: {config_path}, model: {tts_model_path}")
67
+
68
+ with open(config_path, "r") as file:
69
+ config = yaml.safe_load(file)
70
+
71
+ current_working_directory = os.getcwd()
72
+ feat_rel_path = "model/feats_stats.npz"
73
+ pitch_rel_path = "model/pitch_stats.npz"
74
+ energy_rel_path = "model/energy_stats.npz"
75
+
76
+ feat_path = os.path.join(current_working_directory,
77
+ language, gender, feat_rel_path)
78
+ pitch_path = os.path.join(
79
+ current_working_directory, language, gender, pitch_rel_path)
80
+ energy_path = os.path.join(
81
+ current_working_directory, language, gender, energy_rel_path)
82
+
83
+ config["normalize_conf"]["stats_file"] = feat_path
84
+ config["pitch_normalize_conf"]["stats_file"] = pitch_path
85
+ config["energy_normalize_conf"]["stats_file"] = energy_path
86
+
87
+ # Temporarily write the modified config to a new file or use a BytesIO object if preferred
88
+ with open(config_path, "w") as file:
89
+ yaml.dump(config, file)
90
+
91
+ model = Text2Speech(train_config=config_path, model_file=tts_model_path, device=device, vocoder_config=None,vocoder_file=None)
92
+ model.vocoder=None
93
+
94
+ if dtype == "bfloat16":
95
+ model.model = model.model.to(torch.bfloat16)
96
+
97
+ return model
98
+
99
+
100
+ def split_into_chunks(text: str, words_per_chunk: int = 100):
101
+ """Splits text into chunks of specified words_per_chunk."""
102
+ words = text.split()
103
+ chunks = [words[i:i + words_per_chunk]
104
+ for i in range(0, len(words), words_per_chunk)]
105
+ return [' '.join(chunk) for chunk in chunks]
106
+
107
+
108
+ class Text2SpeechApp:
109
+ def __init__(self, language: str, batch_size: str = 1, alpha: float = 1, dtype: str = "bfloat16"):
110
+ self.alpha = alpha
111
+ self.lang = language
112
+ self.batch_size = batch_size
113
+ self.dtype = dtype
114
+ self.vocoder_model = {}
115
+ self.fastspeech2_model = {}
116
+ self.supported_genders = []
117
+
118
+ self.preprocessor = TTSDurAlignPreprocessor()
119
+
120
+ genders = ["male", "female"]
121
+ for gender in genders:
122
+ try:
123
+ self.vocoder_model[gender] = load_hifigan_vocoder(
124
+ f"{language}_latest", gender, device, self.dtype)
125
+ with torch.no_grad():
126
+ self.vocoder_model[gender] = ov.convert_model(self.vocoder_model[gender], example_input=torch.ones([1, 160, MAX_DEFAULT_VALUE]))
127
+
128
+ self.vocoder_model[gender] = ov.compile_model(self.vocoder_model[gender], device_name="CPU")
129
+ print(
130
+ f"Loaded HiFi-GAN vocoder for {language}-{gender}")
131
+
132
+ self.fastspeech2_model[gender] = load_fastspeech2_model(
133
+ f"{language}_latest", gender, device, self.dtype)
134
+
135
+ # with torch.no_grad():
136
+ # self.fastspeech2_model[gender] = ov.convert_model(self.fastspeech2_model[gender])
137
+ print(
138
+ f"Loaded FastSpeech2 model for {language}-{gender}")
139
+ self.supported_genders.append(gender)
140
+ except FileNotFoundError as e:
141
+ print(
142
+ f"Error loading model for {language}-{gender}: {e}. This model key will not be available.")
143
+ except Exception as e:
144
+ print(
145
+ f"An unexpected error occurred while loading model for {language}-{gender}: {e}. This model key will not be available.")
146
+ self.warmup()
147
+
148
+ def pre_print(self, print_str: str):
149
+ print("=================================================")
150
+ print(print_str)
151
+ print("=================================================")
152
+
153
+ def warmup(self):
154
+ self.pre_print("TTS Warming up!")
155
+
156
+ lang = self.lang.lower()
157
+ text = WARMUP_PARAGRAPHS.get(lang)
158
+
159
+ if not text:
160
+ print(f"No warmup paragraph available for language: {lang}")
161
+ return
162
+
163
+ # Ensure warmup output directory exists
164
+ output_dir = "./warmup_outputs"
165
+ os.makedirs(output_dir, exist_ok=True)
166
+
167
+ print(f"Running warmup for language: {lang}")
168
+ print(f"Warmup text length: {len(text.split())} words")
169
+
170
+ total_start_time = time.time()
171
+
172
+ for gender in ["male", "female"]:
173
+ if gender not in self.fastspeech2_model:
174
+ print(f"Skipping warmup for {gender} - model not loaded.")
175
+ continue
176
+
177
+ print(f"Starting warmup for {lang}-{gender}")
178
+ try:
179
+ gender_start_time = time.time()
180
+ for i in range(2): # Run twice; adjust as needed
181
+ print(f"Warmup iteration {i + 1} for {gender}")
182
+ time_taken, _ = self.convert_and_save(
183
+ text=text,
184
+ speaker_gender=gender,
185
+ output_file_dir=output_dir
186
+ )
187
+ print(f"Iteration {i + 1} for {gender} completed in {time_taken:.2f} seconds")
188
+ gender_total_time = time.time() - gender_start_time
189
+ print(f"Total warmup time for {gender}: {gender_total_time:.2f} seconds")
190
+ except Exception as e:
191
+ print(f"Warmup failed for {lang}-{gender}: {e}")
192
+
193
+ total_time = time.time() - total_start_time
194
+ print(f"Total TTS warmup completed in {total_time:.2f} seconds")
195
+ self.pre_print("TTS Warming finished!")
196
+
197
+ def save_to_file(self, audio_arr, file_path):
198
+ write(file_path, SAMPLING_RATE, audio_arr)
199
+ print(f"Audio saved to {file_path}")
200
+
201
+ def convert_and_save(self, text: str, speaker_gender="male", output_file_dir: str = "./outputs"):
202
+ timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
203
+ output_file = f"{output_file_dir}/{self.lang}_{speaker_gender}_{timestamp}.wav"
204
+
205
+ start = time.time()
206
+ audio_arr = []
207
+ result_chunks = split_into_chunks(text)
208
+
209
+ for chunk_text in result_chunks:
210
+ # Preprocess the text
211
+ preprocessed_text, _ = self.preprocessor.preprocess(
212
+ chunk_text, self.lang, speaker_gender)
213
+ preprocessed_text = " ".join(preprocessed_text)
214
+
215
+ with torch.no_grad():
216
+ # Generate mel-spectrograms
217
+ out = self.fastspeech2_model[speaker_gender](preprocessed_text,
218
+ decode_conf={"alpha": self.alpha})
219
+
220
+ x = out["feat_gen_denorm"].T.unsqueeze(0) * 2.3262
221
+
222
+ # Convert mel-spectrograms to raw audio waveforms
223
+ y_g_hat = self.vocoder_model[speaker_gender](x)
224
+ audio = y_g_hat.squeeze()
225
+
226
+ audio = audio * MAX_WAV_VALUE
227
+
228
+ audio = audio.numpy().astype('int16')
229
+ audio_arr.append(audio)
230
+
231
+ result_array = np.concatenate(audio_arr, axis=0)
232
+ self.save_to_file(audio_arr=result_array, file_path=output_file)
233
+ time_taken = time.time() - start
234
+
235
+ return time_taken, output_file
236
+
237
+
238
+ def generate_audio_bytes(self, text: str, speaker_gender="male", save_file: bool = False):
239
+ preprocessed_text, _ = self.preprocessor.preprocess(
240
+ text, self.lang, speaker_gender)
241
+ preprocessed_text = " ".join(preprocessed_text)
242
+
243
+ with torch.no_grad():
244
+ # Generate mel-spectrograms
245
+ st = time.perf_counter()
246
+ out = self.fastspeech2_model[speaker_gender](preprocessed_text,
247
+ decode_conf={"alpha": self.alpha})
248
+
249
+ x = out["feat_gen_denorm"].T.unsqueeze(0) * 2.3262
250
+
251
+ # Convert mel-spectrograms to raw audio waveforms
252
+ # [8,8,8,2]; default sr = 44100
253
+ trim_length = int(8 * 8 * 8 * 2 * x.shape[-1])
254
+ x = F.pad(x, (0, MAX_DEFAULT_VALUE - x.shape[-1]), value=-12)
255
+ st = time.perf_counter()
256
+ y_g_hat = self.vocoder_model[speaker_gender](x)
257
+ audio = y_g_hat[0][0][0][:trim_length]
258
+
259
+ audio = audio * MAX_WAV_VALUE
260
+
261
+ return audio
262
+
263
+
264
+ def evaluate_performance(self, input_sentences: list, save_file: bool = False):
265
+ total_sentences = len(input_sentences)
266
+ print(f"\nTotal T2S to be done: {total_sentences}\n")
267
+ for i, sentence in enumerate(input_sentences):
268
+ start_time = time.perf_counter()
269
+ audio = self.generate_audio_bytes(text=sentence)
270
+ time_taken = time.perf_counter() - start_time
271
+ print("=================================================")
272
+ print(f"Sentence {i + 1}/{total_sentences}:{sentence} processed in {time_taken:.2f} seconds")
273
+
274
+ if save_file:
275
+ os.makedirs(f"audios_{self.dtype}/numpy_files", exist_ok=True)
276
+ os.makedirs(f"audios_{self.dtype}/audio_files", exist_ok=True)
277
+
278
+ output_file = f"audios_{self.dtype}/numpy_files/file_{i}.npy"
279
+
280
+ if audio.dtype == torch.bfloat16:
281
+ audio = audio.to(torch.float32)
282
+
283
+ audio = audio.astype('int16')
284
+ np.save(output_file, audio)
285
+
286
+ audio_file_path = f"audios_{self.dtype}/audio_files/file_{i}.wav"
287
+ with open(audio_file_path, "wb") as f:
288
+ write(f, SAMPLING_RATE, audio)
289
+ print(f"Audio saved to {audio_file_path}")
290
+
291
+ return time_taken
292
+
293
+
294
+ def save_to_files(self, byte_ios, file_prefix: str) -> list[str]:
295
+ file_paths = []
296
+ for i in range(len(byte_ios)):
297
+ timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
298
+ file_path = f"{file_prefix}_{timestamp}_{i + 1}.wav"
299
+ file_paths.append(file_path)
300
+ with open(file_path, "wb") as f:
301
+ f.write(byte_ios[i].read())
302
+ print(f"Audio saved to {file_path}")
303
+ return file_paths
304
+
305
+ def batch_convert_and_save(self, input_sentences: list[str], speaker_gender="male", output_file_dir: str = "./outputs"):
306
+ start_time = time.time()
307
+ output_file_paths = []
308
+ total_sentences = len(input_sentences)
309
+ os.makedirs(output_file_dir, exist_ok=True)
310
+
311
+ print(f"Total T2S to be done: {total_sentences}\n")
312
+ combined_para = ''.join(input_sentences)
313
+ paragraph_time, output_path = self.convert_and_save(
314
+ combined_para, speaker_gender=speaker_gender, output_file_dir=output_file_dir)
315
+ print(f"Paragraph Time: {paragraph_time}\n")
316
+ output_file_paths.append(output_path)
317
+
318
+ time_taken = time.time() - start_time
319
+ return time_taken, output_file_paths
320
+
321
+ if __name__ == "__main__":
322
+ import argparse
323
+
324
+ parser = argparse.ArgumentParser(description="Text to Speech benchmarking")
325
+ parser.add_argument("--batch_size", type=int, default=1, help="Batch size for TTS inference")
326
+ parser.add_argument("--language", type=str, default="hindi", help="Language for TTS")
327
+ parser.add_argument("--alpha", type=float, default=1.0, help="Alpha value for FastSpeech2 decoding")
328
+ parser.add_argument("--dtype", type=str, default="float32", help="Data type for model inference")
329
+ args = parser.parse_args()
330
+
331
+ batch_size = 1
332
+ language = "hindi"
333
+ alpha = 1
334
+ tts = Text2SpeechApp(batch_size=batch_size, alpha=alpha, language=language, dtype=args.dtype)
335
+ st = time.perf_counter()
336
+ texts = [
337
+ "जीवन में सफलता पाने के लिए केवल सपने देखना ही नहीं, बल्कि उन्हें पूरा करने के लिए निरंतर प्रयास और आत्मविश्वास भी ज़रूरी होता है।",
338
+ "कठिन परिस्थितियाँ हमें तोड़ने नहीं आतीं, बल्कि हमें मज़बूत बनाकर जीवन के असली अर्थ से परिचित कराती हैं।",
339
+ "सकारात्मक सोच और सही दृष्टिकोण के साथ किया गया हर छोटा प्रयास भी एक दिन बड़ी उपलब्धि में बदल जाता है।",
340
+ "जब हम निस्वार्थ भाव से दूसरों की मदद करते हैं, तब हमारे अपने जीवन में भी शांति और संतुलन अपने आप आ जाता है।"
341
+ ]
342
+
343
+ total_time = tts.evaluate_performance(texts, save_file=True)
344
+ et = time.perf_counter()
345
+ print(f"Total time for evaluating {len(texts)} sentences: {et - st:.2f} seconds")
346
+ print(f"Average time per sentence: {(et - st)/len(texts):.2f} seconds")
server.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import base64
3
+ import io
4
+ import logging
5
+ import os
6
+ import time
7
+
8
+ import numpy as np
9
+ from fastapi import FastAPI, HTTPException
10
+ from fastapi.middleware.cors import CORSMiddleware
11
+ from pydantic import BaseModel
12
+ from scipy.io.wavfile import write as wav_write
13
+
14
+ from main_ov import Text2SpeechApp
15
+ from utilities import SAMPLING_RATE, SUPPORTED_OUTPUT_LANGS
16
+
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # --- Language code mapping (Bhashini 2-letter <-> full name) ---
21
+
22
+ LANG_CODE_TO_NAME = {
23
+ "hi": "hindi",
24
+ "ta": "tamil",
25
+ "te": "telugu",
26
+ "kn": "kannada",
27
+ "ml": "malayalam",
28
+ "pa": "punjabi",
29
+ "bn": "bengali",
30
+ }
31
+ LANG_NAME_TO_CODE = {v: k for k, v in LANG_CODE_TO_NAME.items()}
32
+
33
+ # --- Pydantic models for Bhashini pipeline request/response ---
34
+
35
+
36
+ class LanguageConfig(BaseModel):
37
+ sourceLanguage: str
38
+ sourceScriptCode: str | None = None
39
+ targetLanguage: str | None = None
40
+
41
+
42
+ class TaskConfig(BaseModel):
43
+ language: LanguageConfig
44
+ serviceId: str | None = None
45
+ gender: str = "female"
46
+ samplingRate: int = 48000
47
+
48
+
49
+ class PipelineTask(BaseModel):
50
+ taskType: str
51
+ config: TaskConfig
52
+
53
+
54
+ class InputItem(BaseModel):
55
+ source: str
56
+
57
+
58
+ class InputData(BaseModel):
59
+ input: list[InputItem] | None = None
60
+
61
+
62
+ class PipelineRequest(BaseModel):
63
+ pipelineTasks: list[PipelineTask]
64
+ inputData: InputData
65
+
66
+
67
+ class AudioItem(BaseModel):
68
+ audioContent: str | None = None
69
+ audioUri: str | None = None
70
+
71
+
72
+ class ResponseConfig(BaseModel):
73
+ audioFormat: str = "wav"
74
+ language: LanguageConfig
75
+ encoding: str = "base64"
76
+ samplingRate: int = 48000
77
+
78
+
79
+ class PipelineResponseItem(BaseModel):
80
+ taskType: str
81
+ config: ResponseConfig
82
+ output: list | None = None
83
+ audio: list[AudioItem] | None = None
84
+ metrics: dict | None = None
85
+
86
+
87
+ class PipelineResponse(BaseModel):
88
+ pipelineResponse: list[PipelineResponseItem]
89
+
90
+
91
+ class SimpleTtsRequest(BaseModel):
92
+ text: str
93
+ language: str = "hi"
94
+ gender: str = "female"
95
+ samplingRate: int = 48000
96
+
97
+
98
+ # --- App setup ---
99
+
100
+ app = FastAPI(title="FastSpeech2 TTS API (Bhashini-compatible)")
101
+ app.add_middleware(
102
+ CORSMiddleware,
103
+ allow_origins=["*"],
104
+ allow_credentials=True,
105
+ allow_methods=["*"],
106
+ allow_headers=["*"],
107
+ )
108
+
109
+ # Dict of language_name -> Text2SpeechApp instance
110
+ tts_engines: dict[str, Text2SpeechApp] = {}
111
+
112
+
113
+ @app.on_event("startup")
114
+ def load_models():
115
+ """Load TTS models for all configured languages at startup."""
116
+ logger.info(f"SUPPORTED_OUTPUT_LANGS: {SUPPORTED_OUTPUT_LANGS}")
117
+ logger.info(f"LANG_CODE_TO_NAME: {LANG_CODE_TO_NAME}")
118
+ for lang_name in SUPPORTED_OUTPUT_LANGS:
119
+ lang_name = lang_name.strip().lower()
120
+ if lang_name not in LANG_NAME_TO_CODE:
121
+ logger.warning(f"Unknown language '{lang_name}' in LANGUAGES env var, skipping.")
122
+ continue
123
+ logger.info(f"Loading TTS models for '{lang_name}'...")
124
+ try:
125
+ tts_engines[lang_name] = Text2SpeechApp(language=lang_name, dtype=os.getenv("TTS_DTYPE", "float32"))
126
+ logger.info(f"✓ Successfully loaded '{lang_name}' with genders: {tts_engines[lang_name].supported_genders}")
127
+ except Exception as e:
128
+ logger.error(f"✗ Failed to load models for '{lang_name}': {str(e)}")
129
+ logger.exception(f"Exception details for '{lang_name}':")
130
+ logger.info(f"Final loaded languages: {list(tts_engines.keys())}")
131
+
132
+
133
+ def _synthesize(tts_app: Text2SpeechApp, text: str, gender: str, requested_sr: int) -> tuple[str, float]:
134
+ """Run TTS inference and return base64-encoded WAV string and audio duration in seconds."""
135
+ audio_tensor = tts_app.generate_audio_bytes(text=text, speaker_gender=gender)
136
+
137
+ # Convert to int16 numpy
138
+ if hasattr(audio_tensor, "numpy"):
139
+ audio_np = audio_tensor.numpy().astype(np.int16)
140
+ else:
141
+ audio_np = np.array(audio_tensor, dtype=np.int16)
142
+
143
+ # Resample if requested rate differs from native rate
144
+ output_sr = SAMPLING_RATE
145
+ if requested_sr != SAMPLING_RATE:
146
+ import librosa
147
+ audio_float = audio_np.astype(np.float32) / 32768.0
148
+ audio_float = librosa.resample(audio_float, orig_sr=SAMPLING_RATE, target_sr=requested_sr)
149
+ audio_np = (audio_float * 32768.0).astype(np.int16)
150
+ output_sr = requested_sr
151
+
152
+ # Write WAV to in-memory buffer
153
+ buf = io.BytesIO()
154
+ wav_write(buf, output_sr, audio_np)
155
+ wav_bytes = buf.getvalue()
156
+ audio_duration_s = float(len(audio_np) / output_sr) if output_sr > 0 else 0.0
157
+
158
+ return base64.b64encode(wav_bytes).decode("ascii"), audio_duration_s
159
+
160
+
161
+ def _resolve_tts_engine(lang_code: str, gender: str) -> tuple[str, Text2SpeechApp, str]:
162
+ lang_code = lang_code.lower()
163
+ lang_name = LANG_CODE_TO_NAME.get(lang_code)
164
+ if not lang_name:
165
+ raise HTTPException(status_code=400, detail=f"Unsupported language code: '{lang_code}'")
166
+
167
+ if lang_name not in tts_engines:
168
+ raise HTTPException(status_code=400, detail=f"Language '{lang_name}' not loaded. Available: {list(tts_engines.keys())}")
169
+
170
+ tts_app = tts_engines[lang_name]
171
+
172
+ resolved_gender = gender.lower()
173
+ if resolved_gender not in tts_app.supported_genders:
174
+ raise HTTPException(
175
+ status_code=400,
176
+ detail=f"Gender '{resolved_gender}' not available for '{lang_name}'. Available: {tts_app.supported_genders}"
177
+ )
178
+
179
+ return lang_name, tts_app, resolved_gender
180
+
181
+
182
+ @app.post("/services/inference/pipeline", response_model=PipelineResponse)
183
+ async def inference_pipeline(request: PipelineRequest):
184
+ t_start = time.perf_counter()
185
+ if not request.pipelineTasks:
186
+ raise HTTPException(status_code=400, detail="pipelineTasks is empty")
187
+
188
+ task = request.pipelineTasks[0]
189
+
190
+ if task.taskType != "tts":
191
+ raise HTTPException(status_code=400, detail=f"Unsupported taskType: '{task.taskType}'. Only 'tts' is supported.")
192
+
193
+ # Resolve language
194
+ lang_code = task.config.language.sourceLanguage
195
+ _, tts_app, gender = _resolve_tts_engine(lang_code, task.config.gender)
196
+
197
+ requested_sr = task.config.samplingRate
198
+
199
+ # Validate input
200
+ if not request.inputData.input:
201
+ raise HTTPException(status_code=400, detail="inputData.input is empty")
202
+
203
+ # Process all input texts and collect audio
204
+ audio_items = []
205
+ total_audio_duration_s = 0.0
206
+ for item in request.inputData.input:
207
+ b64_audio, audio_duration_s = await asyncio.to_thread(_synthesize, tts_app, item.source, gender, requested_sr)
208
+ total_audio_duration_s += audio_duration_s
209
+ audio_items.append(AudioItem(audioContent=b64_audio, audioUri=None))
210
+
211
+ latency_ms = round((time.perf_counter() - t_start) * 1000, 2)
212
+ rtf = round((latency_ms / 1000) / total_audio_duration_s, 4) if total_audio_duration_s > 0 else 0.0
213
+
214
+ response = PipelineResponse(
215
+ pipelineResponse=[
216
+ PipelineResponseItem(
217
+ taskType="tts",
218
+ config=ResponseConfig(
219
+ audioFormat="wav",
220
+ language=LanguageConfig(sourceLanguage=lang_code, sourceScriptCode=""),
221
+ encoding="base64",
222
+ samplingRate=requested_sr,
223
+ ),
224
+ output=None,
225
+ audio=audio_items,
226
+ metrics={
227
+ "latency_ms": latency_ms,
228
+ "audio_duration_s": round(total_audio_duration_s, 3),
229
+ "rtf": rtf,
230
+ },
231
+ )
232
+ ]
233
+ )
234
+ return response
235
+
236
+
237
+ @app.post("/tts")
238
+ async def tts_compat(request: SimpleTtsRequest):
239
+ """Compatibility endpoint for clients calling /tts on port 5000."""
240
+ sentence = request.text.strip()
241
+ if not sentence:
242
+ raise HTTPException(status_code=400, detail="text is empty")
243
+
244
+ lang_code = request.language
245
+ _, tts_app, gender = _resolve_tts_engine(lang_code, request.gender)
246
+
247
+ t_start = time.perf_counter()
248
+ b64_audio, audio_duration_s = await asyncio.to_thread(_synthesize, tts_app, sentence, gender, request.samplingRate)
249
+ latency_ms = round((time.perf_counter() - t_start) * 1000, 2)
250
+ rtf = round((latency_ms / 1000) / audio_duration_s, 4) if audio_duration_s > 0 else 0.0
251
+
252
+ return {
253
+ "audioContent": b64_audio,
254
+ "audioFormat": "wav",
255
+ "encoding": "base64",
256
+ "samplingRate": request.samplingRate,
257
+ "metrics": {
258
+ "latency_ms": latency_ms,
259
+ "audio_duration_s": round(audio_duration_s, 3),
260
+ "rtf": rtf,
261
+ },
262
+ }
263
+
264
+
265
+ @app.get("/health")
266
+ def health():
267
+ loaded_langs = {lang: engine.supported_genders for lang, engine in tts_engines.items()}
268
+ return {
269
+ "status": "ok",
270
+ "loadedLanguages": loaded_langs,
271
+ "availableLanguages": list(LANG_CODE_TO_NAME.values()),
272
+ }
start_server.bat ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ @echo off
2
+ set PYTHONUTF8=1
3
+ set TTS_DTYPE=float32
4
+ set LANGUAGES=hindi,punjabi,tamil
5
+ uvicorn server:app --host 0.0.0.0 --port 5000
6
+ pause
test_tts.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import base64
4
+
5
+ url = "http://localhost:8000/services/inference/pipeline"
6
+
7
+ payload = {
8
+ "pipelineTasks": [
9
+ {
10
+ "taskType": "tts",
11
+ "config": {
12
+ "language": {"sourceLanguage": "hi"},
13
+ "gender": "female",
14
+ "samplingRate": 48000,
15
+ },
16
+ }
17
+ ],
18
+ "inputData": {
19
+ "input": [
20
+ {"source": "मेरा नाम कशा है"}
21
+ ]
22
+ },
23
+ }
24
+
25
+ response = requests.post(url, json=payload)
26
+ print(f"Status: {response.status_code}")
27
+
28
+ if response.status_code == 200:
29
+ data = response.json()
30
+ audio_b64 = data["pipelineResponse"][0]["audio"][0]["audioContent"]
31
+ audio_bytes = base64.b64decode(audio_b64)
32
+ with open("test_output.wav", "wb") as f:
33
+ f.write(audio_bytes)
34
+ print(f"Audio saved to test_output.wav ({len(audio_bytes)} bytes)")
35
+ else:
36
+ print(f"Error: {response.text}")
text_preprocess_for_inference.py CHANGED
@@ -1,949 +1,971 @@
1
- '''
2
- TTS Preprocessing
3
- Developed by Arun Kumar A(CS20S013) - November 2022
4
- Code Changes by Utkarsh - 2023
5
- '''
6
- import os
7
- import re
8
- import json
9
- import pandas as pd
10
- import string
11
- from collections import defaultdict
12
- import time
13
- import subprocess
14
- import shutil
15
- from multiprocessing import Process
16
- import traceback
17
-
18
- #imports of dependencies from environment.yml
19
- from num_to_words import num_to_word
20
- from g2p_en import G2p
21
-
22
- def add_to_dictionary(dict_to_add, dict_file):
23
- append_string = ""
24
- for key, value in dict_to_add.items():
25
- append_string += (str(key) + " " + str(value) + "\n")
26
-
27
- if os.path.isfile(dict_file):
28
- # make a copy of the dictionary
29
- source_dir = os.path.dirname(dict_file)
30
- dict_file_name = os.path.basename(dict_file)
31
- temp_file_name = "." + dict_file_name + ".temp"
32
- temp_dict_file = os.path.join(source_dir, temp_file_name)
33
- shutil.copy(dict_file, temp_dict_file)
34
- # append the new words in the dictionary to the temp file
35
- with open(temp_dict_file, "a") as f:
36
- f.write(append_string)
37
- # check if the write is successful and then replace the temp file as the dict file
38
- try:
39
- df_orig = pd.read_csv(dict_file, delimiter=" ", header=None, dtype=str)
40
- df_temp = pd.read_csv(temp_dict_file, delimiter=" ", header=None, dtype=str)
41
- if len(df_temp) > len(df_orig):
42
- os.rename(temp_dict_file, dict_file)
43
- print(f"{len(dict_to_add)} new words appended to Dictionary: {dict_file}")
44
- except:
45
- print(traceback.format_exc())
46
- else:
47
- # create a new dictionary
48
- with open(dict_file, "a") as f:
49
- f.write(append_string)
50
- print(f"New Dictionary: {dict_file} created with {len(dict_to_add)} words")
51
-
52
-
53
- class TextCleaner:
54
- def __init__(self):
55
- # this is a static set of cleaning rules to be applied
56
- self.cleaning_rules = {
57
- " +" : " ",
58
- "^ +" : "",
59
- " +$" : "",
60
- "#" : "",
61
- "[.,;।!](\r\n)*" : "# ",
62
- "[.,;।!](\n)*" : "# ",
63
- "(\r\n)+" : "# ",
64
- "(\n)+" : "# ",
65
- "(\r)+" : "# ",
66
- """[?;:)(!|&’‘,।\."]""": "",
67
- "[/']" : "",
68
- "[-–]" : " ",
69
- }
70
-
71
- def clean(self, text):
72
- for key, replacement in self.cleaning_rules.items():
73
- text = re.sub(key, replacement, text)
74
- return text
75
-
76
- def clean_list(self, text):
77
- # input is supposed to be a list of strings
78
- output_text = []
79
- for line in text:
80
- line = line.strip()
81
- for key, replacement in self.cleaning_rules.items():
82
- line = re.sub(key, replacement, line)
83
- output_text.append(line)
84
- return output_text
85
-
86
-
87
- class Phonifier:
88
- def __init__(self, dict_location=None):
89
- if dict_location is None:
90
- dict_location = "phone_dict"
91
- self.dict_location = dict_location
92
-
93
- # self.phone_dictionary = {}
94
- # # load dictionary for all the available languages
95
- # for dict_file in os.listdir(dict_location):
96
- # try:
97
- # if dict_file.startswith("."):
98
- # # ignore hidden files
99
- # continue
100
- # language = dict_file
101
- # dict_file_path = os.path.join(dict_location, dict_file)
102
- # df = pd.read_csv(dict_file_path, delimiter=" ", header=None, dtype=str)
103
- # self.phone_dictionary[language] = df.set_index(0).to_dict('dict')[1]
104
- # except Exception as e:
105
- # print(traceback.format_exc())
106
-
107
- # print("Phone dictionary loaded for the following languages:", list(self.phone_dictionary.keys()))
108
-
109
- self.g2p = G2p()
110
- print('Loading G2P model... Done!')
111
- # Mapping between the cmu phones and the iitm cls
112
- self.cmu_2_cls_map = {
113
- "AA" : "aa",
114
- "AA0" : "aa",
115
- "AA1" : "aa",
116
- "AA2" : "aa",
117
- "AE" : "axx",
118
- "AE0" : "axx",
119
- "AE1" : "axx",
120
- "AE2" : "axx",
121
- "AH" : "a",
122
- "AH0" : "a",
123
- "AH1" : "a",
124
- "AH2" : "a",
125
- "AO" : "ax",
126
- "AO0" : "ax",
127
- "AO1" : "ax",
128
- "AO2" : "ax",
129
- "AW" : "ou",
130
- "AW0" : "ou",
131
- "AW1" : "ou",
132
- "AW2" : "ou",
133
- "AX" : "a",
134
- "AY" : "ei",
135
- "AY0" : "ei",
136
- "AY1" : "ei",
137
- "AY2" : "ei",
138
- "B" : "b",
139
- "CH" : "c",
140
- "D" : "dx",
141
- "DH" : "d",
142
- "EH" : "ee",
143
- "EH0" : "ee",
144
- "EH1" : "ee",
145
- "EH2" : "ee",
146
- "ER" : "a r",
147
- "ER0" : "a r",
148
- "ER1" : "a r",
149
- "ER2" : "a r",
150
- "EY" : "ee",
151
- "EY0" : "ee",
152
- "EY1" : "ee",
153
- "EY2" : "ee",
154
- "F" : "f",
155
- "G" : "g",
156
- "HH" : "h",
157
- "IH" : "i",
158
- "IH0" : "i",
159
- "IH1" : "i",
160
- "IH2" : "i",
161
- "IY" : "ii",
162
- "IY0" : "ii",
163
- "IY1" : "ii",
164
- "IY2" : "ii",
165
- "JH" : "j",
166
- "K" : "k",
167
- "L" : "l",
168
- "M" : "m",
169
- "N" : "n",
170
- "NG" : "ng",
171
- "OW" : "o",
172
- "OW0" : "o",
173
- "OW1" : "o",
174
- "OW2" : "o",
175
- "OY" : "ei",
176
- "OY0" : "ei",
177
- "OY1" : "ei",
178
- "OY2" : "ei",
179
- "P" : "p",
180
- "R" : "r",
181
- "S" : "s",
182
- "SH" : "sh",
183
- "T" : "tx",
184
- "TH" : "t",
185
- "UH" : "u",
186
- "UH0" : "u",
187
- "UH1" : "u",
188
- "UH2" : "u",
189
- "UW" : "uu",
190
- "UW0" : "uu",
191
- "UW1" : "uu",
192
- "UW2" : "uu",
193
- "V" : "w",
194
- "W" : "w",
195
- "Y" : "y",
196
- "Z" : "z",
197
- "ZH" : "sh",
198
- }
199
-
200
- # Mapping between the iitm cls and iitm char
201
- self.cls_2_chr_map = {
202
- "aa" : "A",
203
- "ii" : "I",
204
- "uu" : "U",
205
- "ee" : "E",
206
- "oo" : "O",
207
- "nn" : "N",
208
- "ae" : "",
209
- "ag" : "",
210
- "au" : "",
211
- "axx" : "",
212
- "ax" : "",
213
- "bh" : "B",
214
- "ch" : "C",
215
- "dh" : "",
216
- "dx" : "",
217
- "dxh" : "",
218
- "dxhq" : "T",
219
- "dxq" : "D",
220
- "ei" : "ऐ",
221
- "ai" : "ऐ",
222
- "eu" : "உ",
223
- "gh" : "",
224
- "gq" : "G",
225
- "hq" : "H",
226
- "jh" : "J",
227
- "kh" : "",
228
- "khq" : "K",
229
- "kq" : "",
230
- "ln" : "",
231
- "lw" : "",
232
- "lx" : "",
233
- "mq" : "M",
234
- "nd" : "",
235
- "ng" : "",
236
- "nj" : "",
237
- "nk" : "Y",
238
- "nw" : "",
239
- "nx" : "",
240
- "ou" : "",
241
- "ph" : "P",
242
- "rq" : "R",
243
- "rqw" : "",
244
- "rw" : "",
245
- "rx" : "",
246
- "sh" : "",
247
- "sx" : "",
248
- "th" : "",
249
- "tx" : "",
250
- "txh" : "",
251
- "wv" : "W",
252
- "zh" : "Z",
253
- }
254
-
255
- # Multilingual support for OOV characters
256
- oov_map_json_file = 'multilingualcharmap.json'
257
- with open(oov_map_json_file, 'r') as oov_file:
258
- self.oov_map = json.load(oov_file)
259
-
260
-
261
-
262
- def load_lang_dict(self, language, phone_dictionary):
263
- # load dictionary for requested language
264
- try:
265
-
266
- dict_file = language
267
- print("language", language)
268
- dict_file_path = os.path.join(self.dict_location, dict_file)
269
- print("dict_file_path", dict_file_path)
270
- df = pd.read_csv(dict_file_path, delimiter=" ", header=None, dtype=str)
271
- phone_dictionary[language] = df.set_index(0).to_dict('dict')[1]
272
-
273
- dict_file = 'english'
274
- dict_file_path = os.path.join(self.dict_location, dict_file)
275
- df = pd.read_csv(dict_file_path, delimiter=" ", header=None, dtype=str)
276
- phone_dictionary['english'] = df.set_index(0).to_dict('dict')[1]
277
-
278
- except Exception as e:
279
- print(traceback.format_exc())
280
-
281
- return phone_dictionary
282
-
283
- def __is_float(self, word):
284
- parts = word.split('.')
285
- if len(parts) != 2:
286
- return False
287
- return parts[0].isdecimal() and parts[1].isdecimal()
288
-
289
- def en_g2p(self, word):
290
- phn_out = self.g2p(word)
291
- # print(f"phn_out: {phn_out}")
292
- # iterate over the string list and replace each word with the corresponding value from the dictionary
293
- for i, phn in enumerate(phn_out):
294
- if phn in self.cmu_2_cls_map.keys():
295
- phn_out[i] = self.cmu_2_cls_map[phn]
296
- # cls_out = self.cmu_2_cls_map[phn]
297
- if phn_out[i] in self.cls_2_chr_map.keys():
298
- phn_out[i] = self.cls_2_chr_map[phn_out[i]]
299
- else:
300
- pass
301
- else:
302
- pass # ignore words that are not in the dictionary
303
- # print(f"i: {i}, phn: {phn}, cls_out: {cls_out}, phn_out: {phn_out[i]}")
304
- return ("".join(phn_out)).strip().replace(" ", "")
305
-
306
- def __post_phonify(self, text, language, gender):
307
- language_gender_id = language+'_'+gender
308
- if language_gender_id in self.oov_map.keys():
309
- output_string = ''
310
- for char in text:
311
- if char in self.oov_map[language_gender_id].keys():
312
- output_string += self.oov_map[language_gender_id][char]
313
- else:
314
- output_string += char
315
- # output_string += self.oov_map['language_gender_id']['char']
316
- return output_string
317
- else:
318
- return text
319
-
320
- def __is_english_word(self, word):
321
- maxchar = max(word)
322
- if u'\u0000' <= maxchar <= u'\u007f':
323
- return True
324
- return False
325
-
326
- def __phonify(self, text, language, gender, phone_dictionary):
327
- # text is expected to be a list of strings
328
- words = set((" ".join(text)).split(" "))
329
- #print(f"words test: {words}")
330
- non_dict_words = []
331
-
332
-
333
- if language in phone_dictionary:
334
- for word in words:
335
- # print(f"word: {word}")
336
- if word not in phone_dictionary[language] and (language == "english" or (not self.__is_english_word(word))):
337
- non_dict_words.append(word)
338
- #print('INSIDE IF CONDITION OF ADDING WORDS')
339
- else:
340
- non_dict_words = words
341
- print(f"word not in dict: {non_dict_words}")
342
-
343
- if len(non_dict_words) > 0:
344
- # unified parser has to be run for the non dictionary words
345
- os.makedirs("tmp", exist_ok=True)
346
- timestamp = str(time.time())
347
- non_dict_words_file = os.path.abspath("tmp/non_dict_words_" + timestamp)
348
- out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
349
- with open(non_dict_words_file, "w") as f:
350
- f.write("\n".join(non_dict_words))
351
-
352
- if(language == 'tamil'):
353
- current_directory = os.getcwd()
354
- #tamil_parser_cmd = "tamil_parser.sh"
355
- tamil_parser_cmd = f"{current_directory}/ssn_parser_new/tamil_parser.py"
356
- #subprocess.run(["bash", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "ssn_parser"])
357
- subprocess.run(["python", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, f"{current_directory}/ssn_parser_new"])
358
- elif(language == 'english'):
359
- phn_out_dict = {}
360
- for i in range(0,len(non_dict_words)):
361
- phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i])
362
- # Create a string representation of the dictionary
363
- data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()])
364
- print(f"data_str: {data_str}")
365
- with open(out_dict_file, "w") as f:
366
- f.write(data_str)
367
- else:
368
-
369
- out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
370
- from get_phone_mapped_python import TextReplacer
371
-
372
- from indic_unified_parser.uparser import wordparse
373
-
374
- text_replacer=TextReplacer()
375
- # def write_output_to_file(output_text, file_path):
376
- # with open(file_path, 'w') as f:
377
- # f.write(output_text)
378
- parsed_output_list = []
379
- for word in non_dict_words:
380
- parsed_word = wordparse(word, 0, 0, 1)
381
- parsed_output_list.append(parsed_word)
382
- replaced_output_list = [text_replacer.apply_replacements(parsed_word) for parsed_word in parsed_output_list]
383
- with open(out_dict_file, 'w', encoding='utf-8') as file:
384
- for original_word, formatted_word in zip(non_dict_words, replaced_output_list):
385
- line = f"{original_word}\t{formatted_word}\n"
386
- file.write(line)
387
- print(line, end='')
388
-
389
-
390
- try:
391
-
392
- df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str)
393
- #print('DATAFRAME OUTPUT FILE', df.head())
394
- new_dict = df.dropna().set_index(0).to_dict('dict')[1]
395
- #print("new dict",new_dict)
396
- if language not in phone_dictionary:
397
- phone_dictionary[language] = new_dict
398
- else:
399
- phone_dictionary[language].update(new_dict)
400
- # run a non-blocking child process to update the dictionary file
401
- #print("phone_dict", self.phone_dictionary)
402
- p = Process(target=add_to_dictionary, args=(new_dict, os.path.join(self.dict_location, language)))
403
- p.start()
404
- except Exception as err:
405
- print(f"Error: While loading {out_dict_file}")
406
- traceback.print_exc()
407
-
408
- # phonify text with dictionary
409
- text_phonified = []
410
- for phrase in text:
411
- phrase_phonified = []
412
- for word in phrase.split(" "):
413
- if self.__is_english_word(word):
414
- if word in phone_dictionary["english"]:
415
- phrase_phonified.append(str(phone_dictionary["english"][word]))
416
- else:
417
- phrase_phonified.append(str(self.en_g2p(word)))
418
- elif word in phone_dictionary[language]:
419
- # if a word could not be parsed, skip it
420
- phrase_phonified.append(str(phone_dictionary[language][word]))
421
- # text_phonified.append(self.__post_phonify(" ".join(phrase_phonified),language, gender))
422
- text_phonified.append(" ".join(phrase_phonified))
423
- return text_phonified
424
-
425
- def __merge_lists(self, lists):
426
- merged_string = ""
427
- for list in lists:
428
- for word in list:
429
- merged_string += word + " "
430
- return merged_string.strip()
431
-
432
- def __phonify_list(self, text, language, gender, phone_dictionary):
433
- # text is expected to be a list of list of strings
434
- words = set(self.__merge_lists(text).split(" "))
435
- non_dict_words = []
436
- if language in phone_dictionary:
437
- for word in words:
438
- if word not in phone_dictionary[language] and (language == "english" or (not self.__is_english_word(word))):
439
- non_dict_words.append(word)
440
- else:
441
- non_dict_words = words
442
-
443
- if len(non_dict_words) > 0:
444
- print(len(non_dict_words))
445
- print(non_dict_words)
446
- # unified parser has to be run for the non dictionary words
447
- os.makedirs("tmp", exist_ok=True)
448
- timestamp = str(time.time())
449
- non_dict_words_file = os.path.abspath("tmp/non_dict_words_" + timestamp)
450
- out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
451
- with open(non_dict_words_file, "w") as f:
452
- f.write("\n".join(non_dict_words))
453
-
454
- if(language == 'tamil'):
455
- current_directory = os.getcwd()
456
- #tamil_parser_cmd = "tamil_parser.sh"
457
- tamil_parser_cmd = f"{current_directory}/ssn_parser_new/tamil_parser.py"
458
- #subprocess.run(["bash", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "ssn_parser"])
459
- subprocess.run(["python", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, f"{current_directory}/ssn_parser_new"])
460
-
461
- elif(language == 'english'):
462
- phn_out_dict = {}
463
- for i in range(0,len(non_dict_words)):
464
- phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i])
465
- # Create a string representation of the dictionary
466
- data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()])
467
- print(f"data_str: {data_str}")
468
- with open(out_dict_file, "w") as f:
469
- f.write(data_str)
470
- else:
471
- out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
472
- from get_phone_mapped_python import TextReplacer
473
-
474
- from indic_unified_parser.uparser import wordparse
475
-
476
- text_replacer=TextReplacer()
477
-
478
- parsed_output_list = []
479
- for word in non_dict_words:
480
- parsed_word = wordparse(word, 0, 0, 1)
481
- parsed_output_list.append(parsed_word)
482
- replaced_output_list = [text_replacer.apply_replacements(parsed_word) for parsed_word in parsed_output_list]
483
- with open(out_dict_file, 'w', encoding='utf-8') as file:
484
- for original_word, formatted_word in zip(non_dict_words, replaced_output_list):
485
- line = f"{original_word}\t{formatted_word}\n"
486
- file.write(line)
487
- print(line, end='')
488
-
489
- try:
490
- df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str)
491
- new_dict = df.dropna().set_index(0).to_dict('dict')[1]
492
- print(new_dict)
493
- if language not in phone_dictionary:
494
- phone_dictionary[language] = new_dict
495
- else:
496
- phone_dictionary[language].update(new_dict)
497
- # run a non-blocking child process to update the dictionary file
498
- p = Process(target=add_to_dictionary, args=(new_dict, os.path.join(self.dict_location, language)))
499
- p.start()
500
- except Exception as err:
501
- traceback.print_exc()
502
-
503
- # phonify text with dictionary
504
- text_phonified = []
505
- for line in text:
506
- line_phonified = []
507
- for phrase in line:
508
- phrase_phonified = []
509
- for word in phrase.split(" "):
510
- if self.__is_english_word(word):
511
- if word in phone_dictionary["english"]:
512
- phrase_phonified.append(str(phone_dictionary["english"][word]))
513
- else:
514
- phrase_phonified.append(str(self.en_g2p(word)))
515
- elif word in phone_dictionary[language]:
516
- # if a word could not be parsed, skip it
517
- phrase_phonified.append(str(phone_dictionary[language][word]))
518
- # line_phonified.append(self.__post_phonify(" ".join(phrase_phonified), language, gender))
519
- line_phonified.append(" ".join(phrase_phonified))
520
- text_phonified.append(line_phonified)
521
- return text_phonified
522
-
523
- def phonify(self, text, language, gender, phone_dictionary):
524
- if not isinstance(text, list):
525
- out = self.__phonify([text], language, gender)
526
- return out[0]
527
- return self.__phonify(text, language, gender, phone_dictionary)
528
-
529
- def phonify_list(self, text, language, gender, phone_dictionary):
530
- if isinstance(text, list):
531
- return self.__phonify_list(text, language, gender, phone_dictionary)
532
- else:
533
- print("Error!! Expected to have a list as input.")
534
-
535
-
536
- class TextNormalizer:
537
- def __init__(self, char_map_location=None):
538
- # self.phonifier = phonifier
539
- if char_map_location is None:
540
- char_map_location = "charmap"
541
-
542
- # this is a static set of cleaning rules to be applied
543
- self.cleaning_rules = {
544
- " +" : " ",
545
- "^ +" : "",
546
- " +$" : "",
547
- "#$" : "",
548
- "# +$" : "",
549
- }
550
-
551
- # this is the list of languages supported by num_to_words
552
- self.keydict = {"english" : "en",
553
- "hindi" : "hi",
554
- "gujarati" : "gu",
555
- "marathi" : "mr",
556
- "bengali" : "bn",
557
- "telugu" : "te",
558
- "tamil" : "ta",
559
- "kannada" : "kn",
560
- "odia" : "or",
561
- "punjabi" : "pa"
562
- }
563
-
564
- # self.g2p = G2p()
565
- # print('Loading G2P model... Done!')
566
-
567
- def __post_cleaning(self, text):
568
- for key, replacement in self.cleaning_rules.items():
569
- text = re.sub(key, replacement, text)
570
- return text
571
-
572
- def __post_cleaning_list(self, text):
573
- # input is supposed to be a list of strings
574
- output_text = []
575
- for line in text:
576
- for key, replacement in self.cleaning_rules.items():
577
- line = re.sub(key, replacement, line)
578
- output_text.append(line)
579
- return output_text
580
-
581
- def __check_char_type(self, str_c):
582
- # Determine the type of the character
583
- if str_c.isnumeric():
584
- char_type = "number"
585
- elif str_c in string.punctuation:
586
- char_type = "punctuation"
587
- elif str_c in string.whitespace:
588
- char_type = "whitespace"
589
- elif str_c.isalpha() and str_c.isascii():
590
- char_type = "ascii"
591
- else:
592
- char_type = "non-ascii"
593
- return char_type
594
-
595
- def insert_space(self, text):
596
- '''
597
- Check if the text contains numbers and English words and if they are without space inserts space between them.
598
- '''
599
- # Initialize variables to track the previous character type and whether a space should be inserted
600
- prev_char_type = None
601
- next_char_type = None
602
- insert_space = False
603
-
604
- # Output string
605
- output_string = ""
606
-
607
- # Iterate through each character in the text
608
- for i, c in enumerate(text):
609
- # Determine the type of the character
610
- char_type = self.__check_char_type(c)
611
- if i == (len(text) - 1):
612
- next_char_type = None
613
- else:
614
- next_char_type = self.__check_char_type(text[i+1])
615
- # print(f"{i}: {c} is a {char_type} character and next character is a {next_char_type}")
616
-
617
- # If the character type has changed from the previous character, check if a space should be inserted
618
- if (char_type != prev_char_type and prev_char_type != None and char_type != "punctuation" and char_type != "whitespace"):
619
- if next_char_type != "punctuation" or next_char_type != "whitespace":
620
- insert_space = True
621
-
622
- # Insert a space if needed
623
- if insert_space:
624
- output_string += " "+c
625
- insert_space = False
626
- else:
627
- output_string += c
628
-
629
- # Update the previous character type
630
- prev_char_type = char_type
631
-
632
- # Print the modified text
633
- output_string = re.sub(r' +', ' ', output_string)
634
- return output_string
635
-
636
- def insert_space_list(self, text):
637
- '''
638
- Expect the input to be in form of list of string.
639
- Check if the text contains numbers and English words and if they are without space inserts space between them.
640
- '''
641
- # Output string list
642
- output_list = []
643
-
644
- for line in text:
645
- # Initialize variables to track the previous character type and whether a space should be inserted
646
- prev_char_type = None
647
- next_char_type = None
648
- insert_space = False
649
- # Output string
650
- output_string = ""
651
- # Iterate through each character in the line
652
- for i, c in enumerate(line):
653
- # Determine the type of the character
654
- char_type = self.__check_char_type(c)
655
- if i == (len(line) - 1):
656
- next_char_type = None
657
- else:
658
- next_char_type = self.__check_char_type(line[i+1])
659
- # print(f"{i}: {c} is a {char_type} character and next character is a {next_char_type}")
660
-
661
- # If the character type has changed from the previous character, check if a space should be inserted
662
- if (char_type != prev_char_type and prev_char_type != None and char_type != "punctuation" and char_type != "whitespace"):
663
- if next_char_type != "punctuation" or next_char_type != "whitespace":
664
- insert_space = True
665
-
666
- # Insert a space if needed
667
- if insert_space:
668
- output_string += " "+c
669
- insert_space = False
670
- else:
671
- output_string += c
672
-
673
- # Update the previous character type
674
- prev_char_type = char_type
675
-
676
- # Print the modified line
677
- output_string = re.sub(r' +', ' ', output_string)
678
- output_list.append(output_string)
679
- return output_list
680
-
681
- def num2text(self, text, language):
682
- if language in self.keydict.keys():
683
- digits = sorted(list(map(int, re.findall(r'\d+', text))),reverse=True)
684
- if digits:
685
- for digit in digits:
686
- text = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', text)
687
- return self.__post_cleaning(text)
688
- else:
689
- print(f"No num-to-char for the given language {language}.")
690
- return self.__post_cleaning(text)
691
-
692
- def num2text_list(self, text, language):
693
- # input is supposed to be a list of strings
694
- if language in self.keydict.keys():
695
- output_text = []
696
- for line in text:
697
- digits = sorted(list(map(int, re.findall(r'\d+', line))),reverse=True)
698
- if digits:
699
- for digit in digits:
700
- line = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', line)
701
- output_text.append(line)
702
- return self.__post_cleaning_list(output_text)
703
- else:
704
- print(f"No num-to-char for the given language {language}.")
705
- return self.__post_cleaning_list(text)
706
-
707
- def numberToTextConverter(self, text, language):
708
- if language in self.keydict.keys():
709
- matches = re.findall(r'\d+\.\d+|\d+', text)
710
- digits = sorted([int(match) if match.isdigit() else match if re.match(r'^\d+(\.\d+)?$', match) else str(match) for match in matches], key=lambda x: float(x) if isinstance(x, str) and '.' in x else x, reverse=True)
711
- if digits:
712
- for digit in digits:
713
-
714
- if isinstance(digit, int):
715
- text = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language]).replace(",", "")+' ', text)
716
- else:
717
- parts = str(digit).split('.')
718
- integer_part = int(parts[0])
719
- data1 = num_to_word(integer_part, self.keydict[language]).replace(",", "")
720
- decimal_part = str(parts[1])
721
- data2 = ''
722
- for i in decimal_part:
723
- data2 = data2+' '+num_to_word(i, self.keydict[language])
724
- if language == 'hindi':
725
- final_data = f'{data1} दशमलव {data2}'
726
- elif language == 'tamil':
727
- final_data = f'{data1} புள்ளி {data2}'
728
- else:
729
- final_data = f'{data1} point {data2}'
730
-
731
-
732
- text = re.sub(str(digit), ' '+final_data+' ', text)
733
-
734
- return self.__post_cleaning(text)
735
- else:
736
-
737
-
738
- words = {
739
- '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
740
- '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine'
741
- }
742
-
743
-
744
- # Use regular expression to find and replace decimal points in numbers
745
- text = re.sub(r'(?<=\d)\.(?=\d)', ' point ', text)
746
-
747
- # Find all occurrences of numbers with decimal points and convert them to words
748
- matches = re.findall(r'point (\d+)', text)
749
-
750
- for match in matches:
751
- replacement = ' '.join(words[digit] for digit in match)
752
- text = text.replace(f'point {match}', f'point {replacement}', 1)
753
-
754
-
755
- return self.__post_cleaning(text)
756
-
757
-
758
- def normalize(self, text, language):
759
- return self.__post_cleaning(text)
760
-
761
- def normalize_list(self, text, language):
762
- # input is supposed to be a list of strings
763
- return self.__post_cleaning_list(text)
764
-
765
-
766
- class TextPhrasifier:
767
- @classmethod
768
- def phrasify(cls, text):
769
- phrase_list = []
770
- for phrase in text.split("#"):
771
- phrase = phrase.strip()
772
- if phrase != "":
773
- phrase_list.append(phrase)
774
- return phrase_list
775
-
776
- class TextPhrasifier_List:
777
- @classmethod
778
- def phrasify(cls, text):
779
- # input is supposed to be a list of strings
780
- # output is list of list of strings
781
- output_list = []
782
- for line in text:
783
- phrase_list = []
784
- for phrase in line.split("#"):
785
- phrase = phrase.strip()
786
- if phrase != "":
787
- phrase_list.append(phrase)
788
- output_list.append(phrase_list)
789
- return output_list
790
-
791
- class DurAlignTextProcessor:
792
- def __init__(self):
793
- # this is a static set of cleaning rules to be applied
794
- self.cleaning_rules = {
795
- " +" : " ",
796
- "^" : "$",
797
- "$" : ".",
798
- }
799
- self.cleaning_rules_English = {
800
- " +" : " ",
801
- "$" : ".",
802
- }
803
- def textProcesor(self, text):
804
- for key, replacement in self.cleaning_rules.items():
805
- for idx in range(0,len(text)):
806
- text[idx] = re.sub(key, replacement, text[idx])
807
-
808
- return text
809
-
810
- def textProcesorForEnglish(self, text):
811
- for key, replacement in self.cleaning_rules_English.items():
812
- for idx in range(0,len(text)):
813
- text[idx] = re.sub(key, replacement, text[idx])
814
-
815
- return text
816
-
817
- def textProcesor_list(self, text):
818
- # input expected in 'list of list of string' format
819
- output_text = []
820
- for line in text:
821
- for key, replacement in self.cleaning_rules.items():
822
- for idx in range(0,len(line)):
823
- line[idx] = re.sub(key, replacement, line[idx])
824
- output_text.append(line)
825
-
826
- return output_text
827
-
828
-
829
-
830
-
831
- class SharedInit:
832
- def __init__(self,
833
- text_cleaner = TextCleaner(),
834
- text_normalizer=TextNormalizer(),
835
- phonifier = Phonifier(),
836
- text_phrasefier = TextPhrasifier(),
837
- post_processor = DurAlignTextProcessor()):
838
- self.text_cleaner = text_cleaner
839
- self.text_normalizer = text_normalizer
840
- self.phonifier = phonifier
841
- self.text_phrasefier = text_phrasefier
842
- self.post_processor = post_processor
843
-
844
-
845
-
846
- class TTSDurAlignPreprocessor(SharedInit):
847
-
848
- def preprocess(self, text, language, gender, phone_dictionary):
849
- # text = text.strip()
850
- #print(text)
851
- text = self.text_normalizer.numberToTextConverter(text, language)
852
- text = self.text_cleaner.clean(text)
853
- #print("cleaned text", text)
854
- # text = self.text_normalizer.insert_space(text)
855
- #text = self.text_normalizer.num2text(text, language)
856
- # print(text)
857
- text = self.text_normalizer.normalize(text, language)
858
- # print(text)
859
- phrasified_text = TextPhrasifier.phrasify(text)
860
- #print("phrased",phrasified_text)
861
-
862
- if language not in list(phone_dictionary.keys()):
863
- phone_dictionary = self.phonifier.load_lang_dict(language, phone_dictionary)
864
-
865
- #print(phone_dictionary.keys())
866
-
867
- phonified_text = self.phonifier.phonify(phrasified_text, language, gender, phone_dictionary)
868
- #print("phonetext",phonified_text)
869
- phonified_text = self.post_processor.textProcesor(phonified_text)
870
- #print(phonified_text)
871
- return phonified_text, phrasified_text
872
-
873
- class TTSDurAlignPreprocessor_VTT(SharedInit):
874
-
875
- def preprocess(self, text, language, gender):
876
- # text = text.strip()
877
- text = self.text_cleaner.clean_list(text)
878
- # text = self.text_normalizer.insert_space_list(text)
879
- text = self.text_normalizer.num2text_list(text, language)
880
- text = self.text_normalizer.normalize_list(text, language)
881
- phrasified_text = TextPhrasifier_List.phrasify(text)
882
- phonified_text = self.phonifier.phonify_list(phrasified_text, language, gender)
883
- phonified_text = self.post_processor.textProcesor_list(phonified_text)
884
- return phonified_text, phrasified_text
885
-
886
-
887
- class CharTextPreprocessor(SharedInit):
888
-
889
- def preprocess(self, text, language, gender=None, phone_dictionary=None):
890
- text = text.strip()
891
- text = self.text_normalizer.numberToTextConverter(text, language)
892
- text = self.text_cleaner.clean(text)
893
- # text = self.text_normalizer.insert_space(text)
894
- #text = self.text_normalizer.num2text(text, language)
895
- text = self.text_normalizer.normalize(text, language)
896
- phrasified_text = TextPhrasifier.phrasify(text)
897
- phonified_text = phrasified_text # No phonification for character TTS models
898
- return phonified_text, phrasified_text
899
-
900
- class CharTextPreprocessor_VTT(SharedInit):
901
-
902
-
903
- def preprocess(self, text, language, gender=None):
904
- # text = text.strip()
905
- text = self.text_cleaner.clean_list(text)
906
- # text = self.text_normalizer.insert_space_list(text)
907
- text = self.text_normalizer.num2text_list(text, language)
908
- text = self.text_normalizer.normalize_list(text, language)
909
- phrasified_text = TextPhrasifier_List.phrasify(text)
910
- phonified_text = phrasified_text # No phonification for character TTS models
911
- return phonified_text, phrasified_text
912
-
913
-
914
- class TTSPreprocessor(SharedInit):
915
-
916
- def preprocess(self, text, language, gender, phone_dictionary):
917
- text = text.strip()
918
- text = self.text_normalizer.numberToTextConverter(text, language)
919
- text = self.text_cleaner.clean(text)
920
- # text = self.text_normalizer.insert_space(text)
921
- #text = self.text_normalizer.num2text(text, language)
922
- text = self.text_normalizer.normalize(text, language)
923
- phrasified_text = TextPhrasifier.phrasify(text)
924
- if language not in list(phone_dictionary.keys()):
925
- phone_dictionary = self.phonifier.load_lang_dict(language, phone_dictionary)
926
- phonified_text = self.phonifier.phonify(phrasified_text, language, gender, phone_dictionary)
927
- #print(phonified_text)
928
- phonified_text = self.post_processor.textProcesorForEnglish(phonified_text)
929
- #print(phonified_text)
930
- return phonified_text, phrasified_text
931
-
932
- class TTSPreprocessor_VTT(SharedInit):
933
-
934
-
935
- def preprocess(self, text, language, gender):
936
- # print(f"Original text: {text}")
937
- text = self.text_cleaner.clean_list(text)
938
- # print(f"After text cleaner: {text}")
939
- # text = self.text_normalizer.insert_space_list(text)
940
- # print(f"After insert space: {text}")
941
- text = self.text_normalizer.num2text_list(text, language)
942
- # print(f"After num2text: {text}")
943
- text = self.text_normalizer.normalize_list(text, language)
944
- # print(f"After text normalizer: {text}")
945
- phrasified_text = TextPhrasifier_List.phrasify(text)
946
- # print(f"phrasified_text: {phrasified_text}")
947
- phonified_text = self.phonifier.phonify_list(phrasified_text, language, gender)
948
- # print(f"phonified_text: {phonified_text}")
949
- return phonified_text, phrasified_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ TTS Preprocessing
3
+ Developed by Arun Kumar A(CS20S013) - November 2022
4
+ Updated by Utkarsh Pathak (DA24S011) - Septmeber 2023
5
+ '''
6
+ import os
7
+ import re
8
+ import sys
9
+ import json
10
+ import pandas as pd
11
+ import string
12
+ from collections import defaultdict
13
+ import time
14
+ import subprocess
15
+ import shutil
16
+ from multiprocessing import Process
17
+ import traceback
18
+
19
+ #imports of dependencies from environment.yml
20
+ # sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__))))
21
+
22
+ from num_to_words import num_to_word
23
+ from g2p_en import G2p
24
+
25
+
26
+
27
+ from NumberToText import NumberDictionary
28
+
29
+
30
+ def add_to_dictionary(dict_to_add, dict_file):
31
+ append_string = ""
32
+ for key, value in dict_to_add.items():
33
+ append_string += (str(key) + " " + str(value) + "\n")
34
+
35
+ if os.path.isfile(dict_file):
36
+ # make a copy of the dictionary
37
+ source_dir = os.path.dirname(dict_file)
38
+ dict_file_name = os.path.basename(dict_file)
39
+ temp_file_name = "." + dict_file_name + ".temp"
40
+ temp_dict_file = os.path.join(source_dir, temp_file_name)
41
+ shutil.copy(dict_file, temp_dict_file)
42
+ # append the new words in the dictionary to the temp file
43
+ with open(temp_dict_file, "a") as f:
44
+ f.write(append_string)
45
+ # check if the write is successful and then replace the temp file as the dict file
46
+ try:
47
+ df_orig = pd.read_csv(dict_file, delimiter=" ", header=None, dtype=str)
48
+ df_temp = pd.read_csv(temp_dict_file, delimiter=" ", header=None, dtype=str)
49
+ if len(df_temp) > len(df_orig):
50
+ os.replace(temp_dict_file, dict_file)
51
+ print(f"{len(dict_to_add)} new words appended to Dictionary: {dict_file}")
52
+ except:
53
+ print(traceback.format_exc())
54
+ else:
55
+ # create a new dictionary
56
+ with open(dict_file, "a") as f:
57
+ f.write(append_string)
58
+ print(f"New Dictionary: {dict_file} created with {len(dict_to_add)} words")
59
+
60
+
61
+ class TextCleaner:
62
+ def __init__(self):
63
+ # this is a static set of cleaning rules to be applied
64
+ self.cleaning_rules = {
65
+ " +" : " ",
66
+ "^ +" : "",
67
+ " +$" : "",
68
+ "#" : "",
69
+ "[.,;।!](\r\n)*" : "# ",
70
+ "[.,;।!](\n)*" : "# ",
71
+ "(\r\n)+" : "# ",
72
+ "(\n)+" : "# ",
73
+ "(\r)+" : "# ",
74
+ """[?;:)(!|&’‘,।\."]""": "",
75
+ "[/']" : "",
76
+ "[-–]" : " ",
77
+ }
78
+
79
+ def clean(self, text):
80
+ for key, replacement in self.cleaning_rules.items():
81
+ text = re.sub(key, replacement, text)
82
+ return text
83
+
84
+ def clean_list(self, text):
85
+ # input is supposed to be a list of strings
86
+ output_text = []
87
+ for line in text:
88
+ line = line.strip()
89
+ for key, replacement in self.cleaning_rules.items():
90
+ line = re.sub(key, replacement, line)
91
+ output_text.append(line)
92
+ return output_text
93
+
94
+
95
+ class Phonifier:
96
+ def __init__(self, dict_location=None):
97
+ if dict_location is None:
98
+ dict_location = "phone_dict"
99
+ self.dict_location = dict_location
100
+ self.devanagari_langs = {
101
+ "bodo",
102
+ "hindi",
103
+ "kashmiri",
104
+ "marathi",
105
+ "nepali",
106
+ "santali",
107
+ "rajasthani",
108
+ "sindhi",
109
+ "dogri",
110
+ "sanskrit",
111
+ "maithili",
112
+ "konkani",
113
+ }
114
+ self.phone_dictionary = {}
115
+ # load dictionary for all the available languages
116
+ for dict_file in os.listdir(dict_location):
117
+ try:
118
+ if dict_file.startswith("."):
119
+ # ignore hidden files
120
+ continue
121
+ language = dict_file
122
+ dict_file_path = os.path.join(dict_location, dict_file)
123
+ df = pd.read_csv(dict_file_path, delimiter=" ", header=None, dtype=str)
124
+ self.phone_dictionary[language] = df.set_index(0).to_dict('dict')[1]
125
+ except Exception as e:
126
+ #print(traceback.format_exc())
127
+ print("")
128
+ print("Phone dictionary loaded for the following languages:", list(self.phone_dictionary.keys()))
129
+
130
+ self.g2p = G2p()
131
+ print('Loading G2P model... Done!')
132
+ # Mapping between the cmu phones and the iitm cls
133
+ self.cmu_2_cls_map = {
134
+ "AA" : "aa",
135
+ "AA0" : "aa",
136
+ "AA1" : "aa",
137
+ "AA2" : "aa",
138
+ "AE" : "axx",
139
+ "AE0" : "axx",
140
+ "AE1" : "axx",
141
+ "AE2" : "axx",
142
+ "AH" : "a",
143
+ "AH0" : "a",
144
+ "AH1" : "a",
145
+ "AH2" : "a",
146
+ "AO" : "ax",
147
+ "AO0" : "ax",
148
+ "AO1" : "ax",
149
+ "AO2" : "ax",
150
+ "AW" : "ou",
151
+ "AW0" : "ou",
152
+ "AW1" : "ou",
153
+ "AW2" : "ou",
154
+ "AX" : "a",
155
+ "AY" : "ei",
156
+ "AY0" : "ei",
157
+ "AY1" : "ei",
158
+ "AY2" : "ei",
159
+ "B" : "b",
160
+ "CH" : "c",
161
+ "D" : "dx",
162
+ "DH" : "d",
163
+ "EH" : "ee",
164
+ "EH0" : "ee",
165
+ "EH1" : "ee",
166
+ "EH2" : "ee",
167
+ "ER" : "a r",
168
+ "ER0" : "a r",
169
+ "ER1" : "a r",
170
+ "ER2" : "a r",
171
+ "EY" : "ee",
172
+ "EY0" : "ee",
173
+ "EY1" : "ee",
174
+ "EY2" : "ee",
175
+ "F" : "f",
176
+ "G" : "g",
177
+ "HH" : "h",
178
+ "IH" : "i",
179
+ "IH0" : "i",
180
+ "IH1" : "i",
181
+ "IH2" : "i",
182
+ "IY" : "ii",
183
+ "IY0" : "ii",
184
+ "IY1" : "ii",
185
+ "IY2" : "ii",
186
+ "JH" : "j",
187
+ "K" : "k",
188
+ "L" : "l",
189
+ "M" : "m",
190
+ "N" : "n",
191
+ "NG" : "ng",
192
+ "OW" : "o",
193
+ "OW0" : "o",
194
+ "OW1" : "o",
195
+ "OW2" : "o",
196
+ "OY" : "ei",
197
+ "OY0" : "ei",
198
+ "OY1" : "ei",
199
+ "OY2" : "ei",
200
+ "P" : "p",
201
+ "R" : "r",
202
+ "S" : "s",
203
+ "SH" : "sh",
204
+ "T" : "tx",
205
+ "TH" : "t",
206
+ "UH" : "u",
207
+ "UH0" : "u",
208
+ "UH1" : "u",
209
+ "UH2" : "u",
210
+ "UW" : "uu",
211
+ "UW0" : "uu",
212
+ "UW1" : "uu",
213
+ "UW2" : "uu",
214
+ "V" : "w",
215
+ "W" : "w",
216
+ "Y" : "y",
217
+ "Z" : "z",
218
+ "ZH" : "sh",
219
+ }
220
+
221
+ # Mapping between the iitm cls and iitm char
222
+ self.cls_2_chr_map = {
223
+ "aa" : "A",
224
+ "ii" : "I",
225
+ "uu" : "U",
226
+ "ee" : "E",
227
+ "oo" : "O",
228
+ "nn" : "N",
229
+ "ae" : "",
230
+ "ag" : "",
231
+ "au" : "",
232
+ "axx" : "",
233
+ "ax" : "",
234
+ "bh" : "B",
235
+ "ch" : "C",
236
+ "dh" : "",
237
+ "dx" : "",
238
+ "dxh" : "",
239
+ "dxhq" : "T",
240
+ "dxq" : "D",
241
+ "ei" : "",
242
+ "ai" : "",
243
+ "eu" : "",
244
+ "gh" : "",
245
+ "gq" : "G",
246
+ "hq" : "H",
247
+ "jh" : "J",
248
+ "kh" : "",
249
+ "khq" : "K",
250
+ "kq" : "",
251
+ "ln" : "",
252
+ "lw" : "",
253
+ "lx" : "ള",
254
+ "mq" : "M",
255
+ "nd" : "न",
256
+ "ng" : "ङ",
257
+ "nj" : "ञ",
258
+ "nk" : "Y",
259
+ "nw" : "ൺ",
260
+ "nx" : "ण",
261
+ "ou" : "औ",
262
+ "ph" : "P",
263
+ "rq" : "R",
264
+ "rqw" : "ॠ",
265
+ "rw" : "ർ",
266
+ "rx" : "र",
267
+ "sh" : "श",
268
+ "sx" : "ष",
269
+ "th" : "थ",
270
+ "tx" : "",
271
+ "txh" : "ठ",
272
+ "wv" : "W",
273
+ "zh" : "Z",
274
+ }
275
+
276
+ # Multilingual support for OOV characters
277
+ oov_map_json_file = 'multilingualcharmap.json'
278
+ with open(oov_map_json_file, 'r') as oov_file:
279
+ self.oov_map = json.load(oov_file)
280
+
281
+ def __is_float(self, word):
282
+ parts = word.split('.')
283
+ if len(parts) != 2:
284
+ return False
285
+ return parts[0].isdecimal() and parts[1].isdecimal()
286
+
287
+ def en_g2p(self, word):
288
+ phn_out = self.g2p(word)
289
+ # print(f"phn_out: {phn_out}")
290
+ # iterate over the string list and replace each word with the corresponding value from the dictionary
291
+ for i, phn in enumerate(phn_out):
292
+ if phn in self.cmu_2_cls_map.keys():
293
+ phn_out[i] = self.cmu_2_cls_map[phn]
294
+ # cls_out = self.cmu_2_cls_map[phn]
295
+ if phn_out[i] in self.cls_2_chr_map.keys():
296
+ phn_out[i] = self.cls_2_chr_map[phn_out[i]]
297
+ else:
298
+ pass
299
+ else:
300
+ pass # ignore words that are not in the dictionary
301
+ # print(f"i: {i}, phn: {phn}, cls_out: {cls_out}, phn_out: {phn_out[i]}")
302
+ return ("".join(phn_out)).strip().replace(" ", "")
303
+
304
+ def __post_phonify(self, text, language, gender):
305
+ language_gender_id = language+'_'+gender
306
+ if language_gender_id in self.oov_map.keys():
307
+ output_string = ''
308
+ for char in text:
309
+ if char in self.oov_map[language_gender_id].keys():
310
+ output_string += self.oov_map[language_gender_id][char]
311
+ else:
312
+ output_string += char
313
+ # output_string += self.oov_map['language_gender_id']['char']
314
+ return output_string
315
+ else:
316
+ return text
317
+
318
+ def __is_english_word(self, word):
319
+ maxchar = max(word)
320
+ if u'\u0000' <= maxchar <= u'\u007f':
321
+ return True
322
+ return False
323
+
324
+ def __normalize_phones(self, phone_str):
325
+ """
326
+ Apply phone-level replacements after phonification
327
+ """
328
+ replace_map = {
329
+ "P": "f",
330
+ "ष": "श",
331
+ "ग़": "ग",
332
+ "T": "ढ",
333
+ "ऱ": "r",
334
+ "jञ": "gy",
335
+ "क़":"क",
336
+ "":"ख",
337
+ "ड़":"ड",
338
+ "फ़":"फ",
339
+ "य़":"य",
340
+ "ऱ":"r",
341
+ "Y":"",
342
+ "G":"g",
343
+ "क":"k",
344
+ "ay":"E",
345
+ "kH":"ख",
346
+ "ऩ":"n"
347
+ }
348
+
349
+ for src, tgt in replace_map.items():
350
+ phone_str = phone_str.replace(src, tgt)
351
+
352
+ return phone_str
353
+
354
+ def __phonify(self, text, language, gender):
355
+ # text is expected to be a list of strings
356
+ words = set((" ".join(text)).split(" "))
357
+ #print(f"words test: {words}")
358
+ non_dict_words = []
359
+
360
+
361
+ if language in self.phone_dictionary:
362
+ for word in words:
363
+ # print(f"word: {word}")
364
+ if word not in self.phone_dictionary[language] and (language == "english" or (not self.__is_english_word(word))):
365
+ non_dict_words.append(word)
366
+ #print('INSIDE IF CONDITION OF ADDING WORDS')
367
+ else:
368
+ non_dict_words = words
369
+ print(f"word not in dict: {non_dict_words}")
370
+
371
+ if len(non_dict_words) > 0:
372
+ # unified parser has to be run for the non dictionary words
373
+ os.makedirs("tmp", exist_ok=True)
374
+ timestamp = str(time.time())
375
+ non_dict_words_file = os.path.abspath("tmp/non_dict_words_" + timestamp)
376
+ out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
377
+ with open(non_dict_words_file, "w") as f:
378
+ f.write("\n".join(non_dict_words))
379
+
380
+ if(language == 'tamil'):
381
+ current_directory = os.getcwd()
382
+
383
+ tamil_parser_cmd = f"{current_directory}/ssn_parser/Tamil_Parser/ssn_parser/tamil_parser.sh"
384
+ subprocess.run(["bash", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, f"{current_directory}/ssn_parser/Tamil_Parser/ssn_parser"])
385
+ elif(language == 'english'):
386
+ phn_out_dict = {}
387
+ for i in range(0,len(non_dict_words)):
388
+ phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i])
389
+ # Create a string representation of the dictionary
390
+ data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()])
391
+ print(f"data_str: {data_str}")
392
+ with open(out_dict_file, "w") as f:
393
+ f.write(data_str)
394
+ else:
395
+ # unified_parser_cmd = "phonify_wrapper.sh"
396
+
397
+ # subprocess.run(["bash", unified_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "/speech/arun/tts/tts_api/text2phone/"])
398
+ out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
399
+ from get_phone_mapped_python import TextReplacer
400
+
401
+ from indic_unified_parser.uparser import wordparse
402
+
403
+ text_replacer=TextReplacer()
404
+ # def write_output_to_file(output_text, file_path):
405
+ # with open(file_path, 'w') as f:
406
+ # f.write(output_text)
407
+ parsed_output_list = []
408
+ for word in non_dict_words:
409
+ parsed_word = wordparse(word, 0, 0, 1)
410
+ parsed_output_list.append(parsed_word)
411
+ replaced_output_list = [text_replacer.apply_replacements(parsed_word) for parsed_word in parsed_output_list]
412
+ with open(out_dict_file, 'w', encoding='utf-8') as file:
413
+ for original_word, formatted_word in zip(non_dict_words, replaced_output_list):
414
+ line = f"{original_word}\t{formatted_word}\n"
415
+ file.write(line)
416
+ print(line, end='')
417
+
418
+
419
+ try:
420
+
421
+ df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str)
422
+ #print('DATAFRAME OUTPUT FILE', df.head())
423
+ new_dict = df.dropna().set_index(0).to_dict('dict')[1]
424
+ #print("new dict",new_dict)
425
+ if language not in self.phone_dictionary:
426
+ self.phone_dictionary[language] = new_dict
427
+ else:
428
+ self.phone_dictionary[language].update(new_dict)
429
+ # run a non-blocking child process to update the dictionary file
430
+ #print("phone_dict", self.phone_dictionary)
431
+ p = Process(target=add_to_dictionary, args=(new_dict, os.path.join(self.dict_location, language)))
432
+ p.start()
433
+ except Exception as err:
434
+ print(f"Error: While loading {out_dict_file}")
435
+ #traceback.print_exc()
436
+
437
+ # phonify text with dictionary
438
+ text_phonified = []
439
+ for phrase in text:
440
+ phrase_phonified = []
441
+ for word in phrase.split(" "):
442
+ if self.__is_english_word(word):
443
+ if word in self.phone_dictionary["english"]:
444
+ phrase_phonified.append(str(self.phone_dictionary["english"][word]))
445
+ else:
446
+ phrase_phonified.append(str(self.en_g2p(word)))
447
+ elif word in self.phone_dictionary[language]:
448
+ # if a word could not be parsed, skip it
449
+ phrase_phonified.append(str(self.phone_dictionary[language][word]))
450
+ # text_phonified.append(self.__post_phonify(" ".join(phrase_phonified),language, gender))
451
+
452
+ phone_line = " ".join(phrase_phonified)
453
+ if language in self.devanagari_langs:
454
+ phone_line = self.__normalize_phones(phone_line)
455
+ text_phonified.append(phone_line)
456
+ return text_phonified
457
+
458
+ def __merge_lists(self, lists):
459
+ merged_string = ""
460
+ for list in lists:
461
+ for word in list:
462
+ merged_string += word + " "
463
+ return merged_string.strip()
464
+
465
+ def __phonify_list(self, text, language, gender):
466
+ # text is expected to be a list of list of strings
467
+ words = set(self.__merge_lists(text).split(" "))
468
+ non_dict_words = []
469
+ if language in self.phone_dictionary:
470
+ for word in words:
471
+ if word not in self.phone_dictionary[language] and (language == "english" or (not self.__is_english_word(word))):
472
+ non_dict_words.append(word)
473
+ else:
474
+ non_dict_words = words
475
+
476
+ if len(non_dict_words) > 0:
477
+ print(len(non_dict_words))
478
+ print(non_dict_words)
479
+ # unified parser has to be run for the non dictionary words
480
+ os.makedirs("tmp", exist_ok=True)
481
+ timestamp = str(time.time())
482
+ non_dict_words_file = os.path.abspath("tmp/non_dict_words_" + timestamp)
483
+ out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
484
+ with open(non_dict_words_file, "w") as f:
485
+ f.write("\n".join(non_dict_words))
486
+
487
+ if(language == 'tamil'):
488
+ tamil_parser_cmd = "tamil_parser.sh"
489
+ subprocess.run(["bash", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "ssn_parser/"])
490
+ elif(language == 'english'):
491
+ phn_out_dict = {}
492
+ for i in range(0,len(non_dict_words)):
493
+ phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i])
494
+ # Create a string representation of the dictionary
495
+ data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()])
496
+ print(f"data_str: {data_str}")
497
+ with open(out_dict_file, "w") as f:
498
+ f.write(data_str)
499
+ else:
500
+ unified_parser_cmd = "phonify_wrapper.sh"
501
+ subprocess.run(["bash", unified_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "ssn_parser/"])
502
+ # unified_parser_cmd = "phonify_wrapper.sh"
503
+ # subprocess.run(["bash", unified_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "/var/www/html/IITM_TTS/E2E_TTS_FS2/text_proc/text2phone/"])
504
+ try:
505
+ df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str)
506
+ new_dict = df.dropna().set_index(0).to_dict('dict')[1]
507
+ print(new_dict)
508
+ if language not in self.phone_dictionary:
509
+ self.phone_dictionary[language] = new_dict
510
+ else:
511
+ self.phone_dictionary[language].update(new_dict)
512
+ # run a non-blocking child process to update the dictionary file
513
+ p = Process(target=add_to_dictionary, args=(new_dict, os.path.join(self.dict_location, language)))
514
+ p.start()
515
+ except Exception as err:
516
+ traceback.print_exc()
517
+
518
+ # phonify text with dictionary
519
+ text_phonified = []
520
+ for line in text:
521
+ line_phonified = []
522
+ for phrase in line:
523
+ phrase_phonified = []
524
+ for word in phrase.split(" "):
525
+ if self.__is_english_word(word):
526
+ if word in self.phone_dictionary["english"]:
527
+ phrase_phonified.append(str(self.phone_dictionary["english"][word]))
528
+ else:
529
+ phrase_phonified.append(str(self.en_g2p(word)))
530
+ elif word in self.phone_dictionary[language]:
531
+ # if a word could not be parsed, skip it
532
+ phrase_phonified.append(str(self.phone_dictionary[language][word]))
533
+ # line_phonified.append(self.__post_phonify(" ".join(phrase_phonified), language, gender))
534
+ line_phonified.append(" ".join(phrase_phonified))
535
+ text_phonified.append(line_phonified)
536
+ return text_phonified
537
+
538
+ def phonify(self, text, language, gender):
539
+ if not isinstance(text, list):
540
+ out = self.__phonify([text], language, gender)
541
+ return out[0]
542
+ return self.__phonify(text, language, gender)
543
+
544
+ def phonify_list(self, text, language, gender):
545
+ if isinstance(text, list):
546
+ return self.__phonify_list(text, language, gender)
547
+ else:
548
+ print("Error!! Expected to have a list as input.")
549
+
550
+
551
+ class TextNormalizer:
552
+ def __init__(self, char_map_location=None, phonifier = Phonifier()):
553
+ self.phonifier = phonifier
554
+ if char_map_location is None:
555
+ char_map_location = "charmap"
556
+
557
+ # this is a static set of cleaning rules to be applied
558
+ self.cleaning_rules = {
559
+ " +" : " ",
560
+ "^ +" : "",
561
+ " +$" : "",
562
+ "#$" : "",
563
+ "# +$" : "",
564
+ }
565
+
566
+ # this is the list of languages supported by num_to_words
567
+ self.keydict = {"english" : "en",
568
+ "hindi" : "hi",
569
+ "gujarati" : "gu",
570
+ "marathi" : "mr",
571
+ "bengali" : "bn",
572
+ "telugu" : "te",
573
+ "tamil" : "ta",
574
+ "kannada" : "kn",
575
+ "odia" : "or",
576
+ "punjabi" : "pa"
577
+ }
578
+
579
+ self.g2p = G2p()
580
+ #print('Loading G2P model... Done!')
581
+
582
+ def __post_cleaning(self, text):
583
+ for key, replacement in self.cleaning_rules.items():
584
+ text = re.sub(key, replacement, text)
585
+ return text
586
+
587
+ def __post_cleaning_list(self, text):
588
+ # input is supposed to be a list of strings
589
+ output_text = []
590
+ for line in text:
591
+ for key, replacement in self.cleaning_rules.items():
592
+ line = re.sub(key, replacement, line)
593
+ output_text.append(line)
594
+ return output_text
595
+
596
+ def __check_char_type(self, str_c):
597
+ # Determine the type of the character
598
+ if str_c.isnumeric():
599
+ char_type = "number"
600
+ elif str_c in string.punctuation:
601
+ char_type = "punctuation"
602
+ elif str_c in string.whitespace:
603
+ char_type = "whitespace"
604
+ elif str_c.isalpha() and str_c.isascii():
605
+ char_type = "ascii"
606
+ else:
607
+ char_type = "non-ascii"
608
+ return char_type
609
+
610
+ def insert_space(self, text):
611
+ '''
612
+ Check if the text contains numbers and English words and if they are without space inserts space between them.
613
+ '''
614
+ # Initialize variables to track the previous character type and whether a space should be inserted
615
+ prev_char_type = None
616
+ next_char_type = None
617
+ insert_space = False
618
+
619
+ # Output string
620
+ output_string = ""
621
+
622
+ # Iterate through each character in the text
623
+ for i, c in enumerate(text):
624
+ # Determine the type of the character
625
+ char_type = self.__check_char_type(c)
626
+ if i == (len(text) - 1):
627
+ next_char_type = None
628
+ else:
629
+ next_char_type = self.__check_char_type(text[i+1])
630
+ # print(f"{i}: {c} is a {char_type} character and next character is a {next_char_type}")
631
+
632
+ # If the character type has changed from the previous character, check if a space should be inserted
633
+ if (char_type != prev_char_type and prev_char_type != None and char_type != "punctuation" and char_type != "whitespace"):
634
+ if next_char_type != "punctuation" or next_char_type != "whitespace":
635
+ insert_space = True
636
+
637
+ # Insert a space if needed
638
+ if insert_space:
639
+ output_string += " "+c
640
+ insert_space = False
641
+ else:
642
+ output_string += c
643
+
644
+ # Update the previous character type
645
+ prev_char_type = char_type
646
+
647
+ # Print the modified text
648
+ output_string = re.sub(r' +', ' ', output_string)
649
+ return output_string
650
+
651
+ def insert_space_list(self, text):
652
+ '''
653
+ Expect the input to be in form of list of string.
654
+ Check if the text contains numbers and English words and if they are without space inserts space between them.
655
+ '''
656
+ # Output string list
657
+ output_list = []
658
+
659
+ for line in text:
660
+ # Initialize variables to track the previous character type and whether a space should be inserted
661
+ prev_char_type = None
662
+ next_char_type = None
663
+ insert_space = False
664
+ # Output string
665
+ output_string = ""
666
+ # Iterate through each character in the line
667
+ for i, c in enumerate(line):
668
+ # Determine the type of the character
669
+ char_type = self.__check_char_type(c)
670
+ if i == (len(line) - 1):
671
+ next_char_type = None
672
+ else:
673
+ next_char_type = self.__check_char_type(line[i+1])
674
+ # print(f"{i}: {c} is a {char_type} character and next character is a {next_char_type}")
675
+
676
+ # If the character type has changed from the previous character, check if a space should be inserted
677
+ if (char_type != prev_char_type and prev_char_type != None and char_type != "punctuation" and char_type != "whitespace"):
678
+ if next_char_type != "punctuation" or next_char_type != "whitespace":
679
+ insert_space = True
680
+
681
+ # Insert a space if needed
682
+ if insert_space:
683
+ output_string += " "+c
684
+ insert_space = False
685
+ else:
686
+ output_string += c
687
+
688
+ # Update the previous character type
689
+ prev_char_type = char_type
690
+
691
+ # Print the modified line
692
+ output_string = re.sub(r' +', ' ', output_string)
693
+ output_list.append(output_string)
694
+ return output_list
695
+
696
+ # def num2text(self, text, language):
697
+ # if language in self.keydict.keys():
698
+ # digits = sorted(list(map(int, re.findall(r'\d+', text))),reverse=True)
699
+ # if digits:
700
+ # for digit in digits:
701
+ # text = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', text)
702
+ # return self.__post_cleaning(text)
703
+ # else:
704
+ # print(f"No num-to-char for the given language {language}.")
705
+ # return self.__post_cleaning(text)
706
+ #====================================================
707
+
708
+ def num2text(self, text, language):
709
+ # print("utkqdd")
710
+
711
+ pattern = r'\d+\.\d+|\d+'
712
+ numbers = re.findall(pattern, text)
713
+ numbers = [num for num in numbers]
714
+ temp = NumberDictionary()
715
+ for data in numbers:
716
+ num_to_text = ' ' + temp.num2text(data, language)
717
+ # print(num_to_text)
718
+ pattern = r'(?<!\d)' + re.escape(data) + r'(?!\d)'
719
+ text = re.sub(pattern, num_to_text, text)
720
+ # print(text)
721
+ return text
722
+
723
+ def num2text_list(self, text, language):
724
+ # input is supposed to be a list of strings
725
+ # if language in self.keydict.keys():
726
+ output_text = []
727
+ # print("utk")
728
+ # print("text", text)
729
+ temp = NumberDictionary()
730
+ for line in text:
731
+ # print("lines",line)
732
+ pattern = r'\d+\.\d+|\d+'
733
+ numbers = re.findall(pattern, line)
734
+ numbers = [num for num in numbers]
735
+
736
+ for data in numbers:
737
+ num_to_text = ' ' + temp.num2text(data, language)
738
+ pattern = r'(?<!\d)' + re.escape(data) + r'(?!\d)'
739
+ line = re.sub(pattern, num_to_text, line)
740
+ output_text.append(line)
741
+ # print(output_text)
742
+ return output_text
743
+ #====================================================
744
+
745
+
746
+
747
+
748
+ # def num2text_list(self, text, language):
749
+ # # input is supposed to be a list of strings
750
+ # if language in self.keydict.keys():
751
+ # output_text = []
752
+ # for line in text:
753
+ # digits = sorted(list(map(int, re.findall(r'\d+', line))),reverse=True)
754
+ # if digits:
755
+ # for digit in digits:
756
+ # line = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', line)
757
+ # output_text.append(line)
758
+ # return self.__post_cleaning_list(output_text)
759
+ # else:
760
+ # print(f"No num-to-char for the given language {language}.")
761
+ # return self.__post_cleaning_list(text)
762
+
763
+ def normalize(self, text, language):
764
+ return self.__post_cleaning(text)
765
+
766
+ def normalize_list(self, text, language):
767
+ # input is supposed to be a list of strings
768
+ return self.__post_cleaning_list(text)
769
+
770
+
771
+ class TextPhrasifier:
772
+ @classmethod
773
+ def phrasify(cls, text):
774
+ phrase_list = []
775
+ for phrase in text.split("#"):
776
+ phrase = phrase.strip()
777
+ if phrase != "":
778
+ phrase_list.append(phrase)
779
+ return phrase_list
780
+
781
+ class TextPhrasifier_List:
782
+ @classmethod
783
+ def phrasify(cls, text):
784
+ # input is supposed to be a list of strings
785
+ # output is list of list of strings
786
+ output_list = []
787
+ for line in text:
788
+ phrase_list = []
789
+ for phrase in line.split("#"):
790
+ phrase = phrase.strip()
791
+ if phrase != "":
792
+ phrase_list.append(phrase)
793
+ output_list.append(phrase_list)
794
+ return output_list
795
+
796
+ class DurAlignTextProcessor:
797
+ def __init__(self):
798
+ # this is a static set of cleaning rules to be applied
799
+ self.cleaning_rules = {
800
+ r"\s{2,}": " ", # collapse multiple spaces into one
801
+ r"^": "$",
802
+ r"$": ".",
803
+ }
804
+ self.cleaning_rules_English = {
805
+ " +" : "",
806
+ "$" : ".",
807
+ }
808
+ def textProcesorForEnglish(self, text):
809
+ for key, replacement in self.cleaning_rules_English.items():
810
+ for idx in range(0,len(text)):
811
+ text[idx] = re.sub(key, replacement, text[idx])
812
+
813
+ return text
814
+
815
+ def textProcesor(self, text):
816
+ for key, replacement in self.cleaning_rules.items():
817
+ for idx in range(0,len(text)):
818
+ text[idx] = re.sub(key, replacement, text[idx])
819
+
820
+ return text
821
+
822
+ def textProcesor_list(self, text):
823
+ # input expected in 'list of list of string' format
824
+ output_text = []
825
+ for line in text:
826
+ for key, replacement in self.cleaning_rules.items():
827
+ for idx in range(0,len(line)):
828
+ line[idx] = re.sub(key, replacement, line[idx])
829
+ output_text.append(line)
830
+
831
+ return output_text
832
+
833
+
834
+ class TTSDurAlignPreprocessor:
835
+ def __init__(self,
836
+ text_cleaner = TextCleaner(),
837
+ text_normalizer=TextNormalizer(),
838
+ phonifier = Phonifier(),
839
+ post_processor = DurAlignTextProcessor()):
840
+ self.text_cleaner = text_cleaner
841
+ self.text_normalizer = text_normalizer
842
+ self.phonifier = phonifier
843
+ self.post_processor = post_processor
844
+
845
+ def preprocess(self, text, language, gender):
846
+ # text = text.strip()
847
+ print(text)
848
+ text = self.text_cleaner.clean(text)
849
+ print("cleaned text", text)
850
+ # text = self.text_normalizer.insert_space(text)
851
+ text = self.text_normalizer.num2text(text, language)
852
+ # print(text)
853
+ text = self.text_normalizer.normalize(text, language)
854
+ # print(text)
855
+ phrasified_text = TextPhrasifier.phrasify(text)
856
+ #print("phrased",phrasified_text)
857
+ phonified_text = self.phonifier.phonify(phrasified_text, language, gender)
858
+ #print("phonetext",phonified_text)
859
+ phonified_text = self.post_processor.textProcesor(phonified_text)
860
+ print(phonified_text)
861
+ return phonified_text, phrasified_text
862
+
863
+ class TTSDurAlignPreprocessor_VTT:
864
+ def __init__(self,
865
+ text_cleaner = TextCleaner(),
866
+ text_normalizer=TextNormalizer(),
867
+ phonifier = Phonifier(),
868
+ post_processor = DurAlignTextProcessor()):
869
+ self.text_cleaner = text_cleaner
870
+ self.text_normalizer = text_normalizer
871
+ self.phonifier = phonifier
872
+ self.post_processor = post_processor
873
+
874
+ def preprocess(self, text, language, gender):
875
+ # text = text.strip()
876
+ text = self.text_cleaner.clean_list(text)
877
+ # text = self.text_normalizer.insert_space_list(text)
878
+ text = self.text_normalizer.num2text_list(text, language)
879
+ text = self.text_normalizer.normalize_list(text, language)
880
+ phrasified_text = TextPhrasifier_List.phrasify(text)
881
+ phonified_text = self.phonifier.phonify_list(phrasified_text, language, gender)
882
+ phonified_text = self.post_processor.textProcesor_list(phonified_text)
883
+ return phonified_text, phrasified_text
884
+
885
+
886
+ class CharTextPreprocessor:
887
+ def __init__(self,
888
+ text_cleaner = TextCleaner(),
889
+ text_normalizer=TextNormalizer()):
890
+ self.text_cleaner = text_cleaner
891
+ self.text_normalizer = text_normalizer
892
+
893
+ def preprocess(self, text, language, gender=None):
894
+ text = text.strip()
895
+ text = self.text_cleaner.clean(text)
896
+ # text = self.text_normalizer.insert_space(text)
897
+ text = self.text_normalizer.num2text(text, language)
898
+ text = self.text_normalizer.normalize(text, language)
899
+ phrasified_text = TextPhrasifier.phrasify(text)
900
+ phonified_text = phrasified_text # No phonification for character TTS models
901
+ return phonified_text, phrasified_text
902
+
903
+ class CharTextPreprocessor_VTT:
904
+ def __init__(self,
905
+ text_cleaner = TextCleaner(),
906
+ text_normalizer=TextNormalizer()):
907
+ self.text_cleaner = text_cleaner
908
+ self.text_normalizer = text_normalizer
909
+
910
+ def preprocess(self, text, language, gender=None):
911
+ # text = text.strip()
912
+ text = self.text_cleaner.clean_list(text)
913
+ # text = self.text_normalizer.insert_space_list(text)
914
+ text = self.text_normalizer.num2text_list(text, language)
915
+ text = self.text_normalizer.normalize_list(text, language)
916
+ phrasified_text = TextPhrasifier_List.phrasify(text)
917
+ phonified_text = phrasified_text # No phonification for character TTS models
918
+ return phonified_text, phrasified_text
919
+
920
+
921
+ class TTSPreprocessor:
922
+ def __init__(
923
+ self,
924
+ text_cleaner = TextCleaner(),
925
+ text_normalizer=TextNormalizer(),
926
+ phonifier = Phonifier(),
927
+ text_phrasefier = TextPhrasifier(),
928
+ post_processor = DurAlignTextProcessor()
929
+ ):
930
+ self.text_cleaner = text_cleaner
931
+ self.text_normalizer = text_normalizer
932
+ self.phonifier = phonifier
933
+ self.text_phrasefier = text_phrasefier
934
+ self.post_processor = post_processor
935
+
936
+ def preprocess(self, text, language, gender):
937
+ text = text.strip()
938
+ text = self.text_cleaner.clean(text)
939
+ # text = self.text_normalizer.insert_space(text)
940
+ text = self.text_normalizer.num2text(text, language)
941
+ text = self.text_normalizer.normalize(text, language)
942
+ phrasified_text = TextPhrasifier.phrasify(text)
943
+ phonified_text = self.phonifier.phonify(phrasified_text, language, gender)
944
+ return phonified_text, phrasified_text
945
+
946
+ class TTSPreprocessor_VTT:
947
+ def __init__(self,
948
+ text_cleaner = TextCleaner(),
949
+ text_normalizer=TextNormalizer(),
950
+ phonifier = Phonifier(),
951
+ text_phrasefier = TextPhrasifier_List()):
952
+ self.text_cleaner = text_cleaner
953
+ self.text_normalizer = text_normalizer
954
+ self.phonifier = phonifier
955
+ self.text_phrasefier = text_phrasefier
956
+
957
+ def preprocess(self, text, language, gender):
958
+ # print(f"Original text: {text}")
959
+ text = self.text_cleaner.clean_list(text)
960
+ # print(f"After text cleaner: {text}")
961
+ # text = self.text_normalizer.insert_space_list(text)
962
+ # print(f"After insert space: {text}")
963
+ text = self.text_normalizer.num2text_list(text, language)
964
+ # print(f"After num2text: {text}")
965
+ text = self.text_normalizer.normalize_list(text, language)
966
+ # print(f"After text normalizer: {text}")
967
+ phrasified_text = TextPhrasifier_List.phrasify(text)
968
+ # print(f"phrasified_text: {phrasified_text}")
969
+ phonified_text = self.phonifier.phonify_list(phrasified_text, language, gender)
970
+ # print(f"phonified_text: {phonified_text}")
971
+ return phonified_text, phrasified_text
tmp/non_dict_words_1776242609.109296 ADDED
File without changes
tmp/non_dict_words_1776242740.8554978 ADDED
File without changes
utilities.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import random
3
+ from datetime import datetime
4
+ import base64
5
+ from enum import Enum
6
+ import os
7
+ import librosa
8
+ import numpy as np
9
+ import soundfile as sf
10
+
11
+ SAMPLING_RATE = int(os.getenv("DEFAULT_SAMPLING_RATE", "48000"))
12
+
13
+ output_audio_dir = os.getenv("OUTPUT_DIR", f"{os.path.dirname(os.path.realpath(__file__))}/outputs")
14
+ os.makedirs(output_audio_dir, exist_ok=True)
15
+
16
+
17
+ LANG_TEMPOS = {
18
+ "malayalam": 1.2,
19
+ "telugu": 1.3,
20
+ "hindi": 0.9,
21
+ "punjabi": 0.9
22
+ }
23
+
24
+ SUPPORTED_OUTPUT_LANGS = os.getenv("LANGUAGES", "hindi,bengali,tamil").split(",")
25
+
26
+ def isOutputLangSupported(lang: str):
27
+ if(lang not in SUPPORTED_OUTPUT_LANGS):
28
+ raise ValueError(f"Unsupported output language: {lang}. Supported languages are: {SUPPORTED_OUTPUT_LANGS}")
29
+
30
+
31
+ def truncate_string(s, max_length=100):
32
+ return s[:max_length] + ('...' if len(s) > max_length else '')
33
+
34
+
35
+ WARMUP_PARAGRAPHS = {
36
+ "hindi": (
37
+ "भारत एक विशाल देश है जहाँ विभिन्न भाषाएं, धर्म और संस्कृतियाँ एक साथ अस्तित्व में हैं। "
38
+ "यहाँ की विविधता में एकता इसकी सबसे बड़ी विशेषता है। हर राज्य की अपनी एक परंपरा होती है और "
39
+ "खानपान भी अलग होता है। भारतीय समाज में परिवार को अत्यंत महत्त्व दिया जाता है और लोग एक-दूसरे "
40
+ "की मदद के लिए हमेशा तैयार रहते हैं। पर्व-त्योहार जैसे दीपावली, होली, ईद और क्रिसमस यहाँ बड़े "
41
+ "हर्षोल्लास के साथ मनाए जाते हैं। यहाँ की कला, संगीत, और साहित्य ने दुनिया भर में अपनी पहचान बनाई है।"
42
+ ),
43
+ "kannada": (
44
+ "ಭಾರತವು ವಿಭಿನ್ನ ಭಾಷೆಗಳು, ಧರ್ಮಗಳು ಮತ್ತು ಸಂಸ್ಕೃತಿಗಳ ನಾಡಾಗಿದೆ. ಇದು ಒಂದೇ ದೇಶದಲ್ಲಿ ಅನೇಕ ಭಿನ್ನತೆಗಳನ್ನು "
45
+ "ಒಳಗೊಂಡಿದೆ. ಪ್ರತಿಯೊಂದು ರಾಜ್ಯಕ್ಕೂ ತನ್ನದೇ ಆದ ಪರಂಪರೆ, ಸಂಸ್ಕೃತಿ ಮತ್ತು ಆಹಾರದ ಶೈಲಿ ಇರುತ್ತದೆ. "
46
+ "ಇಲ್ಲಿ ಕುಟುಂಬ ಮತ್ತು ಸಮುದಾಯಕ್ಕೆ ಹೆಚ್ಚಿನ ಮಹತ್ವ ನೀಡಲಾಗುತ್ತದೆ. ಹಬ್ಬಗಳು ಮತ್ತು ಆಚರಣೆಗಳು ಜನರ ನಡುವಿನ "
47
+ "ಸಾಮರಸ್ಯವನ್ನು ಬಲಪಡಿಸುತ್ತವೆ. ಭಾರತದಲ್ಲಿ ಸಂಗೀತ, ನೃತ್ಯ ಮತ್ತು ಸಾಹಿತ್ಯ ಕ್ಷೇತ್ರಗಳಲ್ಲಿ ಸಮೃದ್ಧ ಪರಂಪರೆ ಇದೆ."
48
+ ),
49
+ "malayalam": (
50
+ "ഇന്ത്യ ഒരു വൈവിധ്യങ്ങളാൽ സമ്പന്നമായ ദേശമാണ്. വിവിധ ഭാഷകളും മതങ്ങളും സംസ്‌കാരങ്ങളുമുള്ള ഈ രാജ്യത്ത് "
51
+ "ഏറ്റവും വലിയ പ്രത്യേകത ഏകതയിലാണ്. ഓരോ സംസ്ഥാനത്തിനും തനതായ ചരിത്രവും സംസ്കാരവുമുണ്ട്. "
52
+ "കുടുംബ ബന്ധങ്ങൾക്കും ഒത്തുചേരലിനും വലിയ വിലയാണ് ഇന്ത്യയിലെ ജനങ്ങൾ നൽകുന്നത്. "
53
+ "ഓണവും വിഷുവും പോലുള്ള ഉത്സവങ്ങൾ ഒരേ സമയം ആത്മീയതയും സന്തോഷവും പങ്കുവെക്കാൻ സഹായിക്കുന്നു. "
54
+ "ഇവിടെ സംഗീതം, നാടകം, ചലച്ചിത്രം തുടങ്ങിയ കലാരൂപങ്ങൾ ലോകപ്രശസ്തമാണ്."
55
+ ),
56
+ "punjabi": (
57
+ "ਭਾਰਤ ਇੱਕ ਐਸਾ ਦੇਸ਼ ਹੈ ਜਿੱਥੇ ਬਹੁਤ ਸਾਰੀਆਂ ਭਾਸ਼ਾਵਾਂ, ਧਰਮਾਂ ਅਤੇ ਰਿਵਾਇਤਾਂ ਦਾ ਮਿਲਾਪ ਹੈ। "
58
+ "ਇਸ ਦੀ ਸਭ ਤੋਂ ਵੱਡੀ ਖਾਸੀਅਤ ਇਹ ਹੈ ਕਿ ਇੱਥੇ ਵੱਖ-ਵੱਖਤਾ ਵਿੱਚ ਵੀ ਏਕਤਾ ਹੈ। ਹਰ ਰਾਜ ਦੀ ਆਪਣੀ ਪਛਾਣ ਹੁੰਦੀ ਹੈ "
59
+ "ਅਤੇ ਇੱਥੇ ਦੇ ਲੋਕ ਆਪਣੇ ਪਰਿਵਾਰ ਅ��ੇ ਸੰਸਕਾਰਾਂ ਨਾਲ ਜੁੜੇ ਰਹਿੰਦੇ ਹਨ। ਪੰਜਾਬੀ ਲੋਕਾਂ ਦੀ ਬੋਲੀ, ਭੰਗੜਾ, ਗਿੱਧਾ ਅਤੇ "
60
+ "ਖਾਣ-ਪੀਣ ਦੀ ਰੀਤ ਸਾਰੀ ਦੁਨੀਆ ਵਿੱਚ ਮਸ਼ਹੂਰ ਹੈ। ਤਿਉਹਾਰ, ਖੇਡਾਂ ਅਤੇ ਲੋਕ ਸੱਭਿਆਚਾਰ ਇੱਥੇ ਦੀ ਰੋਹਕਤਾ ਵਧਾਉਂਦੇ ਹਨ।"
61
+ ),
62
+ "tamil": (
63
+ "இந்தியா என்பது பல்வேறு கலாசாரங்களும் மொழிகளும் கொண்ட நாடாகும். ஒவ்வொரு மாநிலத்திற்கும் தனித்துவமான பாரம்பரியம் "
64
+ "மற்றும் சுவையான உணவுகள் உள்ளன. தமிழ்நாடு இவற்றில் முக்கியமான ஒரு பகுதி. தமிழ் மொழியும் கலாசாரமும் உலகெங்கும் "
65
+ "புகழ்பெற்றவை. குடும்பங்கள் ஒன்றிணைந்து வாழும் பாசத்திற்கும் எப்போதும் முக்கியத்துவம் அளிக்கப்படுகிறது. "
66
+ "தீபாவளி, பொங்கல் போன்ற பண்டிகைகள் மகிழ்ச்சியுடனும் ஒருமைப்பாட்டுடனும் கொண்டாடப்படுகின்றன. "
67
+ "இங்கே உள்ள பாரம்பரிய இசை, நடனம், இலக்கியம் ஆகியவை பாராட்டத்தக்கவை."
68
+ ),
69
+ "telugu": (
70
+ "భారతదేశం అనేక భాషలు, సంస్కృతులు మరియు సంప్రదాయాలతో కూడిన దేశం. ప్రతి రాష్ట్రానికీ తనదైన ప్రత్యేకత ఉంది. "
71
+ "తెలుగు రాష్ట్రాలు సంస్కృతిగా గొప్పదిగా పరిగణించబడతాయి. ఇక్కడ కుటుంబం, గౌరవం, సహాయం వంటి విలువలకు ఎక్కువ ప్రాధాన్యం "
72
+ "ఇస్తారు. సంక్రాంతి, ఉగాది వంటి పండుగలు కుటుంబ సభ్యులందరినీ కలిపి ఉంచుతాయి. నాట్యం, సంగీతం మరియు సాహిత్యం వంటి కళలు "
73
+ "తెలుగువారిలో భాగం. భోజన పరంపరలూ అద్భుతమైనవి. వీటన్నింటి సమ్మేళనమే భారతదేశాన్ని ప్రత్యేకంగా నిలబెడుతుంది."
74
+ )
75
+ }