Jobanpreet commited on
Commit
445eb21
·
1 Parent(s): cfd96d2

Upload 4 files

Browse files
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ #replace the path with your hifigan path to import Generator from models.py
4
+ sys.path.append("hifigan")
5
+ import argparse
6
+ import torch
7
+ from espnet2.bin.tts_inference import Text2Speech
8
+ from models import Generator
9
+ from scipy.io.wavfile import write
10
+ from meldataset import MAX_WAV_VALUE
11
+ from env import AttrDict
12
+ import json
13
+ import yaml
14
+ import numpy as np
15
+ from text_preprocess_for_inference import TTSDurAlignPreprocessor, CharTextPreprocessor, TTSPreprocessor
16
+
17
+ SAMPLING_RATE = 22050
18
+
19
+ def load_hifigan_vocoder(language, gender, device):
20
+ # Load HiFi-GAN vocoder configuration file and generator model for the specified language and gender
21
+ vocoder_config = f"vocoder/{gender}/aryan/hifigan/config.json"
22
+ vocoder_generator = f"vocoder/{gender}/aryan/hifigan/generator"
23
+ # Read the contents of the vocoder configuration file
24
+ with open(vocoder_config, 'r') as f:
25
+ data = f.read()
26
+ json_config = json.loads(data)
27
+ h = AttrDict(json_config)
28
+ torch.manual_seed(h.seed)
29
+ # Move the generator model to the specified device (CPU or GPU)
30
+ device = torch.device(device)
31
+ generator = Generator(h).to(device)
32
+ state_dict_g = torch.load(vocoder_generator, device)
33
+ generator.load_state_dict(state_dict_g['generator'])
34
+ generator.eval()
35
+ generator.remove_weight_norm()
36
+
37
+ # Return the loaded and prepared HiFi-GAN generator model
38
+ return generator
39
+
40
+
41
+ def load_fastspeech2_model(language, gender, device):
42
+
43
+ #updating the config.yaml fiel based on language and gender
44
+ with open(f"punjabi/{gender}/model/config.yaml", "r") as file:
45
+ config = yaml.safe_load(file)
46
+
47
+ current_working_directory = os.getcwd()
48
+ feat="model/feats_stats.npz"
49
+ pitch="model/pitch_stats.npz"
50
+ energy="model/energy_stats.npz"
51
+
52
+ feat_path=os.path.join(current_working_directory,language,gender,feat)
53
+ pitch_path=os.path.join(current_working_directory,language,gender,pitch)
54
+ energy_path=os.path.join(current_working_directory,language,gender,energy)
55
+
56
+
57
+ config["normalize_conf"]["stats_file"] = feat_path
58
+ config["pitch_normalize_conf"]["stats_file"] = pitch_path
59
+ config["energy_normalize_conf"]["stats_file"] = energy_path
60
+
61
+ with open(f"punjabi/{gender}/model/config.yaml", "w") as file:
62
+ yaml.dump(config, file)
63
+
64
+ tts_model = f"punjabi/{gender}/model/model.pth"
65
+ tts_config = f"punjabi/{gender}/model/config.yaml"
66
+
67
+
68
+ return Text2Speech(train_config=tts_config, model_file=tts_model, device=device)
69
+
70
+ def text_synthesis(language, gender, sample_text, vocoder, MAX_WAV_VALUE, device):
71
+ # Perform Text-to-Speech synthesis
72
+ with torch.no_grad():
73
+ # Load the FastSpeech2 model for the specified language and gender
74
+
75
+ model = load_fastspeech2_model(language, gender, device)
76
+
77
+ # Generate mel-spectrograms from the input text using the FastSpeech2 model
78
+ out = model(sample_text, decode_conf={"alpha": 1})
79
+ print("TTS Done")
80
+ x = out["feat_gen_denorm"].T.unsqueeze(0) * 2.3262
81
+ x = x.to(device)
82
+ # Use the HiFi-GAN vocoder to convert mel-spectrograms to raw audio waveforms
83
+ y_g_hat = vocoder(x)
84
+ audio = y_g_hat.squeeze()
85
+ audio = audio * MAX_WAV_VALUE
86
+ audio = audio.cpu().numpy().astype('int16')
87
+ return audio
88
+
89
+
90
+ def perform_text_synthesis(text_input, language, gender):
91
+ preprocessed_text, _ = preprocessor.preprocess(text_input, language, gender)
92
+ preprocessed_text = " ".join(preprocessed_text)
93
+ audio = text_synthesis(language, gender, preprocessed_text, vocoder, MAX_WAV_VALUE, device)
94
+ return audio
95
+
96
+
97
+ import streamlit as st
98
+ import torch
99
+ import numpy as np
100
+ from scipy.io.wavfile import write
101
+ from text_preprocess_for_inference import CharTextPreprocessor
102
+
103
+ language = "punjabi"
104
+ # gender = 'male'
105
+ device = "cuda" if torch.cuda.is_available() else "cpu"
106
+
107
+ preprocessor = CharTextPreprocessor()
108
+
109
+ # Streamlit app
110
+ st.title("Text to Speech Punjabi Language")
111
+
112
+ text_input = st.text_area("Enter text")
113
+
114
+ # Radio button for selecting the gender
115
+ gender = st.radio("Select Gender", ("male", "female"))
116
+ vocoder = load_hifigan_vocoder(language, gender, device)
117
+
118
+ import streamlit as st
119
+ from io import BytesIO
120
+
121
+ # Assuming the perform_text_synthesis function returns the audio as a numpy array
122
+
123
+ if st.button("Convert to Speech"):
124
+ audio = perform_text_synthesis(text_input, language, gender.lower())
125
+
126
+ # Convert the audio numpy array to bytes
127
+ audio_bytes = BytesIO()
128
+ write(audio_bytes, SAMPLING_RATE, audio)
129
+
130
+ # Display the audio in Streamlit
131
+ st.audio(audio_bytes, format="audio/wav")
132
+
133
+ # Streamlit footer (optional)
134
+ st.text("Powered by Sabudh Interns")
135
+
136
+
137
+
138
+
139
+
140
+
141
+
get_phone_mapped_python.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class TextReplacer:
2
+ def __init__(self):
3
+ self.replacements = {
4
+ 'aa':'A',
5
+ 'ae':'ऍ',
6
+ 'ag':'ऽ',
7
+ 'ai':'ऐ',
8
+ 'au':'औ',
9
+ 'axx':'अ',
10
+ 'ax':'ऑ',
11
+ 'bh':'B',
12
+ 'ch':'C',
13
+ 'dh':'ध',
14
+ 'dxhq':'T',
15
+ 'dxh':'ढ',
16
+ 'dxq':'D',
17
+ 'dx':'ड',
18
+ 'ee':'E',
19
+ 'ei':'ऐ',
20
+ 'eu':'உ',
21
+ 'gh':'घ',
22
+ 'gq':'G',
23
+ 'hq':'H',
24
+ 'ii':'I',
25
+ 'jh':'J',
26
+ 'khq':'K',
27
+ 'kh':'ख',
28
+ 'kq':'क',
29
+ 'ln':'ൾ',
30
+ 'lw':'ൽ',
31
+ 'lx':'ള',
32
+ 'mq':'M',
33
+ 'nd':'ऩ',
34
+ 'ng':'ङ',
35
+ 'nj':'ञ',
36
+ 'nk':'Y',
37
+ 'nn':'N',
38
+ 'nw':'ൺ',
39
+ 'nx':'ण',
40
+ 'oo':'O',
41
+ 'ou':'औ',
42
+ 'ph':'P',
43
+ 'rqw':'ॠ',
44
+ 'rq':'R',
45
+ 'rw':'ർ',
46
+ 'rx':'ऱ',
47
+ 'sh':'श',
48
+ 'sx':'ष',
49
+ 'txh':'ठ',
50
+ 'th':'थ',
51
+ 'tx':'ट',
52
+ 'uu':'U',
53
+ 'wv':'W',
54
+ 'zh':'Z'
55
+
56
+ # ... Add more replacements as needed
57
+ }
58
+
59
+ def apply_replacements(self, text):
60
+ for key, value in self.replacements.items():
61
+ # print('KEY AND VALUE OF PARSED OUTPUT',key, value)
62
+ text = text.replace(key, value)
63
+ text = text.replace(" ", "")
64
+
65
+
66
+ return text
67
+
multilingualcharmap.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"assamese_male": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "\u0919", "c": "c", "C": "C", "j": "j", "J": "j", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "l", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "D", "T": "\u0922", "f": "P", "\u0930": "r", "M": "M", "q": "q", "H": "h", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "l", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "assamese_female": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "\u0919", "c": "c", "C": "C", "j": "j", "J": "j", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "l", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "D", "T": "\u0922", "f": "P", "\u0930": "r", "M": "M", "q": "q", "H": "h", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "l", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "bengali_male": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "\u0919", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "l", "w": "b", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "D", "T": "\u0922", "f": "P", "\u0930": "r", "M": "M", "q": "q", "H": "h", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "l", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "bengali_female": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "\u0919", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "l", "w": "b", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "D", "T": "\u0922", "f": "P", "\u0930": "r", "M": "M", "q": "q", "H": "h", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "l", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "bodo_female": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "\u0919", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "y", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0921", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "l", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "D", "T": "\u0921", "f": "P", "\u0930": "r", "M": "n", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "l", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "gujarati_male": {"a": "a", "\u0911": "\u0911", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "\u090d", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "n", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "\u0921", "T": "\u0922", "f": "P", "\u0930": "r", "M": "M", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "gujarati_female": {"a": "a", "\u0911": "\u0911", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "\u090d", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "n", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "\u0921", "T": "\u0922", "f": "P", "\u0930": "r", "M": "M", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "hindi_male": {"a": "a", "\u0911": "\u0911", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "\u090d", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "\u0919", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "\u0915", "K": "K", "G": "G", "z": "z", "D": "D", "T": "T", "f": "f", "\u0930": "r", "M": "M", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "hindi_female": {"a": "a", "\u0911": "\u0911", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "\u090d", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "\u0919", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "\u0915", "K": "K", "G": "G", "z": "z", "D": "D", "T": "T", "f": "f", "\u0930": "r", "M": "M", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "kannada_male": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "e", "E": "E", "\u0910": "\u0910", "o": "o", "O": "O", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "n", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "\u0921", "T": "\u0922", "f": "P", "\u0930": "r", "M": "n", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "kannada_female": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "e", "E": "E", "\u0910": "\u0910", "o": "o", "O": "O", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "n", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "\u0921", "T": "\u0922", "f": "P", "\u0930": "r", "M": "n", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "malayalam_male": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "e", "E": "E", "\u0910": "\u0910", "o": "o", "O": "O", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "\u0919", "c": "c", "C": "C", "j": "j", "J": "j", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "\u0921", "T": "\u0922", "f": "P", "\u0930": "\u0930", "M": "n", "q": "q", "H": "H", "Z": "Z", "\u0928": "n", "N": "N", "\u0d7e": "\u0d7e", "\u0d7d": "\u0d7d", "\u0d7a": "\u0d7a", "\u0d7c": "\u0d7c", "\u0960": "R"}, "malayalam_female": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "e", "E": "E", "\u0910": "\u0910", "o": "o", "O": "O", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "\u0919", "c": "c", "C": "C", "j": "j", "J": "j", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "\u0921", "T": "\u0922", "f": "P", "\u0930": "\u0930", "M": "n", "q": "q", "H": "H", "Z": "Z", "\u0928": "n", "N": "N", "\u0d7e": "\u0d7e", "\u0d7d": "\u0d7d", "\u0d7a": "\u0d7a", "\u0d7c": "\u0d7c", "\u0960": "R"}, "manipuri_male": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "r", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "g", "\u0919": "\u0919", "c": "c", "C": "c", "j": "j", "J": "j", "\u091e": "y", "\u091f": "\u091f", "\u0920": "\u091f", "\u0921": "\u091f", "\u0922": "\u091f", "\u0923": "n", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "d", "n": "n", "p": "p", "P": "P", "b": "b", "B": "b", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "l", "w": "w", "\u0936": "\u0936", "\u0937": "\u0936", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "\u091f", "T": "\u091f", "f": "P", "\u0930": "r", "M": "n", "q": "q", "H": "h", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "l", "\u0d7d": "l", "\u0d7a": "n", "\u0d7c": "r", "\u0960": "r"}, "manipuri_female": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "r", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "g", "\u0919": "\u0919", "c": "c", "C": "c", "j": "j", "J": "j", "\u091e": "y", "\u091f": "\u091f", "\u0920": "\u091f", "\u0921": "\u091f", "\u0922": "\u091f", "\u0923": "n", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "d", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "l", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "\u091f", "T": "\u091f", "f": "P", "\u0930": "r", "M": "n", "q": "q", "H": "h", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "l", "\u0d7d": "l", "\u0d7a": "n", "\u0d7c": "r", "\u0960": "r"}, "marathi_male": {"a": "a", "\u0911": "\u0911", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "\u090d", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "n", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "\u0921", "T": "\u0922", "f": "f", "\u0930": "\u0930", "M": "M", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "marathi_female": {"a": "a", "\u0911": "\u0911", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "\u090d", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "\u0919", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "z", "D": "\u0921", "T": "\u0922", "f": "f", "\u0930": "\u0930", "M": "M", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "odia_male": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "\u0919", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "D", "T": "T", "f": "P", "\u0930": "r", "M": "M", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "odia_female": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "E", "o": "o", "O": "o", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "\u0919", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "D", "T": "T", "f": "P", "\u0930": "r", "M": "M", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "rajasthani_male": {"a": "a", "\u0911": "\u0911", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "n", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "z", "D": "D", "T": "T", "f": "f", "\u0930": "r", "M": "M", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "rajasthani_female": {"a": "a", "\u0911": "\u0911", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "E", "E": "E", "\u0910": "\u0910", "o": "o", "O": "o", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "n", "c": "c", "C": "C", "j": "j", "J": "J", "\u091e": "y", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "z", "D": "D", "T": "\u0922", "f": "f", "\u0930": "r", "M": "n", "q": "q", "H": "h", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "tamil_male": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "\u0b89", "U": "U", "R": "r", "e": "e", "E": "E", "\u0910": "\u0910", "o": "o", "O": "O", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "k", "g": "g", "\u0918": "g", "\u0919": "\u0919", "c": "c", "C": "c", "j": "j", "J": "j", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u091f", "\u0921": "\u0921", "\u0922": "\u0921", "\u0923": "\u0923", "t": "t", "\u0925": "t", "d": "d", "\u0927": "d", "n": "n", "p": "p", "P": "p", "b": "b", "B": "b", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0937", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "k", "G": "g", "z": "j", "D": "\u0921", "T": "\u0921", "f": "f", "\u0930": "\u0930", "M": "n", "q": "n", "H": "h", "Z": "Z", "\u0928": "\u0928", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "r"}, "tamil_female": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "\u0b89", "U": "U", "R": "r", "e": "e", "E": "E", "\u0910": "\u0910", "o": "o", "O": "O", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "k", "g": "g", "\u0918": "g", "\u0919": "\u0919", "c": "c", "C": "c", "j": "j", "J": "j", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u091f", "\u0921": "\u0921", "\u0922": "\u0921", "\u0923": "\u0923", "t": "t", "\u0925": "t", "d": "d", "\u0927": "d", "n": "n", "p": "p", "P": "p", "b": "b", "B": "b", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0937", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "k", "G": "g", "z": "j", "D": "\u0921", "T": "\u0921", "f": "f", "\u0930": "\u0930", "M": "n", "q": "n", "H": "h", "Z": "Z", "\u0928": "\u0928", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "r"}, "telugu_male": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "e", "E": "E", "\u0910": "\u0910", "o": "o", "O": "O", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "n", "c": "c", "C": "C", "j": "j", "J": "j", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "\u0921", "T": "\u0922", "f": "P", "\u0930": "\u0930", "M": "n", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}, "telugu_female": {"a": "a", "\u0911": "A", "A": "A", "\u0905": "A", "i": "i", "I": "I", "u": "u", "\u0b89": "u", "U": "U", "R": "R", "e": "e", "E": "E", "\u0910": "\u0910", "o": "o", "O": "O", "\u090d": "E", "\u0914": "\u0914", "k": "k", "\u0916": "\u0916", "g": "g", "\u0918": "\u0918", "\u0919": "n", "c": "c", "C": "C", "j": "j", "J": "j", "\u091e": "\u091e", "\u091f": "\u091f", "\u0920": "\u0920", "\u0921": "\u0921", "\u0922": "\u0922", "\u0923": "\u0923", "t": "t", "\u0925": "\u0925", "d": "d", "\u0927": "\u0927", "n": "n", "p": "p", "P": "P", "b": "b", "B": "B", "m": "m", "y": "y", "r": "r", "l": "l", "\u0d33": "\u0d33", "w": "w", "\u0936": "\u0936", "\u0937": "\u0937", "s": "s", "h": "h", "\u0915": "k", "K": "\u0916", "G": "g", "z": "j", "D": "\u0921", "T": "\u0922", "f": "P", "\u0930": "\u0930", "M": "n", "q": "q", "H": "H", "Z": "y", "\u0928": "n", "N": "n", "\u0d7e": "\u0d33", "\u0d7d": "l", "\u0d7a": "\u0923", "\u0d7c": "r", "\u0960": "R"}}
text_preprocess_for_inference.py ADDED
@@ -0,0 +1,887 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ TTS Preprocessing
3
+ Developed by Arun Kumar A(CS20S013) - November 2022
4
+ Code Changes by Utkarsh - 2023
5
+ '''
6
+ import os
7
+ import re
8
+ import json
9
+ import pandas as pd
10
+ import string
11
+ from collections import defaultdict
12
+ import time
13
+ import subprocess
14
+ import shutil
15
+ from multiprocessing import Process
16
+ import traceback
17
+
18
+ #imports of dependencies from environment.yml
19
+ from num_to_words import num_to_word
20
+ from g2p_en import G2p
21
+
22
+ def add_to_dictionary(dict_to_add, dict_file):
23
+ append_string = ""
24
+ for key, value in dict_to_add.items():
25
+ append_string += (str(key) + " " + str(value) + "\n")
26
+
27
+ if os.path.isfile(dict_file):
28
+ # make a copy of the dictionary
29
+ source_dir = os.path.dirname(dict_file)
30
+ dict_file_name = os.path.basename(dict_file)
31
+ temp_file_name = "." + dict_file_name + ".temp"
32
+ temp_dict_file = os.path.join(source_dir, temp_file_name)
33
+ shutil.copy(dict_file, temp_dict_file)
34
+ # append the new words in the dictionary to the temp file
35
+ with open(temp_dict_file, "a") as f:
36
+ f.write(append_string)
37
+ # check if the write is successful and then replace the temp file as the dict file
38
+ try:
39
+ df_orig = pd.read_csv(dict_file, delimiter=" ", header=None, dtype=str)
40
+ df_temp = pd.read_csv(temp_dict_file, delimiter=" ", header=None, dtype=str)
41
+ if len(df_temp) > len(df_orig):
42
+ os.rename(temp_dict_file, dict_file)
43
+ print(f"{len(dict_to_add)} new words appended to Dictionary: {dict_file}")
44
+ except:
45
+ print(traceback.format_exc())
46
+ else:
47
+ # create a new dictionary
48
+ with open(dict_file, "a") as f:
49
+ f.write(append_string)
50
+ print(f"New Dictionary: {dict_file} created with {len(dict_to_add)} words")
51
+
52
+
53
+ class TextCleaner:
54
+ def __init__(self):
55
+ # this is a static set of cleaning rules to be applied
56
+ self.cleaning_rules = {
57
+ " +" : " ",
58
+ "^ +" : "",
59
+ " +$" : "",
60
+ "#" : "",
61
+ "[.,;।!](\r\n)*" : "# ",
62
+ "[.,;।!](\n)*" : "# ",
63
+ "(\r\n)+" : "# ",
64
+ "(\n)+" : "# ",
65
+ "(\r)+" : "# ",
66
+ """[?;:)(!|&’‘,।\."]""": "",
67
+ "[/']" : "",
68
+ "[-–]" : " ",
69
+ }
70
+
71
+ def clean(self, text):
72
+ for key, replacement in self.cleaning_rules.items():
73
+ text = re.sub(key, replacement, text)
74
+ return text
75
+
76
+ def clean_list(self, text):
77
+ # input is supposed to be a list of strings
78
+ output_text = []
79
+ for line in text:
80
+ line = line.strip()
81
+ for key, replacement in self.cleaning_rules.items():
82
+ line = re.sub(key, replacement, line)
83
+ output_text.append(line)
84
+ return output_text
85
+
86
+
87
+ class Phonifier:
88
+ def __init__(self, dict_location=None):
89
+ if dict_location is None:
90
+ dict_location = "phone_dict"
91
+ self.dict_location = dict_location
92
+
93
+ self.phone_dictionary = {}
94
+ # load dictionary for all the available languages
95
+ for dict_file in os.listdir(dict_location):
96
+ try:
97
+ if dict_file.startswith("."):
98
+ # ignore hidden files
99
+ continue
100
+ language = dict_file
101
+ dict_file_path = os.path.join(dict_location, dict_file)
102
+ df = pd.read_csv(dict_file_path, delimiter=" ", header=None, dtype=str)
103
+ self.phone_dictionary[language] = df.set_index(0).to_dict('dict')[1]
104
+ except Exception as e:
105
+ print(traceback.format_exc())
106
+
107
+ print("Phone dictionary loaded for the following languages:", list(self.phone_dictionary.keys()))
108
+
109
+ self.g2p = G2p()
110
+ print('Loading G2P model... Done!')
111
+ # Mapping between the cmu phones and the iitm cls
112
+ self.cmu_2_cls_map = {
113
+ "AA" : "aa",
114
+ "AA0" : "aa",
115
+ "AA1" : "aa",
116
+ "AA2" : "aa",
117
+ "AE" : "axx",
118
+ "AE0" : "axx",
119
+ "AE1" : "axx",
120
+ "AE2" : "axx",
121
+ "AH" : "a",
122
+ "AH0" : "a",
123
+ "AH1" : "a",
124
+ "AH2" : "a",
125
+ "AO" : "ax",
126
+ "AO0" : "ax",
127
+ "AO1" : "ax",
128
+ "AO2" : "ax",
129
+ "AW" : "ou",
130
+ "AW0" : "ou",
131
+ "AW1" : "ou",
132
+ "AW2" : "ou",
133
+ "AX" : "a",
134
+ "AY" : "ei",
135
+ "AY0" : "ei",
136
+ "AY1" : "ei",
137
+ "AY2" : "ei",
138
+ "B" : "b",
139
+ "CH" : "c",
140
+ "D" : "dx",
141
+ "DH" : "d",
142
+ "EH" : "ee",
143
+ "EH0" : "ee",
144
+ "EH1" : "ee",
145
+ "EH2" : "ee",
146
+ "ER" : "a r",
147
+ "ER0" : "a r",
148
+ "ER1" : "a r",
149
+ "ER2" : "a r",
150
+ "EY" : "ee",
151
+ "EY0" : "ee",
152
+ "EY1" : "ee",
153
+ "EY2" : "ee",
154
+ "F" : "f",
155
+ "G" : "g",
156
+ "HH" : "h",
157
+ "IH" : "i",
158
+ "IH0" : "i",
159
+ "IH1" : "i",
160
+ "IH2" : "i",
161
+ "IY" : "ii",
162
+ "IY0" : "ii",
163
+ "IY1" : "ii",
164
+ "IY2" : "ii",
165
+ "JH" : "j",
166
+ "K" : "k",
167
+ "L" : "l",
168
+ "M" : "m",
169
+ "N" : "n",
170
+ "NG" : "ng",
171
+ "OW" : "o",
172
+ "OW0" : "o",
173
+ "OW1" : "o",
174
+ "OW2" : "o",
175
+ "OY" : "ei",
176
+ "OY0" : "ei",
177
+ "OY1" : "ei",
178
+ "OY2" : "ei",
179
+ "P" : "p",
180
+ "R" : "r",
181
+ "S" : "s",
182
+ "SH" : "sh",
183
+ "T" : "tx",
184
+ "TH" : "t",
185
+ "UH" : "u",
186
+ "UH0" : "u",
187
+ "UH1" : "u",
188
+ "UH2" : "u",
189
+ "UW" : "uu",
190
+ "UW0" : "uu",
191
+ "UW1" : "uu",
192
+ "UW2" : "uu",
193
+ "V" : "w",
194
+ "W" : "w",
195
+ "Y" : "y",
196
+ "Z" : "z",
197
+ "ZH" : "sh",
198
+ }
199
+
200
+ # Mapping between the iitm cls and iitm char
201
+ self.cls_2_chr_map = {
202
+ "aa" : "A",
203
+ "ii" : "I",
204
+ "uu" : "U",
205
+ "ee" : "E",
206
+ "oo" : "O",
207
+ "nn" : "N",
208
+ "ae" : "ऍ",
209
+ "ag" : "ऽ",
210
+ "au" : "औ",
211
+ "axx" : "अ",
212
+ "ax" : "ऑ",
213
+ "bh" : "B",
214
+ "ch" : "C",
215
+ "dh" : "ध",
216
+ "dx" : "ड",
217
+ "dxh" : "ढ",
218
+ "dxhq" : "T",
219
+ "dxq" : "D",
220
+ "ei" : "ऐ",
221
+ "ai" : "ऐ",
222
+ "eu" : "உ",
223
+ "gh" : "घ",
224
+ "gq" : "G",
225
+ "hq" : "H",
226
+ "jh" : "J",
227
+ "kh" : "ख",
228
+ "khq" : "K",
229
+ "kq" : "क",
230
+ "ln" : "ൾ",
231
+ "lw" : "ൽ",
232
+ "lx" : "ള",
233
+ "mq" : "M",
234
+ "nd" : "न",
235
+ "ng" : "ङ",
236
+ "nj" : "ञ",
237
+ "nk" : "Y",
238
+ "nw" : "ൺ",
239
+ "nx" : "ण",
240
+ "ou" : "औ",
241
+ "ph" : "P",
242
+ "rq" : "R",
243
+ "rqw" : "ॠ",
244
+ "rw" : "ർ",
245
+ "rx" : "र",
246
+ "sh" : "श",
247
+ "sx" : "ष",
248
+ "th" : "थ",
249
+ "tx" : "ट",
250
+ "txh" : "ठ",
251
+ "wv" : "W",
252
+ "zh" : "Z",
253
+ }
254
+
255
+ # Multilingual support for OOV characters
256
+ oov_map_json_file = 'multilingualcharmap.json'
257
+ with open(oov_map_json_file, 'r') as oov_file:
258
+ self.oov_map = json.load(oov_file)
259
+
260
+ def __is_float(self, word):
261
+ parts = word.split('.')
262
+ if len(parts) != 2:
263
+ return False
264
+ return parts[0].isdecimal() and parts[1].isdecimal()
265
+
266
+ def en_g2p(self, word):
267
+ phn_out = self.g2p(word)
268
+ # print(f"phn_out: {phn_out}")
269
+ # iterate over the string list and replace each word with the corresponding value from the dictionary
270
+ for i, phn in enumerate(phn_out):
271
+ if phn in self.cmu_2_cls_map.keys():
272
+ phn_out[i] = self.cmu_2_cls_map[phn]
273
+ # cls_out = self.cmu_2_cls_map[phn]
274
+ if phn_out[i] in self.cls_2_chr_map.keys():
275
+ phn_out[i] = self.cls_2_chr_map[phn_out[i]]
276
+ else:
277
+ pass
278
+ else:
279
+ pass # ignore words that are not in the dictionary
280
+ # print(f"i: {i}, phn: {phn}, cls_out: {cls_out}, phn_out: {phn_out[i]}")
281
+ return ("".join(phn_out)).strip().replace(" ", "")
282
+
283
+ def __post_phonify(self, text, language, gender):
284
+ language_gender_id = language+'_'+gender
285
+ if language_gender_id in self.oov_map.keys():
286
+ output_string = ''
287
+ for char in text:
288
+ if char in self.oov_map[language_gender_id].keys():
289
+ output_string += self.oov_map[language_gender_id][char]
290
+ else:
291
+ output_string += char
292
+ # output_string += self.oov_map['language_gender_id']['char']
293
+ return output_string
294
+ else:
295
+ return text
296
+
297
+ def __is_english_word(self, word):
298
+ maxchar = max(word)
299
+ if u'\u0000' <= maxchar <= u'\u007f':
300
+ return True
301
+ return False
302
+
303
+ def __phonify(self, text, language, gender):
304
+ # text is expected to be a list of strings
305
+ words = set((" ".join(text)).split(" "))
306
+ #print(f"words test: {words}")
307
+ non_dict_words = []
308
+
309
+
310
+ if language in self.phone_dictionary:
311
+ for word in words:
312
+ # print(f"word: {word}")
313
+ if word not in self.phone_dictionary[language] and (language == "english" or (not self.__is_english_word(word))):
314
+ non_dict_words.append(word)
315
+ #print('INSIDE IF CONDITION OF ADDING WORDS')
316
+ else:
317
+ non_dict_words = words
318
+ print(f"word not in dict: {non_dict_words}")
319
+
320
+ if len(non_dict_words) > 0:
321
+ # unified parser has to be run for the non dictionary words
322
+ os.makedirs("tmp", exist_ok=True)
323
+ timestamp = str(time.time())
324
+ non_dict_words_file = os.path.abspath("tmp/non_dict_words_" + timestamp)
325
+ out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
326
+ with open(non_dict_words_file, "w") as f:
327
+ f.write("\n".join(non_dict_words))
328
+
329
+ if(language == 'tamil'):
330
+ tamil_parser_cmd = "tamil_parser.sh"
331
+ subprocess.run(["bash", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "ssn_parser"])
332
+ elif(language == 'english'):
333
+ phn_out_dict = {}
334
+ for i in range(0,len(non_dict_words)):
335
+ phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i])
336
+ # Create a string representation of the dictionary
337
+ data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()])
338
+ print(f"data_str: {data_str}")
339
+ with open(out_dict_file, "w") as f:
340
+ f.write(data_str)
341
+ else:
342
+
343
+ out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
344
+ from get_phone_mapped_python import TextReplacer
345
+
346
+ from indic_unified_parser.uparser import wordparse
347
+
348
+ text_replacer=TextReplacer()
349
+ # def write_output_to_file(output_text, file_path):
350
+ # with open(file_path, 'w') as f:
351
+ # f.write(output_text)
352
+ parsed_output_list = []
353
+ for word in non_dict_words:
354
+ parsed_word = wordparse(word, 0, 0, 1)
355
+ parsed_output_list.append(parsed_word)
356
+ replaced_output_list = [text_replacer.apply_replacements(parsed_word) for parsed_word in parsed_output_list]
357
+ with open(out_dict_file, 'w', encoding='utf-8') as file:
358
+ for original_word, formatted_word in zip(non_dict_words, replaced_output_list):
359
+ line = f"{original_word}\t{formatted_word}\n"
360
+ file.write(line)
361
+ print(line, end='')
362
+
363
+
364
+ try:
365
+
366
+ df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str)
367
+ #print('DATAFRAME OUTPUT FILE', df.head())
368
+ new_dict = df.dropna().set_index(0).to_dict('dict')[1]
369
+ #print("new dict",new_dict)
370
+ if language not in self.phone_dictionary:
371
+ self.phone_dictionary[language] = new_dict
372
+ else:
373
+ self.phone_dictionary[language].update(new_dict)
374
+ # run a non-blocking child process to update the dictionary file
375
+ #print("phone_dict", self.phone_dictionary)
376
+ p = Process(target=add_to_dictionary, args=(new_dict, os.path.join(self.dict_location, language)))
377
+ p.start()
378
+ except Exception as err:
379
+ print(f"Error: While loading {out_dict_file}")
380
+ traceback.print_exc()
381
+
382
+ # phonify text with dictionary
383
+ text_phonified = []
384
+ for phrase in text:
385
+ phrase_phonified = []
386
+ for word in phrase.split(" "):
387
+ if self.__is_english_word(word):
388
+ if word in self.phone_dictionary["english"]:
389
+ phrase_phonified.append(str(self.phone_dictionary["english"][word]))
390
+ else:
391
+ phrase_phonified.append(str(self.en_g2p(word)))
392
+ elif word in self.phone_dictionary[language]:
393
+ # if a word could not be parsed, skip it
394
+ phrase_phonified.append(str(self.phone_dictionary[language][word]))
395
+ # text_phonified.append(self.__post_phonify(" ".join(phrase_phonified),language, gender))
396
+ text_phonified.append(" ".join(phrase_phonified))
397
+ return text_phonified
398
+
399
+ def __merge_lists(self, lists):
400
+ merged_string = ""
401
+ for list in lists:
402
+ for word in list:
403
+ merged_string += word + " "
404
+ return merged_string.strip()
405
+
406
+ def __phonify_list(self, text, language, gender):
407
+ # text is expected to be a list of list of strings
408
+ words = set(self.__merge_lists(text).split(" "))
409
+ non_dict_words = []
410
+ if language in self.phone_dictionary:
411
+ for word in words:
412
+ if word not in self.phone_dictionary[language] and (language == "english" or (not self.__is_english_word(word))):
413
+ non_dict_words.append(word)
414
+ else:
415
+ non_dict_words = words
416
+
417
+ if len(non_dict_words) > 0:
418
+ print(len(non_dict_words))
419
+ print(non_dict_words)
420
+ # unified parser has to be run for the non dictionary words
421
+ os.makedirs("tmp", exist_ok=True)
422
+ timestamp = str(time.time())
423
+ non_dict_words_file = os.path.abspath("tmp/non_dict_words_" + timestamp)
424
+ out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
425
+ with open(non_dict_words_file, "w") as f:
426
+ f.write("\n".join(non_dict_words))
427
+
428
+ if(language == 'tamil'):
429
+ tamil_parser_cmd = "tamil_parser.sh"
430
+ subprocess.run(["bash", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "ssn_parser"])
431
+ elif(language == 'english'):
432
+ phn_out_dict = {}
433
+ for i in range(0,len(non_dict_words)):
434
+ phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i])
435
+ # Create a string representation of the dictionary
436
+ data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()])
437
+ print(f"data_str: {data_str}")
438
+ with open(out_dict_file, "w") as f:
439
+ f.write(data_str)
440
+ else:
441
+ out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
442
+ from get_phone_mapped_python import TextReplacer
443
+
444
+ from indic_unified_parser.uparser import wordparse
445
+
446
+ text_replacer=TextReplacer()
447
+
448
+ parsed_output_list = []
449
+ for word in non_dict_words:
450
+ parsed_word = wordparse(word, 0, 0, 1)
451
+ parsed_output_list.append(parsed_word)
452
+ replaced_output_list = [text_replacer.apply_replacements(parsed_word) for parsed_word in parsed_output_list]
453
+ with open(out_dict_file, 'w', encoding='utf-8') as file:
454
+ for original_word, formatted_word in zip(non_dict_words, replaced_output_list):
455
+ line = f"{original_word}\t{formatted_word}\n"
456
+ file.write(line)
457
+ print(line, end='')
458
+
459
+ try:
460
+ df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str)
461
+ new_dict = df.dropna().set_index(0).to_dict('dict')[1]
462
+ print(new_dict)
463
+ if language not in self.phone_dictionary:
464
+ self.phone_dictionary[language] = new_dict
465
+ else:
466
+ self.phone_dictionary[language].update(new_dict)
467
+ # run a non-blocking child process to update the dictionary file
468
+ p = Process(target=add_to_dictionary, args=(new_dict, os.path.join(self.dict_location, language)))
469
+ p.start()
470
+ except Exception as err:
471
+ traceback.print_exc()
472
+
473
+ # phonify text with dictionary
474
+ text_phonified = []
475
+ for line in text:
476
+ line_phonified = []
477
+ for phrase in line:
478
+ phrase_phonified = []
479
+ for word in phrase.split(" "):
480
+ if self.__is_english_word(word):
481
+ if word in self.phone_dictionary["english"]:
482
+ phrase_phonified.append(str(self.phone_dictionary["english"][word]))
483
+ else:
484
+ phrase_phonified.append(str(self.en_g2p(word)))
485
+ elif word in self.phone_dictionary[language]:
486
+ # if a word could not be parsed, skip it
487
+ phrase_phonified.append(str(self.phone_dictionary[language][word]))
488
+ # line_phonified.append(self.__post_phonify(" ".join(phrase_phonified), language, gender))
489
+ line_phonified.append(" ".join(phrase_phonified))
490
+ text_phonified.append(line_phonified)
491
+ return text_phonified
492
+
493
+ def phonify(self, text, language, gender):
494
+ if not isinstance(text, list):
495
+ out = self.__phonify([text], language, gender)
496
+ return out[0]
497
+ return self.__phonify(text, language, gender)
498
+
499
+ def phonify_list(self, text, language, gender):
500
+ if isinstance(text, list):
501
+ return self.__phonify_list(text, language, gender)
502
+ else:
503
+ print("Error!! Expected to have a list as input.")
504
+
505
+
506
+ class TextNormalizer:
507
+ def __init__(self, char_map_location=None, phonifier = Phonifier()):
508
+ self.phonifier = phonifier
509
+ if char_map_location is None:
510
+ char_map_location = "charmap"
511
+
512
+ # this is a static set of cleaning rules to be applied
513
+ self.cleaning_rules = {
514
+ " +" : " ",
515
+ "^ +" : "",
516
+ " +$" : "",
517
+ "#$" : "",
518
+ "# +$" : "",
519
+ }
520
+
521
+ # this is the list of languages supported by num_to_words
522
+ self.keydict = {"english" : "en",
523
+ "hindi" : "hi",
524
+ "gujarati" : "gu",
525
+ "marathi" : "mr",
526
+ "bengali" : "bn",
527
+ "telugu" : "te",
528
+ "tamil" : "ta",
529
+ "kannada" : "kn",
530
+ "odia" : "or",
531
+ "punjabi" : "pa"
532
+ }
533
+
534
+ self.g2p = G2p()
535
+ print('Loading G2P model... Done!')
536
+
537
+ def __post_cleaning(self, text):
538
+ for key, replacement in self.cleaning_rules.items():
539
+ text = re.sub(key, replacement, text)
540
+ return text
541
+
542
+ def __post_cleaning_list(self, text):
543
+ # input is supposed to be a list of strings
544
+ output_text = []
545
+ for line in text:
546
+ for key, replacement in self.cleaning_rules.items():
547
+ line = re.sub(key, replacement, line)
548
+ output_text.append(line)
549
+ return output_text
550
+
551
+ def __check_char_type(self, str_c):
552
+ # Determine the type of the character
553
+ if str_c.isnumeric():
554
+ char_type = "number"
555
+ elif str_c in string.punctuation:
556
+ char_type = "punctuation"
557
+ elif str_c in string.whitespace:
558
+ char_type = "whitespace"
559
+ elif str_c.isalpha() and str_c.isascii():
560
+ char_type = "ascii"
561
+ else:
562
+ char_type = "non-ascii"
563
+ return char_type
564
+
565
+ def insert_space(self, text):
566
+ '''
567
+ Check if the text contains numbers and English words and if they are without space inserts space between them.
568
+ '''
569
+ # Initialize variables to track the previous character type and whether a space should be inserted
570
+ prev_char_type = None
571
+ next_char_type = None
572
+ insert_space = False
573
+
574
+ # Output string
575
+ output_string = ""
576
+
577
+ # Iterate through each character in the text
578
+ for i, c in enumerate(text):
579
+ # Determine the type of the character
580
+ char_type = self.__check_char_type(c)
581
+ if i == (len(text) - 1):
582
+ next_char_type = None
583
+ else:
584
+ next_char_type = self.__check_char_type(text[i+1])
585
+ # print(f"{i}: {c} is a {char_type} character and next character is a {next_char_type}")
586
+
587
+ # If the character type has changed from the previous character, check if a space should be inserted
588
+ if (char_type != prev_char_type and prev_char_type != None and char_type != "punctuation" and char_type != "whitespace"):
589
+ if next_char_type != "punctuation" or next_char_type != "whitespace":
590
+ insert_space = True
591
+
592
+ # Insert a space if needed
593
+ if insert_space:
594
+ output_string += " "+c
595
+ insert_space = False
596
+ else:
597
+ output_string += c
598
+
599
+ # Update the previous character type
600
+ prev_char_type = char_type
601
+
602
+ # Print the modified text
603
+ output_string = re.sub(r' +', ' ', output_string)
604
+ return output_string
605
+
606
+ def insert_space_list(self, text):
607
+ '''
608
+ Expect the input to be in form of list of string.
609
+ Check if the text contains numbers and English words and if they are without space inserts space between them.
610
+ '''
611
+ # Output string list
612
+ output_list = []
613
+
614
+ for line in text:
615
+ # Initialize variables to track the previous character type and whether a space should be inserted
616
+ prev_char_type = None
617
+ next_char_type = None
618
+ insert_space = False
619
+ # Output string
620
+ output_string = ""
621
+ # Iterate through each character in the line
622
+ for i, c in enumerate(line):
623
+ # Determine the type of the character
624
+ char_type = self.__check_char_type(c)
625
+ if i == (len(line) - 1):
626
+ next_char_type = None
627
+ else:
628
+ next_char_type = self.__check_char_type(line[i+1])
629
+ # print(f"{i}: {c} is a {char_type} character and next character is a {next_char_type}")
630
+
631
+ # If the character type has changed from the previous character, check if a space should be inserted
632
+ if (char_type != prev_char_type and prev_char_type != None and char_type != "punctuation" and char_type != "whitespace"):
633
+ if next_char_type != "punctuation" or next_char_type != "whitespace":
634
+ insert_space = True
635
+
636
+ # Insert a space if needed
637
+ if insert_space:
638
+ output_string += " "+c
639
+ insert_space = False
640
+ else:
641
+ output_string += c
642
+
643
+ # Update the previous character type
644
+ prev_char_type = char_type
645
+
646
+ # Print the modified line
647
+ output_string = re.sub(r' +', ' ', output_string)
648
+ output_list.append(output_string)
649
+ return output_list
650
+
651
+ def num2text(self, text, language):
652
+ if language in self.keydict.keys():
653
+ digits = sorted(list(map(int, re.findall(r'\d+', text))),reverse=True)
654
+ if digits:
655
+ for digit in digits:
656
+ text = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', text)
657
+ return self.__post_cleaning(text)
658
+ else:
659
+ print(f"No num-to-char for the given language {language}.")
660
+ return self.__post_cleaning(text)
661
+
662
+ def num2text_list(self, text, language):
663
+ # input is supposed to be a list of strings
664
+ if language in self.keydict.keys():
665
+ output_text = []
666
+ for line in text:
667
+ digits = sorted(list(map(int, re.findall(r'\d+', line))),reverse=True)
668
+ if digits:
669
+ for digit in digits:
670
+ line = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', line)
671
+ output_text.append(line)
672
+ return self.__post_cleaning_list(output_text)
673
+ else:
674
+ print(f"No num-to-char for the given language {language}.")
675
+ return self.__post_cleaning_list(text)
676
+
677
+ def normalize(self, text, language):
678
+ return self.__post_cleaning(text)
679
+
680
+ def normalize_list(self, text, language):
681
+ # input is supposed to be a list of strings
682
+ return self.__post_cleaning_list(text)
683
+
684
+
685
+ class TextPhrasifier:
686
+ @classmethod
687
+ def phrasify(cls, text):
688
+ phrase_list = []
689
+ for phrase in text.split("#"):
690
+ phrase = phrase.strip()
691
+ if phrase != "":
692
+ phrase_list.append(phrase)
693
+ return phrase_list
694
+
695
+ class TextPhrasifier_List:
696
+ @classmethod
697
+ def phrasify(cls, text):
698
+ # input is supposed to be a list of strings
699
+ # output is list of list of strings
700
+ output_list = []
701
+ for line in text:
702
+ phrase_list = []
703
+ for phrase in line.split("#"):
704
+ phrase = phrase.strip()
705
+ if phrase != "":
706
+ phrase_list.append(phrase)
707
+ output_list.append(phrase_list)
708
+ return output_list
709
+
710
+ class DurAlignTextProcessor:
711
+ def __init__(self):
712
+ # this is a static set of cleaning rules to be applied
713
+ self.cleaning_rules = {
714
+ " +" : "",
715
+ "^" : "$",
716
+ "$" : ".",
717
+ }
718
+ self.cleaning_rules_English = {
719
+ " +" : "",
720
+ "$" : ".",
721
+ }
722
+ def textProcesor(self, text):
723
+ for key, replacement in self.cleaning_rules.items():
724
+ for idx in range(0,len(text)):
725
+ text[idx] = re.sub(key, replacement, text[idx])
726
+
727
+ return text
728
+
729
+ def textProcesorForEnglish(self, text):
730
+ for key, replacement in self.cleaning_rules_English.items():
731
+ for idx in range(0,len(text)):
732
+ text[idx] = re.sub(key, replacement, text[idx])
733
+
734
+ return text
735
+
736
+ def textProcesor_list(self, text):
737
+ # input expected in 'list of list of string' format
738
+ output_text = []
739
+ for line in text:
740
+ for key, replacement in self.cleaning_rules.items():
741
+ for idx in range(0,len(line)):
742
+ line[idx] = re.sub(key, replacement, line[idx])
743
+ output_text.append(line)
744
+
745
+ return output_text
746
+
747
+
748
+ class TTSDurAlignPreprocessor:
749
+ def __init__(self,
750
+ text_cleaner = TextCleaner(),
751
+ text_normalizer=TextNormalizer(),
752
+ phonifier = Phonifier(),
753
+ post_processor = DurAlignTextProcessor()):
754
+ self.text_cleaner = text_cleaner
755
+ self.text_normalizer = text_normalizer
756
+ self.phonifier = phonifier
757
+ self.post_processor = post_processor
758
+
759
+ def preprocess(self, text, language, gender):
760
+ # text = text.strip()
761
+ print(text)
762
+ text = self.text_cleaner.clean(text)
763
+ print("cleaned text", text)
764
+ # text = self.text_normalizer.insert_space(text)
765
+ text = self.text_normalizer.num2text(text, language)
766
+ # print(text)
767
+ text = self.text_normalizer.normalize(text, language)
768
+ # print(text)
769
+ phrasified_text = TextPhrasifier.phrasify(text)
770
+ #print("phrased",phrasified_text)
771
+ phonified_text = self.phonifier.phonify(phrasified_text, language, gender)
772
+ print("phonetext",phonified_text)
773
+ phonified_text = self.post_processor.textProcesor(phonified_text)
774
+ print(phonified_text)
775
+ return phonified_text, phrasified_text
776
+
777
+ class TTSDurAlignPreprocessor_VTT:
778
+ def __init__(self,
779
+ text_cleaner = TextCleaner(),
780
+ text_normalizer=TextNormalizer(),
781
+ phonifier = Phonifier(),
782
+ post_processor = DurAlignTextProcessor()):
783
+ self.text_cleaner = text_cleaner
784
+ self.text_normalizer = text_normalizer
785
+ self.phonifier = phonifier
786
+ self.post_processor = post_processor
787
+
788
+ def preprocess(self, text, language, gender):
789
+ # text = text.strip()
790
+ text = self.text_cleaner.clean_list(text)
791
+ # text = self.text_normalizer.insert_space_list(text)
792
+ text = self.text_normalizer.num2text_list(text, language)
793
+ text = self.text_normalizer.normalize_list(text, language)
794
+ phrasified_text = TextPhrasifier_List.phrasify(text)
795
+ phonified_text = self.phonifier.phonify_list(phrasified_text, language, gender)
796
+ phonified_text = self.post_processor.textProcesor_list(phonified_text)
797
+ return phonified_text, phrasified_text
798
+
799
+
800
+ class CharTextPreprocessor:
801
+ def __init__(self,
802
+ text_cleaner = TextCleaner(),
803
+ text_normalizer=TextNormalizer()):
804
+ self.text_cleaner = text_cleaner
805
+ self.text_normalizer = text_normalizer
806
+
807
+ def preprocess(self, text, language, gender=None):
808
+ text = text.strip()
809
+ text = self.text_cleaner.clean(text)
810
+ # text = self.text_normalizer.insert_space(text)
811
+ text = self.text_normalizer.num2text(text, language)
812
+ text = self.text_normalizer.normalize(text, language)
813
+ phrasified_text = TextPhrasifier.phrasify(text)
814
+ phonified_text = phrasified_text # No phonification for character TTS models
815
+ return phonified_text, phrasified_text
816
+
817
+ class CharTextPreprocessor_VTT:
818
+ def __init__(self,
819
+ text_cleaner = TextCleaner(),
820
+ text_normalizer=TextNormalizer()
821
+ ):
822
+ self.text_cleaner = text_cleaner
823
+ self.text_normalizer = text_normalizer
824
+
825
+ def preprocess(self, text, language, gender=None):
826
+ # text = text.strip()
827
+ text = self.text_cleaner.clean_list(text)
828
+ # text = self.text_normalizer.insert_space_list(text)
829
+ text = self.text_normalizer.num2text_list(text, language)
830
+ text = self.text_normalizer.normalize_list(text, language)
831
+ phrasified_text = TextPhrasifier_List.phrasify(text)
832
+ phonified_text = phrasified_text # No phonification for character TTS models
833
+ return phonified_text, phrasified_text
834
+
835
+
836
+ class TTSPreprocessor:
837
+ def __init__(self,
838
+ text_cleaner = TextCleaner(),
839
+ text_normalizer=TextNormalizer(),
840
+ phonifier = Phonifier(),
841
+ text_phrasefier = TextPhrasifier(),
842
+ post_processor = DurAlignTextProcessor()):
843
+ self.text_cleaner = text_cleaner
844
+ self.text_normalizer = text_normalizer
845
+ self.phonifier = phonifier
846
+ self.text_phrasefier = text_phrasefier
847
+ self.post_processor = post_processor
848
+
849
+ def preprocess(self, text, language, gender):
850
+ text = text.strip()
851
+ text = self.text_cleaner.clean(text)
852
+ # text = self.text_normalizer.insert_space(text)
853
+ text = self.text_normalizer.num2text(text, language)
854
+ text = self.text_normalizer.normalize(text, language)
855
+ phrasified_text = TextPhrasifier.phrasify(text)
856
+ phonified_text = self.phonifier.phonify(phrasified_text, language, gender)
857
+ print(phonified_text)
858
+ phonified_text = self.post_processor.textProcesorForEnglish(phonified_text)
859
+ print(phonified_text)
860
+ return phonified_text, phrasified_text
861
+
862
+ class TTSPreprocessor_VTT:
863
+ def __init__(self,
864
+ text_cleaner = TextCleaner(),
865
+ text_normalizer=TextNormalizer(),
866
+ phonifier = Phonifier(),
867
+ text_phrasefier = TextPhrasifier_List()):
868
+ self.text_cleaner = text_cleaner
869
+ self.text_normalizer = text_normalizer
870
+ self.phonifier = phonifier
871
+ self.text_phrasefier = text_phrasefier
872
+
873
+ def preprocess(self, text, language, gender):
874
+ # print(f"Original text: {text}")
875
+ text = self.text_cleaner.clean_list(text)
876
+ # print(f"After text cleaner: {text}")
877
+ # text = self.text_normalizer.insert_space_list(text)
878
+ # print(f"After insert space: {text}")
879
+ text = self.text_normalizer.num2text_list(text, language)
880
+ # print(f"After num2text: {text}")
881
+ text = self.text_normalizer.normalize_list(text, language)
882
+ # print(f"After text normalizer: {text}")
883
+ phrasified_text = TextPhrasifier_List.phrasify(text)
884
+ # print(f"phrasified_text: {phrasified_text}")
885
+ phonified_text = self.phonifier.phonify_list(phrasified_text, language, gender)
886
+ # print(f"phonified_text: {phonified_text}")
887
+ return phonified_text, phrasified_text