Upload folder using huggingface_hub
Browse files- __pycache__/NumberToText.cpython-310.pyc +0 -0
- __pycache__/text_preprocess_for_inference.cpython-310.pyc +0 -0
- __pycache__/utilities.cpython-310.pyc +0 -0
- hifigan/__pycache__/__init__.cpython-310.pyc +0 -0
- hifigan/__pycache__/env.cpython-310.pyc +0 -0
- hifigan/__pycache__/meldataset.cpython-310.pyc +0 -0
- hifigan/__pycache__/models.cpython-310.pyc +0 -0
- hifigan/__pycache__/utils.cpython-310.pyc +0 -0
- hindi_latest/female/model/config.yaml +3 -3
- hindi_latest/male/model/model.pth +3 -0
- main_ov.py +346 -0
- server.py +272 -0
- start_server.bat +6 -0
- test_tts.py +36 -0
- text_preprocess_for_inference.py +971 -949
- tmp/non_dict_words_1776242609.109296 +0 -0
- tmp/non_dict_words_1776242740.8554978 +0 -0
- utilities.py +75 -0
__pycache__/NumberToText.cpython-310.pyc
ADDED
|
Binary file (3.04 kB). View file
|
|
|
__pycache__/text_preprocess_for_inference.cpython-310.pyc
ADDED
|
Binary file (22.3 kB). View file
|
|
|
__pycache__/utilities.cpython-310.pyc
ADDED
|
Binary file (8.17 kB). View file
|
|
|
hifigan/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (199 Bytes). View file
|
|
|
hifigan/__pycache__/env.cpython-310.pyc
ADDED
|
Binary file (864 Bytes). View file
|
|
|
hifigan/__pycache__/meldataset.cpython-310.pyc
ADDED
|
Binary file (5.43 kB). View file
|
|
|
hifigan/__pycache__/models.cpython-310.pyc
ADDED
|
Binary file (8.71 kB). View file
|
|
|
hifigan/__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (1.99 kB). View file
|
|
|
hindi_latest/female/model/config.yaml
CHANGED
|
@@ -51,7 +51,7 @@ energy_extract_conf:
|
|
| 51 |
win_length: 4096
|
| 52 |
energy_normalize: global_mvn
|
| 53 |
energy_normalize_conf:
|
| 54 |
-
stats_file:
|
| 55 |
exclude_weight_decay: false
|
| 56 |
exclude_weight_decay_conf: {}
|
| 57 |
feats_extract: fbank
|
|
@@ -90,7 +90,7 @@ no_forward_run: false
|
|
| 90 |
non_linguistic_symbols: null
|
| 91 |
normalize: global_mvn
|
| 92 |
normalize_conf:
|
| 93 |
-
stats_file:
|
| 94 |
num_att_plot: 3
|
| 95 |
num_cache_chunks: 1024
|
| 96 |
num_iters_per_epoch: 800
|
|
@@ -111,7 +111,7 @@ pitch_extract_conf:
|
|
| 111 |
reduction_factor: 1
|
| 112 |
pitch_normalize: global_mvn
|
| 113 |
pitch_normalize_conf:
|
| 114 |
-
stats_file:
|
| 115 |
pretrain_path: null
|
| 116 |
print_config: false
|
| 117 |
required:
|
|
|
|
| 51 |
win_length: 4096
|
| 52 |
energy_normalize: global_mvn
|
| 53 |
energy_normalize_conf:
|
| 54 |
+
stats_file: C:\Users\PEGPVINUser01\Desktop\Realtime-Translation-AIPC\backend\TTS\FastSpeech2_HS_HF\hindi_latest\female\model/energy_stats.npz
|
| 55 |
exclude_weight_decay: false
|
| 56 |
exclude_weight_decay_conf: {}
|
| 57 |
feats_extract: fbank
|
|
|
|
| 90 |
non_linguistic_symbols: null
|
| 91 |
normalize: global_mvn
|
| 92 |
normalize_conf:
|
| 93 |
+
stats_file: C:\Users\PEGPVINUser01\Desktop\Realtime-Translation-AIPC\backend\TTS\FastSpeech2_HS_HF\hindi_latest\female\model/feats_stats.npz
|
| 94 |
num_att_plot: 3
|
| 95 |
num_cache_chunks: 1024
|
| 96 |
num_iters_per_epoch: 800
|
|
|
|
| 111 |
reduction_factor: 1
|
| 112 |
pitch_normalize: global_mvn
|
| 113 |
pitch_normalize_conf:
|
| 114 |
+
stats_file: C:\Users\PEGPVINUser01\Desktop\Realtime-Translation-AIPC\backend\TTS\FastSpeech2_HS_HF\hindi_latest\female\model/pitch_stats.npz
|
| 115 |
pretrain_path: null
|
| 116 |
print_config: false
|
| 117 |
required:
|
hindi_latest/male/model/model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:81790566fd4660ca46c4692f6fbdb30d8e5c9cd657084a0eb40c804e6ec2b9ab
|
| 3 |
+
size 152128410
|
main_ov.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from text_preprocess_for_inference import TTSDurAlignPreprocessor, CharTextPreprocessor, TTSPreprocessor
|
| 2 |
+
from espnet2.bin.tts_inference import Text2Speech
|
| 3 |
+
from scipy.io.wavfile import write
|
| 4 |
+
import json
|
| 5 |
+
import torch
|
| 6 |
+
import yaml
|
| 7 |
+
import sys
|
| 8 |
+
from utilities import SAMPLING_RATE, WARMUP_PARAGRAPHS
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
import os
|
| 11 |
+
import time
|
| 12 |
+
import numpy as np
|
| 13 |
+
import openvino as ov
|
| 14 |
+
|
| 15 |
+
sys.path.append(os.getenv("HIFIGAN_PATH", f"hifigan"))
|
| 16 |
+
from hifigan.env import AttrDict
|
| 17 |
+
from hifigan.models import Generator
|
| 18 |
+
from hifigan.meldataset import MAX_WAV_VALUE
|
| 19 |
+
import torch.nn.functional as F
|
| 20 |
+
|
| 21 |
+
import nltk
|
| 22 |
+
nltk.download('averaged_perceptron_tagger_eng')
|
| 23 |
+
|
| 24 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 25 |
+
MAX_DEFAULT_VALUE = 600
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def load_hifigan_vocoder(language: str, gender: str, device: str, dtype: str = "float32"):
|
| 29 |
+
"""
|
| 30 |
+
Loads HiFi-GAN vocoder configuration file and generator model.
|
| 31 |
+
"""
|
| 32 |
+
vocoder_config = f"vocoder/{gender}/{language}/config.json"
|
| 33 |
+
vocoder_generator = f"vocoder/{gender}/{language}/generator"
|
| 34 |
+
|
| 35 |
+
if not os.path.exists(vocoder_config) or not os.path.exists(vocoder_generator):
|
| 36 |
+
raise FileNotFoundError(
|
| 37 |
+
f"Vocoder files not found. Expected config: {vocoder_config}, generator: {vocoder_generator}")
|
| 38 |
+
|
| 39 |
+
with open(vocoder_config, 'r') as f:
|
| 40 |
+
data = f.read()
|
| 41 |
+
json_config = json.loads(data)
|
| 42 |
+
h = AttrDict(json_config)
|
| 43 |
+
torch.manual_seed(h.seed)
|
| 44 |
+
device = torch.device(device)
|
| 45 |
+
generator = Generator(h).to(device)
|
| 46 |
+
state_dict_g = torch.load(vocoder_generator, map_location=device)
|
| 47 |
+
generator.load_state_dict(state_dict_g['generator'])
|
| 48 |
+
generator.eval()
|
| 49 |
+
generator.remove_weight_norm()
|
| 50 |
+
|
| 51 |
+
if dtype == "bfloat16":
|
| 52 |
+
generator = generator.to(torch.bfloat16)
|
| 53 |
+
|
| 54 |
+
return generator
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def load_fastspeech2_model(language: str, gender: str, device: str, dtype: str = "float32"):
|
| 58 |
+
"""
|
| 59 |
+
Loads FastSpeech2 model and updates its configuration with absolute paths.
|
| 60 |
+
"""
|
| 61 |
+
config_path = f"{language}/{gender}/model/config.yaml"
|
| 62 |
+
tts_model_path = f"{language}/{gender}/model/model.pth"
|
| 63 |
+
|
| 64 |
+
if not os.path.exists(config_path) or not os.path.exists(tts_model_path):
|
| 65 |
+
raise FileNotFoundError(
|
| 66 |
+
f"FastSpeech2 model files not found. Expected config: {config_path}, model: {tts_model_path}")
|
| 67 |
+
|
| 68 |
+
with open(config_path, "r") as file:
|
| 69 |
+
config = yaml.safe_load(file)
|
| 70 |
+
|
| 71 |
+
current_working_directory = os.getcwd()
|
| 72 |
+
feat_rel_path = "model/feats_stats.npz"
|
| 73 |
+
pitch_rel_path = "model/pitch_stats.npz"
|
| 74 |
+
energy_rel_path = "model/energy_stats.npz"
|
| 75 |
+
|
| 76 |
+
feat_path = os.path.join(current_working_directory,
|
| 77 |
+
language, gender, feat_rel_path)
|
| 78 |
+
pitch_path = os.path.join(
|
| 79 |
+
current_working_directory, language, gender, pitch_rel_path)
|
| 80 |
+
energy_path = os.path.join(
|
| 81 |
+
current_working_directory, language, gender, energy_rel_path)
|
| 82 |
+
|
| 83 |
+
config["normalize_conf"]["stats_file"] = feat_path
|
| 84 |
+
config["pitch_normalize_conf"]["stats_file"] = pitch_path
|
| 85 |
+
config["energy_normalize_conf"]["stats_file"] = energy_path
|
| 86 |
+
|
| 87 |
+
# Temporarily write the modified config to a new file or use a BytesIO object if preferred
|
| 88 |
+
with open(config_path, "w") as file:
|
| 89 |
+
yaml.dump(config, file)
|
| 90 |
+
|
| 91 |
+
model = Text2Speech(train_config=config_path, model_file=tts_model_path, device=device, vocoder_config=None,vocoder_file=None)
|
| 92 |
+
model.vocoder=None
|
| 93 |
+
|
| 94 |
+
if dtype == "bfloat16":
|
| 95 |
+
model.model = model.model.to(torch.bfloat16)
|
| 96 |
+
|
| 97 |
+
return model
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def split_into_chunks(text: str, words_per_chunk: int = 100):
|
| 101 |
+
"""Splits text into chunks of specified words_per_chunk."""
|
| 102 |
+
words = text.split()
|
| 103 |
+
chunks = [words[i:i + words_per_chunk]
|
| 104 |
+
for i in range(0, len(words), words_per_chunk)]
|
| 105 |
+
return [' '.join(chunk) for chunk in chunks]
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
class Text2SpeechApp:
|
| 109 |
+
def __init__(self, language: str, batch_size: str = 1, alpha: float = 1, dtype: str = "bfloat16"):
|
| 110 |
+
self.alpha = alpha
|
| 111 |
+
self.lang = language
|
| 112 |
+
self.batch_size = batch_size
|
| 113 |
+
self.dtype = dtype
|
| 114 |
+
self.vocoder_model = {}
|
| 115 |
+
self.fastspeech2_model = {}
|
| 116 |
+
self.supported_genders = []
|
| 117 |
+
|
| 118 |
+
self.preprocessor = TTSDurAlignPreprocessor()
|
| 119 |
+
|
| 120 |
+
genders = ["male", "female"]
|
| 121 |
+
for gender in genders:
|
| 122 |
+
try:
|
| 123 |
+
self.vocoder_model[gender] = load_hifigan_vocoder(
|
| 124 |
+
f"{language}_latest", gender, device, self.dtype)
|
| 125 |
+
with torch.no_grad():
|
| 126 |
+
self.vocoder_model[gender] = ov.convert_model(self.vocoder_model[gender], example_input=torch.ones([1, 160, MAX_DEFAULT_VALUE]))
|
| 127 |
+
|
| 128 |
+
self.vocoder_model[gender] = ov.compile_model(self.vocoder_model[gender], device_name="CPU")
|
| 129 |
+
print(
|
| 130 |
+
f"Loaded HiFi-GAN vocoder for {language}-{gender}")
|
| 131 |
+
|
| 132 |
+
self.fastspeech2_model[gender] = load_fastspeech2_model(
|
| 133 |
+
f"{language}_latest", gender, device, self.dtype)
|
| 134 |
+
|
| 135 |
+
# with torch.no_grad():
|
| 136 |
+
# self.fastspeech2_model[gender] = ov.convert_model(self.fastspeech2_model[gender])
|
| 137 |
+
print(
|
| 138 |
+
f"Loaded FastSpeech2 model for {language}-{gender}")
|
| 139 |
+
self.supported_genders.append(gender)
|
| 140 |
+
except FileNotFoundError as e:
|
| 141 |
+
print(
|
| 142 |
+
f"Error loading model for {language}-{gender}: {e}. This model key will not be available.")
|
| 143 |
+
except Exception as e:
|
| 144 |
+
print(
|
| 145 |
+
f"An unexpected error occurred while loading model for {language}-{gender}: {e}. This model key will not be available.")
|
| 146 |
+
self.warmup()
|
| 147 |
+
|
| 148 |
+
def pre_print(self, print_str: str):
|
| 149 |
+
print("=================================================")
|
| 150 |
+
print(print_str)
|
| 151 |
+
print("=================================================")
|
| 152 |
+
|
| 153 |
+
def warmup(self):
|
| 154 |
+
self.pre_print("TTS Warming up!")
|
| 155 |
+
|
| 156 |
+
lang = self.lang.lower()
|
| 157 |
+
text = WARMUP_PARAGRAPHS.get(lang)
|
| 158 |
+
|
| 159 |
+
if not text:
|
| 160 |
+
print(f"No warmup paragraph available for language: {lang}")
|
| 161 |
+
return
|
| 162 |
+
|
| 163 |
+
# Ensure warmup output directory exists
|
| 164 |
+
output_dir = "./warmup_outputs"
|
| 165 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 166 |
+
|
| 167 |
+
print(f"Running warmup for language: {lang}")
|
| 168 |
+
print(f"Warmup text length: {len(text.split())} words")
|
| 169 |
+
|
| 170 |
+
total_start_time = time.time()
|
| 171 |
+
|
| 172 |
+
for gender in ["male", "female"]:
|
| 173 |
+
if gender not in self.fastspeech2_model:
|
| 174 |
+
print(f"Skipping warmup for {gender} - model not loaded.")
|
| 175 |
+
continue
|
| 176 |
+
|
| 177 |
+
print(f"Starting warmup for {lang}-{gender}")
|
| 178 |
+
try:
|
| 179 |
+
gender_start_time = time.time()
|
| 180 |
+
for i in range(2): # Run twice; adjust as needed
|
| 181 |
+
print(f"Warmup iteration {i + 1} for {gender}")
|
| 182 |
+
time_taken, _ = self.convert_and_save(
|
| 183 |
+
text=text,
|
| 184 |
+
speaker_gender=gender,
|
| 185 |
+
output_file_dir=output_dir
|
| 186 |
+
)
|
| 187 |
+
print(f"Iteration {i + 1} for {gender} completed in {time_taken:.2f} seconds")
|
| 188 |
+
gender_total_time = time.time() - gender_start_time
|
| 189 |
+
print(f"Total warmup time for {gender}: {gender_total_time:.2f} seconds")
|
| 190 |
+
except Exception as e:
|
| 191 |
+
print(f"Warmup failed for {lang}-{gender}: {e}")
|
| 192 |
+
|
| 193 |
+
total_time = time.time() - total_start_time
|
| 194 |
+
print(f"Total TTS warmup completed in {total_time:.2f} seconds")
|
| 195 |
+
self.pre_print("TTS Warming finished!")
|
| 196 |
+
|
| 197 |
+
def save_to_file(self, audio_arr, file_path):
|
| 198 |
+
write(file_path, SAMPLING_RATE, audio_arr)
|
| 199 |
+
print(f"Audio saved to {file_path}")
|
| 200 |
+
|
| 201 |
+
def convert_and_save(self, text: str, speaker_gender="male", output_file_dir: str = "./outputs"):
|
| 202 |
+
timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
| 203 |
+
output_file = f"{output_file_dir}/{self.lang}_{speaker_gender}_{timestamp}.wav"
|
| 204 |
+
|
| 205 |
+
start = time.time()
|
| 206 |
+
audio_arr = []
|
| 207 |
+
result_chunks = split_into_chunks(text)
|
| 208 |
+
|
| 209 |
+
for chunk_text in result_chunks:
|
| 210 |
+
# Preprocess the text
|
| 211 |
+
preprocessed_text, _ = self.preprocessor.preprocess(
|
| 212 |
+
chunk_text, self.lang, speaker_gender)
|
| 213 |
+
preprocessed_text = " ".join(preprocessed_text)
|
| 214 |
+
|
| 215 |
+
with torch.no_grad():
|
| 216 |
+
# Generate mel-spectrograms
|
| 217 |
+
out = self.fastspeech2_model[speaker_gender](preprocessed_text,
|
| 218 |
+
decode_conf={"alpha": self.alpha})
|
| 219 |
+
|
| 220 |
+
x = out["feat_gen_denorm"].T.unsqueeze(0) * 2.3262
|
| 221 |
+
|
| 222 |
+
# Convert mel-spectrograms to raw audio waveforms
|
| 223 |
+
y_g_hat = self.vocoder_model[speaker_gender](x)
|
| 224 |
+
audio = y_g_hat.squeeze()
|
| 225 |
+
|
| 226 |
+
audio = audio * MAX_WAV_VALUE
|
| 227 |
+
|
| 228 |
+
audio = audio.numpy().astype('int16')
|
| 229 |
+
audio_arr.append(audio)
|
| 230 |
+
|
| 231 |
+
result_array = np.concatenate(audio_arr, axis=0)
|
| 232 |
+
self.save_to_file(audio_arr=result_array, file_path=output_file)
|
| 233 |
+
time_taken = time.time() - start
|
| 234 |
+
|
| 235 |
+
return time_taken, output_file
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def generate_audio_bytes(self, text: str, speaker_gender="male", save_file: bool = False):
|
| 239 |
+
preprocessed_text, _ = self.preprocessor.preprocess(
|
| 240 |
+
text, self.lang, speaker_gender)
|
| 241 |
+
preprocessed_text = " ".join(preprocessed_text)
|
| 242 |
+
|
| 243 |
+
with torch.no_grad():
|
| 244 |
+
# Generate mel-spectrograms
|
| 245 |
+
st = time.perf_counter()
|
| 246 |
+
out = self.fastspeech2_model[speaker_gender](preprocessed_text,
|
| 247 |
+
decode_conf={"alpha": self.alpha})
|
| 248 |
+
|
| 249 |
+
x = out["feat_gen_denorm"].T.unsqueeze(0) * 2.3262
|
| 250 |
+
|
| 251 |
+
# Convert mel-spectrograms to raw audio waveforms
|
| 252 |
+
# [8,8,8,2]; default sr = 44100
|
| 253 |
+
trim_length = int(8 * 8 * 8 * 2 * x.shape[-1])
|
| 254 |
+
x = F.pad(x, (0, MAX_DEFAULT_VALUE - x.shape[-1]), value=-12)
|
| 255 |
+
st = time.perf_counter()
|
| 256 |
+
y_g_hat = self.vocoder_model[speaker_gender](x)
|
| 257 |
+
audio = y_g_hat[0][0][0][:trim_length]
|
| 258 |
+
|
| 259 |
+
audio = audio * MAX_WAV_VALUE
|
| 260 |
+
|
| 261 |
+
return audio
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def evaluate_performance(self, input_sentences: list, save_file: bool = False):
|
| 265 |
+
total_sentences = len(input_sentences)
|
| 266 |
+
print(f"\nTotal T2S to be done: {total_sentences}\n")
|
| 267 |
+
for i, sentence in enumerate(input_sentences):
|
| 268 |
+
start_time = time.perf_counter()
|
| 269 |
+
audio = self.generate_audio_bytes(text=sentence)
|
| 270 |
+
time_taken = time.perf_counter() - start_time
|
| 271 |
+
print("=================================================")
|
| 272 |
+
print(f"Sentence {i + 1}/{total_sentences}:{sentence} processed in {time_taken:.2f} seconds")
|
| 273 |
+
|
| 274 |
+
if save_file:
|
| 275 |
+
os.makedirs(f"audios_{self.dtype}/numpy_files", exist_ok=True)
|
| 276 |
+
os.makedirs(f"audios_{self.dtype}/audio_files", exist_ok=True)
|
| 277 |
+
|
| 278 |
+
output_file = f"audios_{self.dtype}/numpy_files/file_{i}.npy"
|
| 279 |
+
|
| 280 |
+
if audio.dtype == torch.bfloat16:
|
| 281 |
+
audio = audio.to(torch.float32)
|
| 282 |
+
|
| 283 |
+
audio = audio.astype('int16')
|
| 284 |
+
np.save(output_file, audio)
|
| 285 |
+
|
| 286 |
+
audio_file_path = f"audios_{self.dtype}/audio_files/file_{i}.wav"
|
| 287 |
+
with open(audio_file_path, "wb") as f:
|
| 288 |
+
write(f, SAMPLING_RATE, audio)
|
| 289 |
+
print(f"Audio saved to {audio_file_path}")
|
| 290 |
+
|
| 291 |
+
return time_taken
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
def save_to_files(self, byte_ios, file_prefix: str) -> list[str]:
|
| 295 |
+
file_paths = []
|
| 296 |
+
for i in range(len(byte_ios)):
|
| 297 |
+
timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
| 298 |
+
file_path = f"{file_prefix}_{timestamp}_{i + 1}.wav"
|
| 299 |
+
file_paths.append(file_path)
|
| 300 |
+
with open(file_path, "wb") as f:
|
| 301 |
+
f.write(byte_ios[i].read())
|
| 302 |
+
print(f"Audio saved to {file_path}")
|
| 303 |
+
return file_paths
|
| 304 |
+
|
| 305 |
+
def batch_convert_and_save(self, input_sentences: list[str], speaker_gender="male", output_file_dir: str = "./outputs"):
|
| 306 |
+
start_time = time.time()
|
| 307 |
+
output_file_paths = []
|
| 308 |
+
total_sentences = len(input_sentences)
|
| 309 |
+
os.makedirs(output_file_dir, exist_ok=True)
|
| 310 |
+
|
| 311 |
+
print(f"Total T2S to be done: {total_sentences}\n")
|
| 312 |
+
combined_para = ''.join(input_sentences)
|
| 313 |
+
paragraph_time, output_path = self.convert_and_save(
|
| 314 |
+
combined_para, speaker_gender=speaker_gender, output_file_dir=output_file_dir)
|
| 315 |
+
print(f"Paragraph Time: {paragraph_time}\n")
|
| 316 |
+
output_file_paths.append(output_path)
|
| 317 |
+
|
| 318 |
+
time_taken = time.time() - start_time
|
| 319 |
+
return time_taken, output_file_paths
|
| 320 |
+
|
| 321 |
+
if __name__ == "__main__":
|
| 322 |
+
import argparse
|
| 323 |
+
|
| 324 |
+
parser = argparse.ArgumentParser(description="Text to Speech benchmarking")
|
| 325 |
+
parser.add_argument("--batch_size", type=int, default=1, help="Batch size for TTS inference")
|
| 326 |
+
parser.add_argument("--language", type=str, default="hindi", help="Language for TTS")
|
| 327 |
+
parser.add_argument("--alpha", type=float, default=1.0, help="Alpha value for FastSpeech2 decoding")
|
| 328 |
+
parser.add_argument("--dtype", type=str, default="float32", help="Data type for model inference")
|
| 329 |
+
args = parser.parse_args()
|
| 330 |
+
|
| 331 |
+
batch_size = 1
|
| 332 |
+
language = "hindi"
|
| 333 |
+
alpha = 1
|
| 334 |
+
tts = Text2SpeechApp(batch_size=batch_size, alpha=alpha, language=language, dtype=args.dtype)
|
| 335 |
+
st = time.perf_counter()
|
| 336 |
+
texts = [
|
| 337 |
+
"जीवन में सफलता पाने के लिए केवल सपने देखना ही नहीं, बल्कि उन्हें पूरा करने के लिए निरंतर प्रयास और आत्मविश्वास भी ज़रूरी होता है।",
|
| 338 |
+
"कठिन परिस्थितियाँ हमें तोड़ने नहीं आतीं, बल्कि हमें मज़बूत बनाकर जीवन के असली अर्थ से परिचित कराती हैं।",
|
| 339 |
+
"सकारात्मक सोच और सही दृष्टिकोण के साथ किया गया हर छोटा प्रयास भी एक दिन बड़ी उपलब्धि में बदल जाता है।",
|
| 340 |
+
"जब हम निस्वार्थ भाव से दूसरों की मदद करते हैं, तब हमारे अपने जीवन में भी शांति और संतुलन अपने आप आ जाता है।"
|
| 341 |
+
]
|
| 342 |
+
|
| 343 |
+
total_time = tts.evaluate_performance(texts, save_file=True)
|
| 344 |
+
et = time.perf_counter()
|
| 345 |
+
print(f"Total time for evaluating {len(texts)} sentences: {et - st:.2f} seconds")
|
| 346 |
+
print(f"Average time per sentence: {(et - st)/len(texts):.2f} seconds")
|
server.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import base64
|
| 3 |
+
import io
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
import time
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
from fastapi import FastAPI, HTTPException
|
| 10 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 11 |
+
from pydantic import BaseModel
|
| 12 |
+
from scipy.io.wavfile import write as wav_write
|
| 13 |
+
|
| 14 |
+
from main_ov import Text2SpeechApp
|
| 15 |
+
from utilities import SAMPLING_RATE, SUPPORTED_OUTPUT_LANGS
|
| 16 |
+
|
| 17 |
+
logging.basicConfig(level=logging.INFO)
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
# --- Language code mapping (Bhashini 2-letter <-> full name) ---
|
| 21 |
+
|
| 22 |
+
LANG_CODE_TO_NAME = {
|
| 23 |
+
"hi": "hindi",
|
| 24 |
+
"ta": "tamil",
|
| 25 |
+
"te": "telugu",
|
| 26 |
+
"kn": "kannada",
|
| 27 |
+
"ml": "malayalam",
|
| 28 |
+
"pa": "punjabi",
|
| 29 |
+
"bn": "bengali",
|
| 30 |
+
}
|
| 31 |
+
LANG_NAME_TO_CODE = {v: k for k, v in LANG_CODE_TO_NAME.items()}
|
| 32 |
+
|
| 33 |
+
# --- Pydantic models for Bhashini pipeline request/response ---
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class LanguageConfig(BaseModel):
|
| 37 |
+
sourceLanguage: str
|
| 38 |
+
sourceScriptCode: str | None = None
|
| 39 |
+
targetLanguage: str | None = None
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class TaskConfig(BaseModel):
|
| 43 |
+
language: LanguageConfig
|
| 44 |
+
serviceId: str | None = None
|
| 45 |
+
gender: str = "female"
|
| 46 |
+
samplingRate: int = 48000
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class PipelineTask(BaseModel):
|
| 50 |
+
taskType: str
|
| 51 |
+
config: TaskConfig
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class InputItem(BaseModel):
|
| 55 |
+
source: str
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class InputData(BaseModel):
|
| 59 |
+
input: list[InputItem] | None = None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class PipelineRequest(BaseModel):
|
| 63 |
+
pipelineTasks: list[PipelineTask]
|
| 64 |
+
inputData: InputData
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class AudioItem(BaseModel):
|
| 68 |
+
audioContent: str | None = None
|
| 69 |
+
audioUri: str | None = None
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class ResponseConfig(BaseModel):
|
| 73 |
+
audioFormat: str = "wav"
|
| 74 |
+
language: LanguageConfig
|
| 75 |
+
encoding: str = "base64"
|
| 76 |
+
samplingRate: int = 48000
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class PipelineResponseItem(BaseModel):
|
| 80 |
+
taskType: str
|
| 81 |
+
config: ResponseConfig
|
| 82 |
+
output: list | None = None
|
| 83 |
+
audio: list[AudioItem] | None = None
|
| 84 |
+
metrics: dict | None = None
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class PipelineResponse(BaseModel):
|
| 88 |
+
pipelineResponse: list[PipelineResponseItem]
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class SimpleTtsRequest(BaseModel):
|
| 92 |
+
text: str
|
| 93 |
+
language: str = "hi"
|
| 94 |
+
gender: str = "female"
|
| 95 |
+
samplingRate: int = 48000
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# --- App setup ---
|
| 99 |
+
|
| 100 |
+
app = FastAPI(title="FastSpeech2 TTS API (Bhashini-compatible)")
|
| 101 |
+
app.add_middleware(
|
| 102 |
+
CORSMiddleware,
|
| 103 |
+
allow_origins=["*"],
|
| 104 |
+
allow_credentials=True,
|
| 105 |
+
allow_methods=["*"],
|
| 106 |
+
allow_headers=["*"],
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Dict of language_name -> Text2SpeechApp instance
|
| 110 |
+
tts_engines: dict[str, Text2SpeechApp] = {}
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
@app.on_event("startup")
|
| 114 |
+
def load_models():
|
| 115 |
+
"""Load TTS models for all configured languages at startup."""
|
| 116 |
+
logger.info(f"SUPPORTED_OUTPUT_LANGS: {SUPPORTED_OUTPUT_LANGS}")
|
| 117 |
+
logger.info(f"LANG_CODE_TO_NAME: {LANG_CODE_TO_NAME}")
|
| 118 |
+
for lang_name in SUPPORTED_OUTPUT_LANGS:
|
| 119 |
+
lang_name = lang_name.strip().lower()
|
| 120 |
+
if lang_name not in LANG_NAME_TO_CODE:
|
| 121 |
+
logger.warning(f"Unknown language '{lang_name}' in LANGUAGES env var, skipping.")
|
| 122 |
+
continue
|
| 123 |
+
logger.info(f"Loading TTS models for '{lang_name}'...")
|
| 124 |
+
try:
|
| 125 |
+
tts_engines[lang_name] = Text2SpeechApp(language=lang_name, dtype=os.getenv("TTS_DTYPE", "float32"))
|
| 126 |
+
logger.info(f"✓ Successfully loaded '{lang_name}' with genders: {tts_engines[lang_name].supported_genders}")
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.error(f"✗ Failed to load models for '{lang_name}': {str(e)}")
|
| 129 |
+
logger.exception(f"Exception details for '{lang_name}':")
|
| 130 |
+
logger.info(f"Final loaded languages: {list(tts_engines.keys())}")
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def _synthesize(tts_app: Text2SpeechApp, text: str, gender: str, requested_sr: int) -> tuple[str, float]:
|
| 134 |
+
"""Run TTS inference and return base64-encoded WAV string and audio duration in seconds."""
|
| 135 |
+
audio_tensor = tts_app.generate_audio_bytes(text=text, speaker_gender=gender)
|
| 136 |
+
|
| 137 |
+
# Convert to int16 numpy
|
| 138 |
+
if hasattr(audio_tensor, "numpy"):
|
| 139 |
+
audio_np = audio_tensor.numpy().astype(np.int16)
|
| 140 |
+
else:
|
| 141 |
+
audio_np = np.array(audio_tensor, dtype=np.int16)
|
| 142 |
+
|
| 143 |
+
# Resample if requested rate differs from native rate
|
| 144 |
+
output_sr = SAMPLING_RATE
|
| 145 |
+
if requested_sr != SAMPLING_RATE:
|
| 146 |
+
import librosa
|
| 147 |
+
audio_float = audio_np.astype(np.float32) / 32768.0
|
| 148 |
+
audio_float = librosa.resample(audio_float, orig_sr=SAMPLING_RATE, target_sr=requested_sr)
|
| 149 |
+
audio_np = (audio_float * 32768.0).astype(np.int16)
|
| 150 |
+
output_sr = requested_sr
|
| 151 |
+
|
| 152 |
+
# Write WAV to in-memory buffer
|
| 153 |
+
buf = io.BytesIO()
|
| 154 |
+
wav_write(buf, output_sr, audio_np)
|
| 155 |
+
wav_bytes = buf.getvalue()
|
| 156 |
+
audio_duration_s = float(len(audio_np) / output_sr) if output_sr > 0 else 0.0
|
| 157 |
+
|
| 158 |
+
return base64.b64encode(wav_bytes).decode("ascii"), audio_duration_s
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def _resolve_tts_engine(lang_code: str, gender: str) -> tuple[str, Text2SpeechApp, str]:
|
| 162 |
+
lang_code = lang_code.lower()
|
| 163 |
+
lang_name = LANG_CODE_TO_NAME.get(lang_code)
|
| 164 |
+
if not lang_name:
|
| 165 |
+
raise HTTPException(status_code=400, detail=f"Unsupported language code: '{lang_code}'")
|
| 166 |
+
|
| 167 |
+
if lang_name not in tts_engines:
|
| 168 |
+
raise HTTPException(status_code=400, detail=f"Language '{lang_name}' not loaded. Available: {list(tts_engines.keys())}")
|
| 169 |
+
|
| 170 |
+
tts_app = tts_engines[lang_name]
|
| 171 |
+
|
| 172 |
+
resolved_gender = gender.lower()
|
| 173 |
+
if resolved_gender not in tts_app.supported_genders:
|
| 174 |
+
raise HTTPException(
|
| 175 |
+
status_code=400,
|
| 176 |
+
detail=f"Gender '{resolved_gender}' not available for '{lang_name}'. Available: {tts_app.supported_genders}"
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
return lang_name, tts_app, resolved_gender
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
@app.post("/services/inference/pipeline", response_model=PipelineResponse)
|
| 183 |
+
async def inference_pipeline(request: PipelineRequest):
|
| 184 |
+
t_start = time.perf_counter()
|
| 185 |
+
if not request.pipelineTasks:
|
| 186 |
+
raise HTTPException(status_code=400, detail="pipelineTasks is empty")
|
| 187 |
+
|
| 188 |
+
task = request.pipelineTasks[0]
|
| 189 |
+
|
| 190 |
+
if task.taskType != "tts":
|
| 191 |
+
raise HTTPException(status_code=400, detail=f"Unsupported taskType: '{task.taskType}'. Only 'tts' is supported.")
|
| 192 |
+
|
| 193 |
+
# Resolve language
|
| 194 |
+
lang_code = task.config.language.sourceLanguage
|
| 195 |
+
_, tts_app, gender = _resolve_tts_engine(lang_code, task.config.gender)
|
| 196 |
+
|
| 197 |
+
requested_sr = task.config.samplingRate
|
| 198 |
+
|
| 199 |
+
# Validate input
|
| 200 |
+
if not request.inputData.input:
|
| 201 |
+
raise HTTPException(status_code=400, detail="inputData.input is empty")
|
| 202 |
+
|
| 203 |
+
# Process all input texts and collect audio
|
| 204 |
+
audio_items = []
|
| 205 |
+
total_audio_duration_s = 0.0
|
| 206 |
+
for item in request.inputData.input:
|
| 207 |
+
b64_audio, audio_duration_s = await asyncio.to_thread(_synthesize, tts_app, item.source, gender, requested_sr)
|
| 208 |
+
total_audio_duration_s += audio_duration_s
|
| 209 |
+
audio_items.append(AudioItem(audioContent=b64_audio, audioUri=None))
|
| 210 |
+
|
| 211 |
+
latency_ms = round((time.perf_counter() - t_start) * 1000, 2)
|
| 212 |
+
rtf = round((latency_ms / 1000) / total_audio_duration_s, 4) if total_audio_duration_s > 0 else 0.0
|
| 213 |
+
|
| 214 |
+
response = PipelineResponse(
|
| 215 |
+
pipelineResponse=[
|
| 216 |
+
PipelineResponseItem(
|
| 217 |
+
taskType="tts",
|
| 218 |
+
config=ResponseConfig(
|
| 219 |
+
audioFormat="wav",
|
| 220 |
+
language=LanguageConfig(sourceLanguage=lang_code, sourceScriptCode=""),
|
| 221 |
+
encoding="base64",
|
| 222 |
+
samplingRate=requested_sr,
|
| 223 |
+
),
|
| 224 |
+
output=None,
|
| 225 |
+
audio=audio_items,
|
| 226 |
+
metrics={
|
| 227 |
+
"latency_ms": latency_ms,
|
| 228 |
+
"audio_duration_s": round(total_audio_duration_s, 3),
|
| 229 |
+
"rtf": rtf,
|
| 230 |
+
},
|
| 231 |
+
)
|
| 232 |
+
]
|
| 233 |
+
)
|
| 234 |
+
return response
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
@app.post("/tts")
|
| 238 |
+
async def tts_compat(request: SimpleTtsRequest):
|
| 239 |
+
"""Compatibility endpoint for clients calling /tts on port 5000."""
|
| 240 |
+
sentence = request.text.strip()
|
| 241 |
+
if not sentence:
|
| 242 |
+
raise HTTPException(status_code=400, detail="text is empty")
|
| 243 |
+
|
| 244 |
+
lang_code = request.language
|
| 245 |
+
_, tts_app, gender = _resolve_tts_engine(lang_code, request.gender)
|
| 246 |
+
|
| 247 |
+
t_start = time.perf_counter()
|
| 248 |
+
b64_audio, audio_duration_s = await asyncio.to_thread(_synthesize, tts_app, sentence, gender, request.samplingRate)
|
| 249 |
+
latency_ms = round((time.perf_counter() - t_start) * 1000, 2)
|
| 250 |
+
rtf = round((latency_ms / 1000) / audio_duration_s, 4) if audio_duration_s > 0 else 0.0
|
| 251 |
+
|
| 252 |
+
return {
|
| 253 |
+
"audioContent": b64_audio,
|
| 254 |
+
"audioFormat": "wav",
|
| 255 |
+
"encoding": "base64",
|
| 256 |
+
"samplingRate": request.samplingRate,
|
| 257 |
+
"metrics": {
|
| 258 |
+
"latency_ms": latency_ms,
|
| 259 |
+
"audio_duration_s": round(audio_duration_s, 3),
|
| 260 |
+
"rtf": rtf,
|
| 261 |
+
},
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
@app.get("/health")
|
| 266 |
+
def health():
|
| 267 |
+
loaded_langs = {lang: engine.supported_genders for lang, engine in tts_engines.items()}
|
| 268 |
+
return {
|
| 269 |
+
"status": "ok",
|
| 270 |
+
"loadedLanguages": loaded_langs,
|
| 271 |
+
"availableLanguages": list(LANG_CODE_TO_NAME.values()),
|
| 272 |
+
}
|
start_server.bat
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@echo off
|
| 2 |
+
set PYTHONUTF8=1
|
| 3 |
+
set TTS_DTYPE=float32
|
| 4 |
+
set LANGUAGES=hindi,punjabi,tamil
|
| 5 |
+
uvicorn server:app --host 0.0.0.0 --port 5000
|
| 6 |
+
pause
|
test_tts.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import json
|
| 3 |
+
import base64
|
| 4 |
+
|
| 5 |
+
url = "http://localhost:8000/services/inference/pipeline"
|
| 6 |
+
|
| 7 |
+
payload = {
|
| 8 |
+
"pipelineTasks": [
|
| 9 |
+
{
|
| 10 |
+
"taskType": "tts",
|
| 11 |
+
"config": {
|
| 12 |
+
"language": {"sourceLanguage": "hi"},
|
| 13 |
+
"gender": "female",
|
| 14 |
+
"samplingRate": 48000,
|
| 15 |
+
},
|
| 16 |
+
}
|
| 17 |
+
],
|
| 18 |
+
"inputData": {
|
| 19 |
+
"input": [
|
| 20 |
+
{"source": "मेरा नाम कशा है"}
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
response = requests.post(url, json=payload)
|
| 26 |
+
print(f"Status: {response.status_code}")
|
| 27 |
+
|
| 28 |
+
if response.status_code == 200:
|
| 29 |
+
data = response.json()
|
| 30 |
+
audio_b64 = data["pipelineResponse"][0]["audio"][0]["audioContent"]
|
| 31 |
+
audio_bytes = base64.b64decode(audio_b64)
|
| 32 |
+
with open("test_output.wav", "wb") as f:
|
| 33 |
+
f.write(audio_bytes)
|
| 34 |
+
print(f"Audio saved to test_output.wav ({len(audio_bytes)} bytes)")
|
| 35 |
+
else:
|
| 36 |
+
print(f"Error: {response.text}")
|
text_preprocess_for_inference.py
CHANGED
|
@@ -1,949 +1,971 @@
|
|
| 1 |
-
'''
|
| 2 |
-
TTS Preprocessing
|
| 3 |
-
Developed by Arun Kumar A(CS20S013) - November 2022
|
| 4 |
-
|
| 5 |
-
'''
|
| 6 |
-
import os
|
| 7 |
-
import re
|
| 8 |
-
import
|
| 9 |
-
import
|
| 10 |
-
import
|
| 11 |
-
|
| 12 |
-
import
|
| 13 |
-
import
|
| 14 |
-
import
|
| 15 |
-
|
| 16 |
-
import
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
#
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
"
|
| 66 |
-
""
|
| 67 |
-
"
|
| 68 |
-
"
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
"
|
| 135 |
-
"
|
| 136 |
-
"
|
| 137 |
-
"
|
| 138 |
-
"
|
| 139 |
-
"
|
| 140 |
-
"
|
| 141 |
-
"
|
| 142 |
-
"
|
| 143 |
-
"
|
| 144 |
-
"
|
| 145 |
-
"
|
| 146 |
-
"
|
| 147 |
-
"
|
| 148 |
-
"
|
| 149 |
-
"
|
| 150 |
-
"
|
| 151 |
-
"
|
| 152 |
-
"
|
| 153 |
-
"
|
| 154 |
-
"
|
| 155 |
-
"
|
| 156 |
-
"
|
| 157 |
-
"
|
| 158 |
-
"
|
| 159 |
-
"
|
| 160 |
-
"
|
| 161 |
-
"
|
| 162 |
-
"
|
| 163 |
-
"
|
| 164 |
-
"
|
| 165 |
-
"
|
| 166 |
-
"
|
| 167 |
-
"
|
| 168 |
-
"
|
| 169 |
-
"
|
| 170 |
-
"
|
| 171 |
-
"
|
| 172 |
-
"
|
| 173 |
-
"
|
| 174 |
-
"
|
| 175 |
-
"
|
| 176 |
-
"
|
| 177 |
-
"
|
| 178 |
-
"
|
| 179 |
-
"
|
| 180 |
-
"
|
| 181 |
-
"
|
| 182 |
-
"
|
| 183 |
-
"
|
| 184 |
-
"
|
| 185 |
-
"
|
| 186 |
-
"
|
| 187 |
-
"
|
| 188 |
-
"
|
| 189 |
-
"
|
| 190 |
-
"
|
| 191 |
-
"
|
| 192 |
-
"
|
| 193 |
-
"
|
| 194 |
-
"
|
| 195 |
-
"
|
| 196 |
-
"
|
| 197 |
-
"
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
"
|
| 203 |
-
"
|
| 204 |
-
"
|
| 205 |
-
"
|
| 206 |
-
"
|
| 207 |
-
"
|
| 208 |
-
"
|
| 209 |
-
"
|
| 210 |
-
"
|
| 211 |
-
"
|
| 212 |
-
"
|
| 213 |
-
"
|
| 214 |
-
"
|
| 215 |
-
"
|
| 216 |
-
"
|
| 217 |
-
"
|
| 218 |
-
"
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
"
|
| 224 |
-
"
|
| 225 |
-
"
|
| 226 |
-
"
|
| 227 |
-
"
|
| 228 |
-
"
|
| 229 |
-
"
|
| 230 |
-
"
|
| 231 |
-
"
|
| 232 |
-
"
|
| 233 |
-
"
|
| 234 |
-
"
|
| 235 |
-
"
|
| 236 |
-
"
|
| 237 |
-
"
|
| 238 |
-
"
|
| 239 |
-
"
|
| 240 |
-
"
|
| 241 |
-
"
|
| 242 |
-
"
|
| 243 |
-
"
|
| 244 |
-
"
|
| 245 |
-
"
|
| 246 |
-
"
|
| 247 |
-
"
|
| 248 |
-
"
|
| 249 |
-
"
|
| 250 |
-
"
|
| 251 |
-
"
|
| 252 |
-
"
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
phn_out[i]
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
output_string +=
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
return
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
#
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
if
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
"
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
"
|
| 560 |
-
"
|
| 561 |
-
"
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
self.
|
| 800 |
-
"
|
| 801 |
-
"
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
self.
|
| 841 |
-
self.
|
| 842 |
-
self.
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
#
|
| 851 |
-
text = self.text_normalizer.
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
#
|
| 855 |
-
|
| 856 |
-
#
|
| 857 |
-
|
| 858 |
-
#
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
text = self.
|
| 878 |
-
|
| 879 |
-
text = self.text_normalizer.
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
phonified_text = self.
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
text = self.
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
-
|
| 909 |
-
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
-
|
| 915 |
-
|
| 916 |
-
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
|
| 933 |
-
|
| 934 |
-
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
text =
|
| 938 |
-
|
| 939 |
-
# text = self.text_normalizer.
|
| 940 |
-
|
| 941 |
-
text = self.text_normalizer.
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
-
|
| 946 |
-
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
'''
|
| 2 |
+
TTS Preprocessing
|
| 3 |
+
Developed by Arun Kumar A(CS20S013) - November 2022
|
| 4 |
+
Updated by Utkarsh Pathak (DA24S011) - Septmeber 2023
|
| 5 |
+
'''
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
import sys
|
| 9 |
+
import json
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import string
|
| 12 |
+
from collections import defaultdict
|
| 13 |
+
import time
|
| 14 |
+
import subprocess
|
| 15 |
+
import shutil
|
| 16 |
+
from multiprocessing import Process
|
| 17 |
+
import traceback
|
| 18 |
+
|
| 19 |
+
#imports of dependencies from environment.yml
|
| 20 |
+
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__))))
|
| 21 |
+
|
| 22 |
+
from num_to_words import num_to_word
|
| 23 |
+
from g2p_en import G2p
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
from NumberToText import NumberDictionary
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def add_to_dictionary(dict_to_add, dict_file):
|
| 31 |
+
append_string = ""
|
| 32 |
+
for key, value in dict_to_add.items():
|
| 33 |
+
append_string += (str(key) + " " + str(value) + "\n")
|
| 34 |
+
|
| 35 |
+
if os.path.isfile(dict_file):
|
| 36 |
+
# make a copy of the dictionary
|
| 37 |
+
source_dir = os.path.dirname(dict_file)
|
| 38 |
+
dict_file_name = os.path.basename(dict_file)
|
| 39 |
+
temp_file_name = "." + dict_file_name + ".temp"
|
| 40 |
+
temp_dict_file = os.path.join(source_dir, temp_file_name)
|
| 41 |
+
shutil.copy(dict_file, temp_dict_file)
|
| 42 |
+
# append the new words in the dictionary to the temp file
|
| 43 |
+
with open(temp_dict_file, "a") as f:
|
| 44 |
+
f.write(append_string)
|
| 45 |
+
# check if the write is successful and then replace the temp file as the dict file
|
| 46 |
+
try:
|
| 47 |
+
df_orig = pd.read_csv(dict_file, delimiter=" ", header=None, dtype=str)
|
| 48 |
+
df_temp = pd.read_csv(temp_dict_file, delimiter=" ", header=None, dtype=str)
|
| 49 |
+
if len(df_temp) > len(df_orig):
|
| 50 |
+
os.replace(temp_dict_file, dict_file)
|
| 51 |
+
print(f"{len(dict_to_add)} new words appended to Dictionary: {dict_file}")
|
| 52 |
+
except:
|
| 53 |
+
print(traceback.format_exc())
|
| 54 |
+
else:
|
| 55 |
+
# create a new dictionary
|
| 56 |
+
with open(dict_file, "a") as f:
|
| 57 |
+
f.write(append_string)
|
| 58 |
+
print(f"New Dictionary: {dict_file} created with {len(dict_to_add)} words")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class TextCleaner:
|
| 62 |
+
def __init__(self):
|
| 63 |
+
# this is a static set of cleaning rules to be applied
|
| 64 |
+
self.cleaning_rules = {
|
| 65 |
+
" +" : " ",
|
| 66 |
+
"^ +" : "",
|
| 67 |
+
" +$" : "",
|
| 68 |
+
"#" : "",
|
| 69 |
+
"[.,;।!](\r\n)*" : "# ",
|
| 70 |
+
"[.,;।!](\n)*" : "# ",
|
| 71 |
+
"(\r\n)+" : "# ",
|
| 72 |
+
"(\n)+" : "# ",
|
| 73 |
+
"(\r)+" : "# ",
|
| 74 |
+
"""[?;:)(!|&’‘,।\."]""": "",
|
| 75 |
+
"[/']" : "",
|
| 76 |
+
"[-–]" : " ",
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
def clean(self, text):
|
| 80 |
+
for key, replacement in self.cleaning_rules.items():
|
| 81 |
+
text = re.sub(key, replacement, text)
|
| 82 |
+
return text
|
| 83 |
+
|
| 84 |
+
def clean_list(self, text):
|
| 85 |
+
# input is supposed to be a list of strings
|
| 86 |
+
output_text = []
|
| 87 |
+
for line in text:
|
| 88 |
+
line = line.strip()
|
| 89 |
+
for key, replacement in self.cleaning_rules.items():
|
| 90 |
+
line = re.sub(key, replacement, line)
|
| 91 |
+
output_text.append(line)
|
| 92 |
+
return output_text
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
class Phonifier:
|
| 96 |
+
def __init__(self, dict_location=None):
|
| 97 |
+
if dict_location is None:
|
| 98 |
+
dict_location = "phone_dict"
|
| 99 |
+
self.dict_location = dict_location
|
| 100 |
+
self.devanagari_langs = {
|
| 101 |
+
"bodo",
|
| 102 |
+
"hindi",
|
| 103 |
+
"kashmiri",
|
| 104 |
+
"marathi",
|
| 105 |
+
"nepali",
|
| 106 |
+
"santali",
|
| 107 |
+
"rajasthani",
|
| 108 |
+
"sindhi",
|
| 109 |
+
"dogri",
|
| 110 |
+
"sanskrit",
|
| 111 |
+
"maithili",
|
| 112 |
+
"konkani",
|
| 113 |
+
}
|
| 114 |
+
self.phone_dictionary = {}
|
| 115 |
+
# load dictionary for all the available languages
|
| 116 |
+
for dict_file in os.listdir(dict_location):
|
| 117 |
+
try:
|
| 118 |
+
if dict_file.startswith("."):
|
| 119 |
+
# ignore hidden files
|
| 120 |
+
continue
|
| 121 |
+
language = dict_file
|
| 122 |
+
dict_file_path = os.path.join(dict_location, dict_file)
|
| 123 |
+
df = pd.read_csv(dict_file_path, delimiter=" ", header=None, dtype=str)
|
| 124 |
+
self.phone_dictionary[language] = df.set_index(0).to_dict('dict')[1]
|
| 125 |
+
except Exception as e:
|
| 126 |
+
#print(traceback.format_exc())
|
| 127 |
+
print("")
|
| 128 |
+
print("Phone dictionary loaded for the following languages:", list(self.phone_dictionary.keys()))
|
| 129 |
+
|
| 130 |
+
self.g2p = G2p()
|
| 131 |
+
print('Loading G2P model... Done!')
|
| 132 |
+
# Mapping between the cmu phones and the iitm cls
|
| 133 |
+
self.cmu_2_cls_map = {
|
| 134 |
+
"AA" : "aa",
|
| 135 |
+
"AA0" : "aa",
|
| 136 |
+
"AA1" : "aa",
|
| 137 |
+
"AA2" : "aa",
|
| 138 |
+
"AE" : "axx",
|
| 139 |
+
"AE0" : "axx",
|
| 140 |
+
"AE1" : "axx",
|
| 141 |
+
"AE2" : "axx",
|
| 142 |
+
"AH" : "a",
|
| 143 |
+
"AH0" : "a",
|
| 144 |
+
"AH1" : "a",
|
| 145 |
+
"AH2" : "a",
|
| 146 |
+
"AO" : "ax",
|
| 147 |
+
"AO0" : "ax",
|
| 148 |
+
"AO1" : "ax",
|
| 149 |
+
"AO2" : "ax",
|
| 150 |
+
"AW" : "ou",
|
| 151 |
+
"AW0" : "ou",
|
| 152 |
+
"AW1" : "ou",
|
| 153 |
+
"AW2" : "ou",
|
| 154 |
+
"AX" : "a",
|
| 155 |
+
"AY" : "ei",
|
| 156 |
+
"AY0" : "ei",
|
| 157 |
+
"AY1" : "ei",
|
| 158 |
+
"AY2" : "ei",
|
| 159 |
+
"B" : "b",
|
| 160 |
+
"CH" : "c",
|
| 161 |
+
"D" : "dx",
|
| 162 |
+
"DH" : "d",
|
| 163 |
+
"EH" : "ee",
|
| 164 |
+
"EH0" : "ee",
|
| 165 |
+
"EH1" : "ee",
|
| 166 |
+
"EH2" : "ee",
|
| 167 |
+
"ER" : "a r",
|
| 168 |
+
"ER0" : "a r",
|
| 169 |
+
"ER1" : "a r",
|
| 170 |
+
"ER2" : "a r",
|
| 171 |
+
"EY" : "ee",
|
| 172 |
+
"EY0" : "ee",
|
| 173 |
+
"EY1" : "ee",
|
| 174 |
+
"EY2" : "ee",
|
| 175 |
+
"F" : "f",
|
| 176 |
+
"G" : "g",
|
| 177 |
+
"HH" : "h",
|
| 178 |
+
"IH" : "i",
|
| 179 |
+
"IH0" : "i",
|
| 180 |
+
"IH1" : "i",
|
| 181 |
+
"IH2" : "i",
|
| 182 |
+
"IY" : "ii",
|
| 183 |
+
"IY0" : "ii",
|
| 184 |
+
"IY1" : "ii",
|
| 185 |
+
"IY2" : "ii",
|
| 186 |
+
"JH" : "j",
|
| 187 |
+
"K" : "k",
|
| 188 |
+
"L" : "l",
|
| 189 |
+
"M" : "m",
|
| 190 |
+
"N" : "n",
|
| 191 |
+
"NG" : "ng",
|
| 192 |
+
"OW" : "o",
|
| 193 |
+
"OW0" : "o",
|
| 194 |
+
"OW1" : "o",
|
| 195 |
+
"OW2" : "o",
|
| 196 |
+
"OY" : "ei",
|
| 197 |
+
"OY0" : "ei",
|
| 198 |
+
"OY1" : "ei",
|
| 199 |
+
"OY2" : "ei",
|
| 200 |
+
"P" : "p",
|
| 201 |
+
"R" : "r",
|
| 202 |
+
"S" : "s",
|
| 203 |
+
"SH" : "sh",
|
| 204 |
+
"T" : "tx",
|
| 205 |
+
"TH" : "t",
|
| 206 |
+
"UH" : "u",
|
| 207 |
+
"UH0" : "u",
|
| 208 |
+
"UH1" : "u",
|
| 209 |
+
"UH2" : "u",
|
| 210 |
+
"UW" : "uu",
|
| 211 |
+
"UW0" : "uu",
|
| 212 |
+
"UW1" : "uu",
|
| 213 |
+
"UW2" : "uu",
|
| 214 |
+
"V" : "w",
|
| 215 |
+
"W" : "w",
|
| 216 |
+
"Y" : "y",
|
| 217 |
+
"Z" : "z",
|
| 218 |
+
"ZH" : "sh",
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
# Mapping between the iitm cls and iitm char
|
| 222 |
+
self.cls_2_chr_map = {
|
| 223 |
+
"aa" : "A",
|
| 224 |
+
"ii" : "I",
|
| 225 |
+
"uu" : "U",
|
| 226 |
+
"ee" : "E",
|
| 227 |
+
"oo" : "O",
|
| 228 |
+
"nn" : "N",
|
| 229 |
+
"ae" : "ऍ",
|
| 230 |
+
"ag" : "ऽ",
|
| 231 |
+
"au" : "औ",
|
| 232 |
+
"axx" : "अ",
|
| 233 |
+
"ax" : "ऑ",
|
| 234 |
+
"bh" : "B",
|
| 235 |
+
"ch" : "C",
|
| 236 |
+
"dh" : "ध",
|
| 237 |
+
"dx" : "ड",
|
| 238 |
+
"dxh" : "ढ",
|
| 239 |
+
"dxhq" : "T",
|
| 240 |
+
"dxq" : "D",
|
| 241 |
+
"ei" : "ऐ",
|
| 242 |
+
"ai" : "ऐ",
|
| 243 |
+
"eu" : "உ",
|
| 244 |
+
"gh" : "घ",
|
| 245 |
+
"gq" : "G",
|
| 246 |
+
"hq" : "H",
|
| 247 |
+
"jh" : "J",
|
| 248 |
+
"kh" : "ख",
|
| 249 |
+
"khq" : "K",
|
| 250 |
+
"kq" : "क",
|
| 251 |
+
"ln" : "ൾ",
|
| 252 |
+
"lw" : "ൽ",
|
| 253 |
+
"lx" : "ള",
|
| 254 |
+
"mq" : "M",
|
| 255 |
+
"nd" : "न",
|
| 256 |
+
"ng" : "ङ",
|
| 257 |
+
"nj" : "ञ",
|
| 258 |
+
"nk" : "Y",
|
| 259 |
+
"nw" : "ൺ",
|
| 260 |
+
"nx" : "ण",
|
| 261 |
+
"ou" : "औ",
|
| 262 |
+
"ph" : "P",
|
| 263 |
+
"rq" : "R",
|
| 264 |
+
"rqw" : "ॠ",
|
| 265 |
+
"rw" : "ർ",
|
| 266 |
+
"rx" : "र",
|
| 267 |
+
"sh" : "श",
|
| 268 |
+
"sx" : "ष",
|
| 269 |
+
"th" : "थ",
|
| 270 |
+
"tx" : "ट",
|
| 271 |
+
"txh" : "ठ",
|
| 272 |
+
"wv" : "W",
|
| 273 |
+
"zh" : "Z",
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
# Multilingual support for OOV characters
|
| 277 |
+
oov_map_json_file = 'multilingualcharmap.json'
|
| 278 |
+
with open(oov_map_json_file, 'r') as oov_file:
|
| 279 |
+
self.oov_map = json.load(oov_file)
|
| 280 |
+
|
| 281 |
+
def __is_float(self, word):
|
| 282 |
+
parts = word.split('.')
|
| 283 |
+
if len(parts) != 2:
|
| 284 |
+
return False
|
| 285 |
+
return parts[0].isdecimal() and parts[1].isdecimal()
|
| 286 |
+
|
| 287 |
+
def en_g2p(self, word):
|
| 288 |
+
phn_out = self.g2p(word)
|
| 289 |
+
# print(f"phn_out: {phn_out}")
|
| 290 |
+
# iterate over the string list and replace each word with the corresponding value from the dictionary
|
| 291 |
+
for i, phn in enumerate(phn_out):
|
| 292 |
+
if phn in self.cmu_2_cls_map.keys():
|
| 293 |
+
phn_out[i] = self.cmu_2_cls_map[phn]
|
| 294 |
+
# cls_out = self.cmu_2_cls_map[phn]
|
| 295 |
+
if phn_out[i] in self.cls_2_chr_map.keys():
|
| 296 |
+
phn_out[i] = self.cls_2_chr_map[phn_out[i]]
|
| 297 |
+
else:
|
| 298 |
+
pass
|
| 299 |
+
else:
|
| 300 |
+
pass # ignore words that are not in the dictionary
|
| 301 |
+
# print(f"i: {i}, phn: {phn}, cls_out: {cls_out}, phn_out: {phn_out[i]}")
|
| 302 |
+
return ("".join(phn_out)).strip().replace(" ", "")
|
| 303 |
+
|
| 304 |
+
def __post_phonify(self, text, language, gender):
|
| 305 |
+
language_gender_id = language+'_'+gender
|
| 306 |
+
if language_gender_id in self.oov_map.keys():
|
| 307 |
+
output_string = ''
|
| 308 |
+
for char in text:
|
| 309 |
+
if char in self.oov_map[language_gender_id].keys():
|
| 310 |
+
output_string += self.oov_map[language_gender_id][char]
|
| 311 |
+
else:
|
| 312 |
+
output_string += char
|
| 313 |
+
# output_string += self.oov_map['language_gender_id']['char']
|
| 314 |
+
return output_string
|
| 315 |
+
else:
|
| 316 |
+
return text
|
| 317 |
+
|
| 318 |
+
def __is_english_word(self, word):
|
| 319 |
+
maxchar = max(word)
|
| 320 |
+
if u'\u0000' <= maxchar <= u'\u007f':
|
| 321 |
+
return True
|
| 322 |
+
return False
|
| 323 |
+
|
| 324 |
+
def __normalize_phones(self, phone_str):
|
| 325 |
+
"""
|
| 326 |
+
Apply phone-level replacements after phonification
|
| 327 |
+
"""
|
| 328 |
+
replace_map = {
|
| 329 |
+
"P": "f",
|
| 330 |
+
"ष": "श",
|
| 331 |
+
"ग़": "ग",
|
| 332 |
+
"T": "ढ",
|
| 333 |
+
"ऱ": "r",
|
| 334 |
+
"jञ": "gy",
|
| 335 |
+
"क़":"क",
|
| 336 |
+
"ख़":"ख",
|
| 337 |
+
"ड़":"ड",
|
| 338 |
+
"फ़":"फ",
|
| 339 |
+
"य़":"य",
|
| 340 |
+
"ऱ":"r",
|
| 341 |
+
"Y":"",
|
| 342 |
+
"G":"g",
|
| 343 |
+
"क":"k",
|
| 344 |
+
"ay":"E",
|
| 345 |
+
"kH":"ख",
|
| 346 |
+
"ऩ":"n"
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
for src, tgt in replace_map.items():
|
| 350 |
+
phone_str = phone_str.replace(src, tgt)
|
| 351 |
+
|
| 352 |
+
return phone_str
|
| 353 |
+
|
| 354 |
+
def __phonify(self, text, language, gender):
|
| 355 |
+
# text is expected to be a list of strings
|
| 356 |
+
words = set((" ".join(text)).split(" "))
|
| 357 |
+
#print(f"words test: {words}")
|
| 358 |
+
non_dict_words = []
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
if language in self.phone_dictionary:
|
| 362 |
+
for word in words:
|
| 363 |
+
# print(f"word: {word}")
|
| 364 |
+
if word not in self.phone_dictionary[language] and (language == "english" or (not self.__is_english_word(word))):
|
| 365 |
+
non_dict_words.append(word)
|
| 366 |
+
#print('INSIDE IF CONDITION OF ADDING WORDS')
|
| 367 |
+
else:
|
| 368 |
+
non_dict_words = words
|
| 369 |
+
print(f"word not in dict: {non_dict_words}")
|
| 370 |
+
|
| 371 |
+
if len(non_dict_words) > 0:
|
| 372 |
+
# unified parser has to be run for the non dictionary words
|
| 373 |
+
os.makedirs("tmp", exist_ok=True)
|
| 374 |
+
timestamp = str(time.time())
|
| 375 |
+
non_dict_words_file = os.path.abspath("tmp/non_dict_words_" + timestamp)
|
| 376 |
+
out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
|
| 377 |
+
with open(non_dict_words_file, "w") as f:
|
| 378 |
+
f.write("\n".join(non_dict_words))
|
| 379 |
+
|
| 380 |
+
if(language == 'tamil'):
|
| 381 |
+
current_directory = os.getcwd()
|
| 382 |
+
|
| 383 |
+
tamil_parser_cmd = f"{current_directory}/ssn_parser/Tamil_Parser/ssn_parser/tamil_parser.sh"
|
| 384 |
+
subprocess.run(["bash", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, f"{current_directory}/ssn_parser/Tamil_Parser/ssn_parser"])
|
| 385 |
+
elif(language == 'english'):
|
| 386 |
+
phn_out_dict = {}
|
| 387 |
+
for i in range(0,len(non_dict_words)):
|
| 388 |
+
phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i])
|
| 389 |
+
# Create a string representation of the dictionary
|
| 390 |
+
data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()])
|
| 391 |
+
print(f"data_str: {data_str}")
|
| 392 |
+
with open(out_dict_file, "w") as f:
|
| 393 |
+
f.write(data_str)
|
| 394 |
+
else:
|
| 395 |
+
# unified_parser_cmd = "phonify_wrapper.sh"
|
| 396 |
+
|
| 397 |
+
# subprocess.run(["bash", unified_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "/speech/arun/tts/tts_api/text2phone/"])
|
| 398 |
+
out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
|
| 399 |
+
from get_phone_mapped_python import TextReplacer
|
| 400 |
+
|
| 401 |
+
from indic_unified_parser.uparser import wordparse
|
| 402 |
+
|
| 403 |
+
text_replacer=TextReplacer()
|
| 404 |
+
# def write_output_to_file(output_text, file_path):
|
| 405 |
+
# with open(file_path, 'w') as f:
|
| 406 |
+
# f.write(output_text)
|
| 407 |
+
parsed_output_list = []
|
| 408 |
+
for word in non_dict_words:
|
| 409 |
+
parsed_word = wordparse(word, 0, 0, 1)
|
| 410 |
+
parsed_output_list.append(parsed_word)
|
| 411 |
+
replaced_output_list = [text_replacer.apply_replacements(parsed_word) for parsed_word in parsed_output_list]
|
| 412 |
+
with open(out_dict_file, 'w', encoding='utf-8') as file:
|
| 413 |
+
for original_word, formatted_word in zip(non_dict_words, replaced_output_list):
|
| 414 |
+
line = f"{original_word}\t{formatted_word}\n"
|
| 415 |
+
file.write(line)
|
| 416 |
+
print(line, end='')
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
try:
|
| 420 |
+
|
| 421 |
+
df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str)
|
| 422 |
+
#print('DATAFRAME OUTPUT FILE', df.head())
|
| 423 |
+
new_dict = df.dropna().set_index(0).to_dict('dict')[1]
|
| 424 |
+
#print("new dict",new_dict)
|
| 425 |
+
if language not in self.phone_dictionary:
|
| 426 |
+
self.phone_dictionary[language] = new_dict
|
| 427 |
+
else:
|
| 428 |
+
self.phone_dictionary[language].update(new_dict)
|
| 429 |
+
# run a non-blocking child process to update the dictionary file
|
| 430 |
+
#print("phone_dict", self.phone_dictionary)
|
| 431 |
+
p = Process(target=add_to_dictionary, args=(new_dict, os.path.join(self.dict_location, language)))
|
| 432 |
+
p.start()
|
| 433 |
+
except Exception as err:
|
| 434 |
+
print(f"Error: While loading {out_dict_file}")
|
| 435 |
+
#traceback.print_exc()
|
| 436 |
+
|
| 437 |
+
# phonify text with dictionary
|
| 438 |
+
text_phonified = []
|
| 439 |
+
for phrase in text:
|
| 440 |
+
phrase_phonified = []
|
| 441 |
+
for word in phrase.split(" "):
|
| 442 |
+
if self.__is_english_word(word):
|
| 443 |
+
if word in self.phone_dictionary["english"]:
|
| 444 |
+
phrase_phonified.append(str(self.phone_dictionary["english"][word]))
|
| 445 |
+
else:
|
| 446 |
+
phrase_phonified.append(str(self.en_g2p(word)))
|
| 447 |
+
elif word in self.phone_dictionary[language]:
|
| 448 |
+
# if a word could not be parsed, skip it
|
| 449 |
+
phrase_phonified.append(str(self.phone_dictionary[language][word]))
|
| 450 |
+
# text_phonified.append(self.__post_phonify(" ".join(phrase_phonified),language, gender))
|
| 451 |
+
|
| 452 |
+
phone_line = " ".join(phrase_phonified)
|
| 453 |
+
if language in self.devanagari_langs:
|
| 454 |
+
phone_line = self.__normalize_phones(phone_line)
|
| 455 |
+
text_phonified.append(phone_line)
|
| 456 |
+
return text_phonified
|
| 457 |
+
|
| 458 |
+
def __merge_lists(self, lists):
|
| 459 |
+
merged_string = ""
|
| 460 |
+
for list in lists:
|
| 461 |
+
for word in list:
|
| 462 |
+
merged_string += word + " "
|
| 463 |
+
return merged_string.strip()
|
| 464 |
+
|
| 465 |
+
def __phonify_list(self, text, language, gender):
|
| 466 |
+
# text is expected to be a list of list of strings
|
| 467 |
+
words = set(self.__merge_lists(text).split(" "))
|
| 468 |
+
non_dict_words = []
|
| 469 |
+
if language in self.phone_dictionary:
|
| 470 |
+
for word in words:
|
| 471 |
+
if word not in self.phone_dictionary[language] and (language == "english" or (not self.__is_english_word(word))):
|
| 472 |
+
non_dict_words.append(word)
|
| 473 |
+
else:
|
| 474 |
+
non_dict_words = words
|
| 475 |
+
|
| 476 |
+
if len(non_dict_words) > 0:
|
| 477 |
+
print(len(non_dict_words))
|
| 478 |
+
print(non_dict_words)
|
| 479 |
+
# unified parser has to be run for the non dictionary words
|
| 480 |
+
os.makedirs("tmp", exist_ok=True)
|
| 481 |
+
timestamp = str(time.time())
|
| 482 |
+
non_dict_words_file = os.path.abspath("tmp/non_dict_words_" + timestamp)
|
| 483 |
+
out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
|
| 484 |
+
with open(non_dict_words_file, "w") as f:
|
| 485 |
+
f.write("\n".join(non_dict_words))
|
| 486 |
+
|
| 487 |
+
if(language == 'tamil'):
|
| 488 |
+
tamil_parser_cmd = "tamil_parser.sh"
|
| 489 |
+
subprocess.run(["bash", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "ssn_parser/"])
|
| 490 |
+
elif(language == 'english'):
|
| 491 |
+
phn_out_dict = {}
|
| 492 |
+
for i in range(0,len(non_dict_words)):
|
| 493 |
+
phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i])
|
| 494 |
+
# Create a string representation of the dictionary
|
| 495 |
+
data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()])
|
| 496 |
+
print(f"data_str: {data_str}")
|
| 497 |
+
with open(out_dict_file, "w") as f:
|
| 498 |
+
f.write(data_str)
|
| 499 |
+
else:
|
| 500 |
+
unified_parser_cmd = "phonify_wrapper.sh"
|
| 501 |
+
subprocess.run(["bash", unified_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "ssn_parser/"])
|
| 502 |
+
# unified_parser_cmd = "phonify_wrapper.sh"
|
| 503 |
+
# subprocess.run(["bash", unified_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "/var/www/html/IITM_TTS/E2E_TTS_FS2/text_proc/text2phone/"])
|
| 504 |
+
try:
|
| 505 |
+
df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str)
|
| 506 |
+
new_dict = df.dropna().set_index(0).to_dict('dict')[1]
|
| 507 |
+
print(new_dict)
|
| 508 |
+
if language not in self.phone_dictionary:
|
| 509 |
+
self.phone_dictionary[language] = new_dict
|
| 510 |
+
else:
|
| 511 |
+
self.phone_dictionary[language].update(new_dict)
|
| 512 |
+
# run a non-blocking child process to update the dictionary file
|
| 513 |
+
p = Process(target=add_to_dictionary, args=(new_dict, os.path.join(self.dict_location, language)))
|
| 514 |
+
p.start()
|
| 515 |
+
except Exception as err:
|
| 516 |
+
traceback.print_exc()
|
| 517 |
+
|
| 518 |
+
# phonify text with dictionary
|
| 519 |
+
text_phonified = []
|
| 520 |
+
for line in text:
|
| 521 |
+
line_phonified = []
|
| 522 |
+
for phrase in line:
|
| 523 |
+
phrase_phonified = []
|
| 524 |
+
for word in phrase.split(" "):
|
| 525 |
+
if self.__is_english_word(word):
|
| 526 |
+
if word in self.phone_dictionary["english"]:
|
| 527 |
+
phrase_phonified.append(str(self.phone_dictionary["english"][word]))
|
| 528 |
+
else:
|
| 529 |
+
phrase_phonified.append(str(self.en_g2p(word)))
|
| 530 |
+
elif word in self.phone_dictionary[language]:
|
| 531 |
+
# if a word could not be parsed, skip it
|
| 532 |
+
phrase_phonified.append(str(self.phone_dictionary[language][word]))
|
| 533 |
+
# line_phonified.append(self.__post_phonify(" ".join(phrase_phonified), language, gender))
|
| 534 |
+
line_phonified.append(" ".join(phrase_phonified))
|
| 535 |
+
text_phonified.append(line_phonified)
|
| 536 |
+
return text_phonified
|
| 537 |
+
|
| 538 |
+
def phonify(self, text, language, gender):
|
| 539 |
+
if not isinstance(text, list):
|
| 540 |
+
out = self.__phonify([text], language, gender)
|
| 541 |
+
return out[0]
|
| 542 |
+
return self.__phonify(text, language, gender)
|
| 543 |
+
|
| 544 |
+
def phonify_list(self, text, language, gender):
|
| 545 |
+
if isinstance(text, list):
|
| 546 |
+
return self.__phonify_list(text, language, gender)
|
| 547 |
+
else:
|
| 548 |
+
print("Error!! Expected to have a list as input.")
|
| 549 |
+
|
| 550 |
+
|
| 551 |
+
class TextNormalizer:
|
| 552 |
+
def __init__(self, char_map_location=None, phonifier = Phonifier()):
|
| 553 |
+
self.phonifier = phonifier
|
| 554 |
+
if char_map_location is None:
|
| 555 |
+
char_map_location = "charmap"
|
| 556 |
+
|
| 557 |
+
# this is a static set of cleaning rules to be applied
|
| 558 |
+
self.cleaning_rules = {
|
| 559 |
+
" +" : " ",
|
| 560 |
+
"^ +" : "",
|
| 561 |
+
" +$" : "",
|
| 562 |
+
"#$" : "",
|
| 563 |
+
"# +$" : "",
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
# this is the list of languages supported by num_to_words
|
| 567 |
+
self.keydict = {"english" : "en",
|
| 568 |
+
"hindi" : "hi",
|
| 569 |
+
"gujarati" : "gu",
|
| 570 |
+
"marathi" : "mr",
|
| 571 |
+
"bengali" : "bn",
|
| 572 |
+
"telugu" : "te",
|
| 573 |
+
"tamil" : "ta",
|
| 574 |
+
"kannada" : "kn",
|
| 575 |
+
"odia" : "or",
|
| 576 |
+
"punjabi" : "pa"
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
self.g2p = G2p()
|
| 580 |
+
#print('Loading G2P model... Done!')
|
| 581 |
+
|
| 582 |
+
def __post_cleaning(self, text):
|
| 583 |
+
for key, replacement in self.cleaning_rules.items():
|
| 584 |
+
text = re.sub(key, replacement, text)
|
| 585 |
+
return text
|
| 586 |
+
|
| 587 |
+
def __post_cleaning_list(self, text):
|
| 588 |
+
# input is supposed to be a list of strings
|
| 589 |
+
output_text = []
|
| 590 |
+
for line in text:
|
| 591 |
+
for key, replacement in self.cleaning_rules.items():
|
| 592 |
+
line = re.sub(key, replacement, line)
|
| 593 |
+
output_text.append(line)
|
| 594 |
+
return output_text
|
| 595 |
+
|
| 596 |
+
def __check_char_type(self, str_c):
|
| 597 |
+
# Determine the type of the character
|
| 598 |
+
if str_c.isnumeric():
|
| 599 |
+
char_type = "number"
|
| 600 |
+
elif str_c in string.punctuation:
|
| 601 |
+
char_type = "punctuation"
|
| 602 |
+
elif str_c in string.whitespace:
|
| 603 |
+
char_type = "whitespace"
|
| 604 |
+
elif str_c.isalpha() and str_c.isascii():
|
| 605 |
+
char_type = "ascii"
|
| 606 |
+
else:
|
| 607 |
+
char_type = "non-ascii"
|
| 608 |
+
return char_type
|
| 609 |
+
|
| 610 |
+
def insert_space(self, text):
|
| 611 |
+
'''
|
| 612 |
+
Check if the text contains numbers and English words and if they are without space inserts space between them.
|
| 613 |
+
'''
|
| 614 |
+
# Initialize variables to track the previous character type and whether a space should be inserted
|
| 615 |
+
prev_char_type = None
|
| 616 |
+
next_char_type = None
|
| 617 |
+
insert_space = False
|
| 618 |
+
|
| 619 |
+
# Output string
|
| 620 |
+
output_string = ""
|
| 621 |
+
|
| 622 |
+
# Iterate through each character in the text
|
| 623 |
+
for i, c in enumerate(text):
|
| 624 |
+
# Determine the type of the character
|
| 625 |
+
char_type = self.__check_char_type(c)
|
| 626 |
+
if i == (len(text) - 1):
|
| 627 |
+
next_char_type = None
|
| 628 |
+
else:
|
| 629 |
+
next_char_type = self.__check_char_type(text[i+1])
|
| 630 |
+
# print(f"{i}: {c} is a {char_type} character and next character is a {next_char_type}")
|
| 631 |
+
|
| 632 |
+
# If the character type has changed from the previous character, check if a space should be inserted
|
| 633 |
+
if (char_type != prev_char_type and prev_char_type != None and char_type != "punctuation" and char_type != "whitespace"):
|
| 634 |
+
if next_char_type != "punctuation" or next_char_type != "whitespace":
|
| 635 |
+
insert_space = True
|
| 636 |
+
|
| 637 |
+
# Insert a space if needed
|
| 638 |
+
if insert_space:
|
| 639 |
+
output_string += " "+c
|
| 640 |
+
insert_space = False
|
| 641 |
+
else:
|
| 642 |
+
output_string += c
|
| 643 |
+
|
| 644 |
+
# Update the previous character type
|
| 645 |
+
prev_char_type = char_type
|
| 646 |
+
|
| 647 |
+
# Print the modified text
|
| 648 |
+
output_string = re.sub(r' +', ' ', output_string)
|
| 649 |
+
return output_string
|
| 650 |
+
|
| 651 |
+
def insert_space_list(self, text):
|
| 652 |
+
'''
|
| 653 |
+
Expect the input to be in form of list of string.
|
| 654 |
+
Check if the text contains numbers and English words and if they are without space inserts space between them.
|
| 655 |
+
'''
|
| 656 |
+
# Output string list
|
| 657 |
+
output_list = []
|
| 658 |
+
|
| 659 |
+
for line in text:
|
| 660 |
+
# Initialize variables to track the previous character type and whether a space should be inserted
|
| 661 |
+
prev_char_type = None
|
| 662 |
+
next_char_type = None
|
| 663 |
+
insert_space = False
|
| 664 |
+
# Output string
|
| 665 |
+
output_string = ""
|
| 666 |
+
# Iterate through each character in the line
|
| 667 |
+
for i, c in enumerate(line):
|
| 668 |
+
# Determine the type of the character
|
| 669 |
+
char_type = self.__check_char_type(c)
|
| 670 |
+
if i == (len(line) - 1):
|
| 671 |
+
next_char_type = None
|
| 672 |
+
else:
|
| 673 |
+
next_char_type = self.__check_char_type(line[i+1])
|
| 674 |
+
# print(f"{i}: {c} is a {char_type} character and next character is a {next_char_type}")
|
| 675 |
+
|
| 676 |
+
# If the character type has changed from the previous character, check if a space should be inserted
|
| 677 |
+
if (char_type != prev_char_type and prev_char_type != None and char_type != "punctuation" and char_type != "whitespace"):
|
| 678 |
+
if next_char_type != "punctuation" or next_char_type != "whitespace":
|
| 679 |
+
insert_space = True
|
| 680 |
+
|
| 681 |
+
# Insert a space if needed
|
| 682 |
+
if insert_space:
|
| 683 |
+
output_string += " "+c
|
| 684 |
+
insert_space = False
|
| 685 |
+
else:
|
| 686 |
+
output_string += c
|
| 687 |
+
|
| 688 |
+
# Update the previous character type
|
| 689 |
+
prev_char_type = char_type
|
| 690 |
+
|
| 691 |
+
# Print the modified line
|
| 692 |
+
output_string = re.sub(r' +', ' ', output_string)
|
| 693 |
+
output_list.append(output_string)
|
| 694 |
+
return output_list
|
| 695 |
+
|
| 696 |
+
# def num2text(self, text, language):
|
| 697 |
+
# if language in self.keydict.keys():
|
| 698 |
+
# digits = sorted(list(map(int, re.findall(r'\d+', text))),reverse=True)
|
| 699 |
+
# if digits:
|
| 700 |
+
# for digit in digits:
|
| 701 |
+
# text = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', text)
|
| 702 |
+
# return self.__post_cleaning(text)
|
| 703 |
+
# else:
|
| 704 |
+
# print(f"No num-to-char for the given language {language}.")
|
| 705 |
+
# return self.__post_cleaning(text)
|
| 706 |
+
#====================================================
|
| 707 |
+
|
| 708 |
+
def num2text(self, text, language):
|
| 709 |
+
# print("utkqdd")
|
| 710 |
+
|
| 711 |
+
pattern = r'\d+\.\d+|\d+'
|
| 712 |
+
numbers = re.findall(pattern, text)
|
| 713 |
+
numbers = [num for num in numbers]
|
| 714 |
+
temp = NumberDictionary()
|
| 715 |
+
for data in numbers:
|
| 716 |
+
num_to_text = ' ' + temp.num2text(data, language)
|
| 717 |
+
# print(num_to_text)
|
| 718 |
+
pattern = r'(?<!\d)' + re.escape(data) + r'(?!\d)'
|
| 719 |
+
text = re.sub(pattern, num_to_text, text)
|
| 720 |
+
# print(text)
|
| 721 |
+
return text
|
| 722 |
+
|
| 723 |
+
def num2text_list(self, text, language):
|
| 724 |
+
# input is supposed to be a list of strings
|
| 725 |
+
# if language in self.keydict.keys():
|
| 726 |
+
output_text = []
|
| 727 |
+
# print("utk")
|
| 728 |
+
# print("text", text)
|
| 729 |
+
temp = NumberDictionary()
|
| 730 |
+
for line in text:
|
| 731 |
+
# print("lines",line)
|
| 732 |
+
pattern = r'\d+\.\d+|\d+'
|
| 733 |
+
numbers = re.findall(pattern, line)
|
| 734 |
+
numbers = [num for num in numbers]
|
| 735 |
+
|
| 736 |
+
for data in numbers:
|
| 737 |
+
num_to_text = ' ' + temp.num2text(data, language)
|
| 738 |
+
pattern = r'(?<!\d)' + re.escape(data) + r'(?!\d)'
|
| 739 |
+
line = re.sub(pattern, num_to_text, line)
|
| 740 |
+
output_text.append(line)
|
| 741 |
+
# print(output_text)
|
| 742 |
+
return output_text
|
| 743 |
+
#====================================================
|
| 744 |
+
|
| 745 |
+
|
| 746 |
+
|
| 747 |
+
|
| 748 |
+
# def num2text_list(self, text, language):
|
| 749 |
+
# # input is supposed to be a list of strings
|
| 750 |
+
# if language in self.keydict.keys():
|
| 751 |
+
# output_text = []
|
| 752 |
+
# for line in text:
|
| 753 |
+
# digits = sorted(list(map(int, re.findall(r'\d+', line))),reverse=True)
|
| 754 |
+
# if digits:
|
| 755 |
+
# for digit in digits:
|
| 756 |
+
# line = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', line)
|
| 757 |
+
# output_text.append(line)
|
| 758 |
+
# return self.__post_cleaning_list(output_text)
|
| 759 |
+
# else:
|
| 760 |
+
# print(f"No num-to-char for the given language {language}.")
|
| 761 |
+
# return self.__post_cleaning_list(text)
|
| 762 |
+
|
| 763 |
+
def normalize(self, text, language):
|
| 764 |
+
return self.__post_cleaning(text)
|
| 765 |
+
|
| 766 |
+
def normalize_list(self, text, language):
|
| 767 |
+
# input is supposed to be a list of strings
|
| 768 |
+
return self.__post_cleaning_list(text)
|
| 769 |
+
|
| 770 |
+
|
| 771 |
+
class TextPhrasifier:
|
| 772 |
+
@classmethod
|
| 773 |
+
def phrasify(cls, text):
|
| 774 |
+
phrase_list = []
|
| 775 |
+
for phrase in text.split("#"):
|
| 776 |
+
phrase = phrase.strip()
|
| 777 |
+
if phrase != "":
|
| 778 |
+
phrase_list.append(phrase)
|
| 779 |
+
return phrase_list
|
| 780 |
+
|
| 781 |
+
class TextPhrasifier_List:
|
| 782 |
+
@classmethod
|
| 783 |
+
def phrasify(cls, text):
|
| 784 |
+
# input is supposed to be a list of strings
|
| 785 |
+
# output is list of list of strings
|
| 786 |
+
output_list = []
|
| 787 |
+
for line in text:
|
| 788 |
+
phrase_list = []
|
| 789 |
+
for phrase in line.split("#"):
|
| 790 |
+
phrase = phrase.strip()
|
| 791 |
+
if phrase != "":
|
| 792 |
+
phrase_list.append(phrase)
|
| 793 |
+
output_list.append(phrase_list)
|
| 794 |
+
return output_list
|
| 795 |
+
|
| 796 |
+
class DurAlignTextProcessor:
|
| 797 |
+
def __init__(self):
|
| 798 |
+
# this is a static set of cleaning rules to be applied
|
| 799 |
+
self.cleaning_rules = {
|
| 800 |
+
r"\s{2,}": " ", # collapse multiple spaces into one
|
| 801 |
+
r"^": "$",
|
| 802 |
+
r"$": ".",
|
| 803 |
+
}
|
| 804 |
+
self.cleaning_rules_English = {
|
| 805 |
+
" +" : "",
|
| 806 |
+
"$" : ".",
|
| 807 |
+
}
|
| 808 |
+
def textProcesorForEnglish(self, text):
|
| 809 |
+
for key, replacement in self.cleaning_rules_English.items():
|
| 810 |
+
for idx in range(0,len(text)):
|
| 811 |
+
text[idx] = re.sub(key, replacement, text[idx])
|
| 812 |
+
|
| 813 |
+
return text
|
| 814 |
+
|
| 815 |
+
def textProcesor(self, text):
|
| 816 |
+
for key, replacement in self.cleaning_rules.items():
|
| 817 |
+
for idx in range(0,len(text)):
|
| 818 |
+
text[idx] = re.sub(key, replacement, text[idx])
|
| 819 |
+
|
| 820 |
+
return text
|
| 821 |
+
|
| 822 |
+
def textProcesor_list(self, text):
|
| 823 |
+
# input expected in 'list of list of string' format
|
| 824 |
+
output_text = []
|
| 825 |
+
for line in text:
|
| 826 |
+
for key, replacement in self.cleaning_rules.items():
|
| 827 |
+
for idx in range(0,len(line)):
|
| 828 |
+
line[idx] = re.sub(key, replacement, line[idx])
|
| 829 |
+
output_text.append(line)
|
| 830 |
+
|
| 831 |
+
return output_text
|
| 832 |
+
|
| 833 |
+
|
| 834 |
+
class TTSDurAlignPreprocessor:
|
| 835 |
+
def __init__(self,
|
| 836 |
+
text_cleaner = TextCleaner(),
|
| 837 |
+
text_normalizer=TextNormalizer(),
|
| 838 |
+
phonifier = Phonifier(),
|
| 839 |
+
post_processor = DurAlignTextProcessor()):
|
| 840 |
+
self.text_cleaner = text_cleaner
|
| 841 |
+
self.text_normalizer = text_normalizer
|
| 842 |
+
self.phonifier = phonifier
|
| 843 |
+
self.post_processor = post_processor
|
| 844 |
+
|
| 845 |
+
def preprocess(self, text, language, gender):
|
| 846 |
+
# text = text.strip()
|
| 847 |
+
print(text)
|
| 848 |
+
text = self.text_cleaner.clean(text)
|
| 849 |
+
print("cleaned text", text)
|
| 850 |
+
# text = self.text_normalizer.insert_space(text)
|
| 851 |
+
text = self.text_normalizer.num2text(text, language)
|
| 852 |
+
# print(text)
|
| 853 |
+
text = self.text_normalizer.normalize(text, language)
|
| 854 |
+
# print(text)
|
| 855 |
+
phrasified_text = TextPhrasifier.phrasify(text)
|
| 856 |
+
#print("phrased",phrasified_text)
|
| 857 |
+
phonified_text = self.phonifier.phonify(phrasified_text, language, gender)
|
| 858 |
+
#print("phonetext",phonified_text)
|
| 859 |
+
phonified_text = self.post_processor.textProcesor(phonified_text)
|
| 860 |
+
print(phonified_text)
|
| 861 |
+
return phonified_text, phrasified_text
|
| 862 |
+
|
| 863 |
+
class TTSDurAlignPreprocessor_VTT:
|
| 864 |
+
def __init__(self,
|
| 865 |
+
text_cleaner = TextCleaner(),
|
| 866 |
+
text_normalizer=TextNormalizer(),
|
| 867 |
+
phonifier = Phonifier(),
|
| 868 |
+
post_processor = DurAlignTextProcessor()):
|
| 869 |
+
self.text_cleaner = text_cleaner
|
| 870 |
+
self.text_normalizer = text_normalizer
|
| 871 |
+
self.phonifier = phonifier
|
| 872 |
+
self.post_processor = post_processor
|
| 873 |
+
|
| 874 |
+
def preprocess(self, text, language, gender):
|
| 875 |
+
# text = text.strip()
|
| 876 |
+
text = self.text_cleaner.clean_list(text)
|
| 877 |
+
# text = self.text_normalizer.insert_space_list(text)
|
| 878 |
+
text = self.text_normalizer.num2text_list(text, language)
|
| 879 |
+
text = self.text_normalizer.normalize_list(text, language)
|
| 880 |
+
phrasified_text = TextPhrasifier_List.phrasify(text)
|
| 881 |
+
phonified_text = self.phonifier.phonify_list(phrasified_text, language, gender)
|
| 882 |
+
phonified_text = self.post_processor.textProcesor_list(phonified_text)
|
| 883 |
+
return phonified_text, phrasified_text
|
| 884 |
+
|
| 885 |
+
|
| 886 |
+
class CharTextPreprocessor:
|
| 887 |
+
def __init__(self,
|
| 888 |
+
text_cleaner = TextCleaner(),
|
| 889 |
+
text_normalizer=TextNormalizer()):
|
| 890 |
+
self.text_cleaner = text_cleaner
|
| 891 |
+
self.text_normalizer = text_normalizer
|
| 892 |
+
|
| 893 |
+
def preprocess(self, text, language, gender=None):
|
| 894 |
+
text = text.strip()
|
| 895 |
+
text = self.text_cleaner.clean(text)
|
| 896 |
+
# text = self.text_normalizer.insert_space(text)
|
| 897 |
+
text = self.text_normalizer.num2text(text, language)
|
| 898 |
+
text = self.text_normalizer.normalize(text, language)
|
| 899 |
+
phrasified_text = TextPhrasifier.phrasify(text)
|
| 900 |
+
phonified_text = phrasified_text # No phonification for character TTS models
|
| 901 |
+
return phonified_text, phrasified_text
|
| 902 |
+
|
| 903 |
+
class CharTextPreprocessor_VTT:
|
| 904 |
+
def __init__(self,
|
| 905 |
+
text_cleaner = TextCleaner(),
|
| 906 |
+
text_normalizer=TextNormalizer()):
|
| 907 |
+
self.text_cleaner = text_cleaner
|
| 908 |
+
self.text_normalizer = text_normalizer
|
| 909 |
+
|
| 910 |
+
def preprocess(self, text, language, gender=None):
|
| 911 |
+
# text = text.strip()
|
| 912 |
+
text = self.text_cleaner.clean_list(text)
|
| 913 |
+
# text = self.text_normalizer.insert_space_list(text)
|
| 914 |
+
text = self.text_normalizer.num2text_list(text, language)
|
| 915 |
+
text = self.text_normalizer.normalize_list(text, language)
|
| 916 |
+
phrasified_text = TextPhrasifier_List.phrasify(text)
|
| 917 |
+
phonified_text = phrasified_text # No phonification for character TTS models
|
| 918 |
+
return phonified_text, phrasified_text
|
| 919 |
+
|
| 920 |
+
|
| 921 |
+
class TTSPreprocessor:
|
| 922 |
+
def __init__(
|
| 923 |
+
self,
|
| 924 |
+
text_cleaner = TextCleaner(),
|
| 925 |
+
text_normalizer=TextNormalizer(),
|
| 926 |
+
phonifier = Phonifier(),
|
| 927 |
+
text_phrasefier = TextPhrasifier(),
|
| 928 |
+
post_processor = DurAlignTextProcessor()
|
| 929 |
+
):
|
| 930 |
+
self.text_cleaner = text_cleaner
|
| 931 |
+
self.text_normalizer = text_normalizer
|
| 932 |
+
self.phonifier = phonifier
|
| 933 |
+
self.text_phrasefier = text_phrasefier
|
| 934 |
+
self.post_processor = post_processor
|
| 935 |
+
|
| 936 |
+
def preprocess(self, text, language, gender):
|
| 937 |
+
text = text.strip()
|
| 938 |
+
text = self.text_cleaner.clean(text)
|
| 939 |
+
# text = self.text_normalizer.insert_space(text)
|
| 940 |
+
text = self.text_normalizer.num2text(text, language)
|
| 941 |
+
text = self.text_normalizer.normalize(text, language)
|
| 942 |
+
phrasified_text = TextPhrasifier.phrasify(text)
|
| 943 |
+
phonified_text = self.phonifier.phonify(phrasified_text, language, gender)
|
| 944 |
+
return phonified_text, phrasified_text
|
| 945 |
+
|
| 946 |
+
class TTSPreprocessor_VTT:
|
| 947 |
+
def __init__(self,
|
| 948 |
+
text_cleaner = TextCleaner(),
|
| 949 |
+
text_normalizer=TextNormalizer(),
|
| 950 |
+
phonifier = Phonifier(),
|
| 951 |
+
text_phrasefier = TextPhrasifier_List()):
|
| 952 |
+
self.text_cleaner = text_cleaner
|
| 953 |
+
self.text_normalizer = text_normalizer
|
| 954 |
+
self.phonifier = phonifier
|
| 955 |
+
self.text_phrasefier = text_phrasefier
|
| 956 |
+
|
| 957 |
+
def preprocess(self, text, language, gender):
|
| 958 |
+
# print(f"Original text: {text}")
|
| 959 |
+
text = self.text_cleaner.clean_list(text)
|
| 960 |
+
# print(f"After text cleaner: {text}")
|
| 961 |
+
# text = self.text_normalizer.insert_space_list(text)
|
| 962 |
+
# print(f"After insert space: {text}")
|
| 963 |
+
text = self.text_normalizer.num2text_list(text, language)
|
| 964 |
+
# print(f"After num2text: {text}")
|
| 965 |
+
text = self.text_normalizer.normalize_list(text, language)
|
| 966 |
+
# print(f"After text normalizer: {text}")
|
| 967 |
+
phrasified_text = TextPhrasifier_List.phrasify(text)
|
| 968 |
+
# print(f"phrasified_text: {phrasified_text}")
|
| 969 |
+
phonified_text = self.phonifier.phonify_list(phrasified_text, language, gender)
|
| 970 |
+
# print(f"phonified_text: {phonified_text}")
|
| 971 |
+
return phonified_text, phrasified_text
|
tmp/non_dict_words_1776242609.109296
ADDED
|
File without changes
|
tmp/non_dict_words_1776242740.8554978
ADDED
|
File without changes
|
utilities.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import random
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
import base64
|
| 5 |
+
from enum import Enum
|
| 6 |
+
import os
|
| 7 |
+
import librosa
|
| 8 |
+
import numpy as np
|
| 9 |
+
import soundfile as sf
|
| 10 |
+
|
| 11 |
+
SAMPLING_RATE = int(os.getenv("DEFAULT_SAMPLING_RATE", "48000"))
|
| 12 |
+
|
| 13 |
+
output_audio_dir = os.getenv("OUTPUT_DIR", f"{os.path.dirname(os.path.realpath(__file__))}/outputs")
|
| 14 |
+
os.makedirs(output_audio_dir, exist_ok=True)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
LANG_TEMPOS = {
|
| 18 |
+
"malayalam": 1.2,
|
| 19 |
+
"telugu": 1.3,
|
| 20 |
+
"hindi": 0.9,
|
| 21 |
+
"punjabi": 0.9
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
SUPPORTED_OUTPUT_LANGS = os.getenv("LANGUAGES", "hindi,bengali,tamil").split(",")
|
| 25 |
+
|
| 26 |
+
def isOutputLangSupported(lang: str):
|
| 27 |
+
if(lang not in SUPPORTED_OUTPUT_LANGS):
|
| 28 |
+
raise ValueError(f"Unsupported output language: {lang}. Supported languages are: {SUPPORTED_OUTPUT_LANGS}")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def truncate_string(s, max_length=100):
|
| 32 |
+
return s[:max_length] + ('...' if len(s) > max_length else '')
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
WARMUP_PARAGRAPHS = {
|
| 36 |
+
"hindi": (
|
| 37 |
+
"भारत एक विशाल देश है जहाँ विभिन्न भाषाएं, धर्म और संस्कृतियाँ एक साथ अस्तित्व में हैं। "
|
| 38 |
+
"यहाँ की विविधता में एकता इसकी सबसे बड़ी विशेषता है। हर राज्य की अपनी एक परंपरा होती है और "
|
| 39 |
+
"खानपान भी अलग होता है। भारतीय समाज में परिवार को अत्यंत महत्त्व दिया जाता है और लोग एक-दूसरे "
|
| 40 |
+
"की मदद के लिए हमेशा तैयार रहते हैं। पर्व-त्योहार जैसे दीपावली, होली, ईद और क्रिसमस यहाँ बड़े "
|
| 41 |
+
"हर्षोल्लास के साथ मनाए जाते हैं। यहाँ की कला, संगीत, और साहित्य ने दुनिया भर में अपनी पहचान बनाई है।"
|
| 42 |
+
),
|
| 43 |
+
"kannada": (
|
| 44 |
+
"ಭಾರತವು ವಿಭಿನ್ನ ಭಾಷೆಗಳು, ಧರ್ಮಗಳು ಮತ್ತು ಸಂಸ್ಕೃತಿಗಳ ನಾಡಾಗಿದೆ. ಇದು ಒಂದೇ ದೇಶದಲ್ಲಿ ಅನೇಕ ಭಿನ್ನತೆಗಳನ್ನು "
|
| 45 |
+
"ಒಳಗೊಂಡಿದೆ. ಪ್ರತಿಯೊಂದು ರಾಜ್ಯಕ್ಕೂ ತನ್ನದೇ ಆದ ಪರಂಪರೆ, ಸಂಸ್ಕೃತಿ ಮತ್ತು ಆಹಾರದ ಶೈಲಿ ಇರುತ್ತದೆ. "
|
| 46 |
+
"ಇಲ್ಲಿ ಕುಟುಂಬ ಮತ್ತು ಸಮುದಾಯಕ್ಕೆ ಹೆಚ್ಚಿನ ಮಹತ್ವ ನೀಡಲಾಗುತ್ತದೆ. ಹಬ್ಬಗಳು ಮತ್ತು ಆಚರಣೆಗಳು ಜನರ ನಡುವಿನ "
|
| 47 |
+
"ಸಾಮರಸ್ಯವನ್ನು ಬಲಪಡಿಸುತ್ತವೆ. ಭಾರತದಲ್ಲಿ ಸಂಗೀತ, ನೃತ್ಯ ಮತ್ತು ಸಾಹಿತ್ಯ ಕ್ಷೇತ್ರಗಳಲ್ಲಿ ಸಮೃದ್ಧ ಪರಂಪರೆ ಇದೆ."
|
| 48 |
+
),
|
| 49 |
+
"malayalam": (
|
| 50 |
+
"ഇന്ത്യ ഒരു വൈവിധ്യങ്ങളാൽ സമ്പന്നമായ ദേശമാണ്. വിവിധ ഭാഷകളും മതങ്ങളും സംസ്കാരങ്ങളുമുള്ള ഈ രാജ്യത്ത് "
|
| 51 |
+
"ഏറ്റവും വലിയ പ്രത്യേകത ഏകതയിലാണ്. ഓരോ സംസ്ഥാനത്തിനും തനതായ ചരിത്രവും സംസ്കാരവുമുണ്ട്. "
|
| 52 |
+
"കുടുംബ ബന്ധങ്ങൾക്കും ഒത്തുചേരലിനും വലിയ വിലയാണ് ഇന്ത്യയിലെ ജനങ്ങൾ നൽകുന്നത്. "
|
| 53 |
+
"ഓണവും വിഷുവും പോലുള്ള ഉത്സവങ്ങൾ ഒരേ സമയം ആത്മീയതയും സന്തോഷവും പങ്കുവെക്കാൻ സഹായിക്കുന്നു. "
|
| 54 |
+
"ഇവിടെ സംഗീതം, നാടകം, ചലച്ചിത്രം തുടങ്ങിയ കലാരൂപങ്ങൾ ലോകപ്രശസ്തമാണ്."
|
| 55 |
+
),
|
| 56 |
+
"punjabi": (
|
| 57 |
+
"ਭਾਰਤ ਇੱਕ ਐਸਾ ਦੇਸ਼ ਹੈ ਜਿੱਥੇ ਬਹੁਤ ਸਾਰੀਆਂ ਭਾਸ਼ਾਵਾਂ, ਧਰਮਾਂ ਅਤੇ ਰਿਵਾਇਤਾਂ ਦਾ ਮਿਲਾਪ ਹੈ। "
|
| 58 |
+
"ਇਸ ਦੀ ਸਭ ਤੋਂ ਵੱਡੀ ਖਾਸੀਅਤ ਇਹ ਹੈ ਕਿ ਇੱਥੇ ਵੱਖ-ਵੱਖਤਾ ਵਿੱਚ ਵੀ ਏਕਤਾ ਹੈ। ਹਰ ਰਾਜ ਦੀ ਆਪਣੀ ਪਛਾਣ ਹੁੰਦੀ ਹੈ "
|
| 59 |
+
"ਅਤੇ ਇੱਥੇ ਦੇ ਲੋਕ ਆਪਣੇ ਪਰਿਵਾਰ ਅ��ੇ ਸੰਸਕਾਰਾਂ ਨਾਲ ਜੁੜੇ ਰਹਿੰਦੇ ਹਨ। ਪੰਜਾਬੀ ਲੋਕਾਂ ਦੀ ਬੋਲੀ, ਭੰਗੜਾ, ਗਿੱਧਾ ਅਤੇ "
|
| 60 |
+
"ਖਾਣ-ਪੀਣ ਦੀ ਰੀਤ ਸਾਰੀ ਦੁਨੀਆ ਵਿੱਚ ਮਸ਼ਹੂਰ ਹੈ। ਤਿਉਹਾਰ, ਖੇਡਾਂ ਅਤੇ ਲੋਕ ਸੱਭਿਆਚਾਰ ਇੱਥੇ ਦੀ ਰੋਹਕਤਾ ਵਧਾਉਂਦੇ ਹਨ।"
|
| 61 |
+
),
|
| 62 |
+
"tamil": (
|
| 63 |
+
"இந்தியா என்பது பல்வேறு கலாசாரங்களும் மொழிகளும் கொண்ட நாடாகும். ஒவ்வொரு மாநிலத்திற்கும் தனித்துவமான பாரம்பரியம் "
|
| 64 |
+
"மற்றும் சுவையான உணவுகள் உள்ளன. தமிழ்நாடு இவற்றில் முக்கியமான ஒரு பகுதி. தமிழ் மொழியும் கலாசாரமும் உலகெங்கும் "
|
| 65 |
+
"புகழ்பெற்றவை. குடும்பங்கள் ஒன்றிணைந்து வாழும் பாசத்திற்கும் எப்போதும் முக்கியத்துவம் அளிக்கப்படுகிறது. "
|
| 66 |
+
"தீபாவளி, பொங்கல் போன்ற பண்டிகைகள் மகிழ்ச்சியுடனும் ஒருமைப்பாட்டுடனும் கொண்டாடப்படுகின்றன. "
|
| 67 |
+
"இங்கே உள்ள பாரம்பரிய இசை, நடனம், இலக்கியம் ஆகியவை பாராட்டத்தக்கவை."
|
| 68 |
+
),
|
| 69 |
+
"telugu": (
|
| 70 |
+
"భారతదేశం అనేక భాషలు, సంస్కృతులు మరియు సంప్రదాయాలతో కూడిన దేశం. ప్రతి రాష్ట్రానికీ తనదైన ప్రత్యేకత ఉంది. "
|
| 71 |
+
"తెలుగు రాష్ట్రాలు సంస్కృతిగా గొప్పదిగా పరిగణించబడతాయి. ఇక్కడ కుటుంబం, గౌరవం, సహాయం వంటి విలువలకు ఎక్కువ ప్రాధాన్యం "
|
| 72 |
+
"ఇస్తారు. సంక్రాంతి, ఉగాది వంటి పండుగలు కుటుంబ సభ్యులందరినీ కలిపి ఉంచుతాయి. నాట్యం, సంగీతం మరియు సాహిత్యం వంటి కళలు "
|
| 73 |
+
"తెలుగువారిలో భాగం. భోజన పరంపరలూ అద్భుతమైనవి. వీటన్నింటి సమ్మేళనమే భారతదేశాన్ని ప్రత్యేకంగా నిలబెడుతుంది."
|
| 74 |
+
)
|
| 75 |
+
}
|