Files changed (2) hide show
  1. README.md +2 -27
  2. inference_w_sil_alpha.py +0 -209
README.md CHANGED
@@ -1,30 +1,5 @@
1
  ---
2
  license: cc-by-4.0
3
- language:
4
- - hi
5
- - bn
6
- - ur
7
- - te
8
- - ta
9
- - pa
10
- - or
11
- - mr
12
- - ml
13
- - kn
14
- - gu
15
- - en
16
- - bo
17
- - as
18
- tags:
19
- - tts
20
- - indictts
21
- - fs2
22
- - fastspeech2
23
- - HS
24
- - hybrid_segmentation
25
- - mfa
26
- - indian
27
- - tts_model
28
  ---
29
  # Fastspeech2 Model using Hybrid Segmentation (HS)
30
 
@@ -90,7 +65,7 @@ If you use this Fastspeech2 Model in your research or work, please consider citi
90
 
91
 
92
  COPYRIGHT
93
- 2025, Speech Technology Consortium,
94
 
95
  Bhashini, MeiTY and by Hema A Murthy & S Umesh,
96
 
@@ -111,4 +86,4 @@ This work is licensed under a
111
 
112
  [cc-by]: http://creativecommons.org/licenses/by/4.0/
113
  [cc-by-image]: https://i.creativecommons.org/l/by/4.0/88x31.png
114
- [cc-by-shield]: https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg
 
1
  ---
2
  license: cc-by-4.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
4
  # Fastspeech2 Model using Hybrid Segmentation (HS)
5
 
 
65
 
66
 
67
  COPYRIGHT
68
+ 2023, Speech Technology Consortium,
69
 
70
  Bhashini, MeiTY and by Hema A Murthy & S Umesh,
71
 
 
86
 
87
  [cc-by]: http://creativecommons.org/licenses/by/4.0/
88
  [cc-by-image]: https://i.creativecommons.org/l/by/4.0/88x31.png
89
+ [cc-by-shield]: https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg
inference_w_sil_alpha.py DELETED
@@ -1,209 +0,0 @@
1
- import sys
2
- import os
3
- #replace the path with your hifigan path to import Generator from models.py
4
- sys.path.append("hifigan")
5
- import argparse
6
- import torch
7
- from espnet2.bin.tts_inference import Text2Speech
8
- from models import Generator
9
- from scipy.io.wavfile import write
10
- from meldataset import MAX_WAV_VALUE
11
- from env import AttrDict
12
- import json
13
- import yaml
14
- import concurrent.futures
15
- import numpy as np
16
- import time
17
- import re
18
-
19
- from text_preprocess_for_inference import TTSDurAlignPreprocessor, CharTextPreprocessor, TTSPreprocessor
20
-
21
- SAMPLING_RATE = 22050
22
-
23
- def load_hifigan_vocoder(language, gender, device):
24
- # Load HiFi-GAN vocoder configuration file and generator model for the specified language and gender
25
- vocoder_config = f"vocoder/{gender}/{language}/config.json"
26
- vocoder_generator = f"vocoder/{gender}/{language}/generator"
27
- # Read the contents of the vocoder configuration file
28
- with open(vocoder_config, 'r') as f:
29
- data = f.read()
30
- json_config = json.loads(data)
31
- h = AttrDict(json_config)
32
- torch.manual_seed(h.seed)
33
- # Move the generator model to the specified device (CPU or GPU)
34
- device = torch.device(device)
35
- generator = Generator(h).to(device)
36
- state_dict_g = torch.load(vocoder_generator, device)
37
- generator.load_state_dict(state_dict_g['generator'])
38
- generator.eval()
39
- generator.remove_weight_norm()
40
-
41
- # Return the loaded and prepared HiFi-GAN generator model
42
- return generator
43
-
44
-
45
- def load_fastspeech2_model(language, gender, device):
46
-
47
- #updating the config.yaml fiel based on language and gender
48
- with open(f"{language}/{gender}/model/config.yaml", "r") as file:
49
- config = yaml.safe_load(file)
50
-
51
- current_working_directory = os.getcwd()
52
- feat="model/feats_stats.npz"
53
- pitch="model/pitch_stats.npz"
54
- energy="model/energy_stats.npz"
55
-
56
- feat_path=os.path.join(current_working_directory,language,gender,feat)
57
- pitch_path=os.path.join(current_working_directory,language,gender,pitch)
58
- energy_path=os.path.join(current_working_directory,language,gender,energy)
59
-
60
-
61
- config["normalize_conf"]["stats_file"] = feat_path
62
- config["pitch_normalize_conf"]["stats_file"] = pitch_path
63
- config["energy_normalize_conf"]["stats_file"] = energy_path
64
-
65
- with open(f"{language}/{gender}/model/config.yaml", "w") as file:
66
- yaml.dump(config, file)
67
-
68
- tts_model = f"{language}/{gender}/model/model.pth"
69
- tts_config = f"{language}/{gender}/model/config.yaml"
70
-
71
-
72
- return Text2Speech(train_config=tts_config, model_file=tts_model, device=device)
73
-
74
- def text_synthesis(language, gender, sample_text, vocoder, model, MAX_WAV_VALUE, device, alpha):
75
- # Perform Text-to-Speech synthesis
76
- with torch.no_grad():
77
- # Load the FastSpeech2 model for the specified language and gender
78
-
79
- # model = load_fastspeech2_model(language, gender, device)
80
-
81
-
82
- # Generate mel-spectrograms from the input text using the FastSpeech2 model
83
- out = model(sample_text, decode_conf={"alpha": alpha})
84
- print("TTS Done")
85
- x = out["feat_gen_denorm"].T.unsqueeze(0) * 2.3262
86
- x = x.to(device)
87
-
88
- # Use the HiFi-GAN vocoder to convert mel-spectrograms to raw audio waveforms
89
- y_g_hat = vocoder(x)
90
- audio = y_g_hat.squeeze()
91
- audio = audio * MAX_WAV_VALUE
92
- audio = audio.cpu().numpy().astype('int16')
93
-
94
- # Return the synthesized audio
95
- return audio
96
-
97
- def split_into_chunks(text, words_per_chunk=100):
98
- words = text.split()
99
- chunks = [words[i:i + words_per_chunk] for i in range(0, len(words), words_per_chunk)]
100
- return [' '.join(chunk) for chunk in chunks]
101
-
102
-
103
-
104
-
105
- def extract_text_alpha_chunks(text, default_alpha=1.0):
106
- alpha_pattern = r"<alpha=([0-9.]+)>"
107
- sil_pattern = r"<sil=([0-9.]+)(ms|s)>"
108
-
109
- chunks = []
110
- alpha = default_alpha
111
-
112
- alpha_blocks = re.split(alpha_pattern, text)
113
- i = 0
114
- while i < len(alpha_blocks):
115
- if i == 0:
116
- current_block = alpha_blocks[i]
117
- i += 1
118
- else:
119
- alpha = float(alpha_blocks[i])
120
- i += 1
121
- current_block = alpha_blocks[i] if i < len(alpha_blocks) else ""
122
- i += 1
123
-
124
- sil_matches = list(re.finditer(sil_pattern, current_block))
125
- sil_placeholders = {}
126
- for j, match in enumerate(sil_matches):
127
- tag = match.group(0)
128
- value = float(match.group(1))
129
- unit = match.group(2)
130
- duration = value / 1000.0 if unit == "ms" else value
131
- placeholder = f"__SIL_{j}__"
132
- sil_placeholders[placeholder] = duration
133
- current_block = current_block.replace(tag, f" {placeholder} ")
134
-
135
- sentences = [s.strip() for s in current_block.split('.') if s.strip()]
136
- for sentence in sentences:
137
- words = sentence.split()
138
- buffer = []
139
- for word in words:
140
- if word in sil_placeholders:
141
- if buffer:
142
- chunks.append((" ".join(buffer), alpha, False, None))
143
- buffer = []
144
- chunks.append(("", alpha, True, sil_placeholders[word]))
145
- else:
146
- buffer.append(word)
147
- if buffer:
148
- chunks.append((" ".join(buffer), alpha, False, None))
149
- return chunks
150
-
151
-
152
-
153
- if __name__ == "__main__":
154
- parser = argparse.ArgumentParser(description="Text-to-Speech Inference")
155
- parser.add_argument("--language", type=str, required=True, help="Language (e.g., hindi)")
156
- parser.add_argument("--gender", type=str, required=True, help="Gender (e.g., female)")
157
- parser.add_argument("--sample_text", type=str, required=True, help="Text to be synthesized")
158
- parser.add_argument("--output_file", type=str, default="", help="Output WAV file path")
159
- parser.add_argument("--alpha", type=float, default=1, help="Alpha Parameter for speed control (e.g. 1.1 (slow) or 0.8 (fast))")
160
-
161
- args = parser.parse_args()
162
-
163
- phone_dictionary = {}
164
- # Set the device
165
- device = "cuda" if torch.cuda.is_available() else "cpu"
166
-
167
- # Load the HiFi-GAN vocoder with dynamic language and gender
168
- vocoder = load_hifigan_vocoder(args.language, args.gender, device)
169
- model = load_fastspeech2_model(args.language, args.gender, device)
170
- if args.language == "urdu" or args.language == "punjabi":
171
- preprocessor = CharTextPreprocessor()
172
- elif args.language == "english":
173
- preprocessor = TTSPreprocessor()
174
- else:
175
- preprocessor = TTSDurAlignPreprocessor()
176
-
177
-
178
-
179
- start_time = time.time()
180
- audio_arr = []
181
- result = split_into_chunks(args.sample_text)
182
- text_alpha_chunks = extract_text_alpha_chunks(args.sample_text, args.alpha)
183
-
184
- with concurrent.futures.ThreadPoolExecutor() as executor:
185
- futures = []
186
- for chunk_text, alpha_val, is_silence, sil_duration in text_alpha_chunks:
187
- if is_silence:
188
- silence_samples = int(sil_duration * SAMPLING_RATE)
189
- silence_audio = np.zeros(silence_samples, dtype=np.int16)
190
- futures.append(silence_audio)
191
- else:
192
- preprocessed_text, _ = preprocessor.preprocess(chunk_text, args.language, args.gender, phone_dictionary)
193
- preprocessed_text = " ".join(preprocessed_text)
194
- future = executor.submit(
195
- text_synthesis, args.language, args.gender, preprocessed_text,
196
- vocoder, model, MAX_WAV_VALUE, device, alpha_val
197
- )
198
- futures.append(future)
199
-
200
- for item in futures:
201
- if isinstance(item, np.ndarray):
202
- audio_arr.append(item)
203
- else:
204
- audio_arr.append(item.result())
205
-
206
- result_array = np.concatenate(audio_arr, axis=0)
207
- output_file = args.output_file if args.output_file else f"{args.language}_{args.gender}_output.wav"
208
- write(output_file, SAMPLING_RATE, result_array)
209
- print(f"Synthesis completed in {time.time()-start_time:.2f} sec → {output_file}")