| import io |
| import re |
| import tempfile |
| from pathlib import Path |
| from typing import Optional |
| import os |
| from pathlib import Path |
| from cached_path import cached_path |
|
|
| import numpy as np |
| import soundfile as sf |
|
|
| from f5_tts.infer.utils_infer import ( |
| infer_process, |
| load_model, |
| load_vocoder, |
| preprocess_ref_audio_text, |
| remove_silence_for_generated_wav, |
| ) |
| from f5_tts.model import DiT, UNetT |
|
|
| model = "F5-TTS-small" |
| |
|
|
| ref_audio = "/home/cmi_10101/Documents/voice/Hindi/my_dataset/data/ref_audio_1.wav" |
| ref_text = '''हाँ भैया अभी दसेहरी आम में दसेहरी रखें हैं ओर केले भी हैं हमारे पास अभी मतलब बताइये आपको कित्ते के चाहिए |
| ''' |
| |
|
|
|
|
| |
|
|
| gen_text = ''' |
| योजना अभी निकाली है बच्चों के लिए जिसका नाम है सीखो कमाओ योजना जिसके अंतर्गत हम ट्रैनिंग के साथ साथ पैसे भी कमा सकते हैं और ट्रैनिंग के बाद अच्छी जॉब भी |
| ''' |
| |
|
|
| output_dir = "gen_audios" |
| output_file = "generated_audio.wav" |
| wav_path = Path(output_dir) / output_file |
|
|
| |
| """ |
| keep your checkpoints in ckpts directory |
| and vocab.txt file in data dir |
| """ |
|
|
| ckpt_file = "/home/cmi_10101/Documents/voice/F5-TTS/ckpts/model_indic_voices_hindi.pt" |
| vocab_file = "/home/cmi_10101/Documents/voice/F5-TTS/data/vocab.txt" |
|
|
| |
|
|
| remove_silence = True |
| speed = 1.0 |
| nfe_step = 32 |
| vocoder_name = "vocos" |
| indic=True |
| mel_spec_type = vocoder_name |
|
|
| |
| if vocoder_name == "vocos": |
| vocoder_local_path = "../ckpts/vocos-mel-24khz" |
| elif vocoder_name == "bigvgan": |
| print("No default F5-TTS-small ckpt available for bigvgan yet") |
| exit(1) |
|
|
| |
| vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=False, local_path=vocoder_local_path) |
|
|
|
|
| |
| model_cls = DiT |
| model_cfg = dict(dim=768, depth=18, heads=12, ff_mult=2, text_dim=512, conv_layers=4) |
|
|
| print(f"Using model: {model} ...") |
| ema_model = load_model(model_cls, model_cfg, ckpt_file, mel_spec_type=mel_spec_type, vocab_file=vocab_file) |
|
|
|
|
| |
| |
| |
| def main_process(ref_audio, ref_text, text_gen, model_obj, mel_spec_type, remove_silence, speed): |
| |
| main_voice = {"ref_audio": ref_audio, "ref_text": ref_text} |
| voices = {"main": main_voice} |
|
|
| |
| for voice in voices: |
| voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text( |
| voices[voice]["ref_audio"], voices[voice]["ref_text"] |
| ) |
| print("Voice:", voice) |
| print("Processed Ref Audio:", voices[voice]["ref_audio"]) |
| print("Processed Ref Text:", voices[voice]["ref_text"]) |
|
|
| generated_audio_segments = [] |
| |
| reg1 = r"(?=\[\w+\])" |
| chunks = re.split(reg1, text_gen) |
| reg2 = r"\[(\w+)\]" |
| for text in chunks: |
| if not text.strip(): |
| continue |
| match = re.match(reg2, text) |
| voice = match.group(1) if match else "main" |
| if voice not in voices: |
| print(f"Voice '{voice}' not found, using 'main'.") |
| voice = "main" |
| |
| text_clean = re.sub(reg2, "", text).strip() |
| print(f"Generating audio for voice: {voice} with text: {text_clean}") |
| audio, final_sample_rate, _ = infer_process( |
| voices[voice]["ref_audio"], |
| voices[voice]["ref_text"], |
| text_clean, |
| model_obj, |
| vocoder, |
| mel_spec_type=mel_spec_type, |
| speed=speed, |
| nfe_step=nfe_step, |
| indic=indic |
| ) |
| generated_audio_segments.append(audio) |
|
|
| if generated_audio_segments: |
| final_wave = np.concatenate(generated_audio_segments) |
| if not os.path.exists(output_dir): |
| os.makedirs(output_dir) |
| |
| sf.write(str(wav_path), final_wave, final_sample_rate) |
| |
| if remove_silence: |
| remove_silence_for_generated_wav(str(wav_path)) |
| print(f"Generated audio written to {wav_path}") |
|
|
| def main(): |
| main_process(ref_audio, ref_text, gen_text, ema_model, mel_spec_type, remove_silence, speed) |
|
|
| if __name__ == "__main__": |
| main() |