| | import argparse |
| | import codecs |
| | import re |
| | from pathlib import Path |
| |
|
| | import numpy as np |
| | import soundfile as sf |
| | import tomli |
| | from cached_path import cached_path |
| |
|
| | from model import DiT, UNetT |
| | from model.utils_infer import ( |
| | load_vocoder, |
| | load_model, |
| | preprocess_ref_audio_text, |
| | infer_process, |
| | remove_silence_for_generated_wav, |
| | ) |
| |
|
| |
|
| | parser = argparse.ArgumentParser( |
| | prog="python3 inference-cli.py", |
| | description="Commandline interface for E2/F5 TTS with Advanced Batch Processing.", |
| | epilog="Specify options above to override one or more settings from config.", |
| | ) |
| | parser.add_argument( |
| | "-c", |
| | "--config", |
| | help="Configuration file. Default=cli-config.toml", |
| | default="inference-cli.toml", |
| | ) |
| | parser.add_argument( |
| | "-m", |
| | "--model", |
| | help="F5-TTS | E2-TTS", |
| | ) |
| | parser.add_argument( |
| | "-p", |
| | "--ckpt_file", |
| | help="The Checkpoint .pt", |
| | ) |
| | parser.add_argument( |
| | "-v", |
| | "--vocab_file", |
| | help="The vocab .txt", |
| | ) |
| | parser.add_argument("-r", "--ref_audio", type=str, help="Reference audio file < 15 seconds.") |
| | parser.add_argument("-s", "--ref_text", type=str, default="666", help="Subtitle for the reference audio.") |
| | parser.add_argument( |
| | "-t", |
| | "--gen_text", |
| | type=str, |
| | help="Text to generate.", |
| | ) |
| | parser.add_argument( |
| | "-f", |
| | "--gen_file", |
| | type=str, |
| | help="File with text to generate. Ignores --text", |
| | ) |
| | parser.add_argument( |
| | "-o", |
| | "--output_dir", |
| | type=str, |
| | help="Path to output folder..", |
| | ) |
| | parser.add_argument( |
| | "--remove_silence", |
| | help="Remove silence.", |
| | ) |
| | parser.add_argument( |
| | "--load_vocoder_from_local", |
| | action="store_true", |
| | help="load vocoder from local. Default: ../checkpoints/charactr/vocos-mel-24khz", |
| | ) |
| | args = parser.parse_args() |
| |
|
| | config = tomli.load(open(args.config, "rb")) |
| |
|
| | ref_audio = args.ref_audio if args.ref_audio else config["ref_audio"] |
| | ref_text = args.ref_text if args.ref_text != "666" else config["ref_text"] |
| | gen_text = args.gen_text if args.gen_text else config["gen_text"] |
| | gen_file = args.gen_file if args.gen_file else config["gen_file"] |
| | if gen_file: |
| | gen_text = codecs.open(gen_file, "r", "utf-8").read() |
| | output_dir = args.output_dir if args.output_dir else config["output_dir"] |
| | model = args.model if args.model else config["model"] |
| | ckpt_file = args.ckpt_file if args.ckpt_file else "" |
| | vocab_file = args.vocab_file if args.vocab_file else "" |
| | remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"] |
| | wave_path = Path(output_dir) / "out.wav" |
| | spectrogram_path = Path(output_dir) / "out.png" |
| | vocos_local_path = "../checkpoints/charactr/vocos-mel-24khz" |
| |
|
| | vocos = load_vocoder(is_local=args.load_vocoder_from_local, local_path=vocos_local_path) |
| |
|
| |
|
| | |
| | if model == "F5-TTS": |
| | model_cls = DiT |
| | model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4) |
| | if ckpt_file == "": |
| | repo_name = "F5-TTS" |
| | exp_name = "F5TTS_Base" |
| | ckpt_step = 1200000 |
| | ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors")) |
| | |
| |
|
| | elif model == "E2-TTS": |
| | model_cls = UNetT |
| | model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4) |
| | if ckpt_file == "": |
| | repo_name = "E2-TTS" |
| | exp_name = "E2TTS_Base" |
| | ckpt_step = 1200000 |
| | ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors")) |
| | |
| |
|
| | print(f"Using {model}...") |
| | ema_model = load_model(model_cls, model_cfg, ckpt_file, vocab_file) |
| |
|
| |
|
| | def main_process(ref_audio, ref_text, text_gen, model_obj, remove_silence): |
| | main_voice = {"ref_audio": ref_audio, "ref_text": ref_text} |
| | if "voices" not in config: |
| | voices = {"main": main_voice} |
| | else: |
| | voices = config["voices"] |
| | voices["main"] = main_voice |
| | for voice in voices: |
| | voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text( |
| | voices[voice]["ref_audio"], voices[voice]["ref_text"] |
| | ) |
| | print("Voice:", voice) |
| | print("Ref_audio:", voices[voice]["ref_audio"]) |
| | print("Ref_text:", voices[voice]["ref_text"]) |
| |
|
| | generated_audio_segments = [] |
| | reg1 = r"(?=\[\w+\])" |
| | chunks = re.split(reg1, text_gen) |
| | reg2 = r"\[(\w+)\]" |
| | for text in chunks: |
| | match = re.match(reg2, text) |
| | if match: |
| | voice = match[1] |
| | else: |
| | print("No voice tag found, using main.") |
| | voice = "main" |
| | if voice not in voices: |
| | print(f"Voice {voice} not found, using main.") |
| | voice = "main" |
| | text = re.sub(reg2, "", text) |
| | gen_text = text.strip() |
| | ref_audio = voices[voice]["ref_audio"] |
| | ref_text = voices[voice]["ref_text"] |
| | print(f"Voice: {voice}") |
| | audio, final_sample_rate, spectragram = infer_process(ref_audio, ref_text, gen_text, model_obj) |
| | generated_audio_segments.append(audio) |
| |
|
| | if generated_audio_segments: |
| | final_wave = np.concatenate(generated_audio_segments) |
| | with open(wave_path, "wb") as f: |
| | sf.write(f.name, final_wave, final_sample_rate) |
| | |
| | if remove_silence: |
| | remove_silence_for_generated_wav(f.name) |
| | print(f.name) |
| |
|
| |
|
| | main_process(ref_audio, ref_text, gen_text, ema_model, remove_silence) |
| |
|