indicF5

Sleeping

App Files Files Community

ashishkblink commited on Jan 5

Commit

591aecb

verified ·

1 Parent(s): e53ebe7

Upload f5_tts/infer/infer_cli.py with huggingface_hub

Browse files

Files changed (1) hide show

f5_tts/infer/infer_cli.py +226 -0

f5_tts/infer/infer_cli.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import argparse
+import codecs
+import os
+import re
+from importlib.resources import files
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import tomli
+from cached_path import cached_path
+from f5_tts.infer.utils_infer import (
+    infer_process,
+    load_model,
+    load_vocoder,
+    preprocess_ref_audio_text,
+    remove_silence_for_generated_wav,
+)
+from f5_tts.model import DiT, UNetT
+parser = argparse.ArgumentParser(
+    prog="python3 infer-cli.py",
+    description="Commandline interface for E2/F5 TTS with Advanced Batch Processing.",
+    epilog="Specify options above to override one or more settings from config.",
+)
+parser.add_argument(
+    "-c",
+    "--config",
+    help="Configuration file. Default=infer/examples/basic/basic.toml",
+    default=os.path.join(files("f5_tts").joinpath("infer/examples/basic"), "basic.toml"),
+)
+parser.add_argument(
+    "-m",
+    "--model",
+    help="F5-TTS | E2-TTS",
+)
+parser.add_argument(
+    "-p",
+    "--ckpt_file",
+    help="The Checkpoint .pt",
+)
+parser.add_argument(
+    "-v",
+    "--vocab_file",
+    help="The vocab .txt",
+)
+parser.add_argument("-r", "--ref_audio", type=str, help="Reference audio file < 15 seconds.")
+parser.add_argument("-s", "--ref_text", type=str, default="666", help="Subtitle for the reference audio.")
+parser.add_argument(
+    "-t",
+    "--gen_text",
+    type=str,
+    help="Text to generate.",
+)
+parser.add_argument(
+    "-f",
+    "--gen_file",
+    type=str,
+    help="File with text to generate. Ignores --gen_text",
+)
+parser.add_argument(
+    "-o",
+    "--output_dir",
+    type=str,
+    help="Path to output folder..",
+)
+parser.add_argument(
+    "-w",
+    "--output_file",
+    type=str,
+    help="Filename of output file..",
+)
+parser.add_argument(
+    "--remove_silence",
+    help="Remove silence.",
+)
+parser.add_argument("--vocoder_name", type=str, default="vocos", choices=["vocos", "bigvgan"], help="vocoder name")
+parser.add_argument(
+    "--load_vocoder_from_local",
+    action="store_true",
+    help="load vocoder from local. Default: ../checkpoints/charactr/vocos-mel-24khz",
+)
+parser.add_argument(
+    "--speed",
+    type=float,
+    default=1.0,
+    help="Adjust the speed of the audio generation (default: 1.0)",
+)
+args = parser.parse_args()
+config = tomli.load(open(args.config, "rb"))
+ref_audio = args.ref_audio if args.ref_audio else config["ref_audio"]
+ref_text = args.ref_text if args.ref_text != "666" else config["ref_text"]
+gen_text = args.gen_text if args.gen_text else config["gen_text"]
+gen_file = args.gen_file if args.gen_file else config["gen_file"]
+# patches for pip pkg user
+if "infer/examples/" in ref_audio:
+    ref_audio = str(files("f5_tts").joinpath(f"{ref_audio}"))
+if "infer/examples/" in gen_file:
+    gen_file = str(files("f5_tts").joinpath(f"{gen_file}"))
+if "voices" in config:
+    for voice in config["voices"]:
+        voice_ref_audio = config["voices"][voice]["ref_audio"]
+        if "infer/examples/" in voice_ref_audio:
+            config["voices"][voice]["ref_audio"] = str(files("f5_tts").joinpath(f"{voice_ref_audio}"))
+if gen_file:
+    gen_text = codecs.open(gen_file, "r", "utf-8").read()
+output_dir = args.output_dir if args.output_dir else config["output_dir"]
+output_file = args.output_file if args.output_file else config["output_file"]
+model = args.model if args.model else config["model"]
+ckpt_file = args.ckpt_file if args.ckpt_file else ""
+vocab_file = args.vocab_file if args.vocab_file else ""
+remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
+speed = args.speed
+wave_path = Path(output_dir) / output_file
+# spectrogram_path = Path(output_dir) / "infer_cli_out.png"
+vocoder_name = args.vocoder_name
+mel_spec_type = args.vocoder_name
+if vocoder_name == "vocos":
+    vocoder_local_path = "../checkpoints/vocos-mel-24khz"
+elif vocoder_name == "bigvgan":
+    vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
+vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=args.load_vocoder_from_local, local_path=vocoder_local_path)
+# load models
+if model == "F5-TTS":
+    model_cls = DiT
+    model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+    if ckpt_file == "":
+        if vocoder_name == "vocos":
+            repo_name = "F5-TTS"
+            exp_name = "F5TTS_Base"
+            ckpt_step = 1200000
+            ckpt_file = "/home/tts/ttsteam/repos/en_f5/F5-TTS/ckpts/expresso/model_356000.pt"
+            # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors; local path
+        elif vocoder_name == "bigvgan":
+            repo_name = "F5-TTS"
+            exp_name = "F5TTS_Base_bigvgan"
+            ckpt_step = 1250000
+            ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
+elif model == "E2-TTS":
+    assert vocoder_name == "vocos", "E2-TTS only supports vocoder vocos"
+    model_cls = UNetT
+    model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
+    if ckpt_file == "":
+        repo_name = "E2-TTS"
+        exp_name = "E2TTS_Base"
+        ckpt_step = 1200000
+        ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
+        # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors; local path
+print(f"Using {model}...")
+ema_model = load_model(model_cls, model_cfg, ckpt_file, mel_spec_type=mel_spec_type, vocab_file=vocab_file)
+def main_process(ref_audio, ref_text, text_gen, model_obj, mel_spec_type, remove_silence, speed):
+    main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
+    if "voices" not in config:
+        voices = {"main": main_voice}
+    else:
+        voices = config["voices"]
+        voices["main"] = main_voice
+    for voice in voices:
+        voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
+            voices[voice]["ref_audio"], voices[voice]["ref_text"]
+        )
+        print("Voice:", voice)
+        print("Ref_audio:", voices[voice]["ref_audio"])
+        print("Ref_text:", voices[voice]["ref_text"])
+    generated_audio_segments = []
+    reg1 = r"(?=\[\w+\])"
+    chunks = re.split(reg1, text_gen)
+    reg2 = r"\[(\w+)\]"
+    for text in chunks:
+        if not text.strip():
+            continue
+        match = re.match(reg2, text)
+        if match:
+            voice = match[1]
+        else:
+            print("No voice tag found, using main.")
+            voice = "main"
+        if voice not in voices:
+            print(f"Voice {voice} not found, using main.")
+            voice = "main"
+        text = re.sub(reg2, "", text)
+        gen_text = text.strip()
+        ref_audio = voices[voice]["ref_audio"]
+        ref_text = voices[voice]["ref_text"]
+        print(f"Voice: {voice}")
+        audio, final_sample_rate, spectragram = infer_process(
+            ref_audio, ref_text, gen_text, model_obj, vocoder, mel_spec_type=mel_spec_type, speed=speed
+        )
+        generated_audio_segments.append(audio)
+    if generated_audio_segments:
+        final_wave = np.concatenate(generated_audio_segments)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        with open(wave_path, "wb") as f:
+            sf.write(f.name, final_wave, final_sample_rate)
+            # Remove silence
+            if remove_silence:
+                remove_silence_for_generated_wav(f.name)
+            print(f.name)
+def main():
+    main_process(ref_audio, ref_text, gen_text, ema_model, mel_spec_type, remove_silence, speed)
+if __name__ == "__main__":
+    main()