import soundfile as sf import numpy as np from voxcpm import VoxCPM import argparse import os def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, required=True) parser.add_argument("--text", type=str) parser.add_argument("--text_file", type=str) parser.add_argument("--output_dir", type=str, default="outputs") parser.add_argument("--cfg_value", type=float, default=2.0) parser.add_argument("--inference_timesteps", type=int, default=10) parser.add_argument("--prompt_wav_path", type=str) parser.add_argument("--prompt_text", type=str) args = parser.parse_args() assert args.text or args.text_file, "Please provide either text or text_file" # validate prompt_wav_path and prompt_text 必须同时提供 if args.prompt_wav_path or args.prompt_text: assert args.prompt_wav_path and args.prompt_text, "Please provide both prompt_wav_path and prompt_text" model = VoxCPM.from_pretrained(args.model_path, load_denoiser=False) if args.text: wav = model.generate( text=args.text, prompt_wav_path=args.prompt_wav_path, # optional: path to a prompt speech for voice cloning prompt_text=args.prompt_text, # optional: reference text cfg_value=args.cfg_value, # LM guidance on LocDiT, higher for better adherence to the prompt, but maybe worse inference_timesteps=args.inference_timesteps, # LocDiT inference timesteps, higher for better result, lower for fast speed normalize=True, # enable external TN tool denoise=False, # enable external Denoise tool retry_badcase=True, # enable retrying mode for some bad cases (unstoppable) retry_badcase_max_times=3, # maximum retrying times retry_badcase_ratio_threshold=6.0, # maximum length restriction for bad case detection (simple but effective), it could be adjusted for slow pace speech ) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) sf.write(f"{args.output_dir}/output.wav", wav, 16000) print(f"saved: {args.output_dir}/output.wav") elif args.text_file: texts = [] with open(args.text_file, "r") as f: lines = f.readlines() for line in lines: line = line.strip().split("||") wav_id = line[0] text = " ".join(line[1:]) texts.append((wav_id, text)) for wav_id, text in texts: wav = model.generate( text=text, prompt_wav_path=args.prompt_wav_path, # optional: path to a prompt speech for voice cloning prompt_text=args.prompt_text, # optional: reference text cfg_value=args.cfg_value, # LM guidance on LocDiT, higher for better adherence to the prompt, but maybe worse inference_timesteps=args.inference_timesteps, # LocDiT inference timesteps, higher for better result, lower for fast speed normalize=True, # enable external TN tool denoise=False, # enable external Denoise tool retry_badcase=True, # enable retrying mode for some bad cases (unstoppable) retry_badcase_max_times=3, # maximum retrying times retry_badcase_ratio_threshold=6.0, # maximum length restriction for bad case detection (simple but effective), it could be adjusted for slow pace speech ) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) sf.write(f"{args.output_dir}/{wav_id}.wav", wav, 16000) print(f"saved: {args.output_dir}/{wav_id}.wav") if __name__ == "__main__": main()