import soundfile as sf import numpy as np from pathlib import Path from voxcpm.model import VoxCPMModel from voxcpm.model.voxcpm import LoRAConfig from voxcpm.training.config import load_yaml_config import argparse import torch import os import re def main(): parser = argparse.ArgumentParser() parser.add_argument("--lora_ckpt", type=str, required=True) parser.add_argument("--lora_config_path", type=str, required=True) parser.add_argument("--text", type=str) parser.add_argument("--text_file", type=str) parser.add_argument("--output_dir", type=str, default="outputs") parser.add_argument("--cfg_value", type=float, default=2.0) parser.add_argument("--inference_timesteps", type=int, default=10) args = parser.parse_args() assert args.text or args.text_file, "Please provide either text or text_file" # 1. 读取 YAML 配置 cfg = load_yaml_config(args.lora_config_path) pretrained_path = cfg["pretrained_path"] lora_cfg_dict = cfg.get("lora", {}) or {} lora_cfg = LoRAConfig(**lora_cfg_dict) if lora_cfg_dict else None # 2. 加载基础模型(包含 LoRA 结构,并执行 torch.compile) print(f"[1/3] 加载基础模型:{pretrained_path}") model = VoxCPMModel.from_local( pretrained_path, optimize=True, # 先 compile,load_lora_weights 使用 named_parameters 兼容 training=False, lora_config=lora_cfg, ) from src.voxcpm.utils.text_normalize import TextNormalizer text_normalizer = TextNormalizer() # 3. 加载 LoRA 权重(在 compile 后也能正常工作) ckpt_dir = Path(args.lora_ckpt) if not ckpt_dir.exists(): raise FileNotFoundError(f"找不到 LoRA checkpoint: {ckpt_dir}") print(f"[2/3] 加载 LoRA 权重:{ckpt_dir}") loaded, skipped = model.load_lora_weights(str(ckpt_dir)) print(f" 已加载 {len(loaded)} 个参数") if skipped: print(f"[WARNING] 跳过 {len(skipped)} 个参数") print(f" 跳过的 key (前5个): {skipped[:5]}") print(f"\n[3/3] 开始推理...") if args.text: with torch.inference_mode(): target_text = args.text.replace("\n", " ") target_text = re.sub(r'\s+', ' ', target_text) target_text = text_normalizer.normalize(target_text) wav = model.generate( target_text=target_text, cfg_value=args.cfg_value, # LM guidance on LocDiT, higher for better adherence to the prompt, but maybe worse inference_timesteps=args.inference_timesteps, # LocDiT inference timesteps, higher for better result, lower for fast speed retry_badcase=True, # enable retrying mode for some bad cases (unstoppable) retry_badcase_max_times=3, # maximum retrying times retry_badcase_ratio_threshold=6.0, # maximum length restriction for bad case detection (simple but effective), it could be adjusted for slow pace speech ) audio_np = wav.squeeze(0).cpu().numpy() if wav.dim() > 1 else wav.cpu().numpy() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) sf.write(f"{args.output_dir}/output_lora.wav", audio_np, 16000) print(f"saved: {args.output_dir}/output_lora.wav") elif args.text_file: texts = [] with open(args.text_file, "r") as f: lines = f.readlines() for line in lines: line = line.strip().split("||") wav_id = line[0] text = " ".join(line[1:]) texts.append((wav_id, text)) for wav_id, text in texts: with torch.inference_mode(): target_text = text.replace("\n", " ") target_text = re.sub(r'\s+', ' ', target_text) target_text = text_normalizer.normalize(target_text) wav = model.generate( target_text=target_text, cfg_value=args.cfg_value, # LM guidance on LocDiT, higher for better adherence to the prompt, but maybe worse inference_timesteps=args.inference_timesteps, # LocDiT inference timesteps, higher for better result, lower for fast speed retry_badcase=True, # enable retrying mode for some bad cases (unstoppable) retry_badcase_max_times=3, # maximum retrying times retry_badcase_ratio_threshold=6.0, # maximum length restriction for bad case detection (simple but effective), it could be adjusted for slow pace speech ) audio_np = wav.squeeze(0).cpu().numpy() if wav.dim() > 1 else wav.cpu().numpy() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) sf.write(f"{args.output_dir}/{wav_id}.wav", audio_np, 16000) print(f"saved: {args.output_dir}/{wav_id}.wav") if __name__ == "__main__": main()