File size: 4,995 Bytes
6766eda
 
 
 
 
 
 
 
 
6d32e7f
6766eda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d32e7f
 
 
 
6766eda
 
 
 
 
 
 
 
 
 
 
 
 
 
6d32e7f
 
 
6766eda
6d32e7f
6766eda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d32e7f
 
 
6766eda
6d32e7f
6766eda
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import soundfile as sf
import numpy as np
from pathlib import Path
from voxcpm.model import VoxCPMModel
from voxcpm.model.voxcpm import LoRAConfig
from voxcpm.training.config import load_yaml_config
import argparse
import torch
import os
import re
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--lora_ckpt", type=str, required=True)
    parser.add_argument("--lora_config_path", type=str, required=True)
    parser.add_argument("--text", type=str)
    parser.add_argument("--text_file", type=str)
    parser.add_argument("--output_dir", type=str, default="outputs")
    parser.add_argument("--cfg_value", type=float, default=2.0)
    parser.add_argument("--inference_timesteps", type=int, default=10)
    args = parser.parse_args()
    assert args.text or args.text_file, "Please provide either text or text_file"

    # 1. 读取 YAML 配置
    cfg = load_yaml_config(args.lora_config_path)
    pretrained_path = cfg["pretrained_path"]
    lora_cfg_dict = cfg.get("lora", {}) or {}
    lora_cfg = LoRAConfig(**lora_cfg_dict) if lora_cfg_dict else None

    # 2. 加载基础模型(包含 LoRA 结构,并执行 torch.compile)
    print(f"[1/3] 加载基础模型:{pretrained_path}")
    model = VoxCPMModel.from_local(
        pretrained_path,
        optimize=True,  # 先 compile,load_lora_weights 使用 named_parameters 兼容
        training=False,
        lora_config=lora_cfg,
    )
        
    from src.voxcpm.utils.text_normalize import TextNormalizer
    text_normalizer = TextNormalizer()
    
    # 3. 加载 LoRA 权重(在 compile 后也能正常工作)
    ckpt_dir = Path(args.lora_ckpt)
    if not ckpt_dir.exists():
        raise FileNotFoundError(f"找不到 LoRA checkpoint: {ckpt_dir}")
    
    print(f"[2/3] 加载 LoRA 权重:{ckpt_dir}")
    loaded, skipped = model.load_lora_weights(str(ckpt_dir))
    print(f"       已加载 {len(loaded)} 个参数")
    if skipped:
        print(f"[WARNING] 跳过 {len(skipped)} 个参数")
        print(f"       跳过的 key (前5个): {skipped[:5]}")
    print(f"\n[3/3] 开始推理...")
    if args.text:
        with torch.inference_mode():
            target_text = args.text.replace("\n", " ")
            target_text = re.sub(r'\s+', ' ', target_text)
            target_text = text_normalizer.normalize(target_text)
            wav = model.generate(
            target_text=target_text,
            cfg_value=args.cfg_value,             # LM guidance on LocDiT, higher for better adherence to the prompt, but maybe worse
            inference_timesteps=args.inference_timesteps,   # LocDiT inference timesteps, higher for better result, lower for fast speed
            retry_badcase=True,        # enable retrying mode for some bad cases (unstoppable)
            retry_badcase_max_times=3,  # maximum retrying times
            retry_badcase_ratio_threshold=6.0, # maximum length restriction for bad case detection (simple but effective), it could be adjusted for slow pace speech
            )
            audio_np = wav.squeeze(0).cpu().numpy() if wav.dim() > 1 else wav.cpu().numpy()
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        sf.write(f"{args.output_dir}/output_lora.wav", audio_np, 16000)
        print(f"saved: {args.output_dir}/output_lora.wav")
    elif args.text_file:
        texts = []
        with open(args.text_file, "r") as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip().split("||")
                wav_id = line[0]
                text = " ".join(line[1:])
                texts.append((wav_id, text))
        for wav_id, text in texts:
            with torch.inference_mode():
                target_text = text.replace("\n", " ")
                target_text = re.sub(r'\s+', ' ', target_text)
                target_text = text_normalizer.normalize(target_text)
                wav = model.generate(
                target_text=target_text,
                cfg_value=args.cfg_value,             # LM guidance on LocDiT, higher for better adherence to the prompt, but maybe worse
                inference_timesteps=args.inference_timesteps,   # LocDiT inference timesteps, higher for better result, lower for fast speed
                retry_badcase=True,        # enable retrying mode for some bad cases (unstoppable)
                retry_badcase_max_times=3,  # maximum retrying times
                retry_badcase_ratio_threshold=6.0, # maximum length restriction for bad case detection (simple but effective), it could be adjusted for slow pace speech
                )
                audio_np = wav.squeeze(0).cpu().numpy() if wav.dim() > 1 else wav.cpu().numpy()
            if not os.path.exists(args.output_dir):
                os.makedirs(args.output_dir)
            sf.write(f"{args.output_dir}/{wav_id}.wav", audio_np, 16000)
            print(f"saved: {args.output_dir}/{wav_id}.wav")

if __name__ == "__main__":
    main()