|
|
|
|
|
""" |
|
|
VoxCPM Command Line Interface |
|
|
|
|
|
Unified CLI for voice cloning, direct TTS synthesis, and batch processing. |
|
|
|
|
|
Usage examples: |
|
|
# Direct synthesis (single sample) |
|
|
voxcpm --text "Hello world" --output output.wav |
|
|
|
|
|
# Voice cloning (with reference audio and text) |
|
|
voxcpm --text "Hello world" --prompt-audio voice.wav --prompt-text "reference text" --output output.wav --denoise |
|
|
|
|
|
# Batch processing (each line in the file is one sample) |
|
|
voxcpm --input texts.txt --output-dir ./outputs/ |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import os |
|
|
import sys |
|
|
from pathlib import Path |
|
|
from typing import Optional, List |
|
|
import soundfile as sf |
|
|
|
|
|
from voxcpm.core import VoxCPM |
|
|
|
|
|
|
|
|
def validate_file_exists(file_path: str, file_type: str = "file") -> Path: |
|
|
"""Validate that a file exists.""" |
|
|
path = Path(file_path) |
|
|
if not path.exists(): |
|
|
raise FileNotFoundError(f"{file_type} '{file_path}' does not exist") |
|
|
return path |
|
|
|
|
|
|
|
|
def validate_output_path(output_path: str) -> Path: |
|
|
"""Validate the output path and create parent directories if needed.""" |
|
|
path = Path(output_path) |
|
|
path.parent.mkdir(parents=True, exist_ok=True) |
|
|
return path |
|
|
|
|
|
|
|
|
def load_model(args) -> VoxCPM: |
|
|
"""Load VoxCPM model. |
|
|
|
|
|
Prefer --model-path if provided; otherwise use from_pretrained (Hub). |
|
|
""" |
|
|
print("Loading VoxCPM model...") |
|
|
|
|
|
|
|
|
zipenhancer_path = getattr(args, "zipenhancer_path", None) or os.environ.get( |
|
|
"ZIPENHANCER_MODEL_PATH", None |
|
|
) |
|
|
|
|
|
|
|
|
if getattr(args, "model_path", None): |
|
|
try: |
|
|
model = VoxCPM( |
|
|
voxcpm_model_path=args.model_path, |
|
|
zipenhancer_model_path=zipenhancer_path, |
|
|
enable_denoiser=not getattr(args, "no_denoiser", False), |
|
|
) |
|
|
print("Model loaded (local).") |
|
|
return model |
|
|
except Exception as e: |
|
|
print(f"Failed to load model (local): {e}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
try: |
|
|
model = VoxCPM.from_pretrained( |
|
|
hf_model_id=getattr(args, "hf_model_id", "openbmb/VoxCPM-0.5B"), |
|
|
load_denoiser=not getattr(args, "no_denoiser", False), |
|
|
zipenhancer_model_id=zipenhancer_path, |
|
|
cache_dir=getattr(args, "cache_dir", None), |
|
|
local_files_only=getattr(args, "local_files_only", False), |
|
|
) |
|
|
print("Model loaded (from_pretrained).") |
|
|
return model |
|
|
except Exception as e: |
|
|
print(f"Failed to load model (from_pretrained): {e}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
def cmd_clone(args): |
|
|
"""Voice cloning command.""" |
|
|
|
|
|
if not args.text: |
|
|
print("Error: Please provide text to synthesize (--text)") |
|
|
sys.exit(1) |
|
|
|
|
|
if not args.prompt_audio: |
|
|
print("Error: Voice cloning requires a reference audio (--prompt-audio)") |
|
|
sys.exit(1) |
|
|
|
|
|
if not args.prompt_text: |
|
|
print("Error: Voice cloning requires a reference text (--prompt-text)") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
prompt_audio_path = validate_file_exists(args.prompt_audio, "reference audio file") |
|
|
output_path = validate_output_path(args.output) |
|
|
|
|
|
|
|
|
model = load_model(args) |
|
|
|
|
|
|
|
|
print(f"Synthesizing text: {args.text}") |
|
|
print(f"Reference audio: {prompt_audio_path}") |
|
|
print(f"Reference text: {args.prompt_text}") |
|
|
|
|
|
audio_array = model.generate( |
|
|
text=args.text, |
|
|
prompt_wav_path=str(prompt_audio_path), |
|
|
prompt_text=args.prompt_text, |
|
|
cfg_value=args.cfg_value, |
|
|
inference_timesteps=args.inference_timesteps, |
|
|
normalize=args.normalize, |
|
|
denoise=args.denoise |
|
|
) |
|
|
|
|
|
|
|
|
sf.write(str(output_path), audio_array, 16000) |
|
|
print(f"Saved audio to: {output_path}") |
|
|
|
|
|
|
|
|
duration = len(audio_array) / 16000 |
|
|
print(f"Duration: {duration:.2f}s") |
|
|
|
|
|
|
|
|
def cmd_synthesize(args): |
|
|
"""Direct TTS synthesis command.""" |
|
|
|
|
|
if not args.text: |
|
|
print("Error: Please provide text to synthesize (--text)") |
|
|
sys.exit(1) |
|
|
|
|
|
output_path = validate_output_path(args.output) |
|
|
|
|
|
model = load_model(args) |
|
|
|
|
|
print(f"Synthesizing text: {args.text}") |
|
|
|
|
|
audio_array = model.generate( |
|
|
text=args.text, |
|
|
prompt_wav_path=None, |
|
|
prompt_text=None, |
|
|
cfg_value=args.cfg_value, |
|
|
inference_timesteps=args.inference_timesteps, |
|
|
normalize=args.normalize, |
|
|
denoise=False |
|
|
) |
|
|
|
|
|
|
|
|
sf.write(str(output_path), audio_array, 16000) |
|
|
print(f"Saved audio to: {output_path}") |
|
|
|
|
|
|
|
|
duration = len(audio_array) / 16000 |
|
|
print(f"Duration: {duration:.2f}s") |
|
|
|
|
|
|
|
|
def cmd_batch(args): |
|
|
"""Batch synthesis command.""" |
|
|
|
|
|
input_file = validate_file_exists(args.input, "input file") |
|
|
output_dir = Path(args.output_dir) |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
try: |
|
|
with open(input_file, 'r', encoding='utf-8') as f: |
|
|
texts = [line.strip() for line in f if line.strip()] |
|
|
except Exception as e: |
|
|
print(f"Failed to read input file: {e}") |
|
|
sys.exit(1) |
|
|
if not texts: |
|
|
print("Error: Input file is empty or contains no valid lines") |
|
|
sys.exit(1) |
|
|
print(f"Found {len(texts)} lines to process") |
|
|
|
|
|
model = load_model(args) |
|
|
prompt_audio_path = None |
|
|
if args.prompt_audio: |
|
|
prompt_audio_path = str(validate_file_exists(args.prompt_audio, "reference audio file")) |
|
|
|
|
|
success_count = 0 |
|
|
for i, text in enumerate(texts, 1): |
|
|
print(f"\nProcessing {i}/{len(texts)}: {text[:50]}...") |
|
|
|
|
|
try: |
|
|
audio_array = model.generate( |
|
|
text=text, |
|
|
prompt_wav_path=prompt_audio_path, |
|
|
prompt_text=args.prompt_text, |
|
|
cfg_value=args.cfg_value, |
|
|
inference_timesteps=args.inference_timesteps, |
|
|
normalize=args.normalize, |
|
|
denoise=args.denoise and prompt_audio_path is not None |
|
|
) |
|
|
output_file = output_dir / f"output_{i:03d}.wav" |
|
|
sf.write(str(output_file), audio_array, 16000) |
|
|
|
|
|
duration = len(audio_array) / 16000 |
|
|
print(f" Saved: {output_file} ({duration:.2f}s)") |
|
|
success_count += 1 |
|
|
|
|
|
except Exception as e: |
|
|
print(f" Failed: {e}") |
|
|
continue |
|
|
|
|
|
print(f"\nBatch finished: {success_count}/{len(texts)} succeeded") |
|
|
|
|
|
def _build_unified_parser(): |
|
|
"""Build unified argument parser (no subcommands, route by args).""" |
|
|
parser = argparse.ArgumentParser( |
|
|
description="VoxCPM CLI (single parser) - voice cloning, direct TTS, and batch processing", |
|
|
formatter_class=argparse.RawDescriptionHelpFormatter, |
|
|
epilog=""" |
|
|
Examples: |
|
|
# Direct synthesis (single sample) |
|
|
voxcpm --text "Hello world" --output out.wav |
|
|
|
|
|
# Voice cloning (reference audio + text) |
|
|
voxcpm --text "Hello world" --prompt-audio voice.wav --prompt-text "reference text" --output out.wav --denoise |
|
|
|
|
|
# Batch processing |
|
|
voxcpm --input texts.txt --output-dir ./outs |
|
|
|
|
|
# Select model (from Hub) |
|
|
voxcpm --text "Hello" --output out.wav --hf-model-id openbmb/VoxCPM-0.5B |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
parser.add_argument("--input", "-i", help="Input text file (one line per sample)") |
|
|
parser.add_argument("--output-dir", "-od", help="Output directory (for batch mode)") |
|
|
parser.add_argument("--text", "-t", help="Text to synthesize (single-sample mode)") |
|
|
parser.add_argument("--output", "-o", help="Output audio file path (single-sample mode)") |
|
|
|
|
|
|
|
|
parser.add_argument("--prompt-audio", "-pa", help="Reference audio file path") |
|
|
parser.add_argument("--prompt-text", "-pt", help="Reference text corresponding to the audio") |
|
|
parser.add_argument("--prompt-file", "-pf", help="Reference text file corresponding to the audio") |
|
|
parser.add_argument("--denoise", action="store_true", help="Enable prompt speech enhancement (denoising)") |
|
|
|
|
|
|
|
|
parser.add_argument("--cfg-value", type=float, default=2.0, help="CFG guidance scale (default: 2.0)") |
|
|
parser.add_argument("--inference-timesteps", type=int, default=10, help="Inference steps (default: 10)") |
|
|
parser.add_argument("--normalize", action="store_true", help="Enable text normalization") |
|
|
|
|
|
|
|
|
parser.add_argument("--model-path", type=str, help="Local VoxCPM model path (overrides Hub download)") |
|
|
parser.add_argument("--hf-model-id", type=str, default="openbmb/VoxCPM-0.5B", help="Hugging Face repo id (e.g., openbmb/VoxCPM-0.5B)") |
|
|
parser.add_argument("--cache-dir", type=str, help="Cache directory for Hub downloads") |
|
|
parser.add_argument("--local-files-only", action="store_true", help="Use only local files (no network)") |
|
|
parser.add_argument("--no-denoiser", action="store_true", help="Disable denoiser model loading") |
|
|
parser.add_argument("--zipenhancer-path", type=str, default="iic/speech_zipenhancer_ans_multiloss_16k_base", help="ZipEnhancer model id or local path (default reads from env)") |
|
|
|
|
|
return parser |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Unified CLI entrypoint: route by provided arguments.""" |
|
|
parser = _build_unified_parser() |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
if args.input: |
|
|
if not args.output_dir: |
|
|
print("Error: Batch mode requires --output-dir") |
|
|
parser.print_help() |
|
|
sys.exit(1) |
|
|
return cmd_batch(args) |
|
|
|
|
|
|
|
|
if not args.text or not args.output: |
|
|
print("Error: Single-sample mode requires --text and --output") |
|
|
parser.print_help() |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
if args.prompt_audio or args.prompt_text: |
|
|
if not args.prompt_text and args.prompt_file: |
|
|
assert os.path.isfile(args.prompt_file), "Prompt file does not exist or is not accessible." |
|
|
|
|
|
with open(args.prompt_file, 'r', encoding='utf-8') as f: |
|
|
args.prompt_text = f.read() |
|
|
|
|
|
if not args.prompt_audio or not args.prompt_text: |
|
|
print("Error: Voice cloning requires both --prompt-audio and --prompt-text") |
|
|
sys.exit(1) |
|
|
return cmd_clone(args) |
|
|
|
|
|
|
|
|
return cmd_synthesize(args) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|