| | |
| | |
| | |
| |
|
| | use clap::{Parser, Subcommand}; |
| | use indextts::{ |
| | pipeline::{IndexTTS, SynthesisOptions}, |
| | Config, Result, |
| | }; |
| | use std::path::PathBuf; |
| |
|
| | #[derive(Parser)] |
| | #[command( |
| | name = "indextts", |
| | about = "High-performance Text-to-Speech engine in Rust", |
| | version, |
| | author |
| | )] |
| | struct Cli { |
| | #[command(subcommand)] |
| | command: Commands, |
| | } |
| |
|
| | #[derive(Subcommand)] |
| | enum Commands { |
| | |
| | Synthesize { |
| | |
| | #[arg(short, long)] |
| | text: String, |
| |
|
| | |
| | #[arg(short = 'v', long)] |
| | voice: PathBuf, |
| |
|
| | |
| | #[arg(short, long, default_value = "output.wav")] |
| | output: PathBuf, |
| |
|
| | |
| | #[arg(short, long)] |
| | config: Option<PathBuf>, |
| |
|
| | |
| | #[arg(short, long, default_value = "models")] |
| | model_dir: PathBuf, |
| |
|
| | |
| | #[arg(long)] |
| | emotion: Option<String>, |
| |
|
| | |
| | #[arg(long, default_value = "1.0")] |
| | emotion_alpha: f32, |
| |
|
| | |
| | #[arg(long, default_value = "50")] |
| | top_k: usize, |
| |
|
| | |
| | #[arg(long, default_value = "0.95")] |
| | top_p: f32, |
| |
|
| | |
| | #[arg(long, default_value = "1.1")] |
| | repetition_penalty: f32, |
| |
|
| | |
| | #[arg(long)] |
| | fp16: bool, |
| |
|
| | |
| | #[arg(short, long, default_value = "cpu")] |
| | device: String, |
| | }, |
| |
|
| | |
| | SynthesizeFile { |
| | |
| | #[arg(short, long)] |
| | input: PathBuf, |
| |
|
| | |
| | #[arg(short = 'v', long)] |
| | voice: PathBuf, |
| |
|
| | |
| | #[arg(short, long, default_value = "output.wav")] |
| | output: PathBuf, |
| |
|
| | |
| | #[arg(short, long)] |
| | config: Option<PathBuf>, |
| |
|
| | |
| | #[arg(short, long, default_value = "models")] |
| | model_dir: PathBuf, |
| |
|
| | |
| | #[arg(long, default_value = "200")] |
| | silence_ms: u32, |
| | }, |
| |
|
| | |
| | InitConfig { |
| | |
| | #[arg(short, long, default_value = "config.yaml")] |
| | output: PathBuf, |
| | }, |
| |
|
| | |
| | Info, |
| |
|
| | |
| | Benchmark { |
| | |
| | #[arg(short, long, default_value = "10")] |
| | iterations: usize, |
| | }, |
| | } |
| |
|
| | fn main() -> Result<()> { |
| | |
| | env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init(); |
| |
|
| | let cli = Cli::parse(); |
| |
|
| | match cli.command { |
| | Commands::Synthesize { |
| | text, |
| | voice, |
| | output, |
| | config, |
| | model_dir, |
| | emotion, |
| | emotion_alpha, |
| | top_k, |
| | top_p, |
| | repetition_penalty, |
| | fp16: _, |
| | device: _, |
| | } => { |
| | log::info!("IndexTTS Synthesizer"); |
| | log::info!("===================="); |
| |
|
| | |
| | let cfg = if let Some(config_path) = config { |
| | Config::load(config_path)? |
| | } else { |
| | let mut cfg = Config::default(); |
| | cfg.model_dir = model_dir; |
| | cfg |
| | }; |
| |
|
| | |
| | let tts = IndexTTS::new(cfg)?; |
| |
|
| | |
| | let emotion_vec = emotion.map(|s| { |
| | s.split(',') |
| | .filter_map(|v| v.trim().parse::<f32>().ok()) |
| | .collect::<Vec<f32>>() |
| | }); |
| |
|
| | |
| | let options = SynthesisOptions { |
| | emotion_vector: emotion_vec, |
| | emotion_alpha, |
| | sampling: indextts::model::SamplingStrategy::TopKP { k: top_k, p: top_p }, |
| | repetition_penalty, |
| | ..Default::default() |
| | }; |
| |
|
| | |
| | log::info!("Text: {}", &text[..text.len().min(100)]); |
| | log::info!("Voice: {}", voice.display()); |
| | log::info!("Output: {}", output.display()); |
| |
|
| | let result = tts.synthesize_to_file( |
| | &text, |
| | voice.to_str().unwrap(), |
| | output.to_str().unwrap(), |
| | &options, |
| | )?; |
| |
|
| | log::info!("Duration: {}", result.duration_formatted()); |
| | log::info!("Processing time: {:.2}s", result.processing_time); |
| | log::info!("Real-time factor: {:.3}x", result.rtf); |
| |
|
| | println!("✓ Synthesis complete: {}", output.display()); |
| | } |
| |
|
| | Commands::SynthesizeFile { |
| | input, |
| | voice, |
| | output, |
| | config, |
| | model_dir, |
| | silence_ms, |
| | } => { |
| | log::info!("IndexTTS File Synthesizer"); |
| | log::info!("=========================="); |
| |
|
| | |
| | let text = std::fs::read_to_string(&input)?; |
| |
|
| | |
| | let cfg = if let Some(config_path) = config { |
| | Config::load(config_path)? |
| | } else { |
| | let mut cfg = Config::default(); |
| | cfg.model_dir = model_dir; |
| | cfg |
| | }; |
| |
|
| | |
| | let tts = IndexTTS::new(cfg)?; |
| |
|
| | |
| | let options = SynthesisOptions { |
| | segment_silence_ms: silence_ms, |
| | ..Default::default() |
| | }; |
| |
|
| | |
| | log::info!("Input file: {}", input.display()); |
| | log::info!("Text length: {} characters", text.len()); |
| |
|
| | let result = tts.synthesize_long( |
| | &text, |
| | voice.to_str().unwrap(), |
| | &options, |
| | )?; |
| |
|
| | result.save(&output)?; |
| |
|
| | log::info!("Duration: {}", result.duration_formatted()); |
| | log::info!("Processing time: {:.2}s", result.processing_time); |
| | log::info!("Real-time factor: {:.3}x", result.rtf); |
| |
|
| | println!("✓ Synthesis complete: {}", output.display()); |
| | } |
| |
|
| | Commands::InitConfig { output } => { |
| | log::info!("Creating default configuration..."); |
| |
|
| | let config = Config::default(); |
| | config.save(&output)?; |
| |
|
| | println!("✓ Configuration saved to: {}", output.display()); |
| | } |
| |
|
| | Commands::Info => { |
| | println!("IndexTTS - High-performance Text-to-Speech Engine"); |
| | println!("=================================================="); |
| | println!("Version: {}", indextts::VERSION); |
| | println!("Platform: {}", std::env::consts::OS); |
| | println!("Architecture: {}", std::env::consts::ARCH); |
| | println!(); |
| | println!("Features:"); |
| | println!(" - Multi-language support (Chinese, English, mixed)"); |
| | println!(" - Zero-shot voice cloning"); |
| | println!(" - 8-dimensional emotion control"); |
| | println!(" - High-quality neural vocoding (BigVGAN)"); |
| | println!(" - SIMD-optimized audio processing"); |
| | println!(" - Parallel processing with Rayon"); |
| | println!(); |
| | println!("Sample Rate: {} Hz", indextts::SAMPLE_RATE); |
| | println!("Mel Bands: {}", indextts::N_MELS); |
| | println!("FFT Size: {}", indextts::N_FFT); |
| | println!("Hop Length: {}", indextts::HOP_LENGTH); |
| | println!(); |
| | println!("CPU Cores: {}", num_cpus::get()); |
| | println!("Physical Cores: {}", num_cpus::get_physical()); |
| | } |
| |
|
| | Commands::Benchmark { iterations } => { |
| | log::info!("Running benchmarks ({} iterations)...", iterations); |
| |
|
| | |
| | benchmark_mel_spectrogram(iterations); |
| |
|
| | |
| | benchmark_tokenization(iterations); |
| |
|
| | |
| | benchmark_vocoder(iterations); |
| |
|
| | println!("✓ Benchmarks complete"); |
| | } |
| | } |
| |
|
| | Ok(()) |
| | } |
| |
|
| | fn benchmark_mel_spectrogram(iterations: usize) { |
| | use indextts::audio::{mel_spectrogram, AudioConfig}; |
| | use std::time::Instant; |
| |
|
| | println!("\nMel-Spectrogram Benchmark"); |
| | println!("-------------------------"); |
| |
|
| | let config = AudioConfig::default(); |
| | let num_samples = config.sample_rate as usize; |
| | let signal: Vec<f32> = (0..num_samples) |
| | .map(|i| (i as f32 * 0.01).sin()) |
| | .collect(); |
| |
|
| | let start = Instant::now(); |
| | for _ in 0..iterations { |
| | let _ = mel_spectrogram(&signal, &config); |
| | } |
| | let elapsed = start.elapsed(); |
| |
|
| | let per_iter = elapsed.as_secs_f32() / iterations as f32; |
| | println!(" Signal length: {} samples ({:.2}s)", num_samples, num_samples as f32 / config.sample_rate as f32); |
| | println!(" Iterations: {}", iterations); |
| | println!(" Total time: {:.3}s", elapsed.as_secs_f32()); |
| | println!(" Per iteration: {:.3}ms", per_iter * 1000.0); |
| | println!(" Throughput: {:.1}x real-time", 1.0 / per_iter); |
| | } |
| |
|
| | fn benchmark_tokenization(iterations: usize) { |
| | use indextts::text::{TextNormalizer, TextTokenizer, TokenizerConfig}; |
| | use std::time::Instant; |
| |
|
| | println!("\nTokenization Benchmark"); |
| | println!("----------------------"); |
| |
|
| | let normalizer = TextNormalizer::new(); |
| | let tokenizer = TextTokenizer::new(TokenizerConfig::default()).unwrap(); |
| |
|
| | let test_texts = vec![ |
| | "Hello world, this is a test of the text-to-speech system.", |
| | "The quick brown fox jumps over the lazy dog.", |
| | "你好世界,这是一个测试。", |
| | "Mixed language: Hello 世界 and 你好 world.", |
| | ]; |
| |
|
| | let start = Instant::now(); |
| | for _ in 0..iterations { |
| | for text in &test_texts { |
| | let normalized = normalizer.normalize(text).unwrap(); |
| | let _tokens = tokenizer.encode(&normalized).unwrap(); |
| | } |
| | } |
| | let elapsed = start.elapsed(); |
| |
|
| | let total_chars: usize = test_texts.iter().map(|t| t.len()).sum(); |
| | let per_iter = elapsed.as_secs_f32() / iterations as f32; |
| | println!(" Texts: {}", test_texts.len()); |
| | println!(" Total characters: {}", total_chars); |
| | println!(" Iterations: {}", iterations); |
| | println!(" Total time: {:.3}s", elapsed.as_secs_f32()); |
| | println!(" Per iteration: {:.3}ms", per_iter * 1000.0); |
| | println!( |
| | " Throughput: {:.0} chars/sec", |
| | (total_chars * iterations) as f32 / elapsed.as_secs_f32() |
| | ); |
| | } |
| |
|
| | fn benchmark_vocoder(iterations: usize) { |
| | use indextts::vocoder::{create_bigvgan_22k, Vocoder}; |
| | use ndarray::Array2; |
| | use std::time::Instant; |
| |
|
| | println!("\nVocoder Benchmark"); |
| | println!("-----------------"); |
| |
|
| | let vocoder = create_bigvgan_22k(); |
| | let num_frames = 100; |
| | let mel = Array2::zeros((80, num_frames)); |
| |
|
| | let start = Instant::now(); |
| | for _ in 0..iterations { |
| | let _ = vocoder.synthesize(&mel); |
| | } |
| | let elapsed = start.elapsed(); |
| |
|
| | let audio_duration = num_frames as f32 * vocoder.hop_length() as f32 / vocoder.sample_rate() as f32; |
| | let per_iter = elapsed.as_secs_f32() / iterations as f32; |
| | println!(" Mel frames: {}", num_frames); |
| | println!(" Audio duration: {:.2}s", audio_duration); |
| | println!(" Iterations: {}", iterations); |
| | println!(" Total time: {:.3}s", elapsed.as_secs_f32()); |
| | println!(" Per iteration: {:.3}ms", per_iter * 1000.0); |
| | println!(" RTF: {:.3}x", per_iter / audio_duration); |
| | } |
| |
|