| | |
| | |
| | |
| |
|
| | mod synthesis; |
| |
|
| | pub use synthesis::{IndexTTS, SynthesisOptions, SynthesisResult}; |
| |
|
| | use crate::{Error, Result}; |
| | use std::path::{Path, PathBuf}; |
| |
|
| | |
| | #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| | pub enum PipelineStage { |
| | TextNormalization, |
| | Tokenization, |
| | SemanticEncoding, |
| | SpeakerConditioning, |
| | GptGeneration, |
| | AcousticExpansion, |
| | Vocoding, |
| | PostProcessing, |
| | } |
| |
|
| | impl PipelineStage { |
| | |
| | pub fn name(&self) -> &'static str { |
| | match self { |
| | PipelineStage::TextNormalization => "Text Normalization", |
| | PipelineStage::Tokenization => "Tokenization", |
| | PipelineStage::SemanticEncoding => "Semantic Encoding", |
| | PipelineStage::SpeakerConditioning => "Speaker Conditioning", |
| | PipelineStage::GptGeneration => "GPT Generation", |
| | PipelineStage::AcousticExpansion => "Acoustic Expansion", |
| | PipelineStage::Vocoding => "Vocoding", |
| | PipelineStage::PostProcessing => "Post Processing", |
| | } |
| | } |
| |
|
| | |
| | pub fn all() -> Vec<PipelineStage> { |
| | vec![ |
| | PipelineStage::TextNormalization, |
| | PipelineStage::Tokenization, |
| | PipelineStage::SemanticEncoding, |
| | PipelineStage::SpeakerConditioning, |
| | PipelineStage::GptGeneration, |
| | PipelineStage::AcousticExpansion, |
| | PipelineStage::Vocoding, |
| | PipelineStage::PostProcessing, |
| | ] |
| | } |
| | } |
| |
|
| | |
| | pub type ProgressCallback = Box<dyn Fn(PipelineStage, f32) + Send + Sync>; |
| |
|
| | |
| | #[derive(Debug, Clone)] |
| | pub struct PipelineConfig { |
| | |
| | pub model_dir: PathBuf, |
| | |
| | pub use_fp16: bool, |
| | |
| | pub device: String, |
| | |
| | pub enable_cache: bool, |
| | |
| | pub max_text_length: usize, |
| | |
| | pub max_audio_duration: f32, |
| | } |
| |
|
| | impl Default for PipelineConfig { |
| | fn default() -> Self { |
| | Self { |
| | model_dir: PathBuf::from("models"), |
| | use_fp16: false, |
| | device: "cpu".to_string(), |
| | enable_cache: true, |
| | max_text_length: 500, |
| | max_audio_duration: 30.0, |
| | } |
| | } |
| | } |
| |
|
| | impl PipelineConfig { |
| | |
| | pub fn with_model_dir<P: AsRef<Path>>(mut self, path: P) -> Self { |
| | self.model_dir = path.as_ref().to_path_buf(); |
| | self |
| | } |
| |
|
| | |
| | pub fn with_fp16(mut self, enable: bool) -> Self { |
| | self.use_fp16 = enable; |
| | self |
| | } |
| |
|
| | |
| | pub fn with_device(mut self, device: &str) -> Self { |
| | self.device = device.to_string(); |
| | self |
| | } |
| |
|
| | |
| | pub fn validate(&self) -> Result<()> { |
| | if !self.model_dir.exists() { |
| | log::warn!( |
| | "Model directory does not exist: {}", |
| | self.model_dir.display() |
| | ); |
| | } |
| |
|
| | if self.max_text_length == 0 { |
| | return Err(Error::Config("max_text_length must be > 0".into())); |
| | } |
| |
|
| | if self.max_audio_duration <= 0.0 { |
| | return Err(Error::Config("max_audio_duration must be > 0".into())); |
| | } |
| |
|
| | Ok(()) |
| | } |
| | } |
| |
|
| | |
| | pub fn segment_text(text: &str, max_segment_len: usize) -> Vec<String> { |
| | use crate::text::TextNormalizer; |
| |
|
| | let normalizer = TextNormalizer::new(); |
| | let sentences = normalizer.split_sentences(text); |
| |
|
| | let mut segments = Vec::new(); |
| | let mut current_segment = String::new(); |
| |
|
| | for sentence in sentences { |
| | if current_segment.len() + sentence.len() > max_segment_len && !current_segment.is_empty() |
| | { |
| | segments.push(current_segment.trim().to_string()); |
| | current_segment = sentence; |
| | } else { |
| | if !current_segment.is_empty() { |
| | current_segment.push(' '); |
| | } |
| | current_segment.push_str(&sentence); |
| | } |
| | } |
| |
|
| | if !current_segment.trim().is_empty() { |
| | segments.push(current_segment.trim().to_string()); |
| | } |
| |
|
| | segments |
| | } |
| |
|
| | |
| | pub fn concatenate_audio(segments: &[Vec<f32>], silence_duration_ms: u32, sample_rate: u32) -> Vec<f32> { |
| | let silence_samples = (silence_duration_ms as usize * sample_rate as usize) / 1000; |
| | let silence = vec![0.0f32; silence_samples]; |
| |
|
| | let mut result = Vec::new(); |
| |
|
| | for (i, segment) in segments.iter().enumerate() { |
| | result.extend_from_slice(segment); |
| | if i < segments.len() - 1 { |
| | result.extend_from_slice(&silence); |
| | } |
| | } |
| |
|
| | result |
| | } |
| |
|
| | |
| | pub fn estimate_duration(text: &str, chars_per_second: f32) -> f32 { |
| | text.chars().count() as f32 / chars_per_second |
| | } |
| |
|