| |
| |
| """ |
| Long-Form VibeVoice Handler for HuggingFace Inference Endpoints |
| |
| === WHAT THIS HANDLER DOES === |
| This handler is specifically optimized for generating 10-30 minutes of high-quality |
| AI speech using Microsoft's VibeVoice model. It's designed to run on HuggingFace |
| Inference Endpoints with NVIDIA GPUs. |
| |
| === KEY OPTIMIZATIONS EXPLAINED === |
| 1. Flash Attention 2: Reduces memory usage from O(N²) to O(N) for sequence length |
| 2. Memory Management: Aggressive VRAM optimization for long sequences |
| 3. Parameter Tuning: Balanced speed vs quality for long-form content |
| 4. No Fallbacks: Fails fast if requirements aren't met (production reliability) |
| |
| === TWEAKING GUIDE FOR DIFFERENT OBJECTIVES === |
| |
| FOR FASTER GENERATION (Lower Latency): |
| - Reduce ddpm_steps: 4-6 steps (current: 6) |
| - Lower cfg_scale: 1.0-1.1 (current: 1.2) |
| - Reduce max_new_tokens: 4096 (current: 8192) |
| |
| FOR BETTER QUALITY (Higher Latency): |
| - Increase ddmp_steps: 8-12 steps |
| - Higher cfg_scale: 1.3-1.5 |
| - Increase max_new_tokens: 12288+ |
| |
| FOR SHORTER CONTENT (< 5 minutes): |
| - Increase ddmp_steps to 8-10 |
| - Increase cfg_scale to 1.3-1.4 |
| - Can use smaller GPU (16GB VRAM) |
| |
| FOR VERY LONG CONTENT (30+ minutes): |
| - Keep ddmp_steps low: 4-6 |
| - Lower cfg_scale: 1.0-1.1 |
| - Increase memory allocation |
| - Consider content chunking |
| """ |
|
|
| import os |
| import re |
| import io |
| import base64 |
| import tempfile |
| import time |
| from typing import Dict, List, Any, Optional, Tuple |
| import torch |
| import torchaudio |
| import numpy as np |
|
|
| |
| from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference |
| from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor |
| from transformers.utils import logging |
|
|
| |
| logging.set_verbosity_error() |
| logger = logging.get_logger(__name__) |
|
|
|
|
| class LongFormVibeVoiceHandler: |
| """ |
| Production handler optimized for 10-30 minute VibeVoice generation. |
| |
| === UNDERSTANDING THE ARCHITECTURE === |
| VibeVoice uses three main components: |
| 1. LLM (Large Language Model): Understands text and dialogue flow |
| 2. Acoustic Tokenizer: Converts speech to/from compressed representations |
| 3. Diffusion Head: Generates high-quality audio details using DDPM process |
| |
| === PERFORMANCE CHARACTERISTICS === |
| - Memory: Uses Flash Attention 2 for linear memory scaling |
| - Speed: Optimized parameters balance quality vs generation time |
| - Quality: DDPM diffusion process creates natural-sounding speech |
| """ |
|
|
| def __init__(self, path: str = ""): |
| """ |
| Initialize handler with aggressive optimizations for long-form content. |
| |
| === INITIALIZATION FLOW === |
| 1. Validate hardware capabilities (GPU, memory, compute) |
| 2. Configure CUDA optimizations for memory efficiency |
| 3. Load the VibeVoice model with Flash Attention 2 |
| 4. Set up memory management for long sequences |
| |
| Args: |
| path: Model path (HuggingFace automatically provides this) |
| """ |
| |
| print("🚀 Initializing Long-Form VibeVoice Handler (10-30 min optimized)") |
| |
| |
| self.model_path = path or "microsoft/VibeVoice-1.5B" |
| |
| |
| |
| self.sample_rate = 24000 |
| |
| |
| |
| self.max_speakers = 4 |
| |
| |
| self.device = self._setup_cuda_device() |
| self._configure_cuda_optimizations() |
| self._load_model_optimized() |
| self._setup_memory_management() |
| |
| print("✅ Long-form handler ready for 10-30 minute audio generation") |
|
|
| def _setup_cuda_device(self) -> str: |
| """ |
| Setup and validate CUDA device for long-form generation. |
| |
| === WHY THESE REQUIREMENTS === |
| - CUDA: Flash Attention 2 only works on NVIDIA GPUs |
| - 20GB+ VRAM: Long sequences need lots of memory for attention matrices |
| - Compute 7.5+: Flash Attention 2 requires modern GPU architecture |
| |
| === TWEAK FOR DIFFERENT HARDWARE === |
| - For shorter content (< 5 min): Reduce memory requirement to 12GB |
| - For very long content (30+ min): Increase to 24GB+ |
| - For development: Can comment out validation and accept lower performance |
| """ |
| |
| |
| if not torch.cuda.is_available(): |
| raise RuntimeError("CUDA not available! This handler requires NVIDIA GPU.") |
| |
| |
| device_name = torch.cuda.get_device_name() |
| memory_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3 |
| compute_cap = torch.cuda.get_device_properties(0).major * 10 + torch.cuda.get_device_properties(0).minor |
| |
| print(f"🔥 GPU: {device_name} ({memory_gb:.1f}GB, Compute {compute_cap/10:.1f})") |
| |
| |
| |
| if memory_gb < 20: |
| raise RuntimeError(f"Insufficient VRAM ({memory_gb:.1f}GB). Need 20GB+ for 10-30 min audio.") |
| |
| if compute_cap < 75: |
| raise RuntimeError(f"GPU compute capability {compute_cap/10:.1f} too old. Need 7.5+ for Flash Attention 2.") |
| |
| return "cuda" |
|
|
| def _configure_cuda_optimizations(self): |
| """ |
| Configure CUDA-specific optimizations for long sequences. |
| |
| === UNDERSTANDING THESE OPTIMIZATIONS === |
| |
| 1. PYTORCH_CUDA_ALLOC_CONF: Controls GPU memory allocation |
| - max_split_size_mb: Prevents memory fragmentation |
| - expandable_segments: Allows dynamic memory growth |
| |
| 2. Flash Attention Settings: Skip compatibility checks for speed |
| |
| 3. Tensor Float-32 (TF32): Uses faster but slightly less precise math |
| - Enabled for matmul and cudnn operations |
| - Negligible quality loss, significant speed gain |
| |
| 4. Attention Backend Selection: Forces Flash Attention usage |
| |
| === TWEAKING FOR DIFFERENT OBJECTIVES === |
| |
| FOR MAXIMUM QUALITY (slower): |
| - Set allow_tf32 = False for both (more precise math) |
| - Increase max_split_size_mb to 4096 |
| |
| FOR MAXIMUM SPEED (slightly lower quality): |
| - Keep current settings |
| - Consider torch.backends.cudnn.benchmark = True |
| """ |
| |
| |
| |
| |
| os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:2048,expandable_segments:True' |
| |
| |
| |
| os.environ['FLASH_ATTENTION_SKIP_CUDA_CHECK'] = '1' |
| |
| |
| |
| |
| torch.backends.cudnn.allow_tf32 = True |
| torch.backends.cuda.matmul.allow_tf32 = True |
| |
| |
| |
| |
| torch.backends.cuda.enable_flash_sdp(True) |
| torch.backends.cuda.enable_mem_efficient_sdp(False) |
| torch.backends.cuda.enable_math_sdp(False) |
| |
| print("✅ CUDA optimizations configured for long-form generation") |
|
|
| def _load_model_optimized(self): |
| """ |
| Load VibeVoice model with Flash Attention 2 optimization. |
| |
| === UNDERSTANDING MODEL LOADING PARAMETERS === |
| |
| 1. torch_dtype=torch.float16: |
| - Uses 16-bit precision (half the memory of float32) |
| - Required for Flash Attention 2 |
| - Minimal quality loss for TTS tasks |
| |
| 2. attn_implementation='flash_attention_2': |
| - Forces Flash Attention 2 usage (no fallbacks) |
| - Reduces memory from O(N²) to O(N) for sequence length |
| - 2-4x faster for long sequences |
| |
| 3. device_map=None: |
| - Manual device management (more control) |
| - Better for single-GPU deployments |
| |
| 4. use_cache=False: |
| - Disables key-value caching to save memory |
| - Better for long sequences where cache becomes huge |
| |
| 5. DDPM Steps (Denoising Diffusion Probabilistic Model): |
| - Controls quality vs speed of audio generation |
| - Each step refines the audio quality |
| - 6 steps = good balance for long-form content |
| |
| === TWEAKING DDMP STEPS === |
| |
| DDMP Steps Guide: |
| - 4 steps: Fastest, lowest quality (good for drafts) |
| - 6 steps: Balanced (current setting, good for long content) |
| - 8 steps: Higher quality, slower (good for shorter content) |
| - 12+ steps: Highest quality, much slower (use for final production) |
| |
| Memory vs Steps: |
| - More steps = more GPU memory usage |
| - For 30+ minute content, stay at 4-6 steps |
| """ |
| |
| print("🧠 Loading model with Flash Attention 2...") |
| |
| |
| try: |
| import flash_attn |
| print(f"✅ Flash Attention 2 version: {flash_attn.__version__}") |
| except ImportError: |
| raise RuntimeError("Flash Attention 2 not installed! Install with: pip install flash-attn --no-build-isolation") |
| |
| |
| self.processor = VibeVoiceProcessor.from_pretrained( |
| self.model_path, |
| cache_dir="/tmp/model_cache" |
| ) |
| |
| |
| self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( |
| self.model_path, |
| torch_dtype=torch.float16, |
| attn_implementation='flash_attention_2', |
| device_map=None, |
| cache_dir="/tmp/model_cache", |
| low_cpu_mem_usage=True, |
| use_cache=False, |
| ) |
| |
| |
| self.model = self.model.to(self.device) |
| self.model.eval() |
| |
| |
| |
| |
| |
| |
| |
| |
| self.model.set_ddpm_inference_steps(num_steps=6) |
| |
| |
| self.model.config.use_cache = False |
| |
| print("✅ Model loaded with Flash Attention 2 optimization") |
| print("📊 DDPM Steps: 6 (balanced quality/speed for long-form)") |
| print("💡 To tweak: Increase to 8-12 for quality, decrease to 4 for speed") |
|
|
| def _setup_memory_management(self): |
| """ |
| Setup aggressive memory management for long sequences. |
| |
| === UNDERSTANDING MEMORY MANAGEMENT === |
| |
| 1. torch.cuda.empty_cache(): Frees unused GPU memory |
| 2. set_memory_fraction(): Reserves memory for other processes |
| 3. Garbage collection: Frees Python objects from RAM |
| |
| === MEMORY USAGE PATTERNS === |
| - Model weights: ~3GB (loaded once) |
| - Attention matrices: Grows with sequence length² (Flash Attention fixes this) |
| - Audio generation: ~1-2GB for intermediate representations |
| - Final audio: ~100MB per minute of generated audio |
| |
| === TWEAKING MEMORY SETTINGS === |
| |
| FOR LONGER CONTENT (30+ minutes): |
| - Reduce memory_fraction to 0.80 (leave more room) |
| - Add periodic memory cleanup during generation |
| |
| FOR SHORTER CONTENT (< 5 minutes): |
| - Increase memory_fraction to 0.90 (use more GPU memory) |
| - Can reduce memory management overhead |
| """ |
| |
| |
| torch.cuda.empty_cache() |
| |
| |
| |
| |
| if hasattr(torch.cuda, 'set_memory_fraction'): |
| torch.cuda.set_memory_fraction(0.85) |
| |
| |
| import gc |
| gc.collect() |
| |
| print("✅ Memory management configured") |
| print("📊 GPU Memory: Using 85% of VRAM (leaving 15% for processing)") |
|
|
| def _parse_long_form_script(self, text: str) -> Tuple[List[str], List[str]]: |
| """ |
| Parse long-form text with optimizations for 10-30 minute content. |
| |
| === UNDERSTANDING TEXT PARSING === |
| |
| This function converts raw text input into structured dialogue that |
| VibeVoice can process. It handles several input formats: |
| |
| 1. "Speaker 1: Hello there! Speaker 2: How are you?" |
| 2. "[1]: Hello there! [2]: How are you?" |
| 3. Mixed formats with paragraph breaks |
| |
| === TEXT CHUNKING STRATEGY === |
| |
| For very long dialogues, we split text into chunks because: |
| - Prevents memory issues with extremely long speaker turns |
| - Improves audio quality (shorter segments = better consistency) |
| - Enables better error recovery if generation fails |
| |
| === TWEAKING FOR DIFFERENT CONTENT === |
| |
| FOR CONVERSATIONAL CONTENT: |
| - Keep max_chunk_size at 500 (current setting) |
| - Natural conversation rarely needs chunking |
| |
| FOR NARRATIVE/MONOLOGUE CONTENT: |
| - Reduce max_chunk_size to 300 (more frequent breaks) |
| - Consider adding pause indicators |
| |
| FOR TECHNICAL/DENSE CONTENT: |
| - Reduce max_chunk_size to 250 (easier to process) |
| - Split on technical terms/punctuation |
| """ |
| |
| if not text.strip(): |
| raise ValueError("Empty text input") |
| |
| |
| speaker_pattern = r'^Speaker\s+(\d+):\s*(.*)$' |
| bracket_pattern = r'^\[(\d+)\]:\s*(.*)$' |
| |
| scripts = [] |
| speaker_numbers = [] |
| |
| |
| |
| paragraphs = re.split(r'\n\s*\n', text.strip()) |
| |
| current_speaker = None |
| current_text = "" |
| |
| |
| for paragraph in paragraphs: |
| lines = paragraph.split('\n') |
| |
| for line in lines: |
| line = line.strip() |
| if not line: |
| continue |
| |
| |
| match = (re.match(speaker_pattern, line, re.IGNORECASE) or |
| re.match(bracket_pattern, line, re.IGNORECASE)) |
| |
| if match: |
| |
| if current_speaker and current_text: |
| |
| |
| chunks = self._chunk_long_text(current_text.strip()) |
| for chunk in chunks: |
| scripts.append(f"Speaker {current_speaker}: {chunk}") |
| speaker_numbers.append(current_speaker) |
| |
| |
| current_speaker = match.group(1) |
| current_text = match.group(2) |
| else: |
| |
| if current_text: |
| current_text += " " + line |
| else: |
| current_text = line |
| |
| |
| if current_speaker and current_text: |
| chunks = self._chunk_long_text(current_text.strip()) |
| for chunk in chunks: |
| scripts.append(f"Speaker {current_speaker}: {chunk}") |
| speaker_numbers.append(current_speaker) |
| |
| |
| total_chars = sum(len(s) for s in scripts) |
| unique_speakers = len(set(speaker_numbers)) |
| |
| print(f"📊 Parsed long-form content:") |
| print(f" Total characters: {total_chars:,}") |
| print(f" Dialogue segments: {len(scripts)}") |
| print(f" Unique speakers: {unique_speakers}") |
| |
| |
| if total_chars < 5000: |
| print("⚠️ Warning: Content seems short for 10-30 min target") |
| print("💡 Tip: 10 min ≈ 8,000-12,000 characters, 30 min ≈ 25,000-35,000 characters") |
| |
| if unique_speakers > self.max_speakers: |
| raise ValueError(f"Too many speakers ({unique_speakers}). Maximum: {self.max_speakers}") |
| |
| return scripts, speaker_numbers |
|
|
| def _chunk_long_text(self, text: str, max_chunk_size: int = 500) -> List[str]: |
| """ |
| Split very long text into manageable chunks at sentence boundaries. |
| |
| === WHY CHUNKING IS IMPORTANT === |
| |
| 1. Memory Management: Very long text segments use exponentially more memory |
| 2. Audio Quality: Shorter segments maintain better voice consistency |
| 3. Processing Stability: Reduces chance of generation failures |
| 4. Natural Breaks: Splitting at sentences maintains speech naturalness |
| |
| === UNDERSTANDING CHUNK SIZE === |
| |
| The chunk size affects: |
| - Memory usage (larger chunks = more memory) |
| - Audio consistency (smaller chunks = more consistent within chunk) |
| - Processing time (more chunks = slight overhead) |
| - Natural flow (bad splits can affect speech rhythm) |
| |
| === TWEAKING CHUNK SIZE === |
| |
| FOR DIFFERENT CONTENT TYPES: |
| - Conversational (current): 500 chars (good balance) |
| - Narrative/storytelling: 400 chars (more frequent natural breaks) |
| - Technical/dense content: 300 chars (easier processing) |
| - Simple content: 600-700 chars (fewer breaks) |
| |
| FOR DIFFERENT OBJECTIVES: |
| - Maximum quality: 300-400 chars (very consistent voices) |
| - Maximum speed: 600-800 chars (fewer processing chunks) |
| - Memory constrained: 250-350 chars (lower memory usage) |
| """ |
| |
| |
| if len(text) <= max_chunk_size: |
| return [text] |
| |
| |
| |
| sentences = re.split(r'[.!?]+\s+', text) |
| |
| chunks = [] |
| current_chunk = "" |
| |
| |
| for sentence in sentences: |
| |
| if len(current_chunk) + len(sentence) < max_chunk_size: |
| current_chunk += sentence + ". " |
| else: |
| |
| if current_chunk: |
| chunks.append(current_chunk.strip()) |
| current_chunk = sentence + ". " |
| |
| |
| if current_chunk: |
| chunks.append(current_chunk.strip()) |
| |
| |
| if len(chunks) > 1: |
| print(f" 📝 Split long dialogue into {len(chunks)} chunks for better processing") |
| |
| return chunks |
|
|
| def _prepare_voice_samples_longform(self, speaker_numbers: List[str], |
| voice_samples: Optional[List[str]] = None, |
| speaker_names: Optional[List[str]] = None) -> List[str]: |
| """ |
| Prepare voice samples with validation for long-form consistency. |
| |
| === UNDERSTANDING VOICE SAMPLES === |
| |
| Voice samples are reference audio files that VibeVoice uses to: |
| 1. Clone the voice characteristics (tone, accent, speaking style) |
| 2. Maintain consistency across long generations |
| 3. Differentiate between multiple speakers |
| |
| === VOICE SAMPLE REQUIREMENTS === |
| |
| For best results with long-form content: |
| - Duration: 30+ seconds (more data = better cloning) |
| - Quality: Clear audio, minimal background noise |
| - Content: Natural speech (not singing or artificial) |
| - Format: Any common audio format (WAV, MP3, etc.) |
| |
| === VOICE CONSISTENCY IN LONG-FORM === |
| |
| Why voice samples are crucial for 10-30 minute content: |
| - Without samples: Voices may drift over long generations |
| - With samples: Consistent voice characteristics maintained |
| - Quality samples: Better emotional expression and naturalness |
| |
| === TWEAKING FOR DIFFERENT OBJECTIVES === |
| |
| FOR MAXIMUM VOICE CONSISTENCY (long content): |
| - Use 60+ second voice samples |
| - Ensure samples contain varied emotional expressions |
| - Use multiple samples per speaker if possible |
| |
| FOR QUICK TESTING (development): |
| - 10-15 second samples are acceptable |
| - Can use synthetic voices (created automatically) |
| |
| FOR PRODUCTION QUALITY: |
| - Professional voice samples (clean, well-recorded) |
| - Multiple sample clips per speaker |
| - Consistent recording conditions across samples |
| """ |
| |
| unique_speakers = list(dict.fromkeys(speaker_numbers)) |
| voice_paths = [] |
| |
| print(f"🎭 Preparing voices for {len(unique_speakers)} speakers") |
| |
| for i, speaker_num in enumerate(unique_speakers): |
| voice_path = None |
| |
| |
| if voice_samples and i < len(voice_samples): |
| temp_path = f"/tmp/longform_voice_{speaker_num}.wav" |
| try: |
| |
| audio_data = base64.b64decode(voice_samples[i]) |
| with open(temp_path, 'wb') as f: |
| f.write(audio_data) |
| |
| |
| import torchaudio |
| waveform, sr = torchaudio.load(temp_path) |
| duration = waveform.shape[1] / sr |
| |
| |
| if duration < 10: |
| print(f"⚠️ Voice sample {i+1} is very short ({duration:.1f}s) - may affect quality") |
| elif duration < 30: |
| print(f"⚠️ Voice sample {i+1} is {duration:.1f}s (recommend 30s+ for long-form)") |
| else: |
| print(f"✅ Voice sample {i+1} duration: {duration:.1f}s (good for long-form)") |
| |
| voice_path = temp_path |
| print(f"✅ Speaker {speaker_num}: Custom voice ({duration:.1f}s)") |
| |
| except Exception as e: |
| raise ValueError(f"Invalid voice sample {i+1}: {e}") |
| |
| |
| if not voice_path: |
| voice_path = self._create_synthetic_voice(speaker_num) |
| print(f"✅ Speaker {speaker_num}: Synthetic voice (generated)") |
| |
| return voice_paths |
|
|
| def _create_synthetic_voice(self, speaker_num: str, duration: float = 30.0) -> str: |
| """ |
| Create high-quality synthetic voice sample for long-form consistency. |
| |
| === WHY SYNTHETIC VOICES === |
| |
| When no custom voice samples are provided, we create synthetic reference |
| voices to ensure: |
| 1. Each speaker has a distinct voice characteristic |
| 2. Voice consistency is maintained throughout long generation |
| 3. Fallback option when custom samples aren't available |
| |
| === SYNTHETIC VOICE GENERATION === |
| |
| This creates complex waveforms using: |
| - Multiple harmonics for natural sound (not just pure sine waves) |
| - Different base frequencies per speaker (voice differentiation) |
| - Envelope shaping for speech-like dynamics |
| - Sufficient duration for good voice modeling |
| |
| === TWEAKING SYNTHETIC VOICES === |
| |
| FOR BETTER SPEAKER DIFFERENTIATION: |
| - Increase frequency spacing: base_freq = 120 + int(speaker_num) * 50 |
| - Add more harmonics (up to 6-8) |
| - Vary envelope patterns per speaker |
| |
| FOR DIFFERENT VOICE CHARACTERISTICS: |
| - Lower frequencies (80-150 Hz): Deeper voices |
| - Higher frequencies (150-250 Hz): Higher voices |
| - More harmonics: Richer, more complex voices |
| - Different envelopes: Various speaking patterns |
| |
| NOTE: Synthetic voices are just placeholders - custom voice samples |
| will always produce much better and more natural results. |
| """ |
| |
| sample_rate = 24000 |
| |
| |
| |
| |
| |
| |
| base_freq = 120 + int(speaker_num) * 30 |
| |
| |
| t = torch.linspace(0, duration, int(sample_rate * duration)) |
| |
| |
| |
| waveform = torch.zeros_like(t) |
| |
| |
| |
| for harmonic in [1, 2, 3, 4]: |
| amplitude = 1.0 / harmonic |
| waveform += amplitude * torch.sin(2 * torch.pi * base_freq * harmonic * t) |
| |
| |
| |
| |
| envelope = torch.exp(-t / 20) * (1 + 0.3 * torch.sin(2 * torch.pi * 2 * t)) |
| waveform = waveform * envelope |
| |
| |
| waveform = waveform / waveform.abs().max() * 0.7 |
| waveform = waveform.unsqueeze(0) |
| |
| |
| temp_path = f"/tmp/synthetic_longform_{speaker_num}.wav" |
| torchaudio.save(temp_path, waveform, sample_rate) |
| |
| return temp_path |
|
|
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: |
| """ |
| Main inference method optimized for 10-30 minute generation. |
| |
| === UNDERSTANDING THE GENERATION PROCESS === |
| |
| The VibeVoice generation process has several stages: |
| 1. Text parsing and speaker identification |
| 2. Voice sample preparation and validation |
| 3. Model input preparation (tokenization, encoding) |
| 4. AI generation using LLM + Diffusion process |
| 5. Audio post-processing and encoding |
| |
| === KEY PARAMETERS EXPLAINED === |
| |
| 1. CFG Scale (Classifier-Free Guidance): |
| - Controls how closely the model follows voice samples |
| - Higher = better voice matching, potentially less natural |
| - Lower = more natural speech, less precise voice matching |
| - 1.2 is optimized for long-form content balance |
| |
| 2. DDMP Steps (covered in model loading): |
| - 6 steps = balanced quality/speed for long content |
| |
| 3. Max New Tokens: |
| - Maximum length of generated sequence |
| - 8192 supports very long audio generation |
| - Increase for longer content, decrease for speed |
| |
| 4. Temperature: |
| - Controls randomness in generation |
| - 1.0 = default randomness |
| - Lower = more deterministic, higher = more varied |
| |
| === TWEAKING PARAMETERS FOR DIFFERENT OBJECTIVES === |
| |
| FOR FASTER GENERATION (reduced latency): |
| - cfg_scale: 1.0-1.1 (less precise voice matching) |
| - ddpm_steps: 4-5 (reduce in model loading section) |
| - max_new_tokens: 4096 (shorter content support) |
| |
| FOR BETTER QUALITY (increased latency): |
| - cfg_scale: 1.3-1.5 (better voice matching) |
| - ddmp_steps: 8-12 (modify in model loading section) |
| - max_new_tokens: 12288+ (longer content support) |
| |
| FOR VOICE CLONING ACCURACY: |
| - cfg_scale: 1.4-1.6 (prioritize voice matching) |
| - Ensure high-quality voice samples (30+ seconds) |
| - Consider multiple voice samples per speaker |
| |
| FOR NATURAL SPEECH FLOW: |
| - cfg_scale: 1.0-1.2 (prioritize naturalness) |
| - temperature: 1.1-1.2 (add slight randomness) |
| - Focus on well-structured input text |
| """ |
| |
| start_total = time.time() |
| |
| |
| text_input = data.get("inputs", "") |
| if not text_input: |
| raise ValueError("No 'inputs' provided") |
| |
| |
| params = data.get("parameters", {}) |
| |
| |
| voice_samples = params.get("voice_samples", []) |
| speaker_names = params.get("speaker_names", []) |
| |
| |
| |
| cfg_scale = params.get("cfg_scale", 1.2) |
| ddpm_steps = params.get("ddpm_steps", 6) |
| max_new_tokens = params.get("max_new_tokens", 8192) |
| temperature = params.get("temperature", 1.0) |
| output_format = params.get("output_format", "wav") |
| |
| print(f"🎯 Long-form generation config:") |
| print(f" CFG scale: {cfg_scale} (voice matching strength)") |
| print(f" DDPM steps: {ddpm_steps} (quality vs speed)") |
| print(f" Max tokens: {max_new_tokens} (content length limit)") |
| print(f" Temperature: {temperature} (generation randomness)") |
| |
| |
| if ddpm_steps != 6: |
| self.model.set_ddpm_inference_steps(num_steps=ddmp_steps) |
| print(f" Updated DDMP steps from 6 to {ddmp_steps}") |
| |
| |
| print("📝 Stage 1: Parsing long-form script...") |
| parsing_start = time.time() |
| scripts, speaker_numbers = self._parse_long_form_script(text_input) |
| parsing_time = time.time() - parsing_start |
| |
| |
| print("🎭 Stage 2: Preparing voice samples...") |
| voice_prep_start = time.time() |
| voice_paths = self._prepare_voice_samples_longform( |
| speaker_numbers, voice_samples, speaker_names |
| ) |
| voice_prep_time = time.time() - voice_prep_start |
| |
| |
| full_script = '\n'.join(scripts) |
| |
| |
| print("🔧 Stage 3: Preparing model inputs for long sequence...") |
| input_prep_start = time.time() |
| |
| |
| inputs = self.processor( |
| text=[full_script], |
| voice_samples=[voice_paths], |
| padding=True, |
| return_tensors="pt", |
| return_attention_mask=True, |
| ) |
| |
| |
| |
| inputs = {k: v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else v |
| for k, v in inputs.items()} |
| |
| input_prep_time = time.time() - input_prep_start |
| input_tokens = inputs['input_ids'].shape[1] |
| |
| print(f"✅ Input preparation complete ({input_tokens:,} tokens, {input_prep_time:.2f}s)") |
| |
| |
| print("🎙️ Stage 4: Starting long-form AI generation...") |
| generation_start = time.time() |
| |
| |
| torch.cuda.empty_cache() |
| |
| |
| with torch.no_grad(): |
| |
| |
| with torch.cuda.amp.autocast(): |
| outputs = self.model.generate( |
| **inputs, |
| max_new_tokens=max_new_tokens, |
| cfg_scale=cfg_scale, |
| tokenizer=self.processor.tokenizer, |
| generation_config={ |
| 'do_sample': False, |
| 'temperature': temperature |
| }, |
| verbose=True, |
| ) |
| |
| generation_time = time.time() - generation_start |
| |
| |
| if not outputs.speech_outputs or outputs.speech_outputs[0] is None: |
| raise RuntimeError("Long-form generation failed - no audio output produced") |
| |
| |
| audio_tensor = outputs.speech_outputs[0] |
| audio_duration = audio_tensor.shape[-1] / self.sample_rate |
| |
| print(f"🎵 Generated {audio_duration/60:.1f} minutes of audio in {generation_time:.2f}s") |
| |
| |
| torch.cuda.empty_cache() |
| |
| |
| print("🔄 Stage 5: Encoding audio output...") |
| encoding_start = time.time() |
| audio_b64 = self._encode_audio_longform(audio_tensor, output_format) |
| encoding_time = time.time() - encoding_start |
| |
| |
| total_time = time.time() - start_total |
| rtf = generation_time / audio_duration if audio_duration > 0 else 0 |
| |
| |
| self._cleanup_temp_files(voice_paths) |
| torch.cuda.empty_cache() |
| |
| |
| response = { |
| |
| "audio": audio_b64, |
| "sample_rate": self.sample_rate, |
| "duration": round(audio_duration, 2), |
| "duration_minutes": round(audio_duration / 60, 2), |
| "format": output_format, |
| |
| |
| "speakers_detected": len(set(speaker_numbers)), |
| "segments": len(scripts), |
| "input_tokens": input_tokens, |
| |
| |
| "generation_time": round(generation_time, 2), |
| "total_processing_time": round(total_time, 2), |
| "real_time_factor": round(rtf, 3), |
| |
| |
| "cfg_scale": cfg_scale, |
| "ddmp_steps": ddpm_steps, |
| |
| |
| "processing_breakdown": { |
| "parsing_time": round(parsing_time, 2), |
| "voice_prep_time": round(voice_prep_time, 2), |
| "input_prep_time": round(input_prep_time, 2), |
| "generation_time": round(generation_time, 2), |
| "encoding_time": round(encoding_time, 2) |
| }, |
| |
| |
| "performance_metrics": { |
| "tokens_per_second": round(input_tokens / generation_time, 1), |
| "audio_minutes_per_minute": round((audio_duration/60) / (generation_time/60), 2), |
| "memory_efficient": True, |
| "flash_attention_2": True |
| }, |
| |
| |
| "warning": "This audio was generated by AI using VibeVoice - Microsoft Research" |
| } |
| |
| |
| print(f"✅ Long-form generation complete:") |
| print(f" 📊 Audio: {audio_duration/60:.1f} minutes") |
| print(f" ⚡ RTF: {rtf:.3f}x (generation speed vs audio duration)") |
| print(f" ⏱️ Total time: {total_time:.2f}s") |
| print(f" 💡 For faster: reduce cfg_scale/ddpm_steps. For quality: increase them.") |
| |
| return response |
|
|
| def _encode_audio_longform(self, audio_tensor: torch.Tensor, format: str = "wav") -> str: |
| """ |
| Encode long-form audio with memory optimization. |
| |
| === UNDERSTANDING AUDIO ENCODING === |
| |
| This function converts the AI-generated audio tensor into a format |
| that can be sent over HTTP (base64 encoded audio file). |
| |
| === MEMORY OPTIMIZATION FOR LONG AUDIO === |
| |
| For 10-30 minute audio files: |
| 1. Move audio to CPU to free GPU memory |
| 2. Use streaming encoding to prevent memory spikes |
| 3. Handle large file sizes efficiently |
| |
| === AUDIO FORMAT CONSIDERATIONS === |
| |
| WAV Format (current default): |
| - Uncompressed, highest quality |
| - Large file sizes (important for 30-min audio) |
| - Universal compatibility |
| |
| Alternative formats (if you modify this function): |
| - MP3: Compressed, smaller files, slight quality loss |
| - FLAC: Compressed, lossless, good for distribution |
| |
| === TWEAKING FOR DIFFERENT OBJECTIVES === |
| |
| FOR SMALLER FILE SIZES: |
| - Implement MP3 encoding (requires additional libraries) |
| - Consider reducing sample rate (though not recommended) |
| |
| FOR FASTEST PROCESSING: |
| - Keep current WAV implementation |
| - Consider skipping format conversion entirely |
| |
| FOR HIGHEST QUALITY: |
| - Use 32-bit float WAV encoding |
| - Implement FLAC compression |
| """ |
| |
| |
| |
| audio_cpu = audio_tensor.cpu() |
| |
| |
| |
| if audio_cpu.dim() == 1: |
| audio_cpu = audio_cpu.unsqueeze(0) |
| elif audio_cpu.dim() == 3: |
| audio_cpu = audio_cpu.squeeze(0) |
| |
| |
| |
| buffer = io.BytesIO() |
| |
| |
| |
| torchaudio.save(buffer, audio_cpu, self.sample_rate, format="wav") |
| |
| |
| audio_bytes = buffer.getvalue() |
| |
| |
| size_mb = len(audio_bytes) / (1024 * 1024) |
| print(f" 📁 Audio file size: {size_mb:.1f} MB") |
| |
| if size_mb > 100: |
| print(f" ⚠️ Large file size - consider MP3 encoding for production") |
| |
| return base64.b64encode(audio_bytes).decode('utf-8') |
|
|
| def _cleanup_temp_files(self, voice_paths: List[str]): |
| """ |
| Clean up temporary voice files to prevent storage buildup. |
| |
| === UNDERSTANDING CLEANUP === |
| |
| This function removes temporary files created during processing: |
| - Custom voice samples (decoded from base64) |
| - Synthetic voice references |
| - Any other temporary audio files |
| |
| === WHY CLEANUP IS IMPORTANT === |
| |
| For production deployments: |
| 1. Prevent storage space buildup over time |
| 2. Maintain system cleanliness |
| 3. Avoid potential file conflicts |
| 4. Security best practice (remove temporary data) |
| |
| This cleanup happens automatically after each generation. |
| """ |
| |
| cleaned_count = 0 |
| for path in voice_paths: |
| |
| if "/tmp/" in path and os.path.exists(path): |
| try: |
| os.unlink(path) |
| cleaned_count += 1 |
| except Exception: |
| pass |
| |
| if cleaned_count > 0: |
| print(f" 🧹 Cleaned up {cleaned_count} temporary voice files") |
|
|