#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ VibeVoice Development Handler for Mac/Local Testing This is a development version of the handler that can run on Mac for testing the logic without requiring Flash Attention 2 or NVIDIA GPUs. USE handler.py for production deployment on HuggingFace Inference Endpoints. Key differences from production handler: - Uses SDPA instead of Flash Attention 2 - Works on CPU/MPS (Apple Silicon) - Includes development-friendly error messages - Lower memory requirements for local testing """ import os import re import io import base64 import tempfile import time from typing import Dict, List, Any, Optional, Tuple import torch import torchaudio import numpy as np # Mock VibeVoice imports for development (replace with actual imports when available) try: from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor VIBEVOICE_AVAILABLE = True except ImportError: print("โš ๏ธ VibeVoice not available - this is a development handler for testing logic only") VIBEVOICE_AVAILABLE = False class DevelopmentVibeVoiceHandler: """ Development version of VibeVoice handler for Mac/local testing. This handler can run without Flash Attention 2 and provides a way to test the API structure and text processing logic locally before deploying to production HuggingFace Endpoints. """ def __init__(self, path: str = ""): """Initialize development handler with Mac-compatible settings.""" print("๐Ÿงช Initializing Development VibeVoice Handler (Mac-compatible)") print("โš ๏ธ This is for development only - use handler.py for production") self.model_path = path or "microsoft/VibeVoice-1.5B" self.device = self._setup_development_device() self.sample_rate = 24000 self.max_speakers = 4 if VIBEVOICE_AVAILABLE: self._load_model_development() else: print("๐Ÿ“ VibeVoice not available - running in mock mode for API testing") self.model = None self.processor = None print("โœ… Development handler ready (text processing and API structure only)") def _setup_development_device(self) -> str: """Setup device for development (Mac-compatible).""" if torch.cuda.is_available(): device = "cuda" print(f"๐Ÿ”ฅ Using CUDA: {torch.cuda.get_device_name()}") elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): device = "mps" print("๐ŸŽ Using Apple Silicon MPS") else: device = "cpu" print("๐Ÿ’ป Using CPU") return device def _load_model_development(self): """Load model with development-friendly settings (no Flash Attention).""" print("๐Ÿง  Loading model with development settings...") try: # Load processor self.processor = VibeVoiceProcessor.from_pretrained( self.model_path, cache_dir="./model_cache" # Local cache for development ) # Load model with SDPA (Mac-compatible) instead of Flash Attention 2 self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( self.model_path, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, attn_implementation='sdpa', # Mac-compatible attention device_map=None, cache_dir="./model_cache", low_cpu_mem_usage=True, ) self.model = self.model.to(self.device) self.model.eval() self.model.set_ddpm_inference_steps(num_steps=6) print("โœ… Model loaded with SDPA (development mode)") except Exception as e: print(f"โš ๏ธ Model loading failed: {e}") print("๐Ÿ“ Continuing in mock mode for API structure testing") self.model = None self.processor = None def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: """ Development inference method - tests API structure without full generation. This method validates input format, processes text, and returns a mock response that matches the production API structure. """ start_time = time.time() # Validate input format (same as production) text_input = data.get("inputs", "") if not text_input: raise ValueError("No 'inputs' provided") params = data.get("parameters", {}) voice_samples = params.get("voice_samples", []) speaker_names = params.get("speaker_names", []) cfg_scale = params.get("cfg_scale", 1.2) ddpm_steps = params.get("ddpm_steps", 6) max_new_tokens = params.get("max_new_tokens", 8192) output_format = params.get("output_format", "wav") print(f"๐ŸŽฏ Development inference config:") print(f" Text length: {len(text_input)} chars") print(f" Voice samples: {len(voice_samples)}") print(f" CFG scale: {cfg_scale}") print(f" DDPM steps: {ddpm_steps}") # Test text parsing (same logic as production) scripts, speaker_numbers = self._parse_long_form_script(text_input) # Simulate voice preparation voice_paths = self._mock_voice_preparation(speaker_numbers, voice_samples, speaker_names) # Mock audio generation if hasattr(self, 'model') and self.model and hasattr(self, 'processor') and self.processor: print("๐ŸŽ™๏ธ Running actual model inference (development mode)...") try: # This would run actual inference with SDPA mock_duration = len(text_input) / 150 # Rough estimate: 150 chars per second time.sleep(min(2.0, mock_duration * 0.1)) # Simulate generation time audio_tensor = self._generate_mock_audio(mock_duration) except Exception as e: print(f"โš ๏ธ Model inference failed: {e}") mock_duration = len(text_input) / 150 audio_tensor = self._generate_mock_audio(mock_duration) else: print("๐ŸŽญ Generating mock audio response...") mock_duration = len(text_input) / 150 # Rough estimate time.sleep(0.5) # Simulate processing time audio_tensor = self._generate_mock_audio(mock_duration) generation_time = time.time() - start_time # Create mock audio data audio_b64 = self._encode_mock_audio(audio_tensor) # Return production-compatible response response = { "audio": audio_b64, "sample_rate": self.sample_rate, "duration": round(mock_duration, 2), "duration_minutes": round(mock_duration / 60, 2), "format": output_format, "speakers_detected": len(set(speaker_numbers)), "segments": len(scripts), "input_tokens": len(text_input) // 4, # Rough token estimate "generation_time": round(generation_time, 2), "total_processing_time": round(generation_time, 2), "real_time_factor": round(generation_time / mock_duration if mock_duration > 0 else 0, 3), "cfg_scale": cfg_scale, "ddpm_steps": ddpm_steps, "processing_breakdown": { "parsing_time": 0.1, "voice_prep_time": 0.1, "input_prep_time": 0.1, "generation_time": generation_time - 0.3, "encoding_time": 0.1 }, "performance_metrics": { "tokens_per_second": 100.0, "audio_minutes_per_minute": 2.0, "memory_efficient": True, "flash_attention_2": False # SDPA in development }, "development_mode": True, "warning": "This audio was generated by AI using VibeVoice - Microsoft Research (Development Mode)" } print(f"โœ… Development inference complete:") print(f" ๐Ÿ“Š Mock audio: {mock_duration/60:.1f} minutes") print(f" โšก Processing: {generation_time:.2f}s") print(f" ๐ŸŽญ Speakers: {len(set(speaker_numbers))}") return response def _parse_long_form_script(self, text: str) -> Tuple[List[str], List[str]]: """Parse text script (same logic as production handler).""" if not text.strip(): raise ValueError("Empty text input") scripts = [] speaker_numbers = [] speaker_pattern = r'^Speaker\s+(\d+):\s*(.*)$' bracket_pattern = r'^\[(\d+)\]:\s*(.*)$' paragraphs = re.split(r'\n\s*\n', text.strip()) current_speaker = None current_text = "" for paragraph in paragraphs: lines = paragraph.split('\n') for line in lines: line = line.strip() if not line: continue match = (re.match(speaker_pattern, line, re.IGNORECASE) or re.match(bracket_pattern, line, re.IGNORECASE)) if match: if current_speaker and current_text: scripts.append(f"Speaker {current_speaker}: {current_text.strip()}") speaker_numbers.append(current_speaker) current_speaker = match.group(1) current_text = match.group(2) else: if current_text: current_text += " " + line else: current_text = line if current_speaker and current_text: scripts.append(f"Speaker {current_speaker}: {current_text.strip()}") speaker_numbers.append(current_speaker) if not scripts: # Handle plain text (single speaker) scripts.append(f"Speaker 1: {text.strip()}") speaker_numbers.append("1") unique_speakers = len(set(speaker_numbers)) print(f"๐Ÿ“Š Parsed text:") print(f" Segments: {len(scripts)}") print(f" Speakers: {unique_speakers}") if unique_speakers > self.max_speakers: raise ValueError(f"Too many speakers ({unique_speakers}). Max: {self.max_speakers}") return scripts, speaker_numbers def _mock_voice_preparation(self, speaker_numbers: List[str], voice_samples: Optional[List[str]] = None, speaker_names: Optional[List[str]] = None) -> List[str]: """Mock voice sample preparation for development.""" unique_speakers = list(dict.fromkeys(speaker_numbers)) print(f"๐ŸŽญ Mock voice preparation for {len(unique_speakers)} speakers") return [f"mock_voice_{i}.wav" for i in unique_speakers] def _generate_mock_audio(self, duration: float) -> torch.Tensor: """Generate mock audio tensor for development testing.""" # Create simple sine wave audio for testing samples = int(duration * self.sample_rate) t = torch.linspace(0, duration, samples) # Create pleasant-sounding mock audio (not just noise) frequency = 220 # A3 note audio = 0.3 * torch.sin(2 * torch.pi * frequency * t) # Add slight complexity audio += 0.1 * torch.sin(2 * torch.pi * frequency * 1.5 * t) return audio.unsqueeze(0) # Add channel dimension def _encode_mock_audio(self, audio_tensor: torch.Tensor) -> str: """Encode mock audio to base64.""" buffer = io.BytesIO() torchaudio.save(buffer, audio_tensor, self.sample_rate, format="wav") audio_bytes = buffer.getvalue() return base64.b64encode(audio_bytes).decode('utf-8') # Development testing function def test_development_handler(): """Test the development handler locally.""" print("๐Ÿงช Testing Development Handler") print("=" * 40) handler = DevelopmentVibeVoiceHandler() # Test data test_data = { "inputs": """Speaker 1: Hello! This is a test of the development handler on Mac. Speaker 2: Great! This should work without Flash Attention 2. Speaker 1: Perfect for testing the API structure and text processing logic.""", "parameters": { "cfg_scale": 1.2, "ddmp_steps": 6, "max_new_tokens": 4096, "output_format": "wav" } } try: result = handler(test_data) print("โœ… Development test successful!") print(f"๐Ÿ“Š Response keys: {list(result.keys())}") print(f"๐ŸŽต Mock audio duration: {result['duration']:.1f}s") print(f"โšก Processing time: {result['generation_time']:.2f}s") # Save mock audio for testing if "audio" in result: audio_data = base64.b64decode(result["audio"]) with open("dev_test_output.wav", "wb") as f: f.write(audio_data) print("๐Ÿ’พ Mock audio saved as dev_test_output.wav") return True except Exception as e: print(f"โŒ Development test failed: {e}") return False if __name__ == "__main__": # Run development test test_development_handler()