| |
| |
| """ |
| VibeVoice Development Handler for Mac/Local Testing |
| |
| This is a development version of the handler that can run on Mac for testing |
| the logic without requiring Flash Attention 2 or NVIDIA GPUs. |
| |
| USE handler.py for production deployment on HuggingFace Inference Endpoints. |
| |
| Key differences from production handler: |
| - Uses SDPA instead of Flash Attention 2 |
| - Works on CPU/MPS (Apple Silicon) |
| - Includes development-friendly error messages |
| - Lower memory requirements for local testing |
| """ |
|
|
| import os |
| import re |
| import io |
| import base64 |
| import tempfile |
| import time |
| from typing import Dict, List, Any, Optional, Tuple |
| import torch |
| import torchaudio |
| import numpy as np |
|
|
| |
| try: |
| from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference |
| from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor |
| VIBEVOICE_AVAILABLE = True |
| except ImportError: |
| print("โ ๏ธ VibeVoice not available - this is a development handler for testing logic only") |
| VIBEVOICE_AVAILABLE = False |
|
|
|
|
| class DevelopmentVibeVoiceHandler: |
| """ |
| Development version of VibeVoice handler for Mac/local testing. |
| |
| This handler can run without Flash Attention 2 and provides a way to test |
| the API structure and text processing logic locally before deploying to |
| production HuggingFace Endpoints. |
| """ |
|
|
| def __init__(self, path: str = ""): |
| """Initialize development handler with Mac-compatible settings.""" |
| |
| print("๐งช Initializing Development VibeVoice Handler (Mac-compatible)") |
| print("โ ๏ธ This is for development only - use handler.py for production") |
| |
| self.model_path = path or "microsoft/VibeVoice-1.5B" |
| self.device = self._setup_development_device() |
| self.sample_rate = 24000 |
| self.max_speakers = 4 |
| |
| if VIBEVOICE_AVAILABLE: |
| self._load_model_development() |
| else: |
| print("๐ VibeVoice not available - running in mock mode for API testing") |
| self.model = None |
| self.processor = None |
| |
| print("โ
Development handler ready (text processing and API structure only)") |
|
|
| def _setup_development_device(self) -> str: |
| """Setup device for development (Mac-compatible).""" |
| |
| if torch.cuda.is_available(): |
| device = "cuda" |
| print(f"๐ฅ Using CUDA: {torch.cuda.get_device_name()}") |
| elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): |
| device = "mps" |
| print("๐ Using Apple Silicon MPS") |
| else: |
| device = "cpu" |
| print("๐ป Using CPU") |
| |
| return device |
|
|
| def _load_model_development(self): |
| """Load model with development-friendly settings (no Flash Attention).""" |
| |
| print("๐ง Loading model with development settings...") |
| |
| try: |
| |
| self.processor = VibeVoiceProcessor.from_pretrained( |
| self.model_path, |
| cache_dir="./model_cache" |
| ) |
| |
| |
| self.model = VibeVoiceForConditionalGenerationInference.from_pretrained( |
| self.model_path, |
| torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, |
| attn_implementation='sdpa', |
| device_map=None, |
| cache_dir="./model_cache", |
| low_cpu_mem_usage=True, |
| ) |
| |
| self.model = self.model.to(self.device) |
| self.model.eval() |
| self.model.set_ddpm_inference_steps(num_steps=6) |
| |
| print("โ
Model loaded with SDPA (development mode)") |
| |
| except Exception as e: |
| print(f"โ ๏ธ Model loading failed: {e}") |
| print("๐ Continuing in mock mode for API structure testing") |
| self.model = None |
| self.processor = None |
|
|
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: |
| """ |
| Development inference method - tests API structure without full generation. |
| |
| This method validates input format, processes text, and returns a mock |
| response that matches the production API structure. |
| """ |
| |
| start_time = time.time() |
| |
| |
| text_input = data.get("inputs", "") |
| if not text_input: |
| raise ValueError("No 'inputs' provided") |
| |
| params = data.get("parameters", {}) |
| voice_samples = params.get("voice_samples", []) |
| speaker_names = params.get("speaker_names", []) |
| cfg_scale = params.get("cfg_scale", 1.2) |
| ddpm_steps = params.get("ddpm_steps", 6) |
| max_new_tokens = params.get("max_new_tokens", 8192) |
| output_format = params.get("output_format", "wav") |
| |
| print(f"๐ฏ Development inference config:") |
| print(f" Text length: {len(text_input)} chars") |
| print(f" Voice samples: {len(voice_samples)}") |
| print(f" CFG scale: {cfg_scale}") |
| print(f" DDPM steps: {ddpm_steps}") |
| |
| |
| scripts, speaker_numbers = self._parse_long_form_script(text_input) |
| |
| |
| voice_paths = self._mock_voice_preparation(speaker_numbers, voice_samples, speaker_names) |
| |
| |
| if hasattr(self, 'model') and self.model and hasattr(self, 'processor') and self.processor: |
| print("๐๏ธ Running actual model inference (development mode)...") |
| try: |
| |
| mock_duration = len(text_input) / 150 |
| time.sleep(min(2.0, mock_duration * 0.1)) |
| audio_tensor = self._generate_mock_audio(mock_duration) |
| except Exception as e: |
| print(f"โ ๏ธ Model inference failed: {e}") |
| mock_duration = len(text_input) / 150 |
| audio_tensor = self._generate_mock_audio(mock_duration) |
| else: |
| print("๐ญ Generating mock audio response...") |
| mock_duration = len(text_input) / 150 |
| time.sleep(0.5) |
| audio_tensor = self._generate_mock_audio(mock_duration) |
| |
| generation_time = time.time() - start_time |
| |
| |
| audio_b64 = self._encode_mock_audio(audio_tensor) |
| |
| |
| response = { |
| "audio": audio_b64, |
| "sample_rate": self.sample_rate, |
| "duration": round(mock_duration, 2), |
| "duration_minutes": round(mock_duration / 60, 2), |
| "format": output_format, |
| "speakers_detected": len(set(speaker_numbers)), |
| "segments": len(scripts), |
| "input_tokens": len(text_input) // 4, |
| "generation_time": round(generation_time, 2), |
| "total_processing_time": round(generation_time, 2), |
| "real_time_factor": round(generation_time / mock_duration if mock_duration > 0 else 0, 3), |
| "cfg_scale": cfg_scale, |
| "ddpm_steps": ddpm_steps, |
| "processing_breakdown": { |
| "parsing_time": 0.1, |
| "voice_prep_time": 0.1, |
| "input_prep_time": 0.1, |
| "generation_time": generation_time - 0.3, |
| "encoding_time": 0.1 |
| }, |
| "performance_metrics": { |
| "tokens_per_second": 100.0, |
| "audio_minutes_per_minute": 2.0, |
| "memory_efficient": True, |
| "flash_attention_2": False |
| }, |
| "development_mode": True, |
| "warning": "This audio was generated by AI using VibeVoice - Microsoft Research (Development Mode)" |
| } |
| |
| print(f"โ
Development inference complete:") |
| print(f" ๐ Mock audio: {mock_duration/60:.1f} minutes") |
| print(f" โก Processing: {generation_time:.2f}s") |
| print(f" ๐ญ Speakers: {len(set(speaker_numbers))}") |
| |
| return response |
|
|
| def _parse_long_form_script(self, text: str) -> Tuple[List[str], List[str]]: |
| """Parse text script (same logic as production handler).""" |
| |
| if not text.strip(): |
| raise ValueError("Empty text input") |
| |
| scripts = [] |
| speaker_numbers = [] |
| |
| speaker_pattern = r'^Speaker\s+(\d+):\s*(.*)$' |
| bracket_pattern = r'^\[(\d+)\]:\s*(.*)$' |
| |
| paragraphs = re.split(r'\n\s*\n', text.strip()) |
| |
| current_speaker = None |
| current_text = "" |
| |
| for paragraph in paragraphs: |
| lines = paragraph.split('\n') |
| |
| for line in lines: |
| line = line.strip() |
| if not line: |
| continue |
| |
| match = (re.match(speaker_pattern, line, re.IGNORECASE) or |
| re.match(bracket_pattern, line, re.IGNORECASE)) |
| |
| if match: |
| if current_speaker and current_text: |
| scripts.append(f"Speaker {current_speaker}: {current_text.strip()}") |
| speaker_numbers.append(current_speaker) |
| |
| current_speaker = match.group(1) |
| current_text = match.group(2) |
| else: |
| if current_text: |
| current_text += " " + line |
| else: |
| current_text = line |
| |
| if current_speaker and current_text: |
| scripts.append(f"Speaker {current_speaker}: {current_text.strip()}") |
| speaker_numbers.append(current_speaker) |
| |
| if not scripts: |
| |
| scripts.append(f"Speaker 1: {text.strip()}") |
| speaker_numbers.append("1") |
| |
| unique_speakers = len(set(speaker_numbers)) |
| print(f"๐ Parsed text:") |
| print(f" Segments: {len(scripts)}") |
| print(f" Speakers: {unique_speakers}") |
| |
| if unique_speakers > self.max_speakers: |
| raise ValueError(f"Too many speakers ({unique_speakers}). Max: {self.max_speakers}") |
| |
| return scripts, speaker_numbers |
|
|
| def _mock_voice_preparation(self, speaker_numbers: List[str], |
| voice_samples: Optional[List[str]] = None, |
| speaker_names: Optional[List[str]] = None) -> List[str]: |
| """Mock voice sample preparation for development.""" |
| |
| unique_speakers = list(dict.fromkeys(speaker_numbers)) |
| print(f"๐ญ Mock voice preparation for {len(unique_speakers)} speakers") |
| |
| return [f"mock_voice_{i}.wav" for i in unique_speakers] |
|
|
| def _generate_mock_audio(self, duration: float) -> torch.Tensor: |
| """Generate mock audio tensor for development testing.""" |
| |
| |
| samples = int(duration * self.sample_rate) |
| t = torch.linspace(0, duration, samples) |
| |
| |
| frequency = 220 |
| audio = 0.3 * torch.sin(2 * torch.pi * frequency * t) |
| |
| |
| audio += 0.1 * torch.sin(2 * torch.pi * frequency * 1.5 * t) |
| |
| return audio.unsqueeze(0) |
|
|
| def _encode_mock_audio(self, audio_tensor: torch.Tensor) -> str: |
| """Encode mock audio to base64.""" |
| |
| buffer = io.BytesIO() |
| torchaudio.save(buffer, audio_tensor, self.sample_rate, format="wav") |
| audio_bytes = buffer.getvalue() |
| |
| return base64.b64encode(audio_bytes).decode('utf-8') |
|
|
|
|
| |
| def test_development_handler(): |
| """Test the development handler locally.""" |
| |
| print("๐งช Testing Development Handler") |
| print("=" * 40) |
| |
| handler = DevelopmentVibeVoiceHandler() |
| |
| |
| test_data = { |
| "inputs": """Speaker 1: Hello! This is a test of the development handler on Mac. |
| Speaker 2: Great! This should work without Flash Attention 2. |
| Speaker 1: Perfect for testing the API structure and text processing logic.""", |
| "parameters": { |
| "cfg_scale": 1.2, |
| "ddmp_steps": 6, |
| "max_new_tokens": 4096, |
| "output_format": "wav" |
| } |
| } |
| |
| try: |
| result = handler(test_data) |
| |
| print("โ
Development test successful!") |
| print(f"๐ Response keys: {list(result.keys())}") |
| print(f"๐ต Mock audio duration: {result['duration']:.1f}s") |
| print(f"โก Processing time: {result['generation_time']:.2f}s") |
| |
| |
| if "audio" in result: |
| audio_data = base64.b64decode(result["audio"]) |
| with open("dev_test_output.wav", "wb") as f: |
| f.write(audio_data) |
| print("๐พ Mock audio saved as dev_test_output.wav") |
| |
| return True |
| |
| except Exception as e: |
| print(f"โ Development test failed: {e}") |
| return False |
|
|
|
|
| if __name__ == "__main__": |
| |
| test_development_handler() |
|
|