File size: 9,087 Bytes

a4e1d96

#!/usr/bin/env python3
"""
VibeVoice CoreML Inference Script

This script provides inference utilities for the converted VibeVoice models.
Note: This must be run on macOS to use CoreML models.

Usage:
    python inference.py --models-dir ./models --text "Hello world"
"""

import argparse
import json
from pathlib import Path
from typing import Optional, Tuple

import numpy as np

# CoreML is only available on macOS
try:
    import coremltools as ct
    COREML_AVAILABLE = True
except ImportError:
    COREML_AVAILABLE = False
    print("Warning: coremltools not available. Running in mock mode.")


class DPMSolverScheduler:
    """DPM-Solver scheduler for diffusion inference."""
    
    def __init__(
        self,
        num_train_timesteps: int = 1000,
        num_inference_steps: int = 20,
        beta_schedule: str = "cosine"
    ):
        self.num_train_timesteps = num_train_timesteps
        self.num_inference_steps = num_inference_steps
        
        # Compute beta schedule
        if beta_schedule == "cosine":
            steps = num_train_timesteps + 1
            t = np.linspace(0, 1, steps)
            alpha_bar = np.cos((t + 0.008) / 1.008 * np.pi / 2) ** 2
            self.betas = np.clip(1 - alpha_bar[1:] / alpha_bar[:-1], 0, 0.999)
        else:
            self.betas = np.linspace(0.0001, 0.02, num_train_timesteps)
        
        self.alphas = 1 - self.betas
        self.alphas_cumprod = np.cumprod(self.alphas)
        
        # Compute timesteps
        step_ratio = num_train_timesteps / num_inference_steps
        self.timesteps = (num_train_timesteps - 1 - np.arange(num_inference_steps) * step_ratio).astype(np.int64)
    
    def add_noise(self, original: np.ndarray, noise: np.ndarray, timestep: int) -> np.ndarray:
        """Add noise to sample at given timestep."""
        sqrt_alpha = np.sqrt(self.alphas_cumprod[timestep])
        sqrt_one_minus_alpha = np.sqrt(1 - self.alphas_cumprod[timestep])
        return sqrt_alpha * original + sqrt_one_minus_alpha * noise
    
    def step(
        self,
        model_output: np.ndarray,
        timestep: int,
        sample: np.ndarray,
        prediction_type: str = "v_prediction"
    ) -> np.ndarray:
        """Single denoising step."""
        alpha = self.alphas_cumprod[timestep]
        alpha_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else 1.0
        
        if prediction_type == "v_prediction":
            # Convert v to epsilon
            sqrt_alpha = np.sqrt(alpha)
            sqrt_one_minus_alpha = np.sqrt(1 - alpha)
            pred_original = sqrt_alpha * sample - sqrt_one_minus_alpha * model_output
            pred_epsilon = sqrt_alpha * model_output + sqrt_one_minus_alpha * sample
        else:
            pred_epsilon = model_output
            pred_original = (sample - sqrt_one_minus_alpha * pred_epsilon) / sqrt_alpha
        
        # Compute previous sample
        sqrt_alpha_prev = np.sqrt(alpha_prev)
        sqrt_one_minus_alpha_prev = np.sqrt(1 - alpha_prev)
        
        pred_sample_prev = sqrt_alpha_prev * pred_original + sqrt_one_minus_alpha_prev * pred_epsilon
        
        return pred_sample_prev


class VibeVoicePipeline:
    """VibeVoice CoreML inference pipeline."""
    
    def __init__(self, models_dir: Path):
        self.models_dir = Path(models_dir)
        self.models = {}
        
        # Load configuration
        config_path = self.models_dir / "vibevoice_pipeline_config.json"
        if config_path.exists():
            with open(config_path) as f:
                self.config = json.load(f)
        else:
            self.config = self._default_config()
        
        # Initialize scheduler
        self.scheduler = DPMSolverScheduler(
            num_inference_steps=self.config["inference"]["diffusion"]["num_steps"]
        )
        
        if COREML_AVAILABLE:
            self._load_models()
    
    def _default_config(self):
        return {
            "inference": {
                "audio": {"sample_rate": 24000, "downsample_factor": 3200},
                "diffusion": {"num_steps": 20, "prediction_type": "v_prediction"}
            }
        }
    
    def _load_models(self):
        """Load CoreML models."""
        model_files = {
            "acoustic_encoder": "vibevoice_acoustic_encoder.mlpackage",
            "acoustic_decoder": "vibevoice_acoustic_decoder.mlpackage",
            "semantic_encoder": "vibevoice_semantic_encoder.mlpackage",
            "llm": "vibevoice_llm.mlpackage",
            "diffusion_head": "vibevoice_diffusion_head.mlpackage"
        }
        
        for name, filename in model_files.items():
            path = self.models_dir / filename
            if path.exists():
                try:
                    self.models[name] = ct.models.MLModel(str(path))
                    print(f"Loaded {name}")
                except Exception as e:
                    print(f"Failed to load {name}: {e}")
    
    def encode_acoustic(self, audio: np.ndarray) -> np.ndarray:
        """Encode audio to acoustic latent."""
        if "acoustic_encoder" not in self.models:
            raise RuntimeError("Acoustic encoder not loaded")
        
        output = self.models["acoustic_encoder"].predict({"audio": audio})
        return output["acoustic_latent"]
    
    def decode_acoustic(self, latent: np.ndarray) -> np.ndarray:
        """Decode acoustic latent to audio."""
        if "acoustic_decoder" not in self.models:
            raise RuntimeError("Acoustic decoder not loaded")
        
        output = self.models["acoustic_decoder"].predict({"acoustic_latent": latent})
        return output["audio"]
    
    def run_llm(
        self,
        input_ids: np.ndarray,
        attention_mask: np.ndarray
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Run LLM forward pass."""
        if "llm" not in self.models:
            raise RuntimeError("LLM not loaded")
        
        output = self.models["llm"].predict({
            "input_ids": input_ids.astype(np.int32),
            "attention_mask": attention_mask.astype(np.float32)
        })
        return output["hidden_states"], output["logits"]
    
    def diffusion_step(
        self,
        noisy_latent: np.ndarray,
        timestep: float,
        condition: np.ndarray
    ) -> np.ndarray:
        """Single diffusion denoising step."""
        if "diffusion_head" not in self.models:
            raise RuntimeError("Diffusion head not loaded")
        
        output = self.models["diffusion_head"].predict({
            "noisy_latent": noisy_latent.astype(np.float32),
            "timestep": np.array([timestep], dtype=np.float32),
            "condition": condition.astype(np.float32)
        })
        return output["prediction"]
    
    def generate_speech(
        self,
        hidden_states: np.ndarray,
        num_tokens: int = 8
    ) -> np.ndarray:
        """
        Generate speech latents using diffusion.
        
        Args:
            hidden_states: LLM hidden states [batch, seq, hidden_dim]
            num_tokens: Number of speech tokens to generate
        Returns:
            audio: Generated audio waveform
        """
        batch_size = hidden_states.shape[0]
        latent_dim = 64
        
        # Initialize with noise
        latents = np.random.randn(batch_size, num_tokens, latent_dim).astype(np.float32)
        
        # Get condition from last hidden states
        condition = hidden_states[:, -num_tokens:, :]  # [batch, num_tokens, hidden_dim]
        
        # Diffusion loop
        for t in self.scheduler.timesteps:
            for i in range(num_tokens):
                noisy = latents[:, i, :]  # [batch, latent_dim]
                cond = condition[:, i, :]  # [batch, hidden_dim]
                
                # Model prediction
                pred = self.diffusion_step(noisy, float(t), cond)
                
                # Scheduler step
                latents[:, i, :] = self.scheduler.step(
                    pred, int(t), noisy,
                    self.config["inference"]["diffusion"]["prediction_type"]
                )
        
        # Decode to audio
        audio = self.decode_acoustic(latents)
        
        return audio


def main():
    parser = argparse.ArgumentParser(description="VibeVoice CoreML Inference")
    parser.add_argument("--models-dir", required=True, help="Directory with CoreML models")
    parser.add_argument("--text", help="Text to synthesize")
    parser.add_argument("--output", default="output.wav", help="Output audio file")
    
    args = parser.parse_args()
    
    if not COREML_AVAILABLE:
        print("CoreML is only available on macOS. Exiting.")
        return
    
    pipeline = VibeVoicePipeline(args.models_dir)
    
    print(f"Pipeline initialized with models: {list(pipeline.models.keys())}")
    
    if args.text:
        print(f"Note: Full text-to-speech requires tokenizer and complete inference pipeline.")
        print("This script demonstrates individual component usage.")


if __name__ == "__main__":
    main()