#!/usr/bin/env python3 """ VibeVoice CoreML Inference Script This script provides inference utilities for the converted VibeVoice models. Note: This must be run on macOS to use CoreML models. Usage: python inference.py --models-dir ./models --text "Hello world" """ import argparse import json from pathlib import Path from typing import Optional, Tuple import numpy as np # CoreML is only available on macOS try: import coremltools as ct COREML_AVAILABLE = True except ImportError: COREML_AVAILABLE = False print("Warning: coremltools not available. Running in mock mode.") class DPMSolverScheduler: """DPM-Solver scheduler for diffusion inference.""" def __init__( self, num_train_timesteps: int = 1000, num_inference_steps: int = 20, beta_schedule: str = "cosine" ): self.num_train_timesteps = num_train_timesteps self.num_inference_steps = num_inference_steps # Compute beta schedule if beta_schedule == "cosine": steps = num_train_timesteps + 1 t = np.linspace(0, 1, steps) alpha_bar = np.cos((t + 0.008) / 1.008 * np.pi / 2) ** 2 self.betas = np.clip(1 - alpha_bar[1:] / alpha_bar[:-1], 0, 0.999) else: self.betas = np.linspace(0.0001, 0.02, num_train_timesteps) self.alphas = 1 - self.betas self.alphas_cumprod = np.cumprod(self.alphas) # Compute timesteps step_ratio = num_train_timesteps / num_inference_steps self.timesteps = (num_train_timesteps - 1 - np.arange(num_inference_steps) * step_ratio).astype(np.int64) def add_noise(self, original: np.ndarray, noise: np.ndarray, timestep: int) -> np.ndarray: """Add noise to sample at given timestep.""" sqrt_alpha = np.sqrt(self.alphas_cumprod[timestep]) sqrt_one_minus_alpha = np.sqrt(1 - self.alphas_cumprod[timestep]) return sqrt_alpha * original + sqrt_one_minus_alpha * noise def step( self, model_output: np.ndarray, timestep: int, sample: np.ndarray, prediction_type: str = "v_prediction" ) -> np.ndarray: """Single denoising step.""" alpha = self.alphas_cumprod[timestep] alpha_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else 1.0 if prediction_type == "v_prediction": # Convert v to epsilon sqrt_alpha = np.sqrt(alpha) sqrt_one_minus_alpha = np.sqrt(1 - alpha) pred_original = sqrt_alpha * sample - sqrt_one_minus_alpha * model_output pred_epsilon = sqrt_alpha * model_output + sqrt_one_minus_alpha * sample else: pred_epsilon = model_output pred_original = (sample - sqrt_one_minus_alpha * pred_epsilon) / sqrt_alpha # Compute previous sample sqrt_alpha_prev = np.sqrt(alpha_prev) sqrt_one_minus_alpha_prev = np.sqrt(1 - alpha_prev) pred_sample_prev = sqrt_alpha_prev * pred_original + sqrt_one_minus_alpha_prev * pred_epsilon return pred_sample_prev class VibeVoicePipeline: """VibeVoice CoreML inference pipeline.""" def __init__(self, models_dir: Path): self.models_dir = Path(models_dir) self.models = {} # Load configuration config_path = self.models_dir / "vibevoice_pipeline_config.json" if config_path.exists(): with open(config_path) as f: self.config = json.load(f) else: self.config = self._default_config() # Initialize scheduler self.scheduler = DPMSolverScheduler( num_inference_steps=self.config["inference"]["diffusion"]["num_steps"] ) if COREML_AVAILABLE: self._load_models() def _default_config(self): return { "inference": { "audio": {"sample_rate": 24000, "downsample_factor": 3200}, "diffusion": {"num_steps": 20, "prediction_type": "v_prediction"} } } def _load_models(self): """Load CoreML models.""" model_files = { "acoustic_encoder": "vibevoice_acoustic_encoder.mlpackage", "acoustic_decoder": "vibevoice_acoustic_decoder.mlpackage", "semantic_encoder": "vibevoice_semantic_encoder.mlpackage", "llm": "vibevoice_llm.mlpackage", "diffusion_head": "vibevoice_diffusion_head.mlpackage" } for name, filename in model_files.items(): path = self.models_dir / filename if path.exists(): try: self.models[name] = ct.models.MLModel(str(path)) print(f"Loaded {name}") except Exception as e: print(f"Failed to load {name}: {e}") def encode_acoustic(self, audio: np.ndarray) -> np.ndarray: """Encode audio to acoustic latent.""" if "acoustic_encoder" not in self.models: raise RuntimeError("Acoustic encoder not loaded") output = self.models["acoustic_encoder"].predict({"audio": audio}) return output["acoustic_latent"] def decode_acoustic(self, latent: np.ndarray) -> np.ndarray: """Decode acoustic latent to audio.""" if "acoustic_decoder" not in self.models: raise RuntimeError("Acoustic decoder not loaded") output = self.models["acoustic_decoder"].predict({"acoustic_latent": latent}) return output["audio"] def run_llm( self, input_ids: np.ndarray, attention_mask: np.ndarray ) -> Tuple[np.ndarray, np.ndarray]: """Run LLM forward pass.""" if "llm" not in self.models: raise RuntimeError("LLM not loaded") output = self.models["llm"].predict({ "input_ids": input_ids.astype(np.int32), "attention_mask": attention_mask.astype(np.float32) }) return output["hidden_states"], output["logits"] def diffusion_step( self, noisy_latent: np.ndarray, timestep: float, condition: np.ndarray ) -> np.ndarray: """Single diffusion denoising step.""" if "diffusion_head" not in self.models: raise RuntimeError("Diffusion head not loaded") output = self.models["diffusion_head"].predict({ "noisy_latent": noisy_latent.astype(np.float32), "timestep": np.array([timestep], dtype=np.float32), "condition": condition.astype(np.float32) }) return output["prediction"] def generate_speech( self, hidden_states: np.ndarray, num_tokens: int = 8 ) -> np.ndarray: """ Generate speech latents using diffusion. Args: hidden_states: LLM hidden states [batch, seq, hidden_dim] num_tokens: Number of speech tokens to generate Returns: audio: Generated audio waveform """ batch_size = hidden_states.shape[0] latent_dim = 64 # Initialize with noise latents = np.random.randn(batch_size, num_tokens, latent_dim).astype(np.float32) # Get condition from last hidden states condition = hidden_states[:, -num_tokens:, :] # [batch, num_tokens, hidden_dim] # Diffusion loop for t in self.scheduler.timesteps: for i in range(num_tokens): noisy = latents[:, i, :] # [batch, latent_dim] cond = condition[:, i, :] # [batch, hidden_dim] # Model prediction pred = self.diffusion_step(noisy, float(t), cond) # Scheduler step latents[:, i, :] = self.scheduler.step( pred, int(t), noisy, self.config["inference"]["diffusion"]["prediction_type"] ) # Decode to audio audio = self.decode_acoustic(latents) return audio def main(): parser = argparse.ArgumentParser(description="VibeVoice CoreML Inference") parser.add_argument("--models-dir", required=True, help="Directory with CoreML models") parser.add_argument("--text", help="Text to synthesize") parser.add_argument("--output", default="output.wav", help="Output audio file") args = parser.parse_args() if not COREML_AVAILABLE: print("CoreML is only available on macOS. Exiting.") return pipeline = VibeVoicePipeline(args.models_dir) print(f"Pipeline initialized with models: {list(pipeline.models.keys())}") if args.text: print(f"Note: Full text-to-speech requires tokenizer and complete inference pipeline.") print("This script demonstrates individual component usage.") if __name__ == "__main__": main()