| | |
| | """ |
| | VibeVoice CoreML Inference Script |
| | |
| | This script provides inference utilities for the converted VibeVoice models. |
| | Note: This must be run on macOS to use CoreML models. |
| | |
| | Usage: |
| | python inference.py --models-dir ./models --text "Hello world" |
| | """ |
| |
|
| | import argparse |
| | import json |
| | from pathlib import Path |
| | from typing import Optional, Tuple |
| |
|
| | import numpy as np |
| |
|
| | |
| | try: |
| | import coremltools as ct |
| | COREML_AVAILABLE = True |
| | except ImportError: |
| | COREML_AVAILABLE = False |
| | print("Warning: coremltools not available. Running in mock mode.") |
| |
|
| |
|
| | class DPMSolverScheduler: |
| | """DPM-Solver scheduler for diffusion inference.""" |
| | |
| | def __init__( |
| | self, |
| | num_train_timesteps: int = 1000, |
| | num_inference_steps: int = 20, |
| | beta_schedule: str = "cosine" |
| | ): |
| | self.num_train_timesteps = num_train_timesteps |
| | self.num_inference_steps = num_inference_steps |
| | |
| | |
| | if beta_schedule == "cosine": |
| | steps = num_train_timesteps + 1 |
| | t = np.linspace(0, 1, steps) |
| | alpha_bar = np.cos((t + 0.008) / 1.008 * np.pi / 2) ** 2 |
| | self.betas = np.clip(1 - alpha_bar[1:] / alpha_bar[:-1], 0, 0.999) |
| | else: |
| | self.betas = np.linspace(0.0001, 0.02, num_train_timesteps) |
| | |
| | self.alphas = 1 - self.betas |
| | self.alphas_cumprod = np.cumprod(self.alphas) |
| | |
| | |
| | step_ratio = num_train_timesteps / num_inference_steps |
| | self.timesteps = (num_train_timesteps - 1 - np.arange(num_inference_steps) * step_ratio).astype(np.int64) |
| | |
| | def add_noise(self, original: np.ndarray, noise: np.ndarray, timestep: int) -> np.ndarray: |
| | """Add noise to sample at given timestep.""" |
| | sqrt_alpha = np.sqrt(self.alphas_cumprod[timestep]) |
| | sqrt_one_minus_alpha = np.sqrt(1 - self.alphas_cumprod[timestep]) |
| | return sqrt_alpha * original + sqrt_one_minus_alpha * noise |
| | |
| | def step( |
| | self, |
| | model_output: np.ndarray, |
| | timestep: int, |
| | sample: np.ndarray, |
| | prediction_type: str = "v_prediction" |
| | ) -> np.ndarray: |
| | """Single denoising step.""" |
| | alpha = self.alphas_cumprod[timestep] |
| | alpha_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else 1.0 |
| | |
| | if prediction_type == "v_prediction": |
| | |
| | sqrt_alpha = np.sqrt(alpha) |
| | sqrt_one_minus_alpha = np.sqrt(1 - alpha) |
| | pred_original = sqrt_alpha * sample - sqrt_one_minus_alpha * model_output |
| | pred_epsilon = sqrt_alpha * model_output + sqrt_one_minus_alpha * sample |
| | else: |
| | pred_epsilon = model_output |
| | pred_original = (sample - sqrt_one_minus_alpha * pred_epsilon) / sqrt_alpha |
| | |
| | |
| | sqrt_alpha_prev = np.sqrt(alpha_prev) |
| | sqrt_one_minus_alpha_prev = np.sqrt(1 - alpha_prev) |
| | |
| | pred_sample_prev = sqrt_alpha_prev * pred_original + sqrt_one_minus_alpha_prev * pred_epsilon |
| | |
| | return pred_sample_prev |
| |
|
| |
|
| | class VibeVoicePipeline: |
| | """VibeVoice CoreML inference pipeline.""" |
| | |
| | def __init__(self, models_dir: Path): |
| | self.models_dir = Path(models_dir) |
| | self.models = {} |
| | |
| | |
| | config_path = self.models_dir / "vibevoice_pipeline_config.json" |
| | if config_path.exists(): |
| | with open(config_path) as f: |
| | self.config = json.load(f) |
| | else: |
| | self.config = self._default_config() |
| | |
| | |
| | self.scheduler = DPMSolverScheduler( |
| | num_inference_steps=self.config["inference"]["diffusion"]["num_steps"] |
| | ) |
| | |
| | if COREML_AVAILABLE: |
| | self._load_models() |
| | |
| | def _default_config(self): |
| | return { |
| | "inference": { |
| | "audio": {"sample_rate": 24000, "downsample_factor": 3200}, |
| | "diffusion": {"num_steps": 20, "prediction_type": "v_prediction"} |
| | } |
| | } |
| | |
| | def _load_models(self): |
| | """Load CoreML models.""" |
| | model_files = { |
| | "acoustic_encoder": "vibevoice_acoustic_encoder.mlpackage", |
| | "acoustic_decoder": "vibevoice_acoustic_decoder.mlpackage", |
| | "semantic_encoder": "vibevoice_semantic_encoder.mlpackage", |
| | "llm": "vibevoice_llm.mlpackage", |
| | "diffusion_head": "vibevoice_diffusion_head.mlpackage" |
| | } |
| | |
| | for name, filename in model_files.items(): |
| | path = self.models_dir / filename |
| | if path.exists(): |
| | try: |
| | self.models[name] = ct.models.MLModel(str(path)) |
| | print(f"Loaded {name}") |
| | except Exception as e: |
| | print(f"Failed to load {name}: {e}") |
| | |
| | def encode_acoustic(self, audio: np.ndarray) -> np.ndarray: |
| | """Encode audio to acoustic latent.""" |
| | if "acoustic_encoder" not in self.models: |
| | raise RuntimeError("Acoustic encoder not loaded") |
| | |
| | output = self.models["acoustic_encoder"].predict({"audio": audio}) |
| | return output["acoustic_latent"] |
| | |
| | def decode_acoustic(self, latent: np.ndarray) -> np.ndarray: |
| | """Decode acoustic latent to audio.""" |
| | if "acoustic_decoder" not in self.models: |
| | raise RuntimeError("Acoustic decoder not loaded") |
| | |
| | output = self.models["acoustic_decoder"].predict({"acoustic_latent": latent}) |
| | return output["audio"] |
| | |
| | def run_llm( |
| | self, |
| | input_ids: np.ndarray, |
| | attention_mask: np.ndarray |
| | ) -> Tuple[np.ndarray, np.ndarray]: |
| | """Run LLM forward pass.""" |
| | if "llm" not in self.models: |
| | raise RuntimeError("LLM not loaded") |
| | |
| | output = self.models["llm"].predict({ |
| | "input_ids": input_ids.astype(np.int32), |
| | "attention_mask": attention_mask.astype(np.float32) |
| | }) |
| | return output["hidden_states"], output["logits"] |
| | |
| | def diffusion_step( |
| | self, |
| | noisy_latent: np.ndarray, |
| | timestep: float, |
| | condition: np.ndarray |
| | ) -> np.ndarray: |
| | """Single diffusion denoising step.""" |
| | if "diffusion_head" not in self.models: |
| | raise RuntimeError("Diffusion head not loaded") |
| | |
| | output = self.models["diffusion_head"].predict({ |
| | "noisy_latent": noisy_latent.astype(np.float32), |
| | "timestep": np.array([timestep], dtype=np.float32), |
| | "condition": condition.astype(np.float32) |
| | }) |
| | return output["prediction"] |
| | |
| | def generate_speech( |
| | self, |
| | hidden_states: np.ndarray, |
| | num_tokens: int = 8 |
| | ) -> np.ndarray: |
| | """ |
| | Generate speech latents using diffusion. |
| | |
| | Args: |
| | hidden_states: LLM hidden states [batch, seq, hidden_dim] |
| | num_tokens: Number of speech tokens to generate |
| | Returns: |
| | audio: Generated audio waveform |
| | """ |
| | batch_size = hidden_states.shape[0] |
| | latent_dim = 64 |
| | |
| | |
| | latents = np.random.randn(batch_size, num_tokens, latent_dim).astype(np.float32) |
| | |
| | |
| | condition = hidden_states[:, -num_tokens:, :] |
| | |
| | |
| | for t in self.scheduler.timesteps: |
| | for i in range(num_tokens): |
| | noisy = latents[:, i, :] |
| | cond = condition[:, i, :] |
| | |
| | |
| | pred = self.diffusion_step(noisy, float(t), cond) |
| | |
| | |
| | latents[:, i, :] = self.scheduler.step( |
| | pred, int(t), noisy, |
| | self.config["inference"]["diffusion"]["prediction_type"] |
| | ) |
| | |
| | |
| | audio = self.decode_acoustic(latents) |
| | |
| | return audio |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="VibeVoice CoreML Inference") |
| | parser.add_argument("--models-dir", required=True, help="Directory with CoreML models") |
| | parser.add_argument("--text", help="Text to synthesize") |
| | parser.add_argument("--output", default="output.wav", help="Output audio file") |
| | |
| | args = parser.parse_args() |
| | |
| | if not COREML_AVAILABLE: |
| | print("CoreML is only available on macOS. Exiting.") |
| | return |
| | |
| | pipeline = VibeVoicePipeline(args.models_dir) |
| | |
| | print(f"Pipeline initialized with models: {list(pipeline.models.keys())}") |
| | |
| | if args.text: |
| | print(f"Note: Full text-to-speech requires tokenizer and complete inference pipeline.") |
| | print("This script demonstrates individual component usage.") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|