aoiandroid
/

vibevoice-1.5-coreml

Core ML

Model card Files Files and versions

xet

Community

aoiandroid commited on Jan 31

Commit

a4e1d96

verified ·

1 Parent(s): 6bb898a

Upload inference.py with huggingface_hub

Browse files

Files changed (1) hide show

inference.py +256 -0

inference.py ADDED Viewed

	@@ -0,0 +1,256 @@

+#!/usr/bin/env python3
+"""
+VibeVoice CoreML Inference Script
+This script provides inference utilities for the converted VibeVoice models.
+Note: This must be run on macOS to use CoreML models.
+Usage:
+    python inference.py --models-dir ./models --text "Hello world"
+"""
+import argparse
+import json
+from pathlib import Path
+from typing import Optional, Tuple
+import numpy as np
+# CoreML is only available on macOS
+try:
+    import coremltools as ct
+    COREML_AVAILABLE = True
+except ImportError:
+    COREML_AVAILABLE = False
+    print("Warning: coremltools not available. Running in mock mode.")
+class DPMSolverScheduler:
+    """DPM-Solver scheduler for diffusion inference."""
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        num_inference_steps: int = 20,
+        beta_schedule: str = "cosine"
+    ):
+        self.num_train_timesteps = num_train_timesteps
+        self.num_inference_steps = num_inference_steps
+        # Compute beta schedule
+        if beta_schedule == "cosine":
+            steps = num_train_timesteps + 1
+            t = np.linspace(0, 1, steps)
+            alpha_bar = np.cos((t + 0.008) / 1.008 * np.pi / 2) ** 2
+            self.betas = np.clip(1 - alpha_bar[1:] / alpha_bar[:-1], 0, 0.999)
+        else:
+            self.betas = np.linspace(0.0001, 0.02, num_train_timesteps)
+        self.alphas = 1 - self.betas
+        self.alphas_cumprod = np.cumprod(self.alphas)
+        # Compute timesteps
+        step_ratio = num_train_timesteps / num_inference_steps
+        self.timesteps = (num_train_timesteps - 1 - np.arange(num_inference_steps) * step_ratio).astype(np.int64)
+    def add_noise(self, original: np.ndarray, noise: np.ndarray, timestep: int) -> np.ndarray:
+        """Add noise to sample at given timestep."""
+        sqrt_alpha = np.sqrt(self.alphas_cumprod[timestep])
+        sqrt_one_minus_alpha = np.sqrt(1 - self.alphas_cumprod[timestep])
+        return sqrt_alpha * original + sqrt_one_minus_alpha * noise
+    def step(
+        self,
+        model_output: np.ndarray,
+        timestep: int,
+        sample: np.ndarray,
+        prediction_type: str = "v_prediction"
+    ) -> np.ndarray:
+        """Single denoising step."""
+        alpha = self.alphas_cumprod[timestep]
+        alpha_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else 1.0
+        if prediction_type == "v_prediction":
+            # Convert v to epsilon
+            sqrt_alpha = np.sqrt(alpha)
+            sqrt_one_minus_alpha = np.sqrt(1 - alpha)
+            pred_original = sqrt_alpha * sample - sqrt_one_minus_alpha * model_output
+            pred_epsilon = sqrt_alpha * model_output + sqrt_one_minus_alpha * sample
+        else:
+            pred_epsilon = model_output
+            pred_original = (sample - sqrt_one_minus_alpha * pred_epsilon) / sqrt_alpha
+        # Compute previous sample
+        sqrt_alpha_prev = np.sqrt(alpha_prev)
+        sqrt_one_minus_alpha_prev = np.sqrt(1 - alpha_prev)
+        pred_sample_prev = sqrt_alpha_prev * pred_original + sqrt_one_minus_alpha_prev * pred_epsilon
+        return pred_sample_prev
+class VibeVoicePipeline:
+    """VibeVoice CoreML inference pipeline."""
+    def __init__(self, models_dir: Path):
+        self.models_dir = Path(models_dir)
+        self.models = {}
+        # Load configuration
+        config_path = self.models_dir / "vibevoice_pipeline_config.json"
+        if config_path.exists():
+            with open(config_path) as f:
+                self.config = json.load(f)
+        else:
+            self.config = self._default_config()
+        # Initialize scheduler
+        self.scheduler = DPMSolverScheduler(
+            num_inference_steps=self.config["inference"]["diffusion"]["num_steps"]
+        )
+        if COREML_AVAILABLE:
+            self._load_models()
+    def _default_config(self):
+        return {
+            "inference": {
+                "audio": {"sample_rate": 24000, "downsample_factor": 3200},
+                "diffusion": {"num_steps": 20, "prediction_type": "v_prediction"}
+            }
+        }
+    def _load_models(self):
+        """Load CoreML models."""
+        model_files = {
+            "acoustic_encoder": "vibevoice_acoustic_encoder.mlpackage",
+            "acoustic_decoder": "vibevoice_acoustic_decoder.mlpackage",
+            "semantic_encoder": "vibevoice_semantic_encoder.mlpackage",
+            "llm": "vibevoice_llm.mlpackage",
+            "diffusion_head": "vibevoice_diffusion_head.mlpackage"
+        }
+        for name, filename in model_files.items():
+            path = self.models_dir / filename
+            if path.exists():
+                try:
+                    self.models[name] = ct.models.MLModel(str(path))
+                    print(f"Loaded {name}")
+                except Exception as e:
+                    print(f"Failed to load {name}: {e}")
+    def encode_acoustic(self, audio: np.ndarray) -> np.ndarray:
+        """Encode audio to acoustic latent."""
+        if "acoustic_encoder" not in self.models:
+            raise RuntimeError("Acoustic encoder not loaded")
+        output = self.models["acoustic_encoder"].predict({"audio": audio})
+        return output["acoustic_latent"]
+    def decode_acoustic(self, latent: np.ndarray) -> np.ndarray:
+        """Decode acoustic latent to audio."""
+        if "acoustic_decoder" not in self.models:
+            raise RuntimeError("Acoustic decoder not loaded")
+        output = self.models["acoustic_decoder"].predict({"acoustic_latent": latent})
+        return output["audio"]
+    def run_llm(
+        self,
+        input_ids: np.ndarray,
+        attention_mask: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Run LLM forward pass."""
+        if "llm" not in self.models:
+            raise RuntimeError("LLM not loaded")
+        output = self.models["llm"].predict({
+            "input_ids": input_ids.astype(np.int32),
+            "attention_mask": attention_mask.astype(np.float32)
+        })
+        return output["hidden_states"], output["logits"]
+    def diffusion_step(
+        self,
+        noisy_latent: np.ndarray,
+        timestep: float,
+        condition: np.ndarray
+    ) -> np.ndarray:
+        """Single diffusion denoising step."""
+        if "diffusion_head" not in self.models:
+            raise RuntimeError("Diffusion head not loaded")
+        output = self.models["diffusion_head"].predict({
+            "noisy_latent": noisy_latent.astype(np.float32),
+            "timestep": np.array([timestep], dtype=np.float32),
+            "condition": condition.astype(np.float32)
+        })
+        return output["prediction"]
+    def generate_speech(
+        self,
+        hidden_states: np.ndarray,
+        num_tokens: int = 8
+    ) -> np.ndarray:
+        """
+        Generate speech latents using diffusion.
+        Args:
+            hidden_states: LLM hidden states [batch, seq, hidden_dim]
+            num_tokens: Number of speech tokens to generate
+        Returns:
+            audio: Generated audio waveform
+        """
+        batch_size = hidden_states.shape[0]
+        latent_dim = 64
+        # Initialize with noise
+        latents = np.random.randn(batch_size, num_tokens, latent_dim).astype(np.float32)
+        # Get condition from last hidden states
+        condition = hidden_states[:, -num_tokens:, :]  # [batch, num_tokens, hidden_dim]
+        # Diffusion loop
+        for t in self.scheduler.timesteps:
+            for i in range(num_tokens):
+                noisy = latents[:, i, :]  # [batch, latent_dim]
+                cond = condition[:, i, :]  # [batch, hidden_dim]
+                # Model prediction
+                pred = self.diffusion_step(noisy, float(t), cond)
+                # Scheduler step
+                latents[:, i, :] = self.scheduler.step(
+                    pred, int(t), noisy,
+                    self.config["inference"]["diffusion"]["prediction_type"]
+                )
+        # Decode to audio
+        audio = self.decode_acoustic(latents)
+        return audio
+def main():
+    parser = argparse.ArgumentParser(description="VibeVoice CoreML Inference")
+    parser.add_argument("--models-dir", required=True, help="Directory with CoreML models")
+    parser.add_argument("--text", help="Text to synthesize")
+    parser.add_argument("--output", default="output.wav", help="Output audio file")
+    args = parser.parse_args()
+    if not COREML_AVAILABLE:
+        print("CoreML is only available on macOS. Exiting.")
+        return
+    pipeline = VibeVoicePipeline(args.models_dir)
+    print(f"Pipeline initialized with models: {list(pipeline.models.keys())}")
+    if args.text:
+        print(f"Note: Full text-to-speech requires tokenizer and complete inference pipeline.")
+        print("This script demonstrates individual component usage.")
+if __name__ == "__main__":
+    main()