vibevoice-1.5-coreml / inference.py
aoiandroid's picture
Upload inference.py with huggingface_hub
a4e1d96 verified
#!/usr/bin/env python3
"""
VibeVoice CoreML Inference Script
This script provides inference utilities for the converted VibeVoice models.
Note: This must be run on macOS to use CoreML models.
Usage:
python inference.py --models-dir ./models --text "Hello world"
"""
import argparse
import json
from pathlib import Path
from typing import Optional, Tuple
import numpy as np
# CoreML is only available on macOS
try:
import coremltools as ct
COREML_AVAILABLE = True
except ImportError:
COREML_AVAILABLE = False
print("Warning: coremltools not available. Running in mock mode.")
class DPMSolverScheduler:
"""DPM-Solver scheduler for diffusion inference."""
def __init__(
self,
num_train_timesteps: int = 1000,
num_inference_steps: int = 20,
beta_schedule: str = "cosine"
):
self.num_train_timesteps = num_train_timesteps
self.num_inference_steps = num_inference_steps
# Compute beta schedule
if beta_schedule == "cosine":
steps = num_train_timesteps + 1
t = np.linspace(0, 1, steps)
alpha_bar = np.cos((t + 0.008) / 1.008 * np.pi / 2) ** 2
self.betas = np.clip(1 - alpha_bar[1:] / alpha_bar[:-1], 0, 0.999)
else:
self.betas = np.linspace(0.0001, 0.02, num_train_timesteps)
self.alphas = 1 - self.betas
self.alphas_cumprod = np.cumprod(self.alphas)
# Compute timesteps
step_ratio = num_train_timesteps / num_inference_steps
self.timesteps = (num_train_timesteps - 1 - np.arange(num_inference_steps) * step_ratio).astype(np.int64)
def add_noise(self, original: np.ndarray, noise: np.ndarray, timestep: int) -> np.ndarray:
"""Add noise to sample at given timestep."""
sqrt_alpha = np.sqrt(self.alphas_cumprod[timestep])
sqrt_one_minus_alpha = np.sqrt(1 - self.alphas_cumprod[timestep])
return sqrt_alpha * original + sqrt_one_minus_alpha * noise
def step(
self,
model_output: np.ndarray,
timestep: int,
sample: np.ndarray,
prediction_type: str = "v_prediction"
) -> np.ndarray:
"""Single denoising step."""
alpha = self.alphas_cumprod[timestep]
alpha_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else 1.0
if prediction_type == "v_prediction":
# Convert v to epsilon
sqrt_alpha = np.sqrt(alpha)
sqrt_one_minus_alpha = np.sqrt(1 - alpha)
pred_original = sqrt_alpha * sample - sqrt_one_minus_alpha * model_output
pred_epsilon = sqrt_alpha * model_output + sqrt_one_minus_alpha * sample
else:
pred_epsilon = model_output
pred_original = (sample - sqrt_one_minus_alpha * pred_epsilon) / sqrt_alpha
# Compute previous sample
sqrt_alpha_prev = np.sqrt(alpha_prev)
sqrt_one_minus_alpha_prev = np.sqrt(1 - alpha_prev)
pred_sample_prev = sqrt_alpha_prev * pred_original + sqrt_one_minus_alpha_prev * pred_epsilon
return pred_sample_prev
class VibeVoicePipeline:
"""VibeVoice CoreML inference pipeline."""
def __init__(self, models_dir: Path):
self.models_dir = Path(models_dir)
self.models = {}
# Load configuration
config_path = self.models_dir / "vibevoice_pipeline_config.json"
if config_path.exists():
with open(config_path) as f:
self.config = json.load(f)
else:
self.config = self._default_config()
# Initialize scheduler
self.scheduler = DPMSolverScheduler(
num_inference_steps=self.config["inference"]["diffusion"]["num_steps"]
)
if COREML_AVAILABLE:
self._load_models()
def _default_config(self):
return {
"inference": {
"audio": {"sample_rate": 24000, "downsample_factor": 3200},
"diffusion": {"num_steps": 20, "prediction_type": "v_prediction"}
}
}
def _load_models(self):
"""Load CoreML models."""
model_files = {
"acoustic_encoder": "vibevoice_acoustic_encoder.mlpackage",
"acoustic_decoder": "vibevoice_acoustic_decoder.mlpackage",
"semantic_encoder": "vibevoice_semantic_encoder.mlpackage",
"llm": "vibevoice_llm.mlpackage",
"diffusion_head": "vibevoice_diffusion_head.mlpackage"
}
for name, filename in model_files.items():
path = self.models_dir / filename
if path.exists():
try:
self.models[name] = ct.models.MLModel(str(path))
print(f"Loaded {name}")
except Exception as e:
print(f"Failed to load {name}: {e}")
def encode_acoustic(self, audio: np.ndarray) -> np.ndarray:
"""Encode audio to acoustic latent."""
if "acoustic_encoder" not in self.models:
raise RuntimeError("Acoustic encoder not loaded")
output = self.models["acoustic_encoder"].predict({"audio": audio})
return output["acoustic_latent"]
def decode_acoustic(self, latent: np.ndarray) -> np.ndarray:
"""Decode acoustic latent to audio."""
if "acoustic_decoder" not in self.models:
raise RuntimeError("Acoustic decoder not loaded")
output = self.models["acoustic_decoder"].predict({"acoustic_latent": latent})
return output["audio"]
def run_llm(
self,
input_ids: np.ndarray,
attention_mask: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
"""Run LLM forward pass."""
if "llm" not in self.models:
raise RuntimeError("LLM not loaded")
output = self.models["llm"].predict({
"input_ids": input_ids.astype(np.int32),
"attention_mask": attention_mask.astype(np.float32)
})
return output["hidden_states"], output["logits"]
def diffusion_step(
self,
noisy_latent: np.ndarray,
timestep: float,
condition: np.ndarray
) -> np.ndarray:
"""Single diffusion denoising step."""
if "diffusion_head" not in self.models:
raise RuntimeError("Diffusion head not loaded")
output = self.models["diffusion_head"].predict({
"noisy_latent": noisy_latent.astype(np.float32),
"timestep": np.array([timestep], dtype=np.float32),
"condition": condition.astype(np.float32)
})
return output["prediction"]
def generate_speech(
self,
hidden_states: np.ndarray,
num_tokens: int = 8
) -> np.ndarray:
"""
Generate speech latents using diffusion.
Args:
hidden_states: LLM hidden states [batch, seq, hidden_dim]
num_tokens: Number of speech tokens to generate
Returns:
audio: Generated audio waveform
"""
batch_size = hidden_states.shape[0]
latent_dim = 64
# Initialize with noise
latents = np.random.randn(batch_size, num_tokens, latent_dim).astype(np.float32)
# Get condition from last hidden states
condition = hidden_states[:, -num_tokens:, :] # [batch, num_tokens, hidden_dim]
# Diffusion loop
for t in self.scheduler.timesteps:
for i in range(num_tokens):
noisy = latents[:, i, :] # [batch, latent_dim]
cond = condition[:, i, :] # [batch, hidden_dim]
# Model prediction
pred = self.diffusion_step(noisy, float(t), cond)
# Scheduler step
latents[:, i, :] = self.scheduler.step(
pred, int(t), noisy,
self.config["inference"]["diffusion"]["prediction_type"]
)
# Decode to audio
audio = self.decode_acoustic(latents)
return audio
def main():
parser = argparse.ArgumentParser(description="VibeVoice CoreML Inference")
parser.add_argument("--models-dir", required=True, help="Directory with CoreML models")
parser.add_argument("--text", help="Text to synthesize")
parser.add_argument("--output", default="output.wav", help="Output audio file")
args = parser.parse_args()
if not COREML_AVAILABLE:
print("CoreML is only available on macOS. Exiting.")
return
pipeline = VibeVoicePipeline(args.models_dir)
print(f"Pipeline initialized with models: {list(pipeline.models.keys())}")
if args.text:
print(f"Note: Full text-to-speech requires tokenizer and complete inference pipeline.")
print("This script demonstrates individual component usage.")
if __name__ == "__main__":
main()