aoiandroid commited on
Commit
a4e1d96
·
verified ·
1 Parent(s): 6bb898a

Upload inference.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. inference.py +256 -0
inference.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ VibeVoice CoreML Inference Script
4
+
5
+ This script provides inference utilities for the converted VibeVoice models.
6
+ Note: This must be run on macOS to use CoreML models.
7
+
8
+ Usage:
9
+ python inference.py --models-dir ./models --text "Hello world"
10
+ """
11
+
12
+ import argparse
13
+ import json
14
+ from pathlib import Path
15
+ from typing import Optional, Tuple
16
+
17
+ import numpy as np
18
+
19
+ # CoreML is only available on macOS
20
+ try:
21
+ import coremltools as ct
22
+ COREML_AVAILABLE = True
23
+ except ImportError:
24
+ COREML_AVAILABLE = False
25
+ print("Warning: coremltools not available. Running in mock mode.")
26
+
27
+
28
+ class DPMSolverScheduler:
29
+ """DPM-Solver scheduler for diffusion inference."""
30
+
31
+ def __init__(
32
+ self,
33
+ num_train_timesteps: int = 1000,
34
+ num_inference_steps: int = 20,
35
+ beta_schedule: str = "cosine"
36
+ ):
37
+ self.num_train_timesteps = num_train_timesteps
38
+ self.num_inference_steps = num_inference_steps
39
+
40
+ # Compute beta schedule
41
+ if beta_schedule == "cosine":
42
+ steps = num_train_timesteps + 1
43
+ t = np.linspace(0, 1, steps)
44
+ alpha_bar = np.cos((t + 0.008) / 1.008 * np.pi / 2) ** 2
45
+ self.betas = np.clip(1 - alpha_bar[1:] / alpha_bar[:-1], 0, 0.999)
46
+ else:
47
+ self.betas = np.linspace(0.0001, 0.02, num_train_timesteps)
48
+
49
+ self.alphas = 1 - self.betas
50
+ self.alphas_cumprod = np.cumprod(self.alphas)
51
+
52
+ # Compute timesteps
53
+ step_ratio = num_train_timesteps / num_inference_steps
54
+ self.timesteps = (num_train_timesteps - 1 - np.arange(num_inference_steps) * step_ratio).astype(np.int64)
55
+
56
+ def add_noise(self, original: np.ndarray, noise: np.ndarray, timestep: int) -> np.ndarray:
57
+ """Add noise to sample at given timestep."""
58
+ sqrt_alpha = np.sqrt(self.alphas_cumprod[timestep])
59
+ sqrt_one_minus_alpha = np.sqrt(1 - self.alphas_cumprod[timestep])
60
+ return sqrt_alpha * original + sqrt_one_minus_alpha * noise
61
+
62
+ def step(
63
+ self,
64
+ model_output: np.ndarray,
65
+ timestep: int,
66
+ sample: np.ndarray,
67
+ prediction_type: str = "v_prediction"
68
+ ) -> np.ndarray:
69
+ """Single denoising step."""
70
+ alpha = self.alphas_cumprod[timestep]
71
+ alpha_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else 1.0
72
+
73
+ if prediction_type == "v_prediction":
74
+ # Convert v to epsilon
75
+ sqrt_alpha = np.sqrt(alpha)
76
+ sqrt_one_minus_alpha = np.sqrt(1 - alpha)
77
+ pred_original = sqrt_alpha * sample - sqrt_one_minus_alpha * model_output
78
+ pred_epsilon = sqrt_alpha * model_output + sqrt_one_minus_alpha * sample
79
+ else:
80
+ pred_epsilon = model_output
81
+ pred_original = (sample - sqrt_one_minus_alpha * pred_epsilon) / sqrt_alpha
82
+
83
+ # Compute previous sample
84
+ sqrt_alpha_prev = np.sqrt(alpha_prev)
85
+ sqrt_one_minus_alpha_prev = np.sqrt(1 - alpha_prev)
86
+
87
+ pred_sample_prev = sqrt_alpha_prev * pred_original + sqrt_one_minus_alpha_prev * pred_epsilon
88
+
89
+ return pred_sample_prev
90
+
91
+
92
+ class VibeVoicePipeline:
93
+ """VibeVoice CoreML inference pipeline."""
94
+
95
+ def __init__(self, models_dir: Path):
96
+ self.models_dir = Path(models_dir)
97
+ self.models = {}
98
+
99
+ # Load configuration
100
+ config_path = self.models_dir / "vibevoice_pipeline_config.json"
101
+ if config_path.exists():
102
+ with open(config_path) as f:
103
+ self.config = json.load(f)
104
+ else:
105
+ self.config = self._default_config()
106
+
107
+ # Initialize scheduler
108
+ self.scheduler = DPMSolverScheduler(
109
+ num_inference_steps=self.config["inference"]["diffusion"]["num_steps"]
110
+ )
111
+
112
+ if COREML_AVAILABLE:
113
+ self._load_models()
114
+
115
+ def _default_config(self):
116
+ return {
117
+ "inference": {
118
+ "audio": {"sample_rate": 24000, "downsample_factor": 3200},
119
+ "diffusion": {"num_steps": 20, "prediction_type": "v_prediction"}
120
+ }
121
+ }
122
+
123
+ def _load_models(self):
124
+ """Load CoreML models."""
125
+ model_files = {
126
+ "acoustic_encoder": "vibevoice_acoustic_encoder.mlpackage",
127
+ "acoustic_decoder": "vibevoice_acoustic_decoder.mlpackage",
128
+ "semantic_encoder": "vibevoice_semantic_encoder.mlpackage",
129
+ "llm": "vibevoice_llm.mlpackage",
130
+ "diffusion_head": "vibevoice_diffusion_head.mlpackage"
131
+ }
132
+
133
+ for name, filename in model_files.items():
134
+ path = self.models_dir / filename
135
+ if path.exists():
136
+ try:
137
+ self.models[name] = ct.models.MLModel(str(path))
138
+ print(f"Loaded {name}")
139
+ except Exception as e:
140
+ print(f"Failed to load {name}: {e}")
141
+
142
+ def encode_acoustic(self, audio: np.ndarray) -> np.ndarray:
143
+ """Encode audio to acoustic latent."""
144
+ if "acoustic_encoder" not in self.models:
145
+ raise RuntimeError("Acoustic encoder not loaded")
146
+
147
+ output = self.models["acoustic_encoder"].predict({"audio": audio})
148
+ return output["acoustic_latent"]
149
+
150
+ def decode_acoustic(self, latent: np.ndarray) -> np.ndarray:
151
+ """Decode acoustic latent to audio."""
152
+ if "acoustic_decoder" not in self.models:
153
+ raise RuntimeError("Acoustic decoder not loaded")
154
+
155
+ output = self.models["acoustic_decoder"].predict({"acoustic_latent": latent})
156
+ return output["audio"]
157
+
158
+ def run_llm(
159
+ self,
160
+ input_ids: np.ndarray,
161
+ attention_mask: np.ndarray
162
+ ) -> Tuple[np.ndarray, np.ndarray]:
163
+ """Run LLM forward pass."""
164
+ if "llm" not in self.models:
165
+ raise RuntimeError("LLM not loaded")
166
+
167
+ output = self.models["llm"].predict({
168
+ "input_ids": input_ids.astype(np.int32),
169
+ "attention_mask": attention_mask.astype(np.float32)
170
+ })
171
+ return output["hidden_states"], output["logits"]
172
+
173
+ def diffusion_step(
174
+ self,
175
+ noisy_latent: np.ndarray,
176
+ timestep: float,
177
+ condition: np.ndarray
178
+ ) -> np.ndarray:
179
+ """Single diffusion denoising step."""
180
+ if "diffusion_head" not in self.models:
181
+ raise RuntimeError("Diffusion head not loaded")
182
+
183
+ output = self.models["diffusion_head"].predict({
184
+ "noisy_latent": noisy_latent.astype(np.float32),
185
+ "timestep": np.array([timestep], dtype=np.float32),
186
+ "condition": condition.astype(np.float32)
187
+ })
188
+ return output["prediction"]
189
+
190
+ def generate_speech(
191
+ self,
192
+ hidden_states: np.ndarray,
193
+ num_tokens: int = 8
194
+ ) -> np.ndarray:
195
+ """
196
+ Generate speech latents using diffusion.
197
+
198
+ Args:
199
+ hidden_states: LLM hidden states [batch, seq, hidden_dim]
200
+ num_tokens: Number of speech tokens to generate
201
+ Returns:
202
+ audio: Generated audio waveform
203
+ """
204
+ batch_size = hidden_states.shape[0]
205
+ latent_dim = 64
206
+
207
+ # Initialize with noise
208
+ latents = np.random.randn(batch_size, num_tokens, latent_dim).astype(np.float32)
209
+
210
+ # Get condition from last hidden states
211
+ condition = hidden_states[:, -num_tokens:, :] # [batch, num_tokens, hidden_dim]
212
+
213
+ # Diffusion loop
214
+ for t in self.scheduler.timesteps:
215
+ for i in range(num_tokens):
216
+ noisy = latents[:, i, :] # [batch, latent_dim]
217
+ cond = condition[:, i, :] # [batch, hidden_dim]
218
+
219
+ # Model prediction
220
+ pred = self.diffusion_step(noisy, float(t), cond)
221
+
222
+ # Scheduler step
223
+ latents[:, i, :] = self.scheduler.step(
224
+ pred, int(t), noisy,
225
+ self.config["inference"]["diffusion"]["prediction_type"]
226
+ )
227
+
228
+ # Decode to audio
229
+ audio = self.decode_acoustic(latents)
230
+
231
+ return audio
232
+
233
+
234
+ def main():
235
+ parser = argparse.ArgumentParser(description="VibeVoice CoreML Inference")
236
+ parser.add_argument("--models-dir", required=True, help="Directory with CoreML models")
237
+ parser.add_argument("--text", help="Text to synthesize")
238
+ parser.add_argument("--output", default="output.wav", help="Output audio file")
239
+
240
+ args = parser.parse_args()
241
+
242
+ if not COREML_AVAILABLE:
243
+ print("CoreML is only available on macOS. Exiting.")
244
+ return
245
+
246
+ pipeline = VibeVoicePipeline(args.models_dir)
247
+
248
+ print(f"Pipeline initialized with models: {list(pipeline.models.keys())}")
249
+
250
+ if args.text:
251
+ print(f"Note: Full text-to-speech requires tokenizer and complete inference pipeline.")
252
+ print("This script demonstrates individual component usage.")
253
+
254
+
255
+ if __name__ == "__main__":
256
+ main()