from __future__ import annotations from pathlib import Path import numpy as np def load_tokenizer(repo_dir: str | Path): from scripts.local_tokenizer import LocalEmiliaTokenizer token_file = Path(repo_dir) / "resources" / "zipvoice_hf" / "zipvoice" / "tokens.txt" if not token_file.is_file(): raise FileNotFoundError(f"tokens.txt not found: {token_file}") return LocalEmiliaTokenizer(token_file=str(token_file)) def extract_prompt_features( prompt_wav: str | Path, repo_dir: str | Path, sampling_rate: int, feat_scale: float, target_rms: float, ): from scripts.local_audio import LocalVocosFbank, load_prompt_wav, rms_norm import torch wav = load_prompt_wav(prompt_wav, sampling_rate=sampling_rate) wav, prompt_rms = rms_norm(wav, target_rms) extractor = LocalVocosFbank() features = extractor.extract(wav, sampling_rate=sampling_rate) if not isinstance(features, torch.Tensor): features = torch.from_numpy(features) features = features.unsqueeze(0) * feat_scale return features.cpu().numpy().astype(np.float32), float(prompt_rms) def load_vocoder(repo_dir: str | Path): from scripts.local_audio import load_local_vocos import torch vocoder_dir = Path(repo_dir) / "resources" / "vocos-mel-24khz" if not (vocoder_dir / "config.yaml").is_file() or not ( vocoder_dir / "pytorch_model.bin" ).is_file(): raise FileNotFoundError(f"Local Vocos files not found in {vocoder_dir}") vocoder = load_local_vocos(vocoder_dir) vocoder = vocoder.to(torch.device("cpu")) vocoder.eval() return vocoder def vocoder_decode_loaded( vocoder, features: np.ndarray, feat_scale: float, target_rms: float, prompt_rms: float, ) -> np.ndarray: from scripts.local_audio import rms_norm import torch feat_tensor = torch.from_numpy(features).float().permute(0, 2, 1) / feat_scale with torch.no_grad(): wav = vocoder.decode(feat_tensor).squeeze(1).clamp(-1, 1) wav = rms_norm(wav, target_rms)[0] if prompt_rms < target_rms: wav = wav * prompt_rms / target_rms return wav.squeeze().cpu().numpy()