#!/usr/bin/env python3 """ HuggingFace Space entry point for OmniVoice demo. """ import logging import os from typing import Any, Dict logging.basicConfig( level=logging.WARNING, format="%(asctime)s %(name)s %(levelname)s: %(message)s", ) logging.getLogger("omnivoice").setLevel(logging.DEBUG) import numpy as np import spaces import torch from omnivoice import OmniVoice, OmniVoiceGenerationConfig from omnivoice.cli.demo import build_demo # --------------------------------------------------------------------------- # Model loading # --------------------------------------------------------------------------- CHECKPOINT = os.environ.get("OMNIVOICE_MODEL", "k2-fsa/OmniVoice") print(f"Loading model from {CHECKPOINT} to cuda ...") model = OmniVoice.from_pretrained( CHECKPOINT, device_map="cuda", dtype=torch.float16, load_asr=True, ) sampling_rate = model.sampling_rate print("Model loaded successfully!") # --------------------------------------------------------------------------- # Generation logic # --------------------------------------------------------------------------- def _gen_core( text, language, ref_audio, instruct, num_step, guidance_scale, denoise, speed, duration, preprocess_prompt, postprocess_output, mode, ref_text=None, ): if not text or not text.strip(): return None, "Please enter the text to synthesize." gen_config = OmniVoiceGenerationConfig( num_step=int(num_step or 32), guidance_scale=float(guidance_scale) if guidance_scale is not None else 2.0, denoise=bool(denoise) if denoise is not None else True, preprocess_prompt=bool(preprocess_prompt), postprocess_output=bool(postprocess_output), ) lang = language if (language and language != "Auto") else None kw: Dict[str, Any] = dict( text=text.strip(), language=lang, generation_config=gen_config ) if speed is not None and float(speed) != 1.0: kw["speed"] = float(speed) if duration is not None and float(duration) > 0: kw["duration"] = float(duration) if mode == "clone": if not ref_audio: return None, "Please upload a reference audio." kw["voice_clone_prompt"] = model.create_voice_clone_prompt( ref_audio=ref_audio, ref_text=ref_text, ) if mode == "design": if instruct and instruct.strip(): kw["instruct"] = instruct.strip() try: audio = model.generate(**kw) except Exception as e: return None, f"Error: {type(e).__name__}: {e}" waveform = audio[0].squeeze(0).numpy() waveform = (waveform * 32767).astype(np.int16) return (sampling_rate, waveform), "Done." # --------------------------------------------------------------------------- # ZeroGPU wrapper # --------------------------------------------------------------------------- @spaces.GPU(duration=60) def generate_fn(*args, **kwargs): return _gen_core(*args, **kwargs) # --------------------------------------------------------------------------- # Build and launch demo # --------------------------------------------------------------------------- demo = build_demo(model, CHECKPOINT, generate_fn=generate_fn) if __name__ == "__main__": demo.queue().launch()