Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python3 | |
| """ | |
| HuggingFace Space entry point for OmniVoice demo. | |
| """ | |
| import logging | |
| import os | |
| from typing import Any, Dict | |
| logging.basicConfig( | |
| level=logging.WARNING, | |
| format="%(asctime)s %(name)s %(levelname)s: %(message)s", | |
| ) | |
| logging.getLogger("omnivoice").setLevel(logging.DEBUG) | |
| import numpy as np | |
| import spaces | |
| import torch | |
| from omnivoice import OmniVoice, OmniVoiceGenerationConfig | |
| from omnivoice.cli.demo import build_demo | |
| # --------------------------------------------------------------------------- | |
| # Model loading | |
| # --------------------------------------------------------------------------- | |
| CHECKPOINT = os.environ.get("OMNIVOICE_MODEL", "k2-fsa/OmniVoice") | |
| print(f"Loading model from {CHECKPOINT} to cuda ...") | |
| model = OmniVoice.from_pretrained( | |
| CHECKPOINT, | |
| device_map="cuda", | |
| dtype=torch.float16, | |
| load_asr=True, | |
| ) | |
| sampling_rate = model.sampling_rate | |
| print("Model loaded successfully!") | |
| # --------------------------------------------------------------------------- | |
| # Generation logic | |
| # --------------------------------------------------------------------------- | |
| def _gen_core( | |
| text, | |
| language, | |
| ref_audio, | |
| instruct, | |
| num_step, | |
| guidance_scale, | |
| denoise, | |
| speed, | |
| duration, | |
| preprocess_prompt, | |
| postprocess_output, | |
| mode, | |
| ref_text=None, | |
| ): | |
| if not text or not text.strip(): | |
| return None, "Please enter the text to synthesize." | |
| gen_config = OmniVoiceGenerationConfig( | |
| num_step=int(num_step or 32), | |
| guidance_scale=float(guidance_scale) if guidance_scale is not None else 2.0, | |
| denoise=bool(denoise) if denoise is not None else True, | |
| preprocess_prompt=bool(preprocess_prompt), | |
| postprocess_output=bool(postprocess_output), | |
| ) | |
| lang = language if (language and language != "Auto") else None | |
| kw: Dict[str, Any] = dict( | |
| text=text.strip(), language=lang, generation_config=gen_config | |
| ) | |
| if speed is not None and float(speed) != 1.0: | |
| kw["speed"] = float(speed) | |
| if duration is not None and float(duration) > 0: | |
| kw["duration"] = float(duration) | |
| if mode == "clone": | |
| if not ref_audio: | |
| return None, "Please upload a reference audio." | |
| kw["voice_clone_prompt"] = model.create_voice_clone_prompt( | |
| ref_audio=ref_audio, | |
| ref_text=ref_text, | |
| ) | |
| if mode == "design": | |
| if instruct and instruct.strip(): | |
| kw["instruct"] = instruct.strip() | |
| try: | |
| audio = model.generate(**kw) | |
| except Exception as e: | |
| return None, f"Error: {type(e).__name__}: {e}" | |
| waveform = audio[0].squeeze(0).numpy() | |
| waveform = (waveform * 32767).astype(np.int16) | |
| return (sampling_rate, waveform), "Done." | |
| # --------------------------------------------------------------------------- | |
| # ZeroGPU wrapper | |
| # --------------------------------------------------------------------------- | |
| def generate_fn(*args, **kwargs): | |
| return _gen_core(*args, **kwargs) | |
| # --------------------------------------------------------------------------- | |
| # Build and launch demo | |
| # --------------------------------------------------------------------------- | |
| demo = build_demo(model, CHECKPOINT, generate_fn=generate_fn) | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |