Spaces:
Running on Zero
Running on Zero
File size: 3,343 Bytes
aa79b9c c1079c1 aa79b9c c1079c1 7be7394 c1079c1 69feda9 aa79b9c 69feda9 aa79b9c 69feda9 aa79b9c 69feda9 aa79b9c 69feda9 aa79b9c 69feda9 aa79b9c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | #!/usr/bin/env python3
"""
HuggingFace Space entry point for OmniVoice demo.
"""
import logging
import os
from typing import Any, Dict
logging.basicConfig(
level=logging.WARNING,
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
)
logging.getLogger("omnivoice").setLevel(logging.DEBUG)
import numpy as np
import spaces
import torch
from omnivoice import OmniVoice, OmniVoiceGenerationConfig
from omnivoice.cli.demo import build_demo
# ---------------------------------------------------------------------------
# Model loading
# ---------------------------------------------------------------------------
CHECKPOINT = os.environ.get("OMNIVOICE_MODEL", "k2-fsa/OmniVoice")
print(f"Loading model from {CHECKPOINT} to cuda ...")
model = OmniVoice.from_pretrained(
CHECKPOINT,
device_map="cuda",
dtype=torch.float16,
load_asr=True,
)
sampling_rate = model.sampling_rate
print("Model loaded successfully!")
# ---------------------------------------------------------------------------
# Generation logic
# ---------------------------------------------------------------------------
def _gen_core(
text,
language,
ref_audio,
instruct,
num_step,
guidance_scale,
denoise,
speed,
duration,
preprocess_prompt,
postprocess_output,
mode,
ref_text=None,
):
if not text or not text.strip():
return None, "Please enter the text to synthesize."
gen_config = OmniVoiceGenerationConfig(
num_step=int(num_step or 32),
guidance_scale=float(guidance_scale) if guidance_scale is not None else 2.0,
denoise=bool(denoise) if denoise is not None else True,
preprocess_prompt=bool(preprocess_prompt),
postprocess_output=bool(postprocess_output),
)
lang = language if (language and language != "Auto") else None
kw: Dict[str, Any] = dict(
text=text.strip(), language=lang, generation_config=gen_config
)
if speed is not None and float(speed) != 1.0:
kw["speed"] = float(speed)
if duration is not None and float(duration) > 0:
kw["duration"] = float(duration)
if mode == "clone":
if not ref_audio:
return None, "Please upload a reference audio."
kw["voice_clone_prompt"] = model.create_voice_clone_prompt(
ref_audio=ref_audio,
ref_text=ref_text,
)
if mode == "design":
if instruct and instruct.strip():
kw["instruct"] = instruct.strip()
try:
audio = model.generate(**kw)
except Exception as e:
return None, f"Error: {type(e).__name__}: {e}"
waveform = audio[0].squeeze(0).numpy()
waveform = (waveform * 32767).astype(np.int16)
return (sampling_rate, waveform), "Done."
# ---------------------------------------------------------------------------
# ZeroGPU wrapper
# ---------------------------------------------------------------------------
@spaces.GPU(duration=60)
def generate_fn(*args, **kwargs):
return _gen_core(*args, **kwargs)
# ---------------------------------------------------------------------------
# Build and launch demo
# ---------------------------------------------------------------------------
demo = build_demo(model, CHECKPOINT, generate_fn=generate_fn)
if __name__ == "__main__":
demo.queue().launch()
|