Spaces:
Running
Running
File size: 3,758 Bytes
f03e375 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | from __future__ import annotations
import os
import gradio as gr
from pyharp import *
from gradio_client import Client, handle_file
_BACKEND_SPACE = "Soul-AILab/SoulX-Singer"
_BACKEND_API_NAME = "/synthesis_function"
_BACKEND_TOKEN_ENV = "HF_TOKEN"
_client = None
def _backend_client():
# Lazily create and cache one warm connection to the backend Space.
global _client
if _client is None:
_token = os.environ.get(_BACKEND_TOKEN_ENV) or None
_client = Client(_BACKEND_SPACE, hf_token=_token)
return _client
model_card = ModelCard(
name="SoulX-Singer",
description="Zero-shot singing voice synthesis: sing a target melody in a reference voice. This is a thin HARP frontend that proxies to the upstream SoulX-Singer Gradio Space over gradio_client, so none of its heavy (and pyharp-conflicting) dependencies are installed here.",
author="Soul-AILab",
tags=["audio-to-audio", "singing-voice-synthesis", "zero-shot", "voice-conversion"],
)
def process_fn(prompt_audio, target_audio, control, auto_shift, pitch_shift, seed, prompt_lyric_lang, target_lyric_lang, prompt_vocal_sep, target_vocal_sep):
_raw = _backend_client().predict(
handle_file(prompt_audio),
handle_file(target_audio),
None,
None,
control,
auto_shift,
int(pitch_shift),
int(seed),
prompt_lyric_lang,
target_lyric_lang,
prompt_vocal_sep,
target_vocal_sep,
api_name="/synthesis_function",
)
_values = list(_raw) if isinstance(_raw, (list, tuple)) else [_raw]
_detail = " | ".join(str(_v) for _v in _values if isinstance(_v, str) and _v.strip())
_out_generated = _values[0] if len(_values) > 0 else None
if not _out_generated:
raise gr.Error(_detail or "The backend Space returned no 'generated' output. Check the backend Space's logs; if it uses ZeroGPU it may need a moment to warm up.")
return _out_generated
with gr.Blocks() as demo:
input_components = [
gr.Audio(type="filepath", label="Reference voice").harp_required(True).set_info("A clip of the target singer's voice (the timbre to sing with), max 30s."),
gr.Audio(type="filepath", label="Target melody").harp_required(True).set_info("The song/melody to reproduce in the reference voice, max 60s."),
gr.Dropdown(choices=["melody", "score"], value="melody", label="Control type", info="How SoulX follows the target: 'melody' tracks the sung pitch; 'score' follows note quantization."),
gr.Checkbox(value=True, label="Auto pitch shift", info="Let the model choose the pitch shift automatically."),
gr.Number(value=0, label="Manual pitch shift (semitones)", info="Used when auto pitch shift is off (-36 to 36)."),
gr.Number(value=12306, label="Seed", info="Random seed for reproducible generation."),
gr.Dropdown(choices=["Mandarin", "Cantonese", "English"], value="English", label="Reference lyric language"),
gr.Dropdown(choices=["Mandarin", "Cantonese", "English"], value="English", label="Target lyric language"),
gr.Checkbox(value=False, label="Separate vocals from reference", info="Run vocal separation on the reference clip first."),
gr.Checkbox(value=True, label="Separate vocals from target", info="Run vocal separation on the target melody first."),
]
output_components = [
gr.Audio(type="filepath", label="Generated singing").set_info("The target melody sung in the reference voice."),
]
build_endpoint(
model_card=model_card,
input_components=input_components,
output_components=output_components,
process_fn=process_fn,
)
demo.queue().launch(share=True, show_error=False, pwa=True)
|