Spaces:

teamup-tech
/

SoulX-Singer

Running

File size: 3,758 Bytes

f03e375

from __future__ import annotations

import os

import gradio as gr

from pyharp import *
from gradio_client import Client, handle_file


_BACKEND_SPACE = "Soul-AILab/SoulX-Singer"
_BACKEND_API_NAME = "/synthesis_function"
_BACKEND_TOKEN_ENV = "HF_TOKEN"
_client = None


def _backend_client():
    # Lazily create and cache one warm connection to the backend Space.
    global _client
    if _client is None:
        _token = os.environ.get(_BACKEND_TOKEN_ENV) or None
        _client = Client(_BACKEND_SPACE, hf_token=_token)
    return _client


model_card = ModelCard(
    name="SoulX-Singer",
    description="Zero-shot singing voice synthesis: sing a target melody in a reference voice. This is a thin HARP frontend that proxies to the upstream SoulX-Singer Gradio Space over gradio_client, so none of its heavy (and pyharp-conflicting) dependencies are installed here.",
    author="Soul-AILab",
    tags=["audio-to-audio", "singing-voice-synthesis", "zero-shot", "voice-conversion"],
)


def process_fn(prompt_audio, target_audio, control, auto_shift, pitch_shift, seed, prompt_lyric_lang, target_lyric_lang, prompt_vocal_sep, target_vocal_sep):
    _raw = _backend_client().predict(
        handle_file(prompt_audio),
        handle_file(target_audio),
        None,
        None,
        control,
        auto_shift,
        int(pitch_shift),
        int(seed),
        prompt_lyric_lang,
        target_lyric_lang,
        prompt_vocal_sep,
        target_vocal_sep,
        api_name="/synthesis_function",
    )
    _values = list(_raw) if isinstance(_raw, (list, tuple)) else [_raw]
    _detail = " | ".join(str(_v) for _v in _values if isinstance(_v, str) and _v.strip())
    _out_generated = _values[0] if len(_values) > 0 else None
    if not _out_generated:
        raise gr.Error(_detail or "The backend Space returned no 'generated' output. Check the backend Space's logs; if it uses ZeroGPU it may need a moment to warm up.")
    return _out_generated


with gr.Blocks() as demo:
    input_components = [
        gr.Audio(type="filepath", label="Reference voice").harp_required(True).set_info("A clip of the target singer's voice (the timbre to sing with), max 30s."),
        gr.Audio(type="filepath", label="Target melody").harp_required(True).set_info("The song/melody to reproduce in the reference voice, max 60s."),
        gr.Dropdown(choices=["melody", "score"], value="melody", label="Control type", info="How SoulX follows the target: 'melody' tracks the sung pitch; 'score' follows note quantization."),
        gr.Checkbox(value=True, label="Auto pitch shift", info="Let the model choose the pitch shift automatically."),
        gr.Number(value=0, label="Manual pitch shift (semitones)", info="Used when auto pitch shift is off (-36 to 36)."),
        gr.Number(value=12306, label="Seed", info="Random seed for reproducible generation."),
        gr.Dropdown(choices=["Mandarin", "Cantonese", "English"], value="English", label="Reference lyric language"),
        gr.Dropdown(choices=["Mandarin", "Cantonese", "English"], value="English", label="Target lyric language"),
        gr.Checkbox(value=False, label="Separate vocals from reference", info="Run vocal separation on the reference clip first."),
        gr.Checkbox(value=True, label="Separate vocals from target", info="Run vocal separation on the target melody first."),
    ]
    output_components = [
        gr.Audio(type="filepath", label="Generated singing").set_info("The target melody sung in the reference voice."),
    ]
    build_endpoint(
        model_card=model_card,
        input_components=input_components,
        output_components=output_components,
        process_fn=process_fn,
    )

demo.queue().launch(share=True, show_error=False, pwa=True)