from __future__ import annotations import gradio as gr try: import spaces except ImportError: # 'spaces' is only provided by Hugging Face Spaces import types as _types def _gpu(*args, **kwargs): if len(args) == 1 and callable(args[0]) and not kwargs: return args[0] def _decorator(func): return func return _decorator spaces = _types.SimpleNamespace(GPU=_gpu) from pyharp import * import os import torch import tempfile from huggingface_hub import hf_hub_download # Assuming 'inference.py' and its dependencies are available in the environment. # This typically means the SoulX-Singer repository is cloned or its modules are in sys.path. from inference import inference_one_song, load_model_and_config DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Download model files from Hugging Face Hub MODEL_REPO_ID = "Soul-AILab/SoulX-Singer" model_pt_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename="model.pt") config_yaml_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename="config.yaml") # Load model and config once at setup MODEL, CONFIG = load_model_and_config(model_pt_path, config_yaml_path, DEVICE) model_card = ModelCard( name="SoulX-Singer", description="SoulX-Singer is a high-fidelity, zero-shot singing voice synthesis model that enables users to generate realistic singing voices for unseen singers. It supports melody-conditioned (F0 contour) and score-conditioned (MIDI notes) control for precise pitch, rhythm, and expression.", author="Soul-AILab", tags=["huggingface_hub", "text-to-audio", "music", "singing-voice-synthesis", "svs", "zero-shot", "text-to-speech", "en", "zh", "arxiv:2602.07803", "license:apache-2.0", "region:us"], ) @spaces.GPU def process_fn(reference_audio, midi_file, lyrics, language, transpose): output_audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name inference_one_song( model=MODEL, config=CONFIG, input_path=reference_audio, output_path=output_audio_path, midi_path=midi_file, text_path=lyrics, lang=language, device=DEVICE, trans=transpose ) return output_audio_path with gr.Blocks() as demo: input_components = [ gr.Audio(type="filepath", label="Reference Audio (for timbre cloning)").harp_required(True).set_info("Upload an audio file to clone its singing timbre."), gr.File(type="filepath", label="MIDI File", file_types=[".mid", ".midi"]).harp_required(True).set_info("Upload a MIDI file to define the melody and rhythm."), gr.Textbox(label="Lyrics", info="Enter the lyrics to be sung. Ensure they match the MIDI notes.").harp_required(True), gr.Dropdown(choices=["en", "zh"], value="en", label="Language", info="Select the language of the lyrics."), gr.Slider(minimum=-12, maximum=12, step=1, value=0, label="Transpose (semitones)", info="Transpose the generated singing voice by this many semitones."), ] output_components = [ gr.Audio(type="filepath", label="Generated Singing Voice"), ] build_endpoint( model_card=model_card, input_components=input_components, output_components=output_components, process_fn=process_fn, ) demo.queue().launch(share=True, show_error=False, pwa=True)