SoulX-Singer / app.py
harp-dev's picture
Deploy HARP wrapper via model agent
5ae8b71 verified
Raw
History Blame Contribute Delete
3.37 kB
from __future__ import annotations
import gradio as gr
try:
import spaces
except ImportError: # 'spaces' is only provided by Hugging Face Spaces
import types as _types
def _gpu(*args, **kwargs):
if len(args) == 1 and callable(args[0]) and not kwargs:
return args[0]
def _decorator(func):
return func
return _decorator
spaces = _types.SimpleNamespace(GPU=_gpu)
from pyharp import *
import os
import torch
import tempfile
from huggingface_hub import hf_hub_download
# Assuming 'inference.py' and its dependencies are available in the environment.
# This typically means the SoulX-Singer repository is cloned or its modules are in sys.path.
from inference import inference_one_song, load_model_and_config
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Download model files from Hugging Face Hub
MODEL_REPO_ID = "Soul-AILab/SoulX-Singer"
model_pt_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename="model.pt")
config_yaml_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename="config.yaml")
# Load model and config once at setup
MODEL, CONFIG = load_model_and_config(model_pt_path, config_yaml_path, DEVICE)
model_card = ModelCard(
name="SoulX-Singer",
description="SoulX-Singer is a high-fidelity, zero-shot singing voice synthesis model that enables users to generate realistic singing voices for unseen singers. It supports melody-conditioned (F0 contour) and score-conditioned (MIDI notes) control for precise pitch, rhythm, and expression.",
author="Soul-AILab",
tags=["huggingface_hub", "text-to-audio", "music", "singing-voice-synthesis", "svs", "zero-shot", "text-to-speech", "en", "zh", "arxiv:2602.07803", "license:apache-2.0", "region:us"],
)
@spaces.GPU
def process_fn(reference_audio, midi_file, lyrics, language, transpose):
output_audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
inference_one_song(
model=MODEL,
config=CONFIG,
input_path=reference_audio,
output_path=output_audio_path,
midi_path=midi_file,
text_path=lyrics,
lang=language,
device=DEVICE,
trans=transpose
)
return output_audio_path
with gr.Blocks() as demo:
input_components = [
gr.Audio(type="filepath", label="Reference Audio (for timbre cloning)").harp_required(True).set_info("Upload an audio file to clone its singing timbre."),
gr.File(type="filepath", label="MIDI File", file_types=[".mid", ".midi"]).harp_required(True).set_info("Upload a MIDI file to define the melody and rhythm."),
gr.Textbox(label="Lyrics", info="Enter the lyrics to be sung. Ensure they match the MIDI notes.").harp_required(True),
gr.Dropdown(choices=["en", "zh"], value="en", label="Language", info="Select the language of the lyrics."),
gr.Slider(minimum=-12, maximum=12, step=1, value=0, label="Transpose (semitones)", info="Transpose the generated singing voice by this many semitones."),
]
output_components = [
gr.Audio(type="filepath", label="Generated Singing Voice"),
]
build_endpoint(
model_card=model_card,
input_components=input_components,
output_components=output_components,
process_fn=process_fn,
)
demo.queue().launch(share=True, show_error=False, pwa=True)