SoulX-Singer

Runtime error

App Files Files Community

SoulX-Singer / app.py

harp-dev

Deploy HARP wrapper via model agent

5ae8b71 verified 3 days ago

Raw

History Blame Contribute Delete

3.37 kB

	from __future__ import annotations

	import gradio as gr
	try:
	import spaces
	except ImportError: # 'spaces' is only provided by Hugging Face Spaces
	import types as _types

	def _gpu(args, *kwargs):
	if len(args) == 1 and callable(args[0]) and not kwargs:
	return args[0]

	def _decorator(func):
	return func

	return _decorator

	spaces = _types.SimpleNamespace(GPU=_gpu)

	from pyharp import *


	import os
	import torch
	import tempfile
	from huggingface_hub import hf_hub_download

	# Assuming 'inference.py' and its dependencies are available in the environment.
	# This typically means the SoulX-Singer repository is cloned or its modules are in sys.path.
	from inference import inference_one_song, load_model_and_config

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# Download model files from Hugging Face Hub
	MODEL_REPO_ID = "Soul-AILab/SoulX-Singer"
	model_pt_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename="model.pt")
	config_yaml_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename="config.yaml")

	# Load model and config once at setup
	MODEL, CONFIG = load_model_and_config(model_pt_path, config_yaml_path, DEVICE)


	model_card = ModelCard(
	name="SoulX-Singer",
	description="SoulX-Singer is a high-fidelity, zero-shot singing voice synthesis model that enables users to generate realistic singing voices for unseen singers. It supports melody-conditioned (F0 contour) and score-conditioned (MIDI notes) control for precise pitch, rhythm, and expression.",
	author="Soul-AILab",
	tags=["huggingface_hub", "text-to-audio", "music", "singing-voice-synthesis", "svs", "zero-shot", "text-to-speech", "en", "zh", "arxiv:2602.07803", "license:apache-2.0", "region:us"],
	)


	@spaces.GPU
	def process_fn(reference_audio, midi_file, lyrics, language, transpose):
	output_audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name

	inference_one_song(
	model=MODEL,
	config=CONFIG,
	input_path=reference_audio,
	output_path=output_audio_path,
	midi_path=midi_file,
	text_path=lyrics,
	lang=language,
	device=DEVICE,
	trans=transpose
	)

	return output_audio_path


	with gr.Blocks() as demo:
	input_components = [
	gr.Audio(type="filepath", label="Reference Audio (for timbre cloning)").harp_required(True).set_info("Upload an audio file to clone its singing timbre."),
	gr.File(type="filepath", label="MIDI File", file_types=[".mid", ".midi"]).harp_required(True).set_info("Upload a MIDI file to define the melody and rhythm."),
	gr.Textbox(label="Lyrics", info="Enter the lyrics to be sung. Ensure they match the MIDI notes.").harp_required(True),
	gr.Dropdown(choices=["en", "zh"], value="en", label="Language", info="Select the language of the lyrics."),
	gr.Slider(minimum=-12, maximum=12, step=1, value=0, label="Transpose (semitones)", info="Transpose the generated singing voice by this many semitones."),
	]
	output_components = [
	gr.Audio(type="filepath", label="Generated Singing Voice"),
	]
	build_endpoint(
	model_card=model_card,
	input_components=input_components,
	output_components=output_components,
	process_fn=process_fn,
	)

	demo.queue().launch(share=True, show_error=False, pwa=True)