from __future__ import annotations

import gradio as gr
try:
    import spaces
except ImportError:  # 'spaces' is only provided by Hugging Face Spaces
    import types as _types

    def _gpu(*args, **kwargs):
        if len(args) == 1 and callable(args[0]) and not kwargs:
            return args[0]

        def _decorator(func):
            return func

        return _decorator

    spaces = _types.SimpleNamespace(GPU=_gpu)

from pyharp import *


import os
import torch
import tempfile
from huggingface_hub import hf_hub_download

# Assuming 'inference.py' and its dependencies are available in the environment.
# This typically means the SoulX-Singer repository is cloned or its modules are in sys.path.
from inference import inference_one_song, load_model_and_config

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Download model files from Hugging Face Hub
MODEL_REPO_ID = "Soul-AILab/SoulX-Singer"
model_pt_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename="model.pt")
config_yaml_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename="config.yaml")

# Load model and config once at setup
MODEL, CONFIG = load_model_and_config(model_pt_path, config_yaml_path, DEVICE)


model_card = ModelCard(
    name="SoulX-Singer",
    description="SoulX-Singer is a high-fidelity, zero-shot singing voice synthesis model that enables users to generate realistic singing voices for unseen singers. It supports melody-conditioned (F0 contour) and score-conditioned (MIDI notes) control for precise pitch, rhythm, and expression.",
    author="Soul-AILab",
    tags=["huggingface_hub", "text-to-audio", "music", "singing-voice-synthesis", "svs", "zero-shot", "text-to-speech", "en", "zh", "arxiv:2602.07803", "license:apache-2.0", "region:us"],
)


@spaces.GPU
def process_fn(reference_audio, midi_file, lyrics, language, transpose):
    output_audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name

    inference_one_song(
        model=MODEL,
        config=CONFIG,
        input_path=reference_audio,
        output_path=output_audio_path,
        midi_path=midi_file,
        text_path=lyrics,
        lang=language,
        device=DEVICE,
        trans=transpose
    )

    return output_audio_path


with gr.Blocks() as demo:
    input_components = [
        gr.Audio(type="filepath", label="Reference Audio (for timbre cloning)").harp_required(True).set_info("Upload an audio file to clone its singing timbre."),
        gr.File(type="filepath", label="MIDI File", file_types=[".mid", ".midi"]).harp_required(True).set_info("Upload a MIDI file to define the melody and rhythm."),
        gr.Textbox(label="Lyrics", info="Enter the lyrics to be sung. Ensure they match the MIDI notes.").harp_required(True),
        gr.Dropdown(choices=["en", "zh"], value="en", label="Language", info="Select the language of the lyrics."),
        gr.Slider(minimum=-12, maximum=12, step=1, value=0, label="Transpose (semitones)", info="Transpose the generated singing voice by this many semitones."),
    ]
    output_components = [
        gr.Audio(type="filepath", label="Generated Singing Voice"),
    ]
    build_endpoint(
        model_card=model_card,
        input_components=input_components,
        output_components=output_components,
        process_fn=process_fn,
    )

demo.queue().launch(share=True, show_error=False, pwa=True)