import gradio as gr
import torch
import tempfile
import soundfile as sf
import librosa
import numpy as np

from demucs.pretrained import get_model
from demucs.apply import apply_model

# Load Demucs model
model = get_model("htdemucs")
model.cpu()
model.eval()


def remove_vocals(audio_file):

    # load audio
    audio, sr = librosa.load(audio_file, sr=44100, mono=False)

    if audio.ndim == 1:
        audio = np.expand_dims(audio, axis=0)

    audio_tensor = torch.tensor(audio).unsqueeze(0)

    with torch.no_grad():
        sources = apply_model(model, audio_tensor)

    # sources order:
    # 0 drums
    # 1 bass
    # 2 other
    # 3 vocals

    drums = sources[0][0].cpu().numpy()
    bass = sources[0][1].cpu().numpy()
    other = sources[0][2].cpu().numpy()

    # mix everything except vocals
    background = drums + bass + other

    output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name

    sf.write(output_file, background.T, sr)

    return output_file


interface = gr.Interface(
    fn=remove_vocals,
    inputs=gr.Audio(type="filepath", label="Upload Audio"),
    outputs=gr.Audio(label="Background Music + Expressions"),
    title="Vocal Remover (Keep Music + Expressions)"
)

interface.launch()