import torch
import torchaudio
from einops import rearrange
import gradio as gr
import spaces
import os
import random
import uuid


from stable_audio_tools.inference.generation import generate_diffusion_cond
from stable_audio_tools import get_pretrained_model

device = "cuda" if torch.cuda.is_available() else "cpu"


def load_model():

    # Download model
    model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
    sample_rate = model_config["sample_rate"]
    sample_size = model_config["sample_size"]

    model = model.to(device)
    return model, sample_rate, sample_size


@spaces.GPU(duration=120)
def inference(audio_path, prompt ="drums beats with snares", noise_level = 2.7):
    # Fetch the Hugging Face token from the environment variable
    hf_token = os.getenv('HF_TOKEN')
    print(f"Hugging Face token: {hf_token}")

    print(f"audio path: {audio_path}")
    model, sample_rate, sample_size = load_model()
    print(f"sample size is: {sample_size} and sample rate is: {sample_rate}.")
    
    # Set up text and timing conditioning
    conditioning = [{
        "prompt": "electronic sound with fast and intensive drums",
        "seconds_start": 0,
        "seconds_total": 30
    }]
    
    # import random
    diffusion_steps = [100]

    float_values = [2.2, 2.6, 3.0, 3.4]
    # float_values = [round(random.uniform(2.2, 4), 2) for _ in range(20)
    len_in_sec = 30
    our_sample_size = sample_rate*len_in_sec
    with torch.no_grad():
        # for example in range(len(data)):
        print(f"prompt: {prompt}")
        conditioning[0]["prompt"] = prompt
        for i in range(len(diffusion_steps)):
            steps = diffusion_steps[i]
            print(f"number of steps: {steps}")
            # for j in range(len(float_values)):
            # noise_level = float_values[j]
            print(f"Noise level is: {noise_level}")
            audio, sr = torchaudio.load(audio_path)
            output = generate_diffusion_cond(
                model,
                steps=steps,
                cfg_scale=7,
                conditioning=conditioning,
                sample_size=our_sample_size,
                sigma_min=0.3,
                sigma_max=500,
                sampler_type="dpmpp-3m-sde",
                device=device,
                init_audio=(sr, audio),
                init_noise_level=noise_level,
                # use_init = True,
            )

            # Rearrange audio batch to a single sequence
            output = rearrange(output, "b d n -> d (b n)")
            print("rearranged the output into a single sequence")
            
            # Peak normalize, clip, convert to int16, and save to file
            output = (
                output.to(torch.float32)
                .div(torch.max(torch.abs(output)))
                .clamp(-1, 1)
                .mul(32767)
                .to(torch.int16)
                .cpu()
            )
            print("Normalized the output, clip and convert to int16")
            
             # Generate a unique filename for the output
            unique_filename = f"output_{uuid.uuid4().hex}.mp3"
            print(f"Saving audio to file: {unique_filename}")
            torchaudio.save(unique_filename, output, sample_rate)
            print(f"saved to filename {unique_filename}")
            
            return unique_filename


interface = gr.Interface(
    fn=inference,
    inputs=[
        # gr.UploadButton(label="Audio without drums",file_types=['mp3']),
        gr.Audio(type="filepath", label="Audio without drums"),
        gr.Textbox(label="Text prompt", placeholder="Enter your text prompt here"),
        gr.Slider(2.5, 3.5, step=0.1, value=2.7, label="Noise Level", info="Choose between 2.5 and 3.5"),
    ],
    outputs=gr.Audio(type="filepath", label="Generated Audio"),
    title="Stable Audio Generator",
    description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0.",
    examples=[
        [
            "the_chosen_ones/085838/no_drums.mp3",  # Audio without drums
            "A techno song with fast, outer space-themed drum beats.",  # Text prompt
            2.7 # Noise Level
        ],
        [
            "the_chosen_ones/103522/no_drums.mp3",  # Audio without drums
            "A slow country melody accompanied by drum beats.",  # Text prompt
            2.7 # Noise Level
        ],
        [
            "the_chosen_ones/103800/no_drums.mp3",  # Audio without drums
            "A rap song featuring slow, groovy drums with intermittent snares.",  # Text prompt
            2.7 # Noise Level
        ],
        [
            "the_chosen_ones/103808/no_drums.mp3",  # Audio without drums
            "Smooth, slow piano grooves paired with intense, rapid drum rhythms.",  # Text prompt
            2.7 # Noise Level
        ],
        [
            "the_chosen_ones/134796/no_drums.mp3",  # Audio without drums
            "A rap track with rapid drum beats and snares.",  # Text prompt
            2.7 # Noise Level
        ]
    ], 
    cache_examples=True
)

model, sample_rate, sample_size = load_model()
interface.launch()