import gradio as gr
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import torch
import numpy as np
import uuid
import torchaudio

# Load Model
model = MusicGen.get_pretrained('facebook/musicgen-melody')

# --- STRUCTURE DEFINITIONS ---
# These act as "presets" for the song's roadmap
STRUCTURE_PRESETS = {
    "Radio Hit (Intro-Verse-Chorus-Drop)": ["Intro", "Verse", "Build-Up", "Drop/Chorus", "Outro"],
    "Club Extended (Intro-Build-Drop-Break-Drop)": ["Intro", "Build-Up", "Drop", "Breakdown", "Drop 2", "Outro"],
    "Hip Hop Classic (Intro-Verse-Hook-Verse-Hook)": ["Intro", "Verse 1", "Hook", "Verse 2", "Hook", "Outro"],
    "Custom (Single Prompt)": ["Full Track"]
}

# --- STYLE PROMPTS FOR SECTIONS ---
# How each section should sound based on the genre
SECTION_PROMPTS = {
    "Bass House (Jayms Style)": {
        "Intro": "Atmospheric intro, filtered bass, simple hi-hats, tension building.",
        "Verse": "Minimal groove, shuffling hi-hats, deep sub-bass, metallic plucks, clean vocal chops.",
        "Build-Up": "Rising snare roll, pitch riser, white noise sweep, accelerating energy, hype vocals.",
        "Drop": "Explosive Bass House drop, metallic FM wobble bass, heavy kick, sidechain, festival energy.",
        "Breakdown": "Atmospheric pads, filtered chords, no drums, emotional melody.",
        "Outro": "Drum loop fading out, DJ friendly intro/outro, simple bass."
    },
    "Future House (Mainstage)": {
        "Intro": "Bright piano chords, filtered kick, uplifting atmosphere.",
        "Verse": "Plucky melody, snapping fingers, deep house bass, clean production.",
        "Build-Up": "uplifting risers, snare build, hands in the air energy.",
        "Drop": "Bouncy Future House drop, metallic donk bass, shuffling beat, catchy lead.",
        "Outro": "Stripped back drums, fading melody."
    },
    "Dubstep (Heavy)": {
        "Intro": "Dark cinematic drone, ominous atmosphere, slow percussion.",
        "Build-Up": "Intense drum roll, siren rising, pre-drop vocal scream.",
        "Drop": "Heavy dubstep drop, mechanical growl bass, half-time drums, sub-bass impact.",
        "Breakdown": "Orchestral strings, emotional piano, dark ambience.",
        "Outro": "Dark fading textures, slow drum hit."
    }
}

def generate_structured_song(audio_input, genre_style, structure_preset, total_duration, bpm, temp, cfg_scale, top_k, history):
    if audio_input is None: return None, history

    # 1. SETUP AUDIO INPUT (The DNA)
    sr, data = audio_input
    audio_data = data.astype(np.float32)
    if np.max(np.abs(audio_data)) > 0: audio_data /= np.max(np.abs(audio_data))
    
    # Convert to Tensor
    audio_tensor = torch.from_numpy(audio_data).t()
    if audio_tensor.shape[0] > 1: audio_tensor = audio_tensor.mean(dim=0, keepdim=True)
    else: audio_tensor = audio_tensor.unsqueeze(0)
    
    if sr != 32000:
        resampler = torchaudio.transforms.Resample(sr, 32000)
        audio_tensor = resampler(audio_tensor)

    # 2. DETERMINE SECTIONS
    sections = STRUCTURE_PRESETS[structure_preset]
    if structure_preset == "Custom (Single Prompt)":
        # Fallback to simple generation
        pass 

    # Calculate time per section
    # e.g., 60s total / 4 sections = 15s per section
    sec_duration = total_duration / len(sections)
    
    full_song = audio_tensor # Start with the user's input as context (optional, or start fresh)
    # Actually, let's treat the user input as the "Intro" DNA and generate from there.
    
    current_context = audio_tensor.unsqueeze(0) # [1, 1, T]
    
    print(f"Generating {len(sections)} sections...")
    
    generated_parts = []

    # 3. GENERATION LOOP (The Architect)
    for i, section_name in enumerate(sections):
        # Get the specific prompt for this section (e.g., "Verse" description)
        if genre_style in SECTION_PROMPTS and section_name in SECTION_PROMPTS[genre_style]:
            specific_desc = SECTION_PROMPTS[genre_style][section_name]
        else:
            # Fallback if specific section text missing
            specific_desc = f"{genre_style} style, {section_name} section, dynamic change."
        
        final_prompt = f"{specific_desc} strictly {bpm} BPM, high fidelity."
        print(f"Rendering {section_name}: {final_prompt}")

        # Set params for this chunk
        # Note: We generate a bit more than needed to allow for crossfading overlap
        model.set_generation_params(
            duration=sec_duration + 2, # +2s for overlap/continuity
            temperature=temp,
            cfg_coef=cfg_scale,
            top_k=int(top_k)
        )

        # Generate Continuation from previous part
        # We use the LAST 10 seconds of the previous audio as the "seed" for the new part
        # to ensure flow, but we change the text prompt to change the energy.
        
        # Grab last 10s of context
        context_len = current_context.shape[-1]
        trim_len = int(10 * 32000)
        if context_len > trim_len:
            input_seed = current_context[..., -trim_len:]
        else:
            input_seed = current_context

        next_chunk = model.generate_continuation(
            prompt=input_seed,
            prompt_sample_rate=32000,
            descriptions=[final_prompt],
            progress=True
        )
        
        # Add to list
        # Remove the input_seed part from the output to avoid duplication?
        # generate_continuation returns the FULL audio (seed + new). 
        # We only want the NEW part.
        new_audio = next_chunk[..., input_seed.shape[-1]:] 
        
        generated_parts.append(new_audio)
        
        # Update context for next loop
        current_context = next_chunk

    # 4. STITCHING
    # Concatenate all parts
    final_mix = torch.cat(generated_parts, dim=-1)

    # 5. EXPORT
    uid = str(uuid.uuid4())[:8]
    filename = f"structured_song_{uid}"
    audio_write(filename, final_mix[0].cpu(), model.sample_rate, strategy="loudness")
    
    return f"{filename}.wav", [f"{filename}.wav"] + history

# --- UI ---
with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="zinc")) as demo:
    history = gr.State([])
    gr.Markdown("# 🏗️ Infinite Song Architect: Structure Mode")
    gr.Markdown("Define a Verse-Chorus structure, and the AI will build each section dynamically.")

    with gr.Row():
        with gr.Column():
            audio_in = gr.Audio(label="🎧 Input DNA (Chords/Bass)", type="numpy")
            
            with gr.Row():
                genre_drop = gr.Dropdown(list(SECTION_PROMPTS.keys()), value="Bass House (Jayms Style)", label="Sound Palette")
                struct_drop = gr.Dropdown(list(STRUCTURE_PRESETS.keys()), value="Radio Hit (Intro-Verse-Chorus-Drop)", label="Song Structure")
            
            with gr.Row():
                bpm_slider = gr.Slider(60, 180, 126, label="BPM")
                len_slider = gr.Slider(30, 120, 60, step=10, label="Total Length")
            
            with gr.Accordion("Fine Tune", open=False):
                cfg_slider = gr.Slider(1, 15, 8, label="Strictness")
                temp_slider = gr.Slider(0.1, 1.5, 0.6, label="Creativity")
                top_k_slider = gr.Slider(10, 250, 50, label="Stability")

            btn = gr.Button("🏗️ BUILD FULL SONG", variant="primary")
            out_audio = gr.Audio(label="Full Track Output")

        with gr.Column():
            history_list = gr.Files(label="History")

    btn.click(generate_structured_song, 
              [audio_in, genre_drop, struct_drop, len_slider, bpm_slider, temp_slider, cfg_slider, top_k_slider, history], 
              [out_audio, history_list])

demo.launch()