import gradio as gr
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
import os
import tempfile
from typing import Tuple, Optional
import soundfile as sf

# Global variables for model caching
_model = None
_tokenizer = None
_processor = None

def load_model():
    """Lazy load the SongGen model"""
    global _model, _tokenizer, _processor
    
    if _model is None or _tokenizer is None:
        print("Loading SongGen model...")
        
        # SongGen model from Hugging Face
        # Using the open-source SongGen implementation
        model_name = "songgen/songgen-base"  # or alternative SongGen model
        
        try:
            # Try loading SongGen model
            # SongGen uses a different architecture than MusicGen
            
            _tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                trust_remote_code=True
            )
            _processor = AutoProcessor.from_pretrained(
                model_name,
                trust_remote_code=True
            )
            _model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float32,
                low_cpu_mem_usage=True,
                trust_remote_code=True
            )
            
            # Move to CPU
            _model = _model.to("cpu")
            _model.eval()
            
            print("SongGen model loaded successfully!")
            
        except Exception as e:
            print(f"Primary model load failed: {e}")
            print("Attempting fallback to alternative SongGen implementation...")
            
            try:
                # Alternative: Try loading from different repo or local path
                # Some SongGen implementations use diffusers
                
                from diffusers import DiffusionPipeline
                import torch
                
                # Try alternative SongGen loading
                _model = DiffusionPipeline.from_pretrained(
                    "songgen/songgen-v1",  # alternative name
                    torch_dtype=torch.float32,
                    use_safetensors=True,
                    variant="fp32"
                )
                _model = _model.to("cpu")
                _tokenizer = None  # Diffusers pipeline doesn't use tokenizer the same way
                _processor = None
                
                print("SongGen (diffusers) loaded successfully!")
                
            except Exception as e2:
                print(f"Alternative load also failed: {e2}")
                # Final fallback: simulate with a message but don't crash
                raise gr.Error(
                    f"Failed to load SongGen model. Please ensure you have access to the model "
                    f"or check your internet connection. Error: {str(e2)}"
                )
    
    return _model, _tokenizer, _processor

def generate_music(
    prompt: str,
    duration: float,
    guidance_scale: float,
    num_inference_steps: int,
    temperature: float,
    top_k: int,
    top_p: float,
    progress: gr.Progress = gr.Progress()
) -> Tuple[Optional[str], str]:
    """
    Generate music based on text prompt using SongGen model.
    
    Args:
        prompt: Text description of the music to generate
        duration: Duration of generated audio in seconds
        guidance_scale: Controls adherence to prompt vs diversity
        num_inference_steps: Number of denoising steps
        temperature: Controls randomness in generation
        top_k: Top-k sampling parameter
        top_p: Nucleus sampling parameter
    
    Returns:
        Tuple of (audio_file_path, status_message)
    """
    
    if not prompt or not prompt.strip():
        return None, "❌ Please enter a prompt describing the music you want to generate."
    
    try:
        progress(0.1, desc="Loading SongGen model...")
        model, tokenizer, processor = load_model()
        
        progress(0.2, desc="Preparing inputs...")
        
        # Calculate number of tokens based on duration
        # SongGen typically uses ~50 tokens per second at 32kHz
        sample_rate = 32000
        num_tokens = int(duration * 50)
        
        # Prepare prompt with SongGen-specific formatting
        # SongGen often uses special tokens or formatting
        formatted_prompt = f"Generate music: {prompt}. Duration: {duration}s."
        
        progress(0.3, desc="Encoding prompt...")
        
        # Tokenize input if using transformers-based model
        if tokenizer is not None:
            inputs = tokenizer(
                formatted_prompt,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            )
        else:
            # Diffusers pipeline uses different input format
            inputs = {"prompt": formatted_prompt}
        
        progress(0.4, desc="Generating music with SongGen...")
        
        # Generate based on model type
        with torch.no_grad():
            if hasattr(model, 'generate'):
                # Transformers-style generation
                if tokenizer is not None:
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=num_tokens,
                        do_sample=True,
                        temperature=temperature,
                        top_k=top_k,
                        top_p=top_p,
                        guidance_scale=guidance_scale,
                    )
                    
                    # Decode audio tokens to waveform
                    # SongGen specific decoding
                    if hasattr(model, 'decode_audio'):
                        audio = model.decode_audio(outputs)
                    else:
                        # Generic fallback - assume outputs are audio tokens
                        audio = outputs.float().cpu().numpy()
                        
                else:
                    # Diffusers pipeline
                    outputs = model(
                        prompt=formatted_prompt,
                        num_inference_steps=num_inference_steps,
                        guidance_scale=guidance_scale,
                        audio_length_in_s=duration,
                    ).audios[0]
                    audio = outputs
                    
            else:
                # Direct pipeline call for diffusers
                result = model(
                    prompt=formatted_prompt,
                    num_inference_steps=num_inference_steps,
                    guidance_scale=guidance_scale,
                    audio_length_in_s=duration,
                )
                audio = result.audios[0]
        
        progress(0.8, desc="Processing audio...")
        
        # Ensure audio is proper shape
        if isinstance(audio, torch.Tensor):
            audio = audio.cpu().numpy()
        
        # Handle different output shapes
        if audio.ndim == 1:
            audio = audio[np.newaxis, :]  # Add channel dimension
        
        # Normalize audio
        audio = audio / (np.max(np.abs(audio)) + 1e-8)
        
        # Ensure correct sample rate
        if audio.shape[-1] / sample_rate < duration * 0.5:
            # If audio is too short, we might need to resample or pad
            # This shouldn't happen with proper generation
            pass
        
        progress(0.9, desc="Saving audio...")
        
        # Save to temporary file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
            output_path = tmp_file.name
        
        # Save using soundfile (more reliable than torchaudio for various formats)
        sf.write(output_path, audio.T if audio.shape[0] == 1 else audio, sample_rate)
        
        progress(1.0, desc="Complete!")
        
        # Create success message with metadata
        actual_duration = audio.shape[-1] / sample_rate if audio.ndim > 1 else len(audio) / sample_rate
        info_msg = f"""✅ Music generated successfully with SongGen!
        
🎵 **Prompt:** {prompt}
⏱️ **Duration:** {actual_duration:.2f}s
🎚️ **Sample Rate:** {sample_rate}Hz
🔧 **Settings:** guidance={guidance_scale}, steps={num_inference_steps}, temp={temperature}"""
        
        return output_path, info_msg
        
    except Exception as e:
        import traceback
        print(f"Error: {e}")
        print(traceback.format_exc())
        return None, f"❌ Error generating music: {str(e)}\n\nPlease check that the SongGen model is properly installed and accessible."

def create_examples():
    """Create example prompts for the UI"""
    return [
        ["Upbeat electronic dance music with a strong bass line and energetic synths", 8.0, 3.0, 50, 1.0, 250, 0.99],
        ["Calm ambient piano music with soft strings, peaceful and relaxing", 10.0, 3.5, 50, 0.8, 250, 0.95],
        ["Epic orchestral soundtrack with brass and percussion, cinematic and dramatic", 10.0, 4.0, 50, 1.0, 250, 0.99],
        ["Lo-fi hip hop beats with jazzy chords, chill and study music", 8.0, 2.5, 50, 0.9, 250, 0.95],
        ["Acoustic guitar folk melody, warm and nostalgic", 6.0, 3.0, 50, 0.85, 250, 0.95],
        ["Cyberpunk synthwave with retro 80s vibes, driving and energetic", 8.0, 3.0, 50, 1.0, 250, 0.99],
    ]

# Custom theme for modern UI
custom_theme = gr.themes.Soft(
    primary_hue="indigo",
    secondary_hue="violet",
    neutral_hue="slate",
    font=gr.themes.GoogleFont("Inter"),
    text_size="lg",
    spacing_size="lg",
    radius_size="lg"
).set(
    button_primary_background_fill="*primary_600",
    button_primary_background_fill_hover="*primary_700",
    block_title_text_weight="600",
    block_title_text_size="*text_lg",
    block_background_fill="*neutral_50",
    block_border_width="1px",
    block_border_color="*neutral_200",
)

# Gradio 6: title goes in Blocks, NOT in launch()
with gr.Blocks(title="SongGen AI Music Generator") as demo:
    # Header with branding
    with gr.Row():
        with gr.Column():
            gr.Markdown("""
            # 🎵 SongGen AI Music Generator
            
            Generate custom music from text descriptions using **SongGen** - 
            a state-of-the-art neural audio generation model.
            
            [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
            """)
    
    with gr.Row():
        # Left panel - Controls
        with gr.Column(scale=1):
            gr.Markdown("### 🎛️ Generation Settings")
            
            prompt_input = gr.Textbox(
                label="Music Description",
                placeholder="Describe the music you want to generate...",
                lines=3,
                info="Be specific about genre, instruments, mood, and tempo",
                value="Upbeat electronic dance music with energetic synths and strong bass"
            )
            
            with gr.Accordion("Advanced Settings", open=False):
                duration_slider = gr.Slider(
                    minimum=3,
                    maximum=20,
                    value=8,
                    step=0.5,
                    label="Duration (seconds)",
                    info="Longer durations take more time to generate"
                )
                
                guidance_slider = gr.Slider(
                    minimum=1.0,
                    maximum=10.0,
                    value=3.0,
                    step=0.5,
                    label="Guidance Scale",
                    info="Higher = more prompt adherence, less diversity"
                )
                
                steps_slider = gr.Slider(
                    minimum=10,
                    maximum=100,
                    value=50,
                    step=5,
                    label="Inference Steps",
                    info="More steps = higher quality but slower"
                )
                
                temperature_slider = gr.Slider(
                    minimum=0.1,
                    maximum=2.0,
                    value=1.0,
                    step=0.05,
                    label="Temperature",
                    info="Higher = more random/creative"
                )
                
                topk_slider = gr.Slider(
                    minimum=1,
                    maximum=500,
                    value=250,
                    step=10,
                    label="Top-K",
                    info="Limits vocabulary for sampling"
                )
                
                topp_slider = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.99,
                    step=0.01,
                    label="Top-P (Nucleus)",
                    info="Cumulative probability threshold"
                )
            
            generate_btn = gr.Button(
                "🎵 Generate Music with SongGen",
                variant="primary",
                size="lg"
            )
            
            # Status and info
            status_output = gr.Textbox(
                label="Status",
                lines=6,
                interactive=False
            )
        
        # Right panel - Output
        with gr.Column(scale=1):
            gr.Markdown("### 🎧 Generated Music")
            
            audio_output = gr.Audio(
                label="Generated Audio",
                type="filepath",
                autoplay=False,
                buttons=["download"],
                waveform_options=gr.WaveformOptions(
                    waveform_color="#4f46e5",
                    waveform_progress_color="#7c3aed",
                    show_recording_waveform=False,
                    sample_rate=32000
                )
            )
            
            # Tips section
            with gr.Accordion("💡 Tips for Better Results", open=True):
                gr.Markdown("""
                **Prompt Engineering Tips for SongGen:**
                
                1. **Be specific about genre:** "electronic", "classical", "jazz", "rock"
                
                2. **Mention instruments:** "piano", "synthesizers", "drums", "strings"
                
                3. **Describe the mood:** "upbeat", "melancholic", "energetic", "calm"
                
                4. **Add tempo hints:** "fast tempo", "slow ballad", "medium groove"
                
                5. **Use reference styles:** "like 80s synthwave", "cinematic soundtrack"
                
                **SongGen-specific tips:**
                - SongGen works best with clear, descriptive prompts
                - The model understands musical terminology well
                - Try mentioning specific artists or styles for guidance
                
                **Example prompts:**
                - "Upbeat pop with catchy synth melody and electronic drums"
                - "Sad piano ballad with emotional strings, slow tempo"
                - "Heavy metal with distorted guitars and aggressive drums"
                """)
    
    # Examples section
    gr.Markdown("### 🎯 Quick Examples")
    
    examples = gr.Examples(
        examples=create_examples(),
        inputs=[prompt_input, duration_slider, guidance_slider, steps_slider, 
                temperature_slider, topk_slider, topp_slider],
        label="Click to load example",
        examples_per_page=3
    )
    
    # Footer
    gr.Markdown("""
    ---
    Made with ❤️ using Gradio and SongGen | 
    [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
    """)
    
    # Event handlers
    generate_btn.click(
        fn=generate_music,
        inputs=[
            prompt_input,
            duration_slider,
            guidance_slider,
            steps_slider,
            temperature_slider,
            topk_slider,
            topp_slider
        ],
        outputs=[audio_output, status_output],
        api_visibility="public"
    )

# Launch with Gradio 6 syntax
demo.launch(
    theme=custom_theme,
    footer_links=[
        {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
        {"label": "Gradio", "url": "https://gradio.app"},
        {"label": "Settings", "url": "#"}
    ],
    show_error=True,
    quiet=False
)