import gradio as gr
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
from PIL import Image
import random

# Check GPU availability
use_gpu = torch.cuda.is_available()

# Lazy loading of models
processor, model, zephyr_generator = None, None, None


def load_models():
    """Load models only when needed"""
    global processor, model, zephyr_generator
    if processor is None or model is None or zephyr_generator is None:
        print("Loading BLIP model...")
        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
        model = BlipForConditionalGeneration.from_pretrained(
            "Salesforce/blip-image-captioning-large",
            torch_dtype=torch.float32  # Use float32 for CPU
        )
        print("✅ BLIP model loaded successfully!")
        print("Loading SARA-Zephyr fine-tuned model...")
        zephyr_generator = pipeline(
            "text-generation",
            model="Malaji71/SARA-Zephyr",  # Cambiado al modelo fine-tuned
            torch_dtype=torch.float32,  # Use float32 for CPU
            device_map="auto" if use_gpu else None  # Use auto device mapping if GPU available
        )
        print("✅ SARA-Zephyr fine-tuned model loaded successfully!")


# Universal Video Prompting Guide combining Gen-4 + SARA
unified_instructions = """
# 🎬 Universal Video Prompting Guide
*Compatible with Gen-4, Sora, Pika, Luma, Runway and all diffusion-based video models*
## Core Principles (Universal)
✅ **Focus on MOTION, not static description**
✅ **Use positive phrasing exclusively**
✅ **Start simple, iterate progressively**
✅ **Refer to subjects in general terms** ("the subject," "the woman")
✅ **Keep prompts direct and easily understood**
## Two Complementary Approaches
### 🚀 **Gen-4 Official Method** (Recommended for beginners)
**Structure**: Simple iterative building
1. Start with essential motion only
2. Add one element at a time: Subject Motion → Camera Motion → Scene Motion → Style Descriptors
3. Use general terms and avoid complex descriptions
**Example**:
- Basic: "The subject walks forward"
- + Camera: "The subject walks forward. Handheld camera follows"
- + Scene: "The subject walks forward. Handheld camera follows. Dust trails behind"
- + Style: "The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic."
### 🎯 **SARA Framework** (Advanced precision)
**Structure**: [Subject] + [Action] + [Reference] + [Atmosphere]
- **Subject (S)**: Main element to control
- **Action (A)**: Movement/transformation ([verb] + [adverb])
- **Reference (R)**: Spatial anchors ("while X remains steady")
- **Atmosphere (A)**: Context and style
**Template**: [Subject] [verb] [adverb] while [reference] [atmosphere]
**Example**: "The subject walks smoothly while background remains steady, cinematic atmosphere"
"""


def analyze_image_with_zephyr(image):
    """Analyze image using BLIP + Zephyr AI for enhanced understanding"""
    if image is None:
        return "Please upload an image first.", {}
    try:
        # Lazy load models
        load_models()
        # Convert to PIL if needed
        if not isinstance(image, Image.Image):
            image = Image.fromarray(image)
        # Get image dimensions
        width, height = image.size
        aspect_ratio = width / height
        if aspect_ratio > 1.5:
            composition = "Wide landscape shot"
        elif aspect_ratio < 0.7:
            composition = "Vertical portrait shot"
        else:
            composition = "Balanced composition"
        # Generate caption with BLIP
        inputs = processor(image, return_tensors="pt")
        out = model.generate(**inputs, max_length=50, num_beams=3)
        basic_caption = processor.decode(out[0], skip_special_tokens=True)
        # Use Zephyr for advanced analysis
        enhanced_analysis = analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition)
        # Create comprehensive analysis
        analysis = f"""📊 **Image Analysis:**
• **Dimensions**: {width} x {height}
• **Composition**: {composition}
• **Aspect Ratio**: {aspect_ratio:.2f}
🎨 **Scene Description**: 
"{basic_caption}"
🤖 **AI Enhanced Analysis**:
{enhanced_analysis['scene_interpretation']}
💡 **Motion Insights**:
{chr(10).join(f"• {insight}" for insight in enhanced_analysis['motion_insights'])}
🎯 **Recommended Approach**: 
{enhanced_analysis['recommended_approach']}"""
        # Scene info for prompt generation
        scene_info = {
            'basic_description': basic_caption,
            'composition': composition,
            'aspect_ratio': aspect_ratio,
            'enhanced_analysis': enhanced_analysis
        }
        return analysis, scene_info
    except Exception as e:
        return f"Error analyzing image: {str(e)}", {}


def analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition):
    """Use SARA-Zephyr for advanced scene analysis"""
    analysis_prompt = f"""<|system|>
You are a video prompt engineering expert specializing in the SARA framework. Analyze this image description for video creation potential.
<|user|>
Image description: "{basic_caption}"
Image composition: {composition}
Aspect ratio: {aspect_ratio:.2f}
Please provide:
1. Type of motion that would work best
2. Recommended camera movements
3. Emotional tone/style suggestions
4. Best prompting approach (SARA vs Gen-4)
Be concise and practical.
<|assistant|>"""
    response = zephyr_generator(
        analysis_prompt,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        pad_token_id=zephyr_generator.tokenizer.eos_token_id
    )
    ai_analysis = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
    lines = ai_analysis.split('\n')  # Fixed: Properly split by newline
    motion_insights = []
    recommended_approach = "SARA framework recommended for precise control"
    for line in lines:
        if line.strip():
            if any(keyword in line.lower() for keyword in ['motion', 'movement', 'camera', 'lighting']):
                motion_insights.append(line.strip('- ').strip())
            elif 'sara' in line.lower() or 'gen-4' in line.lower():
                recommended_approach = line.strip('- ').strip()
    return {
        'scene_interpretation': ai_analysis.split('\n')[0] if ai_analysis else "Scene analysis completed",
        'motion_insights': motion_insights[:6],
        'recommended_approach': recommended_approach
    }


def generate_sample_prompts_with_zephyr(scene_info=None):
    """Generate sample prompts using SARA-Zephyr"""
    if scene_info and scene_info.get('basic_description'):
        # Use Zephyr to generate contextual prompts
        context_prompt = f"""<|system|>
Generate 3 professional video prompts using the SARA framework based on this image analysis.
<|user|>
Image description: {scene_info['basic_description']}
Composition: {scene_info.get('composition', 'Balanced')}
Aspect Ratio: {scene_info.get('aspect_ratio', 'N/A'):.2f}
Remember the SARA framework: Subject + Action + Reference + Atmosphere
<|assistant|>"""
        response = zephyr_generator(
            context_prompt,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.8,
            pad_token_id=zephyr_generator.tokenizer.eos_token_id
        )
        # Extract and clean prompts
        prompts_text = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
        prompts = [p.strip('123.-• ') for p in prompts_text.split('\n') if p.strip()]  # Fixed: Split by newline
        # Return first 3 clean prompts
        if len(prompts) >= 3:
            return prompts[:3]
    # Fallback prompts if Zephyr fails or no scene info
    base_prompts = [
        "The subject walks forward smoothly while the background remains steady, cinematic atmosphere.",
        "A dramatic close-up captures the subject's expression as they speak directly to the camera.",
        "The scene transitions with a handheld camera following the subject through a bustling environment."
    ]
    return base_prompts


def optimize_user_prompt_with_zephyr(user_idea, scene_info=None):
    """Optimize user's prompt idea using SARA-Zephyr while respecting SARA/Gen-4 structure"""
    if not user_idea.strip():
        return "Please enter your idea first."
    # Create context from scene if available
    context = ""
    if scene_info and scene_info.get('basic_description'):
        context = f"Image context: {scene_info['basic_description']}"
    # Enforce structure based on approach
    optimization_prompt = f"""<|system|>
You are an expert in video prompting, specializing in the SARA framework. Transform user ideas into professional prompts compatible with AI video models like Sora, Gen-4, Pika, Runway, and Luma.
Key principles:
- Focus on MOTION, not static description
- Use positive phrasing
- Be specific about camera work
- Include lighting/atmosphere details
- Follow the SARA structure: Subject + Action + Reference + Atmosphere
<|user|>
User's idea: "{user_idea}"
{context}
Please create an optimized video prompt using the SARA framework. Respond with just the prompt.
<|assistant|>"""
    response = zephyr_generator(
        optimization_prompt,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.7,
        pad_token_id=zephyr_generator.tokenizer.eos_token_id
    )
    # Extract optimized prompt
    optimized = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
    return optimized


def refine_prompt_with_zephyr(current_prompt, feedback, chat_history, scene_info=None):
    """Refine a prompt based on user feedback using SARA-Zephyr"""
    if not feedback.strip():
        return current_prompt, chat_history
    # Create refinement context
    context = ""
    if scene_info and scene_info.get('basic_description'):
        context = f"Image context: {scene_info['basic_description']}"
    # Construct Zephyr refinement prompt
    refinement_prompt = f"""<|system|>
You are an expert in refining video prompts using the SARA framework. Based on the user's feedback, improve the current prompt while maintaining its core structure.
Key principles:
- Focus on MOTION, not static description
- Use positive phrasing
- Be specific about camera work
- Include lighting/atmosphere details
- Follow the SARA structure: Subject + Action + Reference + Atmosphere
<|user|>
Current prompt: "{current_prompt}"
Feedback: "{feedback}"
{context}
Please refine the prompt while keeping it under 100 words. Respond with just the refined prompt.
<|assistant|>"""
    response = zephyr_generator(
        refinement_prompt,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.7,
        pad_token_id=zephyr_generator.tokenizer.eos_token_id
    )
    # Extract refined prompt
    refined = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
    # Update chat history
    new_chat_history = chat_history + [[feedback, refined]]
    return refined, new_chat_history


def generate_gen4_prompts(scene_info, foundation=""):
    """Generate Gen-4 style prompts iteratively"""
    try:
        if scene_info and scene_info.get('basic_description'):
            description = scene_info['basic_description']
            # Detect subject
            if 'man' in description.lower():
                subject = "The man"
            elif 'woman' in description.lower():
                subject = "The woman"
            elif 'person' in description.lower():
                subject = "The person"
            else:
                subject = "The subject"
            # Generate actions based on scene
            if any(word in description.lower() for word in ['sitting', 'seated']):
                actions = ['speaks to camera', 'gestures while seated', 'leans forward', 'adjusts posture']
            elif any(word in description.lower() for word in ['standing', 'portrait']):
                actions = ['speaks directly', 'gestures naturally', 'shifts weight', 'looks around']
            else:
                actions = ['moves forward', 'turns slightly', 'gestures', 'demonstrates']
            action = random.choice(actions)
            # Build Gen-4 iteratively
            basic = f"{subject} {action}"
            with_motion = f"{basic} smoothly"
            with_camera = f"{with_motion}. Camera captures steadily"
            # Add style based on composition
            composition = scene_info.get('composition', '')
            if 'Wide' in composition:
                style_addition = "Wide cinematic framing"
            elif 'Portrait' in composition:
                style_addition = "Intimate portrait lighting"
            else:
                style_addition = "Professional documentary style"
            with_style = f"{with_camera}. {style_addition}."
            return f"""🚀 **Gen-4 Iterative Building:**
**Basic**: {basic}
**+ Motion**: {with_motion}
**+ Camera**: {with_camera}
**+ Style**: {with_style}"""
        else:
            return """🚀 **Gen-4 Iterative Building:**
**Basic**: The subject walks forward
**+ Camera**: The subject walks forward. Handheld camera follows
**+ Scene**: The subject walks forward. Handheld camera follows. Dust trails behind
**+ Style**: The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic."""
    except Exception as e:
        return f"Error generating Gen-4 prompts: {str(e)}"


def build_custom_prompt(foundation, subject_motion, scene_motion, camera_motion, style, approach="SARA"):
    """Build custom prompt using selected approach"""
    if approach == "SARA":
        # SARA Structure: [Subject] [Action] while [Reference], [Atmosphere]
        parts = []
        if foundation:
            parts.append(foundation)
        # Add motion elements
        motion_parts = []
        if subject_motion:
            motion_parts.extend(subject_motion)
        if scene_motion:
            motion_parts.extend(scene_motion)
        if motion_parts:
            parts.append(", ".join(motion_parts))
        # Reference (camera stability)
        if camera_motion:
            parts.append(f"while {camera_motion}")
        else:
            parts.append("while background remains steady")
        # Atmosphere
        if style:
            parts.append(style)
        return " ".join(parts)
    else:  # Gen-4 style
        # Gen-4 Structure: Simple iterative building
        parts = []
        if foundation:
            parts.append(foundation)
        if subject_motion:
            parts.extend(subject_motion)
        if camera_motion:
            parts.append(camera_motion)
        if scene_motion:
            parts.extend(scene_motion)
        if style:
            parts.append(style)
        return ". ".join(parts) if parts else "The subject moves naturally"


# Create the Gradio interface
def create_interface():
    """Create the Gradio interface"""
    with gr.Blocks(theme=gr.themes.Soft(), title="AI Video Prompt Generator") as demo:
        # Header
        gr.Markdown("# 🎬 AI Video Prompt Generator - 🤖 SARA-Zephyr AI Powered")
        gr.Markdown("*Professional prompts for Sora, Gen-4, Pika, Luma, Runway and more*")
        # State variables
        scene_state = gr.State({})
        chat_history_state = gr.State([])
        with gr.Tabs():
            # Tab 1: Learning Guide
            with gr.Tab("📚 Prompting Guide"):
                gr.Markdown(unified_instructions)
                # Advanced tips
                with gr.Accordion("🎯 Advanced Tips", open=False):
                    gr.Markdown("""
                    ## Advanced Prompting Strategies
                    ### 🎨 Style Integration
                    - **Cinematography**: "Dutch angle," "Extreme close-up," "Bird's eye view"
                    - **Lighting**: "Golden hour," "Neon glow," "Harsh shadows," "Soft diffused light"
                    - **Movement Quality**: "Fluid motion," "Mechanical precision," "Organic flow"
                    ### ⚡ Motion Types
                    - **Subject Motion**: Walking, running, dancing, gesturing
                    - **Camera Motion**: Pan, tilt, dolly, zoom, orbit, tracking
                    - **Environmental**: Wind, water flow, particle effects, lighting changes
                    """)
            # Tab 2: Image Analysis
            with gr.Tab("📷 Image Analysis"):
                with gr.Row():
                    with gr.Column(scale=1):
                        image_input = gr.Image(
                            label="Upload Image for Analysis",
                            type="pil"
                        )
                        analyze_btn = gr.Button("🔍 Analyze Image", variant="primary")
                    with gr.Column(scale=2):
                        analysis_output = gr.Markdown(label="AI Analysis Results")
                # Sample prompts section
                with gr.Group():
                    gr.Markdown("### 💡 Sample Prompts")
                    sample_btn = gr.Button("🎲 Generate Sample Prompts")
                    sample_prompts = [
                        gr.Textbox(
                            label=f"Sample {i+1}", 
                            lines=2, 
                            interactive=False,
                            show_copy_button=True
                        )
                        for i in range(3)
                    ]
            # Tab 3: AI Prompt Generator
            with gr.Tab("🤖 AI Prompt Generator"):
                with gr.Row():
                    with gr.Column():
                        user_idea = gr.Textbox(
                            label="Your Video Idea (any language)",
                            placeholder="e.g., 'el personaje se quita la nariz' or 'character walks slowly'",
                            lines=3
                        )
                        optimize_btn = gr.Button("🚀 Generate Optimized Prompt", variant="primary")
                        optimized_prompt = gr.Textbox(
                            label="AI-Optimized Video Prompt",
                            lines=4,
                            interactive=True,
                            show_copy_button=True
                        )
                    with gr.Column():
                        gr.Markdown("### 🔄 Refine Your Prompt")
                        feedback_input = gr.Textbox(
                            label="Feedback/Changes",
                            placeholder="e.g., 'make it more dramatic' or 'add camera movement'",
                            lines=2
                        )
                        refine_btn = gr.Button("🔄 Refine Prompt")
                        # Chat history
                        with gr.Accordion("💬 Refinement History", open=False):
                            chat_display = gr.Chatbot(height=300, type='messages')
            # Tab 4: Gen-4 Method
            with gr.Tab("📝 Gen-4 Official"):
                gr.Markdown("*Official Gen-4 method: Simple → Complex building*")
                with gr.Row():
                    foundation_gen4 = gr.Textbox(
                        label="Foundation (Optional)",
                        placeholder="e.g., 'The subject walks forward'",
                        lines=1
                    )
                    generate_gen4_btn = gr.Button("Generate Gen-4 Prompts", variant="primary")
                gen4_output = gr.Textbox(
                    label="Gen-4 Style Prompts",
                    lines=8,
                    interactive=False,
                    show_copy_button=True
                )
            # Tab 5: Custom Builder
            with gr.Tab("🛠️ Custom Builder"):
                gr.Markdown("## Build Your Custom Prompt")
                with gr.Row():
                    approach_selector = gr.Radio(
                        choices=["SARA", "Gen-4"],
                        value="SARA",
                        label="Approach",
                        interactive=True
                    )
                    custom_foundation = gr.Textbox(
                        label="Foundation",
                        placeholder="The subject...",
                        lines=1
                    )
                with gr.Row():
                    subject_motion = gr.CheckboxGroup(
                        choices=["walks smoothly", "speaks clearly", "gestures naturally", "moves gracefully", "turns slowly"],
                        label="Subject Motion"
                    )
                    scene_motion = gr.CheckboxGroup(
                        choices=["dust swirls", "lighting changes", "wind effects", "water movement", "atmosphere shifts"],
                        label="Scene Motion"
                    )
                with gr.Row():
                    camera_motion = gr.Dropdown(
                        choices=["camera remains steady", "handheld camera", "camera pans left", "camera pans right", "camera tracks forward", "camera zooms in"],
                        label="Camera Motion",
                        value="camera remains steady"
                    )
                    style_motion = gr.Dropdown(
                        choices=["cinematic", "documentary style", "live-action", "dramatic", "peaceful", "energetic", "professional"],
                        label="Style/Atmosphere",
                        value="cinematic"
                    )
                build_custom_btn = gr.Button("🔨 Build Custom Prompt", variant="secondary")
                custom_output = gr.Textbox(
                    label="Your Custom Prompt",
                    lines=3,
                    interactive=True,
                    show_copy_button=True
                )
        # Event handlers
        analyze_btn.click(
            fn=analyze_image_with_zephyr,
            inputs=[image_input],
            outputs=[analysis_output, scene_state]
        )
        sample_btn.click(
            fn=generate_sample_prompts_with_zephyr,
            inputs=[scene_state],
            outputs=sample_prompts
        )
        optimize_btn.click(
            fn=optimize_user_prompt_with_zephyr,
            inputs=[user_idea, scene_state],
            outputs=[optimized_prompt]
        )
        refine_btn.click(
            fn=refine_prompt_with_zephyr,
            inputs=[optimized_prompt, feedback_input, chat_history_state, scene_state],
            outputs=[optimized_prompt, chat_history_state]
        )
        # Update chat display when history changes
        chat_history_state.change(
            fn=lambda history: history,
            inputs=[chat_history_state],
            outputs=[chat_display]
        )
        generate_gen4_btn.click(
            fn=generate_gen4_prompts,
            inputs=[scene_state, foundation_gen4],
            outputs=[gen4_output]
        )
        build_custom_btn.click(
            fn=build_custom_prompt,
            inputs=[custom_foundation, subject_motion, scene_motion, camera_motion, style_motion, approach_selector],
            outputs=[custom_output]
        )
    return demo


# Launch the app
if __name__ == "__main__":
    print("🎬 Starting AI Video Prompt Generator with SARA-Zephyr...")
    print(f"📊 Status: {'GPU' if use_gpu else 'CPU'} Mode Enabled")
    print("🔧 Loading models (this may take a few minutes)...")
    try:
        demo = create_interface()
        print("✅ Interface created successfully!")
        print("🚀 Launching application...")
        demo.launch(
            share=True,
            server_name="0.0.0.0",
            server_port=7860,
            debug=True,
            show_error=True
        )
    except Exception as e:
        print(f"❌ Error launching app: {e}")
        print("🔧 Make sure you have sufficient CPU resources and all dependencies installed.")
        print("📦 Required packages:")
        print("   pip install torch transformers gradio pillow accelerate bitsandbytes")
        # Alternative launch attempt
        print("\n🔄 Attempting alternative launch...")
        try:
            demo = create_interface()
            demo.launch(
                share=False,
                server_name="127.0.0.1",
                server_port=7860,
                debug=False
            )
        except Exception as e2:
            print(f"❌ Alternative launch failed: {e2}")
            print("\n💡 Troubleshooting tips:")
            print("1. Ensure CPU resources are sufficient.")
            print("2. Check CPU usage: top or htop")
            print("3. Try reducing model precision: set torch_dtype=torch.float32")
            print("4. Monitor memory usage: free -h")