import gradio as gr
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import random

# Use lighter BLIP model instead of heavy LLaVA
print("Loading BLIP model (lighter version)...")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-large",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

# Universal Video Prompting Guide combining Gen-4 + SARA
unified_instructions = """
# 🎬 Universal Video Prompting Guide
*Compatible with Gen-4, Sora, Pika, Luma, Runway and all diffusion-based video models*

## Core Principles (Universal)
✅ **Focus on MOTION, not static description**
✅ **Use positive phrasing exclusively**
✅ **Start simple, iterate progressively**
✅ **Refer to subjects in general terms** ("the subject," "the woman")
✅ **Keep prompts direct and easily understood**

## Two Complementary Approaches

### 🚀 **Gen-4 Official Method** (Recommended for beginners)
**Structure**: Simple iterative building
1. Start with essential motion only
2. Add one element at a time: Subject Motion → Camera Motion → Scene Motion → Style Descriptors
3. Use general terms and avoid complex descriptions

**Example**: 
- Basic: "The subject walks forward"
- + Camera: "The subject walks forward. Handheld camera follows"
- + Scene: "The subject walks forward. Handheld camera follows. Dust trails behind"
- + Style: "The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic."

### 🎯 **SARA Framework** (Advanced precision)
**Structure**: [Subject] + [Action] + [Reference] + [Atmosphere]
- **Subject (S)**: Main element to control
- **Action (A)**: Movement/transformation ([verb] + [adverb])
- **Reference (R)**: Spatial anchors ("while X remains steady")
- **Atmosphere (A)**: Context and style

**Template**: [Subject] [verb] [adverb] while [reference] [atmosphere]
**Example**: "The subject walks smoothly while background remains steady, cinematic atmosphere"

## Essential Vocabulary

### Effective Verbs (Action)
- **Movement**: walks, runs, moves, glides, flows, drifts
- **Rotation**: turns, spins, rotates, pivots, tilts
- **Transformation**: transforms, morphs, transitions, evolves
- **Expression**: speaks, gestures, looks, smiles, nods

### Effective Adverbs (Quality)
- **Speed**: slowly, quickly, gradually, suddenly, steadily
- **Style**: smoothly, naturally, elegantly, gracefully, dramatically
- **Intensity**: gently, softly, powerfully, intensely, subtly

### Camera Motion Terms
- **Basic**: locked camera, handheld, steady cam
- **Movement**: pan left/right, tilt up/down, zoom in/out, dolly forward/back
- **Advanced**: tracking shot, crane movement, orbital movement

### Style Descriptors
- **Aesthetic**: cinematic, live-action, smooth animation, stop motion
- **Mood**: dramatic, peaceful, energetic, mysterious, professional
- **Technical**: 4K, slow motion, time-lapse, documentary style

## Multi-Subject Guidelines
- **Positional**: "The subject on the left walks. The subject on the right remains still."
- **Descriptive**: "The woman nods. The man waves."
- **Sequential**: "The woman speaks then the man responds."

## Scene Motion Approaches
- **Insinuated**: "The subject runs across the dusty desert" (natural)
- **Explicit**: "The subject runs across the desert. Dust trails behind them" (emphasized)

## Proven Examples (from SARA Framework)
### Character Motion
- "The woman speaks enthusiastically to camera while camera remains still, online tutorial"
- "The subject transitions from walking to jumping while background stays constant"

### Camera Motion  
- "The subject remains centered as camera smoothly moves left with steady background"
- "Handheld camera tracks the subject as they walk forward naturally"

### Environmental
- "Camera stays fixed while day cycles into night over the temple, stone structures remain still"
- "The red cup slides smoothly to the right on white table, maintaining background constant"

### Complex Scenes
- "The pile of rocks transforms into a humanoid made of rugged volcanic rocks. The rock humanoid walks around"
- "The woman inspects her reflection in mirror. Surface bubbles with translucent bubbles. Locked camera"

## Technical Notes
- **Gen-4/Runway**: Prefer SARA structure for precision
- **Sora/OpenAI**: Works well with both approaches
- **Pika/Stable**: Gen-4 method often more effective
- **All models**: Start simple, iterate based on results
"""

# Prompt templates from both Gen-4 and SARA research
SARA_TEMPLATES = {
    "character_motion": [
        "{subject} speaks {adverb} to camera while camera remains still, {genre}",
        "{subject} {action} {adverb} while background stays constant, {style}",
        "{subject} transitions from {action1} to {action2} while frame remains fixed, {genre}"
    ],
    "camera_motion": [
        "{subject} remains centered as camera {movement} {adverb} with steady background",
        "{camera_type} camera {action} the {subject} as they {movement} {adverb}",
        "Camera {movement} {adverb} while {subject} maintains position, {style}"
    ],
    "environmental": [
        "Camera stays fixed while {environment} {transformation} over {subject}, {reference} remain still",
        "{subject} {action} while {environmental_effect} around them, {style}",
        "{environmental_element} {movement} {adverb} as {subject} {action}, maintaining {reference}"
    ],
    "transformations": [
        "{object} transforms into {new_form} made of {material}. The {new_subject} {action} around",
        "{subject} {action} in {location}. {environmental_reaction} {adverb}. {camera_style}",
        "The {subject} {action} while {environmental_change} occurs {adverb}, {atmosphere}"
    ]
}

GEN4_TEMPLATES = {
    "basic": [
        "The subject {action}",
        "The {subject} {movement} {direction}",
        "{subject} {expression} to camera"
    ],
    "with_camera": [
        "The subject {action}. {camera_movement}",
        "{subject} {movement} {direction}. Camera {camera_action}",
        "Handheld camera {camera_behavior} as {subject} {action}"
    ],
    "with_scene": [
        "The subject {action}. {camera_movement}. {scene_element} {scene_action}",
        "{subject} {movement} across {environment}. {environmental_reaction}",
        "Camera {camera_movement} while {subject} {action}, {scene_description}"
    ],
    "complete": [
        "The subject {action}. {camera_movement}. {scene_element} {scene_action}. {style}",
        "{subject} {movement} {adverb} across {environment}. {camera_type} camera {camera_action}. {style}",
        "Camera {camera_movement} as {subject} {action}, {environmental_reaction}, {atmosphere}"
    ]
}

# Vocabulary databases
VOCABULARY = {
    "subjects": ["the subject", "the woman", "the man", "the person", "the character"],
    "actions": ["walks", "runs", "moves", "glides", "flows", "turns", "speaks", "gestures"],
    "adverbs": ["smoothly", "slowly", "quickly", "naturally", "gracefully", "steadily", "gently"],
    "camera_movements": ["locked camera", "handheld", "dolly forward", "pan left", "pan right", "tracking shot"],
    "environments": ["dusty desert", "forest", "urban street", "open field", "indoor space"],
    "styles": ["cinematic", "documentary", "live-action", "dramatic", "peaceful", "energetic"]
}

def analyze_image_simple(image):
    """Enhanced image analysis using BLIP + AI reasoning"""
    if image is None:
        return "Please upload an image first.", "", {}
    
    try:
        # Convert to PIL if needed
        if not isinstance(image, Image.Image):
            image = Image.fromarray(image)
        
        # Get basic image info
        width, height = image.size
        aspect_ratio = width / height
        
        if aspect_ratio > 1.5:
            composition = "Wide landscape shot"
        elif aspect_ratio < 0.7:
            composition = "Vertical portrait shot"
        else:
            composition = "Balanced composition"
        
        # Generate caption with BLIP
        inputs = processor(image, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
            model.cuda()
        
        out = model.generate(**inputs, max_length=50, num_beams=3)
        basic_caption = processor.decode(out[0], skip_special_tokens=True)
        
        # Enhanced analysis using AI reasoning
        enhanced_analysis = analyze_scene_with_ai(basic_caption, aspect_ratio, composition)
        
        # Create comprehensive analysis text
        analysis = f"""📊 **Image Analysis:**
• **Dimensions**: {width} x {height}
• **Composition**: {composition}
• **Aspect Ratio**: {aspect_ratio:.2f}

🎨 **Basic Description**: 
"{basic_caption}"

🧠 **AI-Enhanced Analysis**:
{enhanced_analysis['scene_interpretation']}

💡 **Motion & Cinematography Insights**:
{chr(10).join(f"• {insight}" for insight in enhanced_analysis['motion_insights'])}

🎯 **Recommended Approach**: 
{enhanced_analysis['recommended_approach']}"""
        
        # Enhanced scene info for prompt generation
        scene_info = {
            'basic_description': basic_caption,
            'enhanced_description': enhanced_analysis['detailed_description'],
            'composition': composition,
            'aspect_ratio': aspect_ratio,
            'has_person': enhanced_analysis['has_person'],
            'emotional_tone': enhanced_analysis['emotional_tone'],
            'visual_style': enhanced_analysis['visual_style'],
            'setting': enhanced_analysis['setting'],
            'distinctive_elements': enhanced_analysis['distinctive_elements'],
            'motion_potential': enhanced_analysis['motion_potential'],
            'cinematic_qualities': enhanced_analysis['cinematic_qualities']
        }
        
        return analysis, basic_caption, scene_info
        
    except Exception as e:
        return f"Error analyzing image: {str(e)}", "", {}

def analyze_scene_with_ai(basic_caption, aspect_ratio, composition):
    """Use AI reasoning to enhance basic image analysis"""
    text = basic_caption.lower() if isinstance(basic_caption, str) else ""
    
    # Interpret the scene beyond basic description
    scene_elements = extract_scene_elements(text)
    
    # Determine emotional tone and mood
    emotional_tone = determine_emotional_tone(text, scene_elements)
    
    # Analyze visual style potential
    visual_style = determine_visual_style(text, scene_elements, composition)
    
    # Identify distinctive elements for video potential
    distinctive_elements = identify_distinctive_elements(text)
    
    # Assess motion potential
    motion_potential = assess_motion_potential(text, scene_elements)
    
    # Generate cinematic insights
    cinematic_qualities = analyze_cinematic_potential(text, composition, aspect_ratio)
    
    # Create enhanced interpretation
    enhanced_description = create_enhanced_description(basic_caption, scene_elements, emotional_tone)
    
    # Generate motion and cinematography insights
    motion_insights = generate_motion_insights(scene_elements, emotional_tone, visual_style, composition)
    
    # Recommend best approach
    recommended_approach = recommend_approach(scene_elements, emotional_tone, visual_style)
    
    return {
        'detailed_description': enhanced_description,
        'scene_interpretation': f"Scene shows {scene_elements['subject']} in {scene_elements['setting']} with {emotional_tone} mood. Key elements: {', '.join(distinctive_elements)}",
        'motion_insights': motion_insights,
        'recommended_approach': recommended_approach,
        'has_person': scene_elements['has_person'],
        'emotional_tone': emotional_tone,
        'visual_style': visual_style,
        'setting': scene_elements['setting'],
        'distinctive_elements': distinctive_elements,
        'motion_potential': motion_potential,
        'cinematic_qualities': cinematic_qualities
    }

def extract_scene_elements(text):
    """Extract and interpret scene elements intelligently"""
    elements = {
        'subject': 'subject',
        'setting': 'neutral',
        'clothing': None,
        'colors': [],
        'objects': [],
        'has_person': False
    }
    
    # Detect subjects with context
    if any(word in text for word in ['man', 'male', 'gentleman']):
        elements['subject'] = 'man'
        elements['has_person'] = True
        # Detect what the man is wearing/doing
        if 'costume' in text:
            elements['subject'] = 'man in costume'
            elements['clothing'] = 'costume'
        elif 'suit' in text:
            elements['subject'] = 'man in suit'
            elements['clothing'] = 'suit'
    elif any(word in text for word in ['woman', 'female', 'lady']):
        elements['subject'] = 'woman'
        elements['has_person'] = True
        if 'dress' in text:
            elements['subject'] = 'woman in dress'
            elements['clothing'] = 'dress'
    
    # Detect setting with intelligence
    if any(word in text for word in ['outdoor', 'outside', 'street', 'nature', 'park']):
        elements['setting'] = 'outdoor'
    elif any(word in text for word in ['indoor', 'inside', 'room', 'office', 'studio']):
        elements['setting'] = 'indoor'
    elif any(word in text for word in ['stage', 'performance']):
        elements['setting'] = 'performance'
    
    # Extract colors intelligently
    color_words = ['red', 'blue', 'green', 'yellow', 'black', 'white', 'brown', 'pink', 'purple', 'orange', 'gold', 'silver']
    elements['colors'] = [color for color in color_words if color in text]
    
    # Extract objects
    objects = ['hat', 'cape', 'flag', 'chair', 'table', 'background', 'wall']
    elements['objects'] = [obj for obj in objects if obj in text]
    
    return elements

def determine_emotional_tone(text, scene_elements):
    """Intelligently determine the emotional tone of the scene"""
    # Ensure text is a string
    if not isinstance(text, str):
        text = ""
    
    text_lower = text.lower()
    
    # Look for emotional indicators
    if any(word in text_lower for word in ['serious', 'formal', 'stern', 'professional']):
        return 'serious'
    elif any(word in text_lower for word in ['happy', 'smiling', 'cheerful', 'joyful']):
        return 'cheerful'
    elif any(word in text_lower for word in ['dramatic', 'intense', 'powerful', 'bold']):
        return 'dramatic'
    elif any(word in text_lower for word in ['elegant', 'graceful', 'refined']):
        return 'elegant'
    elif 'costume' in text_lower or 'performance' in text_lower:
        return 'theatrical'
    else:
        # Infer from context
        if scene_elements['setting'] == 'performance':
            return 'theatrical'
        elif scene_elements['clothing'] in ['suit', 'formal']:
            return 'professional'
        else:
            return 'neutral'

def determine_visual_style(text, scene_elements, composition):
    """Determine the most suitable visual style"""
    # Ensure text is a string
    if not isinstance(text, str):
        text = ""
    
    text_lower = text.lower()
    
    # Analyze scene for style cues
    if scene_elements['setting'] == 'performance' or 'costume' in text_lower:
        return 'theatrical'
    elif scene_elements['setting'] == 'indoor' and 'formal' in text_lower:
        return 'professional'
    elif composition in ['Wide landscape shot']:
        return 'cinematic'
    elif any(color in scene_elements['colors'] for color in ['red', 'gold', 'dramatic']):
        return 'dramatic'
    else:
        return 'cinematic'

def identify_distinctive_elements(text):
    """Identify unique elements that can enhance video prompts"""
    # Ensure text is a string
    if not isinstance(text, str):
        text = ""
    
    elements = []
    text_lower = text.lower()
    
    # Clothing and costume elements
    if 'costume' in text_lower:
        elements.append('elaborate costume')
    if 'cape' in text_lower:
        elements.append('flowing cape')
    if 'hat' in text_lower:
        elements.append('distinctive hat')
    if 'flag' in text_lower:
        elements.append('flag detail')
    
    # Color elements
    colors = ['red', 'blue', 'green', 'gold']
    found_colors = [color for color in colors if color in text_lower]
    if found_colors:
        elements.append(f"{', '.join(found_colors)} coloring")
    
    # Setting elements
    if 'background' in text_lower:
        elements.append('detailed background')
    
    return elements if elements else ['natural elements']

def assess_motion_potential(text, scene_elements):
    """Assess what types of motion would work best"""
    # Ensure text is a string
    if not isinstance(text, str):
        text = ""
    
    potential = []
    text_lower = text.lower()
    
    if scene_elements['has_person']:
        potential.extend(['facial expressions', 'hand gestures', 'body movement'])
        
        if scene_elements['clothing'] == 'costume':
            potential.append('costume dynamics')
        if scene_elements['clothing'] == 'cape':
            potential.append('cape flow')
        if scene_elements['clothing'] == 'dress':
            potential.append('fabric movement')
    
    if scene_elements['setting'] == 'outdoor':
        potential.extend(['environmental effects', 'natural lighting changes'])
    elif scene_elements['setting'] == 'indoor':
        potential.extend(['controlled lighting', 'subtle environment shifts'])
    
    return potential

def analyze_cinematic_potential(text, composition, aspect_ratio):
    """Analyze the cinematic qualities and potential"""
    # Ensure text is a string
    if not isinstance(text, str):
        text = ""
    
    qualities = []
    text_lower = text.lower()
    
    # Composition analysis
    if composition == 'Wide landscape shot':
        qualities.extend(['horizontal camera movements', 'panoramic reveals', 'environmental context'])
    elif composition == 'Vertical portrait shot':
        qualities.extend(['character focus', 'intimate framing', 'vertical movement'])
    else:
        qualities.extend(['balanced framing', 'versatile movement', 'centered composition'])
    
    # Content analysis
    if 'costume' in text_lower or 'dramatic' in text_lower:
        qualities.append('dramatic lighting potential')
    
    if any(color in text_lower for color in ['red', 'gold', 'rich']):
        qualities.append('color enhancement opportunities')
    
    return qualities

def create_enhanced_description(basic_caption, scene_elements, emotional_tone):
    """Create a richer description using AI analysis"""
    subject = scene_elements['subject']
    setting = scene_elements['setting']
    clothing = scene_elements['clothing']
    
    enhanced = f"A {emotional_tone} scene featuring {subject}"
    
    if clothing:
        enhanced += f" wearing {clothing}"
    
    enhanced += f" in a {setting} setting"
    
    if scene_elements['colors']:
        enhanced += f" with prominent {', '.join(scene_elements['colors'])} elements"
    
    return enhanced

def generate_motion_insights(scene_elements, emotional_tone, visual_style, composition):
    """Generate intelligent motion and cinematography insights"""
    insights = []
    
    # Subject-based insights
    if scene_elements['has_person']:
        if emotional_tone == 'dramatic':
            insights.append('Emphasize powerful gestures and dynamic poses')
        elif emotional_tone == 'elegant':
            insights.append('Focus on graceful, refined movements')
        elif emotional_tone == 'theatrical':
            insights.append('Capture performance-style expressions and gestures')
        
        if scene_elements['clothing']:
            clothing = scene_elements['clothing']
            if clothing == 'costume':
                insights.append('Highlight costume details with movement')
            elif clothing == 'cape':
                insights.append('Showcase cape flow and dramatic movement')
            elif clothing == 'dress':
                insights.append('Capture fabric dynamics and elegant motion')
    
    # Composition-based insights
    if composition == 'Wide landscape shot':
        insights.append('Utilize horizontal camera movements and wide reveals')
    elif composition == 'Vertical portrait shot':
        insights.append('Focus on vertical movement and character detail')
    
    # Style-based insights
    if visual_style == 'cinematic':
        insights.append('Use cinematic camera techniques and dramatic lighting')
    elif visual_style == 'dramatic':
        insights.append('Emphasize bold movements and high contrast lighting')
    elif visual_style == 'professional':
        insights.append('Maintain clean, controlled camera work')
    
    # Color-based insights
    if scene_elements['colors']:
        insights.append(f"Enhance {', '.join(scene_elements['colors'])} tones through lighting")
    
    return insights[:6]  # Limit to 6 most relevant insights

def recommend_approach(scene_elements, emotional_tone, visual_style):
    """Intelligently recommend the best prompting approach"""
    
    # For complex scenes with people in costume/formal wear
    if scene_elements['has_person'] and scene_elements['clothing'] in ['costume', 'suit', 'dress']:
        return "SARA Framework recommended for precise character and costume control"
    
    # For dramatic or theatrical scenes
    elif emotional_tone in ['dramatic', 'theatrical']:
        return "SARA Framework ideal for complex dramatic scenes with multiple elements"
    
    # For simple, natural scenes
    elif emotional_tone in ['neutral', 'peaceful'] and visual_style != 'dramatic':
        return "Gen-4 method perfect for natural, iterative scene building"
    
    # For professional or formal contexts
    elif emotional_tone == 'professional' or visual_style == 'professional':
        return "Either approach works - SARA for precision, Gen-4 for simplicity"
    
    else:
        return "Start with Gen-4 for base prompt, then refine with SARA for complexity"

def generate_motion_suggestions(description, aspect_ratio):
    """Generate contextual motion suggestions"""
    text = description.lower()
    suggestions = []
    
    # Content-based suggestions
    if any(word in text for word in ['person', 'woman', 'man', 'people']):
        suggestions.extend([
            'Focus on character expressions and gestures',
            'Use "the subject" or "the woman/man" for clarity',
            'Consider handheld camera for natural movement'
        ])
    
    if any(word in text for word in ['sitting', 'standing']):
        suggestions.extend([
            'Start with simple movements: speaking, gesturing',
            'Locked or steady camera works well for portraits'
        ])
    
    if any(word in text for word in ['outdoor', 'landscape', 'nature']):
        suggestions.extend([
            'Camera movement can explore the environment',
            'Consider environmental motion: wind, clouds',
            'Cinematic style complements outdoor scenes'
        ])
    
    if any(word in text for word in ['indoor', 'room']):
        suggestions.extend([
            'Controlled movements work best indoors',
            'Focus on subject motion within the space'
        ])
    
    # Composition-based suggestions
    if aspect_ratio > 1.5:
        suggestions.append('Wide format perfect for horizontal camera movements')
    elif aspect_ratio < 0.8:
        suggestions.append('Portrait format ideal for character-focused content')
    
    return suggestions[:6] if suggestions else [
        'Start with simple motion: "The subject moves"',
        'Add camera movement: "Camera follows naturally"',
        'Include environment: "Background remains steady"'
    ]

def get_recommended_approach(description):
    """Recommend best approach based on image content"""
    text = description.lower()
    
    if any(word in text for word in ['person', 'woman', 'man']):
        return "SARA Framework recommended for character precision"
    elif any(word in text for word in ['landscape', 'building', 'nature']):
        return "Gen-4 method works well for environmental scenes"
    else:
        return "Try both approaches - start with Gen-4, refine with SARA"

def detect_setting(description):
    """Detect setting type from description"""
    text = description.lower()
    
    if any(word in text for word in ['outdoor', 'outside', 'street', 'nature']):
        return 'outdoor'
    elif any(word in text for word in ['indoor', 'inside', 'room', 'building']):
        return 'indoor'
    else:
        return 'neutral'

def extract_specific_details(description):
    """Extract specific details from the image description"""
    details = {
        'colors': [],
        'clothing': None,
        'distinctive_feature': None,
        'main_object': None,
        'setting_clues': []
    }
    
    text = description.lower()
    
    # Extract colors
    colors = ['red', 'blue', 'green', 'yellow', 'black', 'white', 'brown', 'pink', 'purple', 'orange']
    found_colors = [color for color in colors if color in text]
    if found_colors:
        details['colors'] = ', '.join(found_colors)
    
    # Extract clothing/costume details
    clothing_items = ['cape', 'hat', 'dress', 'suit', 'shirt', 'coat', 'jacket', 'uniform', 'costume', 'robe']
    for item in clothing_items:
        if item in text:
            # Get the full clothing description
            if 'red cape' in text:
                details['clothing'] = 'red cape'
                details['distinctive_feature'] = 'flowing red cape'
            elif 'hat' in text:
                if 'red hat' in text:
                    details['clothing'] = 'red hat'
                    details['distinctive_feature'] = 'red hat'
                else:
                    details['clothing'] = 'hat'
                    details['distinctive_feature'] = 'hat'
            else:
                details['clothing'] = item
                details['distinctive_feature'] = item
            break
    
    # Extract main subject
    if 'man' in text:
        details['main_object'] = 'man'
    elif 'woman' in text:
        details['main_object'] = 'woman'
    elif 'person' in text:
        details['main_object'] = 'person'
    elif 'people' in text:
        details['main_object'] = 'people'
    
    # Extract setting clues
    setting_indicators = ['outdoor', 'indoor', 'street', 'room', 'building', 'nature', 'park', 'office']
    details['setting_clues'] = [indicator for indicator in setting_indicators if indicator in text]
    
    return details

def get_contextual_subject(description, details):
    """Get appropriate subject reference based on image content"""
    text = description.lower()
    
    if 'man' in text:
        if details.get('clothing'):
            return f"The man in the {details['clothing']}"
        else:
            return "The man"
    elif 'woman' in text:
        if details.get('clothing'):
            return f"The woman in the {details['clothing']}"
        else:
            return "The woman"
    elif 'person' in text:
        return "The person"
    else:
        return "The subject"

def get_contextual_actions(description, details):
    """Get actions appropriate for the scene"""
    base_actions = ['speaks', 'gestures', 'moves', 'looks', 'turns']
    
    # Add context-specific actions
    if details.get('clothing'):
        if 'cape' in details['clothing']:
            base_actions.extend(['adjusts cape', 'moves dramatically', 'gestures with cape flowing'])
        if 'hat' in details['clothing']:
            base_actions.extend(['tips hat', 'adjusts hat', 'nods with hat'])
    
    # Add character-appropriate actions
    if 'man' in description.lower():
        base_actions.extend(['speaks confidently', 'gestures authoritatively'])
    
    return base_actions

def get_contextual_adverbs(details):
    """Get adverbs that fit the scene"""
    base_adverbs = ['naturally', 'smoothly', 'slowly', 'gracefully']
    
    if details.get('clothing'):
        if 'cape' in details['clothing']:
            base_adverbs.extend(['dramatically', 'majestically', 'with flair'])
        if 'hat' in details['clothing']:
            base_adverbs.extend(['elegantly', 'with style', 'confidently'])
    
    return base_adverbs

def get_contextual_camera_movement(description, details):
    """Get camera movements appropriate for the scene"""
    base_movements = ['Camera follows steadily', 'Locked camera captures', 'Handheld camera tracks']
    
    if details.get('distinctive_feature'):
        if 'cape' in details['distinctive_feature']:
            base_movements.extend(['Camera captures cape movement', 'Tracking shot follows cape flow'])
        if 'hat' in details['distinctive_feature']:
            base_movements.extend(['Camera frames from chest up', 'Close tracking of upper body'])
    
    return base_movements

def get_contextual_environment(description, details):
    """Get environmental effects that complement the scene"""
    if details.get('colors'):
        if 'red' in details['colors']:
            return "lighting enhances red tones"
    
    if details.get('clothing'):
        if 'cape' in details['clothing']:
            return "cape fabric reacts to air movement"
    
    return None

def get_contextual_style(details):
    """Get style that fits the scene context"""
    if details.get('clothing'):
        if 'cape' in details['clothing']:
            return "dramatic cinematic style"
        if 'hat' in details['clothing']:
            return "classic portrait style"
    
    return "professional documentary style"

def get_contextual_atmosphere(details):
    """Get atmosphere that matches the scene"""
    if details.get('colors'):
        if 'red' in details['colors']:
            return "dramatic atmosphere with rich red tones"
    
    if details.get('clothing'):
        if 'cape' in details['clothing']:
            return "heroic cinematic atmosphere"
        if 'hat' in details['clothing']:
            return "elegant portrait atmosphere"
    
    return "professional cinematic atmosphere"

def optimize_user_prompt(user_idea, scene_info=None):
    """Optimize and structure user's prompt idea into professional video prompt"""
    if not user_idea.strip():
        return "Please enter your idea first."
    
    try:
        # Analyze the user's input
        idea = user_idea.strip()
        
        # Detect language and content
        analysis = analyze_user_idea(idea)
        
        # Generate optimized prompt
        optimized = create_optimized_prompt(idea, analysis, scene_info)
        
        return optimized
        
    except Exception as e:
        return f"Error optimizing prompt: {str(e)}"

def analyze_user_idea(idea):
    """Analyze user's idea to understand intent and content"""
    idea_lower = idea.lower()
    
    analysis = {
        'language': detect_language(idea),
        'has_action': False,
        'has_object': False,
        'has_emotion': False,
        'has_camera': False,
        'complexity': 'simple',
        'main_elements': []
    }
    
    # Detect actions (multilingual)
    action_words = {
        'en': ['removes', 'takes off', 'puts on', 'walks', 'runs', 'speaks', 'gestures', 'moves', 'turns', 'looks'],
        'es': ['quita', 'se quita', 'pone', 'camina', 'corre', 'habla', 'gesticula', 'mueve', 'gira', 'mira'],
        'fr': ['enlève', 'met', 'marche', 'court', 'parle', 'gesticule', 'bouge'],
        'de': ['nimmt ab', 'zieht aus', 'geht', 'läuft', 'spricht', 'bewegt']
    }
    
    for lang, actions in action_words.items():
        if any(action in idea_lower for action in actions):
            analysis['has_action'] = True
            break
    
    # Detect objects/elements
    object_words = ['nose', 'nariz', 'hat', 'sombrero', 'costume', 'traje', 'cape', 'capa', 'mask', 'máscara']
    if any(obj in idea_lower for obj in object_words):
        analysis['has_object'] = True
    
    # Detect emotions/style
    emotion_words = ['dramatic', 'dramático', 'slow', 'lento', 'fast', 'rápido', 'gentle', 'suave', 'powerful', 'poderoso']
    if any(emotion in idea_lower for emotion in emotion_words):
        analysis['has_emotion'] = True
    
    # Detect camera references
    camera_words = ['camera', 'cámara', 'shot', 'toma', 'angle', 'ángulo', 'close', 'cerca', 'wide', 'amplio']
    if any(camera in idea_lower for camera in camera_words):
        analysis['has_camera'] = True
    
    # Determine complexity
    element_count = sum([analysis['has_action'], analysis['has_object'], analysis['has_emotion'], analysis['has_camera']])
    if element_count >= 3:
        analysis['complexity'] = 'complex'
    elif element_count >= 2:
        analysis['complexity'] = 'medium'
    
    return analysis

def detect_language(text):
    """Simple language detection"""
    spanish_indicators = ['el', 'la', 'se', 'que', 'con', 'por', 'para', 'del', 'de la', 'nariz', 'payaso']
    french_indicators = ['le', 'la', 'se', 'que', 'avec', 'pour', 'du', 'de la', 'nez', 'clown']
    german_indicators = ['der', 'die', 'das', 'sich', 'mit', 'für', 'vom', 'nase', 'clown']
    
    text_lower = text.lower()
    
    if any(indicator in text_lower for indicator in spanish_indicators):
        return 'spanish'
    elif any(indicator in text_lower for indicator in french_indicators):
        return 'french'
    elif any(indicator in text_lower for indicator in german_indicators):
        return 'german'
    else:
        return 'english'

def create_optimized_prompt(idea, analysis, scene_info=None):
    """Create optimized English video prompt from user idea"""
    
    # Translation dictionary for common elements
    translations = {
        'spanish': {
            'se quita': 'removes',
            'quita': 'removes', 
            'pone': 'puts on',
            'camina': 'walks',
            'habla': 'speaks',
            'mueve': 'moves',
            'nariz': 'nose',
            'payaso': 'clown',
            'personaje': 'character',
            'sombrero': 'hat',
            'capa': 'cape',
            'lentamente': 'slowly',
            'rápidamente': 'quickly',
            'dramáticamente': 'dramatically'
        },
        'french': {
            'enlève': 'removes',
            'met': 'puts on',
            'marche': 'walks',
            'parle': 'speaks',
            'bouge': 'moves',
            'nez': 'nose',
            'clown': 'clown',
            'personnage': 'character',
            'chapeau': 'hat',
            'cape': 'cape'
        }
    }
    
    # Start with basic translation
    optimized_idea = idea
    if analysis['language'] in translations:
        for original, translation in translations[analysis['language']].items():
            optimized_idea = optimized_idea.replace(original, translation)
    
    # Structure the prompt professionally
    structured_prompt = structure_video_prompt(optimized_idea, analysis, scene_info)
    
    return structured_prompt

def structure_video_prompt(idea, analysis, scene_info=None):
    """Structure the idea into a professional video prompt"""
    
    # Extract main elements
    idea_lower = idea.lower()
    
    # Identify subject
    if 'character' in idea_lower or 'personaje' in idea_lower:
        subject = "The character"
    elif 'person' in idea_lower or 'persona' in idea_lower:
        subject = "The person"
    elif scene_info and scene_info.get('has_person'):
        # Use context from scene analysis
        subject = extract_intelligent_subject_reference(scene_info)
    else:
        subject = "The subject"
    
    # Extract and optimize action
    action = extract_action_from_idea(idea)
    
    # Add appropriate style modifiers
    if analysis['complexity'] == 'simple':
        # Simple structure: Subject + Action + naturally
        optimized = f"{subject} {action} naturally"
        
        # Add camera suggestion
        optimized += ". Camera captures the motion smoothly"
        
    elif analysis['complexity'] == 'medium':
        # Medium structure: Add more detail
        optimized = f"{subject} {action} while camera follows steadily"
        
        # Add environmental/lighting
        if analysis['has_emotion']:
            optimized += ", dramatic lighting enhances the mood"
        else:
            optimized += ", professional lighting"
            
    else:
        # Complex structure: Full SARA framework
        optimized = f"{subject} {action} expressively while camera tracks the motion"
        optimized += ", lighting and environment support the action, cinematic atmosphere"
    
    # Add technical improvements
    optimized = improve_technical_language(optimized)
    
    return optimized

def extract_action_from_idea(idea):
    """Extract and refine the main action from user's idea"""
    idea_lower = idea.lower()
    
    # Map common actions to video-optimized versions
    action_mappings = {
        'removes': 'removes',
        'quita': 'removes',
        'se quita': 'removes',
        'takes off': 'removes',
        'puts on': 'puts on',
        'pone': 'puts on',
        'walks': 'walks',
        'camina': 'walks',
        'speaks': 'speaks',
        'habla': 'speaks',
        'moves': 'moves',
        'mueve': 'moves',
        'turns': 'turns',
        'gira': 'turns',
        'looks': 'looks',
        'mira': 'looks'
    }
    
    # Find the action and object
    action = "moves"  # default
    object_part = ""
    
    for original, mapped in action_mappings.items():
        if original in idea_lower:
            action = mapped
            # Try to extract what's being acted upon
            if original in ['removes', 'quita', 'se quita', 'takes off']:
                # Look for what's being removed
                if 'nose' in idea_lower or 'nariz' in idea_lower:
                    if 'clown' in idea_lower or 'payaso' in idea_lower:
                        object_part = "the clown nose"
                    else:
                        object_part = "the nose piece"
                elif 'hat' in idea_lower or 'sombrero' in idea_lower:
                    object_part = "the hat"
                elif 'mask' in idea_lower or 'máscara' in idea_lower:
                    object_part = "the mask"
            break
    
    # Combine action with object
    if object_part:
        return f"{action} {object_part}"
    else:
        return action

def improve_technical_language(prompt):
    """Improve the prompt with professional video terminology"""
    
    # Enhance basic terms
    improvements = {
        'moves naturally': 'moves with natural grace',
        'Camera captures': 'Camera captures',
        'smoothly': 'with smooth motion',
        'follows steadily': 'follows with steady tracking',
        'dramatic lighting': 'dramatic lighting transitions',
        'professional lighting': 'professional lighting setup',
        'cinematic atmosphere': 'rich cinematic atmosphere'
    }
    
    improved_prompt = prompt
    for basic, enhanced in improvements.items():
        improved_prompt = improved_prompt.replace(basic, enhanced)
    
    return improved_prompt

def refine_prompt_with_feedback(current_prompt, feedback, chat_history, scene_info=None):
    """Use AI to intelligently refine prompts based on user feedback"""
    
    if not feedback.strip():
        return current_prompt, chat_history
    
    # Analyze the feedback with AI understanding
    refinement_analysis = analyze_refinement_request(feedback, current_prompt, scene_info)
    
    # Generate intelligent refinement
    refined_prompt = apply_intelligent_refinement(current_prompt, refinement_analysis, scene_info)
    
    # Create explanatory response
    explanation = create_refinement_explanation(refinement_analysis, current_prompt, refined_prompt)
    
    # Update chat history with intelligent conversation
    new_chat_history = chat_history + [
        [feedback, f"🤖 {explanation}\n\n✨ **Refined Prompt**: {refined_prompt}"]
    ]
    
    return refined_prompt, new_chat_history

def analyze_refinement_request(feedback, current_prompt, scene_info):
    """Analyze what the user wants to change using AI understanding"""
    feedback_lower = feedback.lower()
    analysis = {
        'request_type': 'general',
        'intensity': 'moderate',
        'focus_area': 'action',
        'style_preference': None,
        'specific_elements': [],
        'language': detect_language(feedback)
    }
    
    # Detect request type with AI understanding
    if any(word in feedback_lower for word in ['dramatic', 'dramático', 'dramatique', 'dramatisch']):
        analysis['request_type'] = 'dramatic'
        analysis['intensity'] = 'high'
    elif any(word in feedback_lower for word in ['slow', 'slower', 'lento', 'más lento', 'lentement']):
        analysis['request_type'] = 'pace'
        analysis['intensity'] = 'slow'
    elif any(word in feedback_lower for word in ['fast', 'faster', 'rápido', 'más rápido', 'rapide']):
        analysis['request_type'] = 'pace'
        analysis['intensity'] = 'fast'
    elif any(word in feedback_lower for word in ['camera', 'cámara', 'caméra', 'kamera']):
        analysis['request_type'] = 'camera'
        analysis['focus_area'] = 'cinematography'
    elif any(word in feedback_lower for word in ['lighting', 'light', 'luz', 'lumière', 'licht']):
        analysis['request_type'] = 'lighting'
        analysis['focus_area'] = 'atmosphere'
    elif any(word in feedback_lower for word in ['simple', 'simpler', 'más simple', 'plus simple']):
        analysis['request_type'] = 'simplify'
        analysis['intensity'] = 'low'
    elif any(word in feedback_lower for word in ['complex', 'complicated', 'detalle', 'detail', 'détail']):
        analysis['request_type'] = 'elaborate'
        analysis['intensity'] = 'high'
    elif any(word in feedback_lower for word in ['elegant', 'elegante', 'élégant']):
        analysis['request_type'] = 'style'
        analysis['style_preference'] = 'elegant'
    elif any(word in feedback_lower for word in ['powerful', 'poderoso', 'puissant']):
        analysis['request_type'] = 'style'
        analysis['style_preference'] = 'powerful'
    elif any(word in feedback_lower for word in ['natural', 'natural', 'naturel']):
        analysis['request_type'] = 'style'
        analysis['style_preference'] = 'natural'
    
    # Detect specific elements mentioned
    elements = ['costume', 'dress', 'cape', 'hat', 'background', 'face', 'hands', 'movement']
    for element in elements:
        if element in feedback_lower:
            analysis['specific_elements'].append(element)
    
    return analysis

def apply_intelligent_refinement(current_prompt, analysis, scene_info):
    """Apply intelligent refinement based on analysis"""
    
    # Start with current prompt
    refined = current_prompt
    
    # Apply refinements based on request type
    if analysis['request_type'] == 'dramatic':
        refined = enhance_dramatic_elements(refined, analysis, scene_info)
    elif analysis['request_type'] == 'pace':
        refined = adjust_pace(refined, analysis)
    elif analysis['request_type'] == 'camera':
        refined = enhance_camera_work(refined, analysis, scene_info)
    elif analysis['request_type'] == 'lighting':
        refined = enhance_lighting(refined, analysis, scene_info)
    elif analysis['request_type'] == 'simplify':
        refined = simplify_prompt(refined)
    elif analysis['request_type'] == 'elaborate':
        refined = elaborate_prompt(refined, scene_info)
    elif analysis['request_type'] == 'style':
        refined = apply_style_preference(refined, analysis, scene_info)
    else:
        # General enhancement
        refined = apply_general_enhancement(refined, analysis, scene_info)
    
    return refined

def enhance_dramatic_elements(prompt, analysis, scene_info):
    """Enhance dramatic elements intelligently"""
    # Replace gentle actions with dramatic ones
    dramatic_replacements = {
        'naturally': 'dramatically with intensity',
        'smoothly': 'with powerful emphasis',
        'gently': 'boldly',
        'moves': 'commands attention',
        'speaks': 'declares passionately',
        'gestures': 'gestures with commanding presence',
        'professional lighting': 'dramatic lighting with stark contrasts',
        'cinematic lighting': 'theatrical lighting with deep shadows'
    }
    
    enhanced = prompt
    for original, dramatic in dramatic_replacements.items():
        enhanced = enhanced.replace(original, dramatic)
    
    # Add dramatic elements based on scene context
    if scene_info and scene_info.get('distinctive_elements'):
        elements = scene_info['distinctive_elements']
        if 'costume' in str(elements):
            enhanced += ". Costume elements amplify the dramatic presence"
        if 'cape' in str(elements):
            enhanced += ". Cape billows dramatically with movement"
    
    # Enhance camera work for drama
    if 'Camera captures' in enhanced:
        enhanced = enhanced.replace('Camera captures', 'Dynamic camera captures')
    
    return enhanced

def adjust_pace(prompt, analysis):
    """Adjust the pace of action"""
    if analysis['intensity'] == 'slow':
        pace_replacements = {
            'naturally': 'slowly and deliberately',
            'smoothly': 'in measured slow motion',
            'moves': 'moves with deliberate slowness',
            'speaks': 'speaks thoughtfully',
            'gestures': 'gestures with careful precision'
        }
    else:  # fast
        pace_replacements = {
            'naturally': 'with energetic quickness',
            'slowly': 'rapidly',
            'smoothly': 'with swift fluidity',
            'deliberate': 'rapid',
            'measured': 'quick'
        }
    
    adjusted = prompt
    for original, paced in pace_replacements.items():
        adjusted = adjusted.replace(original, paced)
    
    return adjusted

def enhance_camera_work(prompt, analysis, scene_info):
    """Enhance camera work based on scene context"""
    # Analyze current camera work
    enhanced = prompt
    
    # Upgrade basic camera work
    camera_enhancements = {
        'Camera captures': 'Dynamic camera work captures',
        'camera follows': 'cinematic camera tracks',
        'handheld camera': 'fluid handheld camera movement',
        'steady camera': 'precision camera operation',
        'locked camera': 'artistically locked camera'
    }
    
    for basic, enhanced_version in camera_enhancements.items():
        enhanced = enhanced.replace(basic, enhanced_version)
    
    # Add specific camera techniques based on scene
    if scene_info:
        composition = scene_info.get('composition', '')
        if 'Wide' in composition:
            enhanced += ". Wide tracking shots reveal environmental context"
        elif 'Portrait' in composition:
            enhanced += ". Intimate camera framing emphasizes character details"
    
    # If no camera work exists, add it
    if 'camera' not in enhanced.lower():
        enhanced += ". Sophisticated camera movement enhances the narrative"
    
    return enhanced

def enhance_lighting(prompt, analysis, scene_info):
    """Enhance lighting based on scene context"""
    enhanced = prompt
    
    # Upgrade lighting descriptions
    lighting_enhancements = {
        'professional lighting': 'artistic lighting design',
        'cinematic lighting': 'masterful cinematic lighting',
        'dramatic lighting': 'sculptural dramatic lighting',
        'natural lighting': 'beautiful natural light'
    }
    
    for basic, enhanced_version in lighting_enhancements.items():
        enhanced = enhanced.replace(basic, enhanced_version)
    
    # Add lighting based on emotional tone
    if scene_info:
        emotional_tone = scene_info.get('emotional_tone', 'neutral')
        if emotional_tone == 'dramatic':
            enhanced += ". High-contrast lighting creates powerful shadows"
        elif emotional_tone == 'elegant':
            enhanced += ". Soft, sophisticated lighting enhances refinement"
        elif emotional_tone == 'theatrical':
            enhanced += ". Stage-quality lighting emphasizes performance"
    
    # If no lighting exists, add it
    if 'lighting' not in enhanced.lower() and 'light' not in enhanced.lower():
        enhanced += ". Expressive lighting design supports the mood"
    
    return enhanced

def simplify_prompt(prompt):
    """Simplify prompt to essential elements"""
    # Split into main components
    parts = prompt.split('.')
    
    # Keep the main action and one enhancement
    if len(parts) > 1:
        simplified = parts[0] + '.'
        # Add one simple enhancement
        if 'camera' in prompt.lower():
            simplified += " Camera follows naturally."
        elif 'lighting' in prompt.lower():
            simplified += " Natural lighting."
    else:
        simplified = prompt
    
    return simplified

def elaborate_prompt(prompt, scene_info):
    """Add sophisticated details to the prompt"""
    elaborated = prompt
    
    # Add environmental details
    if scene_info:
        setting = scene_info.get('setting', 'neutral')
        distinctive_elements = scene_info.get('distinctive_elements', [])
        
        if setting == 'outdoor':
            elaborated += ". Environmental elements respond subtly to the action"
        elif setting == 'indoor':
            elaborated += ". Interior atmosphere enhances intimate connection"
        elif setting == 'performance':
            elaborated += ". Stage environment supports theatrical presence"
        
        # Add details about distinctive elements
        if distinctive_elements:
            element = distinctive_elements[0] if distinctive_elements else ''
            if 'costume' in element:
                elaborated += ". Costume textures and details visible in motion"
            elif 'color' in element:
                elaborated += ". Color palette enhanced through dynamic lighting"
    
    # Add technical sophistication
    elaborated += ". Multi-layered composition with depth and visual interest"
    
    return elaborated

def apply_style_preference(prompt, analysis, scene_info):
    """Apply specific style preferences"""
    styled = prompt
    preference = analysis['style_preference']
    
    if preference == 'elegant':
        style_replacements = {
            'dramatically': 'with refined elegance',
            'boldly': 'gracefully',
            'powerfully': 'with sophisticated poise',
            'dramatic lighting': 'elegant lighting transitions',
            'intensive': 'refined'
        }
    elif preference == 'powerful':
        style_replacements = {
            'gently': 'with commanding force',
            'naturally': 'with authoritative presence',
            'smoothly': 'with decisive power',
            'professional lighting': 'bold, impactful lighting'
        }
    elif preference == 'natural':
        style_replacements = {
            'dramatically': 'naturally',
            'theatrical': 'authentic',
            'commanding': 'genuine',
            'dramatic lighting': 'natural lighting'
        }
    
    if preference in ['elegant', 'powerful', 'natural']:
        for original, styled_version in style_replacements.items():
            styled = styled.replace(original, styled_version)
    
    return styled

def apply_general_enhancement(prompt, analysis, scene_info):
    """Apply general enhancements based on context"""
    enhanced = prompt
    
    # Add sophistication to basic elements
    if 'moves' in enhanced and 'gracefully' not in enhanced:
        enhanced = enhanced.replace('moves', 'moves with purposeful grace')
    
    if 'speaks' in enhanced and 'expressively' not in enhanced:
        enhanced = enhanced.replace('speaks', 'speaks with genuine expression')
    
    # Enhance based on scene context
    if scene_info:
        emotional_tone = scene_info.get('emotional_tone', 'neutral')
        if emotional_tone != 'neutral' and emotional_tone not in enhanced:
            enhanced += f". {emotional_tone.capitalize()} energy throughout"
    
    return enhanced

def create_refinement_explanation(analysis, original, refined):
    """Create an explanation of what was changed"""
    explanations = {
        'dramatic': "I've enhanced the dramatic intensity by upgrading the actions and adding powerful lighting elements.",
        'pace': f"I've adjusted the pacing to be more {'slow and deliberate' if analysis['intensity'] == 'slow' else 'energetic and quick'}.",
        'camera': "I've enhanced the camera work with more sophisticated cinematography techniques.",
        'lighting': "I've upgraded the lighting description to create more visual impact.",
        'simplify': "I've simplified the prompt to focus on the essential action.",
        'elaborate': "I've added more sophisticated details and environmental context.",
        'style': f"I've adjusted the style to be more {analysis['style_preference']}."
    }
    
    base_explanation = explanations.get(analysis['request_type'], "I've enhanced the prompt based on your feedback.")
    
    # Add language-specific response
    if analysis['language'] != 'english':
        language_notes = {
            'spanish': "Entiendo tu sugerencia y ",
            'french': "Je comprends votre suggestion et ",
            'german': "Ich verstehe Ihren Vorschlag und "
        }
        prefix = language_notes.get(analysis['language'], "")
        base_explanation = prefix + base_explanation.lower()
    
    return base_explanation

def generate_gen4_prompts_local(scene_info, user_input=""):
    """Generate Gen-4 prompts using iterative building"""
    try:
        description = scene_info.get('description', '')
        has_person = scene_info.get('has_person', False)
        setting = scene_info.get('setting', 'neutral')
        
        # Extract specific details for contextual prompts
        specific_details = extract_specific_details(description)
        subject_ref = get_contextual_subject(description, specific_details)
        
        prompts = []
        
        # Basic - specific to what's in the image
        if has_person:
            actions = get_contextual_actions(description, specific_details)
            basic = f"{subject_ref} {random.choice(actions)} to camera"
        else:
            basic = f"The {specific_details.get('main_object', 'main element')} {random.choice(['moves', 'shifts', 'transforms'])}"
        prompts.append(f"**Basic**: {basic}")
        
        # + Subject Motion - add natural movement based on what's visible
        motion_adverbs = get_contextual_adverbs(specific_details)
        motion_addition = random.choice(motion_adverbs)
        with_subject = f"{basic} {motion_addition}"
        prompts.append(f"**+ Subject Motion**: {with_subject}")
        
        # + Camera Motion - appropriate for the scene
        camera_movements = get_contextual_camera_movement(description, specific_details)
        camera_addition = random.choice(camera_movements)
        with_camera = f"{with_subject}. {camera_addition}"
        prompts.append(f"**+ Camera Motion**: {with_camera}")
        
        # + Scene/Style - enhance the specific elements
        if specific_details.get('colors'):
            style_addition = f"{specific_details['colors']} tones enhanced by lighting. {get_contextual_atmosphere(specific_details)}"
        elif setting == 'outdoor':
            style_addition = "Natural lighting enhances the scene. Cinematic"
        else:
            style_addition = f"Professional lighting highlights {specific_details.get('distinctive_feature', 'the subject')}. Documentary style"
        complete = f"{with_camera}. {style_addition}"
        prompts.append(f"**+ Scene/Style**: {complete}")
        
        return "\n\n".join(prompts)
        
    except Exception as e:
        return f"Error generating Gen-4 prompts: {str(e)}"

def build_custom_prompt_local(foundation, subject_motion, scene_motion, camera_motion, style, approach="SARA"):
    """Build custom prompt using selected approach"""
    if approach == "SARA":
        # SARA Structure: [Subject] [Action] while [Reference], [Atmosphere]
        parts = []
        
        if foundation:
            parts.append(foundation)
        
        # Add motion elements
        motion_parts = []
        if subject_motion:
            motion_parts.extend(subject_motion)
        if scene_motion:
            motion_parts.extend(scene_motion)
        
        if motion_parts:
            parts.append(", ".join(motion_parts))
        
        # Reference (camera stability)
        if camera_motion:
            parts.append(f"while {camera_motion}")
        else:
            parts.append("while background remains steady")
        
        # Atmosphere
        if style:
            parts.append(style)
        
        return " ".join(parts)
    
    else:  # Gen-4 style
        # Gen-4 Structure: Simple iterative building
        parts = []
        
        if foundation:
            parts.append(foundation)
        
        if subject_motion:
            parts.extend(subject_motion)
        
        if camera_motion:
            parts.append(camera_motion)
        
        if scene_motion:
            parts.extend(scene_motion)
        
        if style:
            parts.append(style)
        
        return ". ".join(parts) if parts else "The subject moves naturally"

def get_smart_suggestions_local(scene_info):
    """Generate intelligent suggestions using AI-enhanced analysis"""
    enhanced_description = scene_info.get('enhanced_description', '')
    emotional_tone = scene_info.get('emotional_tone', 'neutral')
    visual_style = scene_info.get('visual_style', 'cinematic')
    distinctive_elements = scene_info.get('distinctive_elements', [])
    motion_potential = scene_info.get('motion_potential', [])
    setting = scene_info.get('setting', 'neutral')
    
    if not enhanced_description:
        return "Please analyze an image first to generate smart suggestions."
    
    suggestions = []
    
    # AI-enhanced scene understanding
    subject_ref = extract_intelligent_subject_reference(scene_info)
    suggestions.append(f'🤖 **AI Analysis**: {enhanced_description}')
    suggestions.append(f'🎯 **Smart Reference**: Use "{subject_ref}" for optimal clarity')
    
    # Tone-based action suggestions
    actions = generate_tone_appropriate_actions(emotional_tone, scene_info)[:3]
    suggestions.append(f'🎭 **Tone-Matched Actions**: {", ".join(actions)}')
    
    # Motion potential insights
    if motion_potential:
        top_potential = motion_potential[:3]
        suggestions.append(f'🎬 **Motion Opportunities**: {", ".join(top_potential)}')
    
    # Distinctive element highlights
    if distinctive_elements:
        top_elements = distinctive_elements[:2]
        suggestions.append(f'✨ **Key Elements to Highlight**: {", ".join(top_elements)}')
    
    # Visual style recommendations
    style_cameras = generate_style_appropriate_cameras(visual_style, scene_info.get('cinematic_qualities', []))[:2]
    suggestions.append(f'🎥 **Style-Appropriate Cameras**: {", ".join(style_cameras)}')
    
    # Emotional tone guidance
    appropriate_adverbs = [get_tone_appropriate_adverb(emotional_tone) for _ in range(3)]
    suggestions.append(f'💫 **Emotional Adverbs**: {", ".join(appropriate_adverbs)}')
    
    # Setting-specific insights
    if setting == 'performance':
        suggestions.append('🎪 **Performance Context**: Focus on stage presence and audience engagement')
    elif setting == 'outdoor':
        suggestions.append('🌿 **Outdoor Setting**: Leverage natural lighting and environmental elements')
    elif setting == 'indoor':
        suggestions.append('🏠 **Indoor Context**: Utilize controlled lighting and intimate framing')
    
    # Cinematic quality suggestions
    cinematic_qualities = scene_info.get('cinematic_qualities', [])
    if cinematic_qualities:
        top_qualities = cinematic_qualities[:2]
        suggestions.append(f'🎬 **Cinematic Opportunities**: {", ".join(top_qualities)}')
    
    # Atmosphere recommendation
    atmosphere = get_style_appropriate_atmosphere(visual_style, emotional_tone)
    suggestions.append(f'🌟 **Recommended Atmosphere**: {atmosphere}')
    
    return "\n".join(suggestions[:10])

def generate_instant_prompts(scene_info):
    """Generate sophisticated ready-to-use prompts based on AI-enhanced analysis"""
    enhanced_description = scene_info.get('enhanced_description', '')
    emotional_tone = scene_info.get('emotional_tone', 'neutral')
    visual_style = scene_info.get('visual_style', 'cinematic')
    distinctive_elements = scene_info.get('distinctive_elements', [])
    cinematic_qualities = scene_info.get('cinematic_qualities', [])
    motion_potential = scene_info.get('motion_potential', [])
    
    if not enhanced_description:
        return "Please analyze an image first to generate instant prompts."
    
    # Extract intelligent subject reference
    subject_ref = extract_intelligent_subject_reference(scene_info)
    
    # Generate tone-appropriate actions
    actions = generate_tone_appropriate_actions(emotional_tone, scene_info)
    
    # Generate style-appropriate camera work
    camera_movements = generate_style_appropriate_cameras(visual_style, cinematic_qualities)
    
    # Generate sophisticated prompts
    instant_prompts = []
    
    # === AI-POWERED SIMPLE PROMPTS ===
    instant_prompts.append("🤖 **AI-Powered Simple Prompts:**")
    for i in range(3):
        action = random.choice(actions)
        adverb = get_tone_appropriate_adverb(emotional_tone)
        instant_prompts.append(f"   • {subject_ref} {action} {adverb}")
    
    # === CONTEXT-AWARE SARA PROMPTS ===
    instant_prompts.append("\n🧠 **Context-Aware SARA Prompts:**")
    for i in range(3):
        action = random.choice(actions)
        adverb = get_tone_appropriate_adverb(emotional_tone)
        camera = random.choice(camera_movements)
        atmosphere = get_style_appropriate_atmosphere(visual_style, emotional_tone)
        
        # Include distinctive elements
        if distinctive_elements and random.choice([True, False]):
            distinctive = random.choice(distinctive_elements)
            instant_prompts.append(f"   • {subject_ref} {action} {adverb} while {camera}, {distinctive} enhanced, {atmosphere}")
        else:
            instant_prompts.append(f"   • {subject_ref} {action} {adverb} while {camera}, {atmosphere}")
    
    # === INTELLIGENCE-ENHANCED GEN-4 ===
    instant_prompts.append("\n🎬 **Intelligence-Enhanced Gen-4:**")
    for i in range(3):
        action = random.choice(actions)
        adverb = get_tone_appropriate_adverb(emotional_tone)
        camera = random.choice(camera_movements)
        
        # Build Gen-4 iteratively with intelligence
        basic = f"{subject_ref} {action}"
        with_motion = f"{basic} {adverb}"
        with_camera = f"{with_motion}. {camera}"
        
        # Add intelligent style enhancement
        if distinctive_elements:
            distinctive = random.choice(distinctive_elements)
            style_addition = f"{distinctive} highlighted by {get_lighting_for_style(visual_style)}"
        else:
            style_addition = f"{get_lighting_for_style(visual_style)} enhances {emotional_tone} mood"
        
        complete = f"{with_camera}. {style_addition}"
        instant_prompts.append(f"   • {complete}")
    
    # === SPECIALIZED INTELLIGENT PROMPTS ===
    instant_prompts.append("\n✨ **Specialized AI Prompts:**")
    
    # Motion-potential based prompts
    if 'costume dynamics' in motion_potential:
        instant_prompts.append(f"   🎭 **Costume Dynamics**: {subject_ref} {random.choice(actions)} while camera captures fabric textures, costume elements react to movement, theatrical lighting")
    
    if 'facial expressions' in motion_potential:
        instant_prompts.append(f"   😊 **Expression Focus**: {subject_ref} {random.choice(['expresses emotion', 'speaks meaningfully', 'reacts naturally'])} while camera maintains intimate framing, {emotional_tone} energy emphasized")
    
    # Cinematic quality based prompts
    if 'dramatic lighting potential' in cinematic_qualities:
        instant_prompts.append(f"   💡 **Dramatic Lighting**: {subject_ref} {random.choice(actions)} as lighting creates dramatic shadows, visual contrast enhances {emotional_tone} mood, cinematic depth")
    
    if 'color enhancement opportunities' in cinematic_qualities:
        colors = [elem for elem in distinctive_elements if 'coloring' in elem]
        if colors:
            instant_prompts.append(f"   🎨 **Color Enhanced**: {subject_ref} {random.choice(actions)} while lighting dramatically enhances {colors[0]}, color grading emphasizes mood, {visual_style} aesthetic")
    
    # Environmental integration
    setting = scene_info.get('setting', 'neutral')
    if setting == 'performance':
        instant_prompts.append(f"   🎪 **Performance Mode**: {subject_ref} {random.choice(['performs', 'presents', 'commands attention'])} while audience perspective maintained, {emotional_tone} stage presence, professional capture")
    elif setting == 'outdoor':
        instant_prompts.append(f"   🌿 **Environmental Harmony**: {subject_ref} {random.choice(actions)} as natural elements complement motion, environmental lighting, organic {visual_style} feel")
    
    # === ADVANCED COMPOSITE PROMPTS ===
    instant_prompts.append("\n🚀 **Advanced AI Composite:**")
    
    # Ultra-sophisticated prompt
    advanced_action = random.choice(actions)
    advanced_adverb = get_tone_appropriate_adverb(emotional_tone)
    advanced_camera = random.choice(camera_movements)
    advanced_atmosphere = get_style_appropriate_atmosphere(visual_style, emotional_tone)
    
    if distinctive_elements:
        advanced_distinctive = random.choice(distinctive_elements)
        advanced_prompt = f"{subject_ref} {advanced_action} {advanced_adverb} as {advanced_camera} captures nuanced details, {advanced_distinctive} dynamically enhanced, lighting and color grading amplify {emotional_tone} undertones, {advanced_atmosphere} with {visual_style} cinematography"
    else:
        advanced_prompt = f"{subject_ref} {advanced_action} {advanced_adverb} while {advanced_camera} follows natural rhythm, environmental elements support the motion, {advanced_atmosphere} with intelligent {visual_style} direction"
    
    instant_prompts.append(f"   • {advanced_prompt}")
    
    return "\n".join(instant_prompts)

def extract_intelligent_subject_reference(scene_info):
    """Extract intelligent subject reference using AI analysis"""
    enhanced_desc = scene_info.get('enhanced_description', '')
    basic_desc = scene_info.get('basic_description', '')
    
    # Check if we have a person
    has_person = scene_info.get('has_person', False)
    
    if not has_person:
        return "The subject"
    
    # Use enhanced description for smarter reference
    if isinstance(enhanced_desc, str):
        enhanced_lower = enhanced_desc.lower()
        if 'man in costume' in enhanced_lower:
            return "The man in costume"
        elif 'woman in dress' in enhanced_lower:
            return "The woman in dress"
        elif 'man in suit' in enhanced_lower:
            return "The man in suit"
    
    # Fallback to basic description
    if isinstance(basic_desc, str):
        basic_lower = basic_desc.lower()
        if 'man' in basic_lower:
            return "The man"
        elif 'woman' in basic_lower:
            return "The woman"
        elif 'person' in basic_lower:
            return "The person"
    
    return "The subject"

def generate_tone_appropriate_actions(emotional_tone, scene_info):
    """Generate actions that match the emotional tone"""
    base_actions = {
        'dramatic': ['moves powerfully', 'gestures boldly', 'commands attention', 'strikes a pose', 'displays intensity'],
        'elegant': ['moves gracefully', 'gestures refined', 'poses elegantly', 'demonstrates poise', 'flows naturally'],
        'theatrical': ['performs dramatically', 'presents theatrically', 'expresses character', 'embodies role', 'captivates audience'],
        'serious': ['maintains composure', 'speaks authoritatively', 'gestures formally', 'projects confidence', 'demonstrates focus'],
        'cheerful': ['expresses joy', 'gestures enthusiastically', 'radiates energy', 'shows warmth', 'displays positivity'],
        'professional': ['presents professionally', 'maintains bearing', 'demonstrates expertise', 'projects authority', 'engages formally'],
        'neutral': ['moves naturally', 'gestures appropriately', 'maintains presence', 'expresses subtly', 'demonstrates character']
    }
    
    # Add context-specific actions based on scene elements
    actions = base_actions.get(emotional_tone, base_actions['neutral']).copy()
    
    # Add clothing-specific actions
    if scene_info.get('distinctive_elements'):
        for element in scene_info['distinctive_elements']:
            if 'costume' in element:
                actions.extend(['adjusts costume', 'displays costume details'])
            elif 'cape' in element:
                actions.extend(['gestures with cape', 'moves dramatically with cape'])
            elif 'flag' in element:
                actions.extend(['acknowledges flag', 'presents with flag'])
    
    return actions

def generate_style_appropriate_cameras(visual_style, cinematic_qualities):
    """Generate camera movements appropriate for the visual style"""
    base_cameras = {
        'cinematic': ['camera glides smoothly', 'tracking shot follows', 'camera orbits elegantly', 'dolly movement captures', 'crane shot reveals'],
        'dramatic': ['camera emphasizes motion', 'dynamic camera movement', 'camera captures intensity', 'bold camera work follows', 'dramatic camera angles'],
        'theatrical': ['camera frames performance', 'audience perspective maintained', 'camera captures stage presence', 'performance-focused framing', 'theatrical camera work'],
        'professional': ['steady camera captures', 'professional camera movement', 'controlled camera work', 'camera maintains stability', 'precise camera tracking'],
        'documentary': ['handheld camera follows', 'natural camera movement', 'camera observes genuinely', 'documentary-style capture', 'authentic camera work']
    }
    
    cameras = base_cameras.get(visual_style, base_cameras['cinematic']).copy()
    
    # Add cameras based on cinematic qualities
    if 'horizontal camera movements' in cinematic_qualities:
        cameras.extend(['camera pans horizontally', 'lateral camera movement'])
    if 'vertical movement' in cinematic_qualities:
        cameras.extend(['camera tilts vertically', 'vertical camera motion'])
    if 'environmental context' in cinematic_qualities:
        cameras.extend(['camera reveals environment', 'wide establishing shots'])
    
    return cameras

def get_tone_appropriate_adverb(emotional_tone):
    """Get adverbs that match the emotional tone"""
    adverbs = {
        'dramatic': ['powerfully', 'intensely', 'dramatically', 'boldly', 'majestically'],
        'elegant': ['gracefully', 'refinedly', 'elegantly', 'smoothly', 'sophisticatedly'],
        'theatrical': ['dramatically', 'expressively', 'theatrically', 'charismatically', 'captivating'],
        'serious': ['authoritatively', 'professionally', 'formally', 'confidently', 'purposefully'],
        'cheerful': ['enthusiastically', 'energetically', 'warmly', 'positively', 'vibrantly'],
        'professional': ['professionally', 'precisely', 'competently', 'expertly', 'authoritatively'],
        'neutral': ['naturally', 'smoothly', 'appropriately', 'genuinely', 'authentically']
    }
    
    return random.choice(adverbs.get(emotional_tone, adverbs['neutral']))

def get_style_appropriate_atmosphere(visual_style, emotional_tone):
    """Get atmosphere that combines style and tone"""
    style_atmospheres = {
        'cinematic': f'cinematic {emotional_tone} atmosphere',
        'dramatic': f'dramatic {emotional_tone} mood',
        'theatrical': f'theatrical {emotional_tone} presence',
        'professional': f'professional {emotional_tone} environment',
        'documentary': f'authentic {emotional_tone} feeling'
    }
    
    return style_atmospheres.get(visual_style, f'{visual_style} {emotional_tone} atmosphere')

def get_lighting_for_style(visual_style):
    """Get appropriate lighting description for visual style"""
    lighting = {
        'cinematic': 'cinematic lighting',
        'dramatic': 'dramatic lighting',
        'theatrical': 'stage lighting',
        'professional': 'professional lighting',
        'documentary': 'natural lighting'
    }
    
    return lighting.get(visual_style, 'cinematic lighting')
    
    # Gen-4 style prompts
    for i in range(3):
        action = random.choice(contextual_actions)
        adverb = random.choice(contextual_adverbs)
        camera = random.choice(camera_moves)
        
        # Build Gen-4 iteratively
        basic = f"{subject_ref} {action}"
        with_motion = f"{basic} {adverb}"
        with_camera = f"{with_motion}. {camera}"
        
        # Add style based on specific details
        if specific_details.get('colors'):
            style_addition = f"{specific_details['colors']} tones enhanced by lighting"
        else:
            style_addition = "Cinematic lighting"
        complete = f"{with_camera}. {style_addition}"
        
        instant_prompts.append(f"📝 **Gen-4**: {complete}")
    
    # Specialized prompts based on distinctive features
    if specific_details.get('clothing'):
        clothing = specific_details['clothing']
        if 'cape' in clothing:
            instant_prompts.append(f"🦸 **Cape Focus**: {subject_ref} moves dramatically while camera captures cape movement, wind effects enhance cape flow, heroic atmosphere")
        if 'dress' in clothing:
            instant_prompts.append(f"👗 **Dress Focus**: {subject_ref} moves gracefully while camera tracks smoothly, fabric reacts to movement, elegant atmosphere")
        if 'hat' in clothing:
            instant_prompts.append(f"🎩 **Hat Focus**: {subject_ref} tips hat confidently while camera frames from chest up, professional lighting")
    
    # Color-focused prompts
    if specific_details.get('colors'):
        colors = specific_details['colors']
        instant_prompts.append(f"🎨 **Color Enhanced**: {subject_ref} {random.choice(contextual_actions)} while lighting dramatically enhances {colors} tones, cinematic depth")
    
    return "\n\n".join(instant_prompts)

def copy_to_foundation(prompt_text, approach):
    """Extract the main prompt from formatted text for foundation field"""
    # Remove the emoji and label prefix to get clean prompt
    if "**" in prompt_text:
        # Extract text after the **: 
        parts = prompt_text.split("**: ", 1)
        if len(parts) > 1:
            return parts[1]
    return prompt_text

# Create optimized Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Universal Video Prompting Tool") as demo:
    gr.Markdown("# 🎬 Universal Video Prompting Tool")
    gr.Markdown("*Compatible with Gen-4, Sora, Pika, Luma, Runway & all AI video models*")
    gr.Markdown("**Combines official Gen-4 guidelines with advanced SARA Framework**")
    
    with gr.Tabs():
        with gr.TabItem("📚 Prompting Guide"):
            gr.Markdown(unified_instructions)
        
        with gr.TabItem("🎬 Quick Video Prompt Generator"):
            with gr.Row():
                with gr.Column(scale=1):
                    # Image upload and analysis
                    gr.Markdown("## 📷 Upload Your Frame 0")
                    image_input = gr.Image(type="pil", label="Upload your initial frame")
                    analyze_btn = gr.Button("🔍 Analyze Image (Fast)", variant="primary")
                    image_analysis = gr.Textbox(
                        label="Image Analysis Results",
                        placeholder="Upload an image and click 'Analyze Image' for instant analysis...",
                        lines=10,
                        interactive=False
                    )
                    
                    # Hidden state for scene info
                    scene_info_state = gr.State({})
                    
                    # Quick suggestions
                    with gr.Group():
                        gr.Markdown("### 💡 Smart Suggestions")
                        get_suggestions_btn = gr.Button("Get Smart Tips", variant="secondary")
                        smart_suggestions = gr.Textbox(
                            label="Context-Aware Suggestions",
                            placeholder="Click 'Get Smart Tips' after image analysis...",
                            lines=5,
                            interactive=False
                        )
                    
                    # Instant prompts - NEW SECTION
                    with gr.Group():
                        gr.Markdown("### 🚀 Ready-to-Use Prompts")
                        generate_instant_btn = gr.Button("Generate Instant Prompts", variant="primary")
                        instant_prompts = gr.Textbox(
                            label="Copy & Paste Ready Prompts",
                            placeholder="Click 'Generate Instant Prompts' to get ready-to-use prompts based on your image...",
                            lines=12,
                            interactive=True,
                            show_copy_button=True
                        )
                
                with gr.Column(scale=1):
                    # Prompt generation methods
                    gr.Markdown("## 🚀 Choose Your Method")
                    
                    with gr.Tabs():
                        with gr.TabItem("🤖 AI Prompt Assistant"):
                            gr.Markdown("*Describe your idea in any language - AI will create optimized English video prompts*")
                            
                            with gr.Row():
                                with gr.Column(scale=2):
                                    user_idea = gr.Textbox(
                                        label="Your Idea (any language)",
                                        placeholder="e.g., 'el personaje se quita la nariz de payaso' or 'character walks slowly towards camera'",
                                        lines=3
                                    )
                                with gr.Column(scale=1):
                                    optimize_btn = gr.Button("🚀 Optimize & Structure", variant="primary")
                            
                            ai_optimized = gr.Textbox(
                                label="AI-Optimized Video Prompt",
                                placeholder="Your optimized prompt will appear here...",
                                lines=4,
                                interactive=True,
                                show_copy_button=True
                            )
                            
                            # Chat interface for refinement
                            gr.Markdown("### 💬 Refine Your Prompt")
                            chat_history = gr.Chatbot(
                                label="Prompt Refinement Chat", 
                                height=250,
                                placeholder="Chat history will appear here as you refine your prompt..."
                            )
                            
                            with gr.Row():
                                refine_input = gr.Textbox(
                                    label="Refine further",
                                    placeholder="e.g., 'make it more dramatic' or 'add camera movement' or 'más lento'",
                                    scale=3
                                )
                                refine_btn = gr.Button("💬 Refine", scale=1)
                        
                        with gr.TabItem("📝 Gen-4 Official"):
                            gr.Markdown("*Official method: Simple → Complex building*")
                            foundation_gen4 = gr.Textbox(
                                label="Foundation (Optional)",
                                placeholder="e.g., 'The subject walks forward'",
                                lines=1
                            )
                            generate_gen4_btn = gr.Button("Generate Gen-4 Prompts", variant="primary")
                            gen4_output = gr.Textbox(
                                label="Gen-4 Style Prompts",
                                lines=8,
                                interactive=False
                            )
            
            # Custom prompt builder
            with gr.Group():
                gr.Markdown("## 🛠️ Custom Prompt Builder")
                
                with gr.Row():
                    approach_selector = gr.Radio(
                        choices=["SARA", "Gen-4"],
                        value="SARA",
                        label="Approach",
                        interactive=True
                    )
                    custom_foundation = gr.Textbox(
                        label="Foundation",
                        placeholder="The subject...",
                        lines=1
                    )
                
                with gr.Row():
                    subject_motion = gr.CheckboxGroup(
                        choices=["walks smoothly", "speaks clearly", "gestures naturally", "moves gracefully", "turns slowly"],
                        label="Subject Motion"
                    )
                    scene_motion = gr.CheckboxGroup(
                        choices=["dust swirls", "lighting changes", "wind effects", "water movement", "atmosphere shifts"],
                        label="Scene Motion"
                    )
                
                with gr.Row():
                    camera_motion = gr.Dropdown(
                        choices=["camera remains steady", "handheld camera", "camera pans left", "camera pans right", "camera tracks forward", "camera zooms in"],
                        label="Camera Motion",
                        value="camera remains steady"
                    )
                    style_motion = gr.Dropdown(
                        choices=["cinematic", "documentary style", "live-action", "dramatic", "peaceful", "energetic", "professional"],
                        label="Style/Atmosphere",
                        value="cinematic"
                    )
                
                build_custom_btn = gr.Button("🔨 Build Custom Prompt", variant="secondary")
                custom_output = gr.Textbox(
                    label="Your Custom Prompt",
                    lines=3,
                    interactive=True
                )
    
    # Event handlers
    analyze_btn.click(
        fn=analyze_image_simple,
        inputs=[image_input],
        outputs=[image_analysis, gr.State(), scene_info_state]
    )
    
    get_suggestions_btn.click(
        fn=get_smart_suggestions_local,
        inputs=[scene_info_state],
        outputs=[smart_suggestions]
    )
    
    # NEW: Generate instant prompts
    generate_instant_btn.click(
        fn=generate_instant_prompts,
        inputs=[scene_info_state],
        outputs=[instant_prompts]
    )
    
    # NEW: AI Prompt Assistant
    optimize_btn.click(
        fn=optimize_user_prompt,
        inputs=[user_idea, scene_info_state],
        outputs=[ai_optimized]
    )
    
    refine_btn.click(
        fn=refine_prompt_with_feedback,
        inputs=[ai_optimized, refine_input, chat_history, scene_info_state],
        outputs=[ai_optimized, chat_history]
    )
    
    generate_gen4_btn.click(
        fn=generate_gen4_prompts_local,
        inputs=[scene_info_state, foundation_gen4],
        outputs=[gen4_output]
    )
    
    build_custom_btn.click(
        fn=build_custom_prompt_local,
        inputs=[custom_foundation, subject_motion, scene_motion, camera_motion, style_motion, approach_selector],
        outputs=[custom_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()