import gradio as gr import torch from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image import random # Use lighter BLIP model instead of heavy LLaVA print("Loading BLIP model (lighter version)...") processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = BlipForConditionalGeneration.from_pretrained( "Salesforce/blip-image-captioning-large", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ) # Universal Video Prompting Guide combining Gen-4 + SARA unified_instructions = """ # 🎬 Universal Video Prompting Guide *Compatible with Gen-4, Sora, Pika, Luma, Runway and all diffusion-based video models* ## Core Principles (Universal) ✅ **Focus on MOTION, not static description** ✅ **Use positive phrasing exclusively** ✅ **Start simple, iterate progressively** ✅ **Refer to subjects in general terms** ("the subject," "the woman") ✅ **Keep prompts direct and easily understood** ## Two Complementary Approaches ### 🚀 **Gen-4 Official Method** (Recommended for beginners) **Structure**: Simple iterative building 1. Start with essential motion only 2. Add one element at a time: Subject Motion → Camera Motion → Scene Motion → Style Descriptors 3. Use general terms and avoid complex descriptions **Example**: - Basic: "The subject walks forward" - + Camera: "The subject walks forward. Handheld camera follows" - + Scene: "The subject walks forward. Handheld camera follows. Dust trails behind" - + Style: "The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic." ### 🎯 **SARA Framework** (Advanced precision) **Structure**: [Subject] + [Action] + [Reference] + [Atmosphere] - **Subject (S)**: Main element to control - **Action (A)**: Movement/transformation ([verb] + [adverb]) - **Reference (R)**: Spatial anchors ("while X remains steady") - **Atmosphere (A)**: Context and style **Template**: [Subject] [verb] [adverb] while [reference] [atmosphere] **Example**: "The subject walks smoothly while background remains steady, cinematic atmosphere" ## Essential Vocabulary ### Effective Verbs (Action) - **Movement**: walks, runs, moves, glides, flows, drifts - **Rotation**: turns, spins, rotates, pivots, tilts - **Transformation**: transforms, morphs, transitions, evolves - **Expression**: speaks, gestures, looks, smiles, nods ### Effective Adverbs (Quality) - **Speed**: slowly, quickly, gradually, suddenly, steadily - **Style**: smoothly, naturally, elegantly, gracefully, dramatically - **Intensity**: gently, softly, powerfully, intensely, subtly ### Camera Motion Terms - **Basic**: locked camera, handheld, steady cam - **Movement**: pan left/right, tilt up/down, zoom in/out, dolly forward/back - **Advanced**: tracking shot, crane movement, orbital movement ### Style Descriptors - **Aesthetic**: cinematic, live-action, smooth animation, stop motion - **Mood**: dramatic, peaceful, energetic, mysterious, professional - **Technical**: 4K, slow motion, time-lapse, documentary style ## Multi-Subject Guidelines - **Positional**: "The subject on the left walks. The subject on the right remains still." - **Descriptive**: "The woman nods. The man waves." - **Sequential**: "The woman speaks then the man responds." ## Scene Motion Approaches - **Insinuated**: "The subject runs across the dusty desert" (natural) - **Explicit**: "The subject runs across the desert. Dust trails behind them" (emphasized) ## Proven Examples (from SARA Framework) ### Character Motion - "The woman speaks enthusiastically to camera while camera remains still, online tutorial" - "The subject transitions from walking to jumping while background stays constant" ### Camera Motion - "The subject remains centered as camera smoothly moves left with steady background" - "Handheld camera tracks the subject as they walk forward naturally" ### Environmental - "Camera stays fixed while day cycles into night over the temple, stone structures remain still" - "The red cup slides smoothly to the right on white table, maintaining background constant" ### Complex Scenes - "The pile of rocks transforms into a humanoid made of rugged volcanic rocks. The rock humanoid walks around" - "The woman inspects her reflection in mirror. Surface bubbles with translucent bubbles. Locked camera" ## Technical Notes - **Gen-4/Runway**: Prefer SARA structure for precision - **Sora/OpenAI**: Works well with both approaches - **Pika/Stable**: Gen-4 method often more effective - **All models**: Start simple, iterate based on results """ # Prompt templates from both Gen-4 and SARA research SARA_TEMPLATES = { "character_motion": [ "{subject} speaks {adverb} to camera while camera remains still, {genre}", "{subject} {action} {adverb} while background stays constant, {style}", "{subject} transitions from {action1} to {action2} while frame remains fixed, {genre}" ], "camera_motion": [ "{subject} remains centered as camera {movement} {adverb} with steady background", "{camera_type} camera {action} the {subject} as they {movement} {adverb}", "Camera {movement} {adverb} while {subject} maintains position, {style}" ], "environmental": [ "Camera stays fixed while {environment} {transformation} over {subject}, {reference} remain still", "{subject} {action} while {environmental_effect} around them, {style}", "{environmental_element} {movement} {adverb} as {subject} {action}, maintaining {reference}" ], "transformations": [ "{object} transforms into {new_form} made of {material}. The {new_subject} {action} around", "{subject} {action} in {location}. {environmental_reaction} {adverb}. {camera_style}", "The {subject} {action} while {environmental_change} occurs {adverb}, {atmosphere}" ] } GEN4_TEMPLATES = { "basic": [ "The subject {action}", "The {subject} {movement} {direction}", "{subject} {expression} to camera" ], "with_camera": [ "The subject {action}. {camera_movement}", "{subject} {movement} {direction}. Camera {camera_action}", "Handheld camera {camera_behavior} as {subject} {action}" ], "with_scene": [ "The subject {action}. {camera_movement}. {scene_element} {scene_action}", "{subject} {movement} across {environment}. {environmental_reaction}", "Camera {camera_movement} while {subject} {action}, {scene_description}" ], "complete": [ "The subject {action}. {camera_movement}. {scene_element} {scene_action}. {style}", "{subject} {movement} {adverb} across {environment}. {camera_type} camera {camera_action}. {style}", "Camera {camera_movement} as {subject} {action}, {environmental_reaction}, {atmosphere}" ] } # Vocabulary databases VOCABULARY = { "subjects": ["the subject", "the woman", "the man", "the person", "the character"], "actions": ["walks", "runs", "moves", "glides", "flows", "turns", "speaks", "gestures"], "adverbs": ["smoothly", "slowly", "quickly", "naturally", "gracefully", "steadily", "gently"], "camera_movements": ["locked camera", "handheld", "dolly forward", "pan left", "pan right", "tracking shot"], "environments": ["dusty desert", "forest", "urban street", "open field", "indoor space"], "styles": ["cinematic", "documentary", "live-action", "dramatic", "peaceful", "energetic"] } def analyze_image_simple(image): """Enhanced image analysis using BLIP + AI reasoning""" if image is None: return "Please upload an image first.", "", {} try: # Convert to PIL if needed if not isinstance(image, Image.Image): image = Image.fromarray(image) # Get basic image info width, height = image.size aspect_ratio = width / height if aspect_ratio > 1.5: composition = "Wide landscape shot" elif aspect_ratio < 0.7: composition = "Vertical portrait shot" else: composition = "Balanced composition" # Generate caption with BLIP inputs = processor(image, return_tensors="pt") if torch.cuda.is_available(): inputs = {k: v.cuda() for k, v in inputs.items()} model.cuda() out = model.generate(**inputs, max_length=50, num_beams=3) basic_caption = processor.decode(out[0], skip_special_tokens=True) # Enhanced analysis using AI reasoning enhanced_analysis = analyze_scene_with_ai(basic_caption, aspect_ratio, composition) # Create comprehensive analysis text analysis = f"""📊 **Image Analysis:** • **Dimensions**: {width} x {height} • **Composition**: {composition} • **Aspect Ratio**: {aspect_ratio:.2f} 🎨 **Basic Description**: "{basic_caption}" 🧠 **AI-Enhanced Analysis**: {enhanced_analysis['scene_interpretation']} 💡 **Motion & Cinematography Insights**: {chr(10).join(f"• {insight}" for insight in enhanced_analysis['motion_insights'])} 🎯 **Recommended Approach**: {enhanced_analysis['recommended_approach']}""" # Enhanced scene info for prompt generation scene_info = { 'basic_description': basic_caption, 'enhanced_description': enhanced_analysis['detailed_description'], 'composition': composition, 'aspect_ratio': aspect_ratio, 'has_person': enhanced_analysis['has_person'], 'emotional_tone': enhanced_analysis['emotional_tone'], 'visual_style': enhanced_analysis['visual_style'], 'setting': enhanced_analysis['setting'], 'distinctive_elements': enhanced_analysis['distinctive_elements'], 'motion_potential': enhanced_analysis['motion_potential'], 'cinematic_qualities': enhanced_analysis['cinematic_qualities'] } return analysis, basic_caption, scene_info except Exception as e: return f"Error analyzing image: {str(e)}", "", {} def analyze_scene_with_ai(basic_caption, aspect_ratio, composition): """Use AI reasoning to enhance basic image analysis""" text = basic_caption.lower() if isinstance(basic_caption, str) else "" # Interpret the scene beyond basic description scene_elements = extract_scene_elements(text) # Determine emotional tone and mood emotional_tone = determine_emotional_tone(text, scene_elements) # Analyze visual style potential visual_style = determine_visual_style(text, scene_elements, composition) # Identify distinctive elements for video potential distinctive_elements = identify_distinctive_elements(text) # Assess motion potential motion_potential = assess_motion_potential(text, scene_elements) # Generate cinematic insights cinematic_qualities = analyze_cinematic_potential(text, composition, aspect_ratio) # Create enhanced interpretation enhanced_description = create_enhanced_description(basic_caption, scene_elements, emotional_tone) # Generate motion and cinematography insights motion_insights = generate_motion_insights(scene_elements, emotional_tone, visual_style, composition) # Recommend best approach recommended_approach = recommend_approach(scene_elements, emotional_tone, visual_style) return { 'detailed_description': enhanced_description, 'scene_interpretation': f"Scene shows {scene_elements['subject']} in {scene_elements['setting']} with {emotional_tone} mood. Key elements: {', '.join(distinctive_elements)}", 'motion_insights': motion_insights, 'recommended_approach': recommended_approach, 'has_person': scene_elements['has_person'], 'emotional_tone': emotional_tone, 'visual_style': visual_style, 'setting': scene_elements['setting'], 'distinctive_elements': distinctive_elements, 'motion_potential': motion_potential, 'cinematic_qualities': cinematic_qualities } def extract_scene_elements(text): """Extract and interpret scene elements intelligently""" elements = { 'subject': 'subject', 'setting': 'neutral', 'clothing': None, 'colors': [], 'objects': [], 'has_person': False } # Detect subjects with context if any(word in text for word in ['man', 'male', 'gentleman']): elements['subject'] = 'man' elements['has_person'] = True # Detect what the man is wearing/doing if 'costume' in text: elements['subject'] = 'man in costume' elements['clothing'] = 'costume' elif 'suit' in text: elements['subject'] = 'man in suit' elements['clothing'] = 'suit' elif any(word in text for word in ['woman', 'female', 'lady']): elements['subject'] = 'woman' elements['has_person'] = True if 'dress' in text: elements['subject'] = 'woman in dress' elements['clothing'] = 'dress' # Detect setting with intelligence if any(word in text for word in ['outdoor', 'outside', 'street', 'nature', 'park']): elements['setting'] = 'outdoor' elif any(word in text for word in ['indoor', 'inside', 'room', 'office', 'studio']): elements['setting'] = 'indoor' elif any(word in text for word in ['stage', 'performance']): elements['setting'] = 'performance' # Extract colors intelligently color_words = ['red', 'blue', 'green', 'yellow', 'black', 'white', 'brown', 'pink', 'purple', 'orange', 'gold', 'silver'] elements['colors'] = [color for color in color_words if color in text] # Extract objects objects = ['hat', 'cape', 'flag', 'chair', 'table', 'background', 'wall'] elements['objects'] = [obj for obj in objects if obj in text] return elements def determine_emotional_tone(text, scene_elements): """Intelligently determine the emotional tone of the scene""" # Ensure text is a string if not isinstance(text, str): text = "" text_lower = text.lower() # Look for emotional indicators if any(word in text_lower for word in ['serious', 'formal', 'stern', 'professional']): return 'serious' elif any(word in text_lower for word in ['happy', 'smiling', 'cheerful', 'joyful']): return 'cheerful' elif any(word in text_lower for word in ['dramatic', 'intense', 'powerful', 'bold']): return 'dramatic' elif any(word in text_lower for word in ['elegant', 'graceful', 'refined']): return 'elegant' elif 'costume' in text_lower or 'performance' in text_lower: return 'theatrical' else: # Infer from context if scene_elements['setting'] == 'performance': return 'theatrical' elif scene_elements['clothing'] in ['suit', 'formal']: return 'professional' else: return 'neutral' def determine_visual_style(text, scene_elements, composition): """Determine the most suitable visual style""" # Ensure text is a string if not isinstance(text, str): text = "" text_lower = text.lower() # Analyze scene for style cues if scene_elements['setting'] == 'performance' or 'costume' in text_lower: return 'theatrical' elif scene_elements['setting'] == 'indoor' and 'formal' in text_lower: return 'professional' elif composition in ['Wide landscape shot']: return 'cinematic' elif any(color in scene_elements['colors'] for color in ['red', 'gold', 'dramatic']): return 'dramatic' else: return 'cinematic' def identify_distinctive_elements(text): """Identify unique elements that can enhance video prompts""" # Ensure text is a string if not isinstance(text, str): text = "" elements = [] text_lower = text.lower() # Clothing and costume elements if 'costume' in text_lower: elements.append('elaborate costume') if 'cape' in text_lower: elements.append('flowing cape') if 'hat' in text_lower: elements.append('distinctive hat') if 'flag' in text_lower: elements.append('flag detail') # Color elements colors = ['red', 'blue', 'green', 'gold'] found_colors = [color for color in colors if color in text_lower] if found_colors: elements.append(f"{', '.join(found_colors)} coloring") # Setting elements if 'background' in text_lower: elements.append('detailed background') return elements if elements else ['natural elements'] def assess_motion_potential(text, scene_elements): """Assess what types of motion would work best""" # Ensure text is a string if not isinstance(text, str): text = "" potential = [] text_lower = text.lower() if scene_elements['has_person']: potential.extend(['facial expressions', 'hand gestures', 'body movement']) if scene_elements['clothing'] == 'costume': potential.append('costume dynamics') if scene_elements['clothing'] == 'cape': potential.append('cape flow') if scene_elements['clothing'] == 'dress': potential.append('fabric movement') if scene_elements['setting'] == 'outdoor': potential.extend(['environmental effects', 'natural lighting changes']) elif scene_elements['setting'] == 'indoor': potential.extend(['controlled lighting', 'subtle environment shifts']) return potential def analyze_cinematic_potential(text, composition, aspect_ratio): """Analyze the cinematic qualities and potential""" # Ensure text is a string if not isinstance(text, str): text = "" qualities = [] text_lower = text.lower() # Composition analysis if composition == 'Wide landscape shot': qualities.extend(['horizontal camera movements', 'panoramic reveals', 'environmental context']) elif composition == 'Vertical portrait shot': qualities.extend(['character focus', 'intimate framing', 'vertical movement']) else: qualities.extend(['balanced framing', 'versatile movement', 'centered composition']) # Content analysis if 'costume' in text_lower or 'dramatic' in text_lower: qualities.append('dramatic lighting potential') if any(color in text_lower for color in ['red', 'gold', 'rich']): qualities.append('color enhancement opportunities') return qualities def create_enhanced_description(basic_caption, scene_elements, emotional_tone): """Create a richer description using AI analysis""" subject = scene_elements['subject'] setting = scene_elements['setting'] clothing = scene_elements['clothing'] enhanced = f"A {emotional_tone} scene featuring {subject}" if clothing: enhanced += f" wearing {clothing}" enhanced += f" in a {setting} setting" if scene_elements['colors']: enhanced += f" with prominent {', '.join(scene_elements['colors'])} elements" return enhanced def generate_motion_insights(scene_elements, emotional_tone, visual_style, composition): """Generate intelligent motion and cinematography insights""" insights = [] # Subject-based insights if scene_elements['has_person']: if emotional_tone == 'dramatic': insights.append('Emphasize powerful gestures and dynamic poses') elif emotional_tone == 'elegant': insights.append('Focus on graceful, refined movements') elif emotional_tone == 'theatrical': insights.append('Capture performance-style expressions and gestures') if scene_elements['clothing']: clothing = scene_elements['clothing'] if clothing == 'costume': insights.append('Highlight costume details with movement') elif clothing == 'cape': insights.append('Showcase cape flow and dramatic movement') elif clothing == 'dress': insights.append('Capture fabric dynamics and elegant motion') # Composition-based insights if composition == 'Wide landscape shot': insights.append('Utilize horizontal camera movements and wide reveals') elif composition == 'Vertical portrait shot': insights.append('Focus on vertical movement and character detail') # Style-based insights if visual_style == 'cinematic': insights.append('Use cinematic camera techniques and dramatic lighting') elif visual_style == 'dramatic': insights.append('Emphasize bold movements and high contrast lighting') elif visual_style == 'professional': insights.append('Maintain clean, controlled camera work') # Color-based insights if scene_elements['colors']: insights.append(f"Enhance {', '.join(scene_elements['colors'])} tones through lighting") return insights[:6] # Limit to 6 most relevant insights def recommend_approach(scene_elements, emotional_tone, visual_style): """Intelligently recommend the best prompting approach""" # For complex scenes with people in costume/formal wear if scene_elements['has_person'] and scene_elements['clothing'] in ['costume', 'suit', 'dress']: return "SARA Framework recommended for precise character and costume control" # For dramatic or theatrical scenes elif emotional_tone in ['dramatic', 'theatrical']: return "SARA Framework ideal for complex dramatic scenes with multiple elements" # For simple, natural scenes elif emotional_tone in ['neutral', 'peaceful'] and visual_style != 'dramatic': return "Gen-4 method perfect for natural, iterative scene building" # For professional or formal contexts elif emotional_tone == 'professional' or visual_style == 'professional': return "Either approach works - SARA for precision, Gen-4 for simplicity" else: return "Start with Gen-4 for base prompt, then refine with SARA for complexity" def generate_motion_suggestions(description, aspect_ratio): """Generate contextual motion suggestions""" text = description.lower() suggestions = [] # Content-based suggestions if any(word in text for word in ['person', 'woman', 'man', 'people']): suggestions.extend([ 'Focus on character expressions and gestures', 'Use "the subject" or "the woman/man" for clarity', 'Consider handheld camera for natural movement' ]) if any(word in text for word in ['sitting', 'standing']): suggestions.extend([ 'Start with simple movements: speaking, gesturing', 'Locked or steady camera works well for portraits' ]) if any(word in text for word in ['outdoor', 'landscape', 'nature']): suggestions.extend([ 'Camera movement can explore the environment', 'Consider environmental motion: wind, clouds', 'Cinematic style complements outdoor scenes' ]) if any(word in text for word in ['indoor', 'room']): suggestions.extend([ 'Controlled movements work best indoors', 'Focus on subject motion within the space' ]) # Composition-based suggestions if aspect_ratio > 1.5: suggestions.append('Wide format perfect for horizontal camera movements') elif aspect_ratio < 0.8: suggestions.append('Portrait format ideal for character-focused content') return suggestions[:6] if suggestions else [ 'Start with simple motion: "The subject moves"', 'Add camera movement: "Camera follows naturally"', 'Include environment: "Background remains steady"' ] def get_recommended_approach(description): """Recommend best approach based on image content""" text = description.lower() if any(word in text for word in ['person', 'woman', 'man']): return "SARA Framework recommended for character precision" elif any(word in text for word in ['landscape', 'building', 'nature']): return "Gen-4 method works well for environmental scenes" else: return "Try both approaches - start with Gen-4, refine with SARA" def detect_setting(description): """Detect setting type from description""" text = description.lower() if any(word in text for word in ['outdoor', 'outside', 'street', 'nature']): return 'outdoor' elif any(word in text for word in ['indoor', 'inside', 'room', 'building']): return 'indoor' else: return 'neutral' def extract_specific_details(description): """Extract specific details from the image description""" details = { 'colors': [], 'clothing': None, 'distinctive_feature': None, 'main_object': None, 'setting_clues': [] } text = description.lower() # Extract colors colors = ['red', 'blue', 'green', 'yellow', 'black', 'white', 'brown', 'pink', 'purple', 'orange'] found_colors = [color for color in colors if color in text] if found_colors: details['colors'] = ', '.join(found_colors) # Extract clothing/costume details clothing_items = ['cape', 'hat', 'dress', 'suit', 'shirt', 'coat', 'jacket', 'uniform', 'costume', 'robe'] for item in clothing_items: if item in text: # Get the full clothing description if 'red cape' in text: details['clothing'] = 'red cape' details['distinctive_feature'] = 'flowing red cape' elif 'hat' in text: if 'red hat' in text: details['clothing'] = 'red hat' details['distinctive_feature'] = 'red hat' else: details['clothing'] = 'hat' details['distinctive_feature'] = 'hat' else: details['clothing'] = item details['distinctive_feature'] = item break # Extract main subject if 'man' in text: details['main_object'] = 'man' elif 'woman' in text: details['main_object'] = 'woman' elif 'person' in text: details['main_object'] = 'person' elif 'people' in text: details['main_object'] = 'people' # Extract setting clues setting_indicators = ['outdoor', 'indoor', 'street', 'room', 'building', 'nature', 'park', 'office'] details['setting_clues'] = [indicator for indicator in setting_indicators if indicator in text] return details def get_contextual_subject(description, details): """Get appropriate subject reference based on image content""" text = description.lower() if 'man' in text: if details.get('clothing'): return f"The man in the {details['clothing']}" else: return "The man" elif 'woman' in text: if details.get('clothing'): return f"The woman in the {details['clothing']}" else: return "The woman" elif 'person' in text: return "The person" else: return "The subject" def get_contextual_actions(description, details): """Get actions appropriate for the scene""" base_actions = ['speaks', 'gestures', 'moves', 'looks', 'turns'] # Add context-specific actions if details.get('clothing'): if 'cape' in details['clothing']: base_actions.extend(['adjusts cape', 'moves dramatically', 'gestures with cape flowing']) if 'hat' in details['clothing']: base_actions.extend(['tips hat', 'adjusts hat', 'nods with hat']) # Add character-appropriate actions if 'man' in description.lower(): base_actions.extend(['speaks confidently', 'gestures authoritatively']) return base_actions def get_contextual_adverbs(details): """Get adverbs that fit the scene""" base_adverbs = ['naturally', 'smoothly', 'slowly', 'gracefully'] if details.get('clothing'): if 'cape' in details['clothing']: base_adverbs.extend(['dramatically', 'majestically', 'with flair']) if 'hat' in details['clothing']: base_adverbs.extend(['elegantly', 'with style', 'confidently']) return base_adverbs def get_contextual_camera_movement(description, details): """Get camera movements appropriate for the scene""" base_movements = ['Camera follows steadily', 'Locked camera captures', 'Handheld camera tracks'] if details.get('distinctive_feature'): if 'cape' in details['distinctive_feature']: base_movements.extend(['Camera captures cape movement', 'Tracking shot follows cape flow']) if 'hat' in details['distinctive_feature']: base_movements.extend(['Camera frames from chest up', 'Close tracking of upper body']) return base_movements def get_contextual_environment(description, details): """Get environmental effects that complement the scene""" if details.get('colors'): if 'red' in details['colors']: return "lighting enhances red tones" if details.get('clothing'): if 'cape' in details['clothing']: return "cape fabric reacts to air movement" return None def get_contextual_style(details): """Get style that fits the scene context""" if details.get('clothing'): if 'cape' in details['clothing']: return "dramatic cinematic style" if 'hat' in details['clothing']: return "classic portrait style" return "professional documentary style" def get_contextual_atmosphere(details): """Get atmosphere that matches the scene""" if details.get('colors'): if 'red' in details['colors']: return "dramatic atmosphere with rich red tones" if details.get('clothing'): if 'cape' in details['clothing']: return "heroic cinematic atmosphere" if 'hat' in details['clothing']: return "elegant portrait atmosphere" return "professional cinematic atmosphere" def optimize_user_prompt(user_idea, scene_info=None): """Optimize and structure user's prompt idea into professional video prompt""" if not user_idea.strip(): return "Please enter your idea first." try: # Analyze the user's input idea = user_idea.strip() # Detect language and content analysis = analyze_user_idea(idea) # Generate optimized prompt optimized = create_optimized_prompt(idea, analysis, scene_info) return optimized except Exception as e: return f"Error optimizing prompt: {str(e)}" def analyze_user_idea(idea): """Analyze user's idea to understand intent and content""" idea_lower = idea.lower() analysis = { 'language': detect_language(idea), 'has_action': False, 'has_object': False, 'has_emotion': False, 'has_camera': False, 'complexity': 'simple', 'main_elements': [] } # Detect actions (multilingual) action_words = { 'en': ['removes', 'takes off', 'puts on', 'walks', 'runs', 'speaks', 'gestures', 'moves', 'turns', 'looks'], 'es': ['quita', 'se quita', 'pone', 'camina', 'corre', 'habla', 'gesticula', 'mueve', 'gira', 'mira'], 'fr': ['enlève', 'met', 'marche', 'court', 'parle', 'gesticule', 'bouge'], 'de': ['nimmt ab', 'zieht aus', 'geht', 'läuft', 'spricht', 'bewegt'] } for lang, actions in action_words.items(): if any(action in idea_lower for action in actions): analysis['has_action'] = True break # Detect objects/elements object_words = ['nose', 'nariz', 'hat', 'sombrero', 'costume', 'traje', 'cape', 'capa', 'mask', 'máscara'] if any(obj in idea_lower for obj in object_words): analysis['has_object'] = True # Detect emotions/style emotion_words = ['dramatic', 'dramático', 'slow', 'lento', 'fast', 'rápido', 'gentle', 'suave', 'powerful', 'poderoso'] if any(emotion in idea_lower for emotion in emotion_words): analysis['has_emotion'] = True # Detect camera references camera_words = ['camera', 'cámara', 'shot', 'toma', 'angle', 'ángulo', 'close', 'cerca', 'wide', 'amplio'] if any(camera in idea_lower for camera in camera_words): analysis['has_camera'] = True # Determine complexity element_count = sum([analysis['has_action'], analysis['has_object'], analysis['has_emotion'], analysis['has_camera']]) if element_count >= 3: analysis['complexity'] = 'complex' elif element_count >= 2: analysis['complexity'] = 'medium' return analysis def detect_language(text): """Simple language detection""" spanish_indicators = ['el', 'la', 'se', 'que', 'con', 'por', 'para', 'del', 'de la', 'nariz', 'payaso'] french_indicators = ['le', 'la', 'se', 'que', 'avec', 'pour', 'du', 'de la', 'nez', 'clown'] german_indicators = ['der', 'die', 'das', 'sich', 'mit', 'für', 'vom', 'nase', 'clown'] text_lower = text.lower() if any(indicator in text_lower for indicator in spanish_indicators): return 'spanish' elif any(indicator in text_lower for indicator in french_indicators): return 'french' elif any(indicator in text_lower for indicator in german_indicators): return 'german' else: return 'english' def create_optimized_prompt(idea, analysis, scene_info=None): """Create optimized English video prompt from user idea""" # Translation dictionary for common elements translations = { 'spanish': { 'se quita': 'removes', 'quita': 'removes', 'pone': 'puts on', 'camina': 'walks', 'habla': 'speaks', 'mueve': 'moves', 'nariz': 'nose', 'payaso': 'clown', 'personaje': 'character', 'sombrero': 'hat', 'capa': 'cape', 'lentamente': 'slowly', 'rápidamente': 'quickly', 'dramáticamente': 'dramatically' }, 'french': { 'enlève': 'removes', 'met': 'puts on', 'marche': 'walks', 'parle': 'speaks', 'bouge': 'moves', 'nez': 'nose', 'clown': 'clown', 'personnage': 'character', 'chapeau': 'hat', 'cape': 'cape' } } # Start with basic translation optimized_idea = idea if analysis['language'] in translations: for original, translation in translations[analysis['language']].items(): optimized_idea = optimized_idea.replace(original, translation) # Structure the prompt professionally structured_prompt = structure_video_prompt(optimized_idea, analysis, scene_info) return structured_prompt def structure_video_prompt(idea, analysis, scene_info=None): """Structure the idea into a professional video prompt""" # Extract main elements idea_lower = idea.lower() # Identify subject if 'character' in idea_lower or 'personaje' in idea_lower: subject = "The character" elif 'person' in idea_lower or 'persona' in idea_lower: subject = "The person" elif scene_info and scene_info.get('has_person'): # Use context from scene analysis subject = extract_intelligent_subject_reference(scene_info) else: subject = "The subject" # Extract and optimize action action = extract_action_from_idea(idea) # Add appropriate style modifiers if analysis['complexity'] == 'simple': # Simple structure: Subject + Action + naturally optimized = f"{subject} {action} naturally" # Add camera suggestion optimized += ". Camera captures the motion smoothly" elif analysis['complexity'] == 'medium': # Medium structure: Add more detail optimized = f"{subject} {action} while camera follows steadily" # Add environmental/lighting if analysis['has_emotion']: optimized += ", dramatic lighting enhances the mood" else: optimized += ", professional lighting" else: # Complex structure: Full SARA framework optimized = f"{subject} {action} expressively while camera tracks the motion" optimized += ", lighting and environment support the action, cinematic atmosphere" # Add technical improvements optimized = improve_technical_language(optimized) return optimized def extract_action_from_idea(idea): """Extract and refine the main action from user's idea""" idea_lower = idea.lower() # Map common actions to video-optimized versions action_mappings = { 'removes': 'removes', 'quita': 'removes', 'se quita': 'removes', 'takes off': 'removes', 'puts on': 'puts on', 'pone': 'puts on', 'walks': 'walks', 'camina': 'walks', 'speaks': 'speaks', 'habla': 'speaks', 'moves': 'moves', 'mueve': 'moves', 'turns': 'turns', 'gira': 'turns', 'looks': 'looks', 'mira': 'looks' } # Find the action and object action = "moves" # default object_part = "" for original, mapped in action_mappings.items(): if original in idea_lower: action = mapped # Try to extract what's being acted upon if original in ['removes', 'quita', 'se quita', 'takes off']: # Look for what's being removed if 'nose' in idea_lower or 'nariz' in idea_lower: if 'clown' in idea_lower or 'payaso' in idea_lower: object_part = "the clown nose" else: object_part = "the nose piece" elif 'hat' in idea_lower or 'sombrero' in idea_lower: object_part = "the hat" elif 'mask' in idea_lower or 'máscara' in idea_lower: object_part = "the mask" break # Combine action with object if object_part: return f"{action} {object_part}" else: return action def improve_technical_language(prompt): """Improve the prompt with professional video terminology""" # Enhance basic terms improvements = { 'moves naturally': 'moves with natural grace', 'Camera captures': 'Camera captures', 'smoothly': 'with smooth motion', 'follows steadily': 'follows with steady tracking', 'dramatic lighting': 'dramatic lighting transitions', 'professional lighting': 'professional lighting setup', 'cinematic atmosphere': 'rich cinematic atmosphere' } improved_prompt = prompt for basic, enhanced in improvements.items(): improved_prompt = improved_prompt.replace(basic, enhanced) return improved_prompt def refine_prompt_with_feedback(current_prompt, feedback, chat_history, scene_info=None): """Use AI to intelligently refine prompts based on user feedback""" if not feedback.strip(): return current_prompt, chat_history # Analyze the feedback with AI understanding refinement_analysis = analyze_refinement_request(feedback, current_prompt, scene_info) # Generate intelligent refinement refined_prompt = apply_intelligent_refinement(current_prompt, refinement_analysis, scene_info) # Create explanatory response explanation = create_refinement_explanation(refinement_analysis, current_prompt, refined_prompt) # Update chat history with intelligent conversation new_chat_history = chat_history + [ [feedback, f"🤖 {explanation}\n\n✨ **Refined Prompt**: {refined_prompt}"] ] return refined_prompt, new_chat_history def analyze_refinement_request(feedback, current_prompt, scene_info): """Analyze what the user wants to change using AI understanding""" feedback_lower = feedback.lower() analysis = { 'request_type': 'general', 'intensity': 'moderate', 'focus_area': 'action', 'style_preference': None, 'specific_elements': [], 'language': detect_language(feedback) } # Detect request type with AI understanding if any(word in feedback_lower for word in ['dramatic', 'dramático', 'dramatique', 'dramatisch']): analysis['request_type'] = 'dramatic' analysis['intensity'] = 'high' elif any(word in feedback_lower for word in ['slow', 'slower', 'lento', 'más lento', 'lentement']): analysis['request_type'] = 'pace' analysis['intensity'] = 'slow' elif any(word in feedback_lower for word in ['fast', 'faster', 'rápido', 'más rápido', 'rapide']): analysis['request_type'] = 'pace' analysis['intensity'] = 'fast' elif any(word in feedback_lower for word in ['camera', 'cámara', 'caméra', 'kamera']): analysis['request_type'] = 'camera' analysis['focus_area'] = 'cinematography' elif any(word in feedback_lower for word in ['lighting', 'light', 'luz', 'lumière', 'licht']): analysis['request_type'] = 'lighting' analysis['focus_area'] = 'atmosphere' elif any(word in feedback_lower for word in ['simple', 'simpler', 'más simple', 'plus simple']): analysis['request_type'] = 'simplify' analysis['intensity'] = 'low' elif any(word in feedback_lower for word in ['complex', 'complicated', 'detalle', 'detail', 'détail']): analysis['request_type'] = 'elaborate' analysis['intensity'] = 'high' elif any(word in feedback_lower for word in ['elegant', 'elegante', 'élégant']): analysis['request_type'] = 'style' analysis['style_preference'] = 'elegant' elif any(word in feedback_lower for word in ['powerful', 'poderoso', 'puissant']): analysis['request_type'] = 'style' analysis['style_preference'] = 'powerful' elif any(word in feedback_lower for word in ['natural', 'natural', 'naturel']): analysis['request_type'] = 'style' analysis['style_preference'] = 'natural' # Detect specific elements mentioned elements = ['costume', 'dress', 'cape', 'hat', 'background', 'face', 'hands', 'movement'] for element in elements: if element in feedback_lower: analysis['specific_elements'].append(element) return analysis def apply_intelligent_refinement(current_prompt, analysis, scene_info): """Apply intelligent refinement based on analysis""" # Start with current prompt refined = current_prompt # Apply refinements based on request type if analysis['request_type'] == 'dramatic': refined = enhance_dramatic_elements(refined, analysis, scene_info) elif analysis['request_type'] == 'pace': refined = adjust_pace(refined, analysis) elif analysis['request_type'] == 'camera': refined = enhance_camera_work(refined, analysis, scene_info) elif analysis['request_type'] == 'lighting': refined = enhance_lighting(refined, analysis, scene_info) elif analysis['request_type'] == 'simplify': refined = simplify_prompt(refined) elif analysis['request_type'] == 'elaborate': refined = elaborate_prompt(refined, scene_info) elif analysis['request_type'] == 'style': refined = apply_style_preference(refined, analysis, scene_info) else: # General enhancement refined = apply_general_enhancement(refined, analysis, scene_info) return refined def enhance_dramatic_elements(prompt, analysis, scene_info): """Enhance dramatic elements intelligently""" # Replace gentle actions with dramatic ones dramatic_replacements = { 'naturally': 'dramatically with intensity', 'smoothly': 'with powerful emphasis', 'gently': 'boldly', 'moves': 'commands attention', 'speaks': 'declares passionately', 'gestures': 'gestures with commanding presence', 'professional lighting': 'dramatic lighting with stark contrasts', 'cinematic lighting': 'theatrical lighting with deep shadows' } enhanced = prompt for original, dramatic in dramatic_replacements.items(): enhanced = enhanced.replace(original, dramatic) # Add dramatic elements based on scene context if scene_info and scene_info.get('distinctive_elements'): elements = scene_info['distinctive_elements'] if 'costume' in str(elements): enhanced += ". Costume elements amplify the dramatic presence" if 'cape' in str(elements): enhanced += ". Cape billows dramatically with movement" # Enhance camera work for drama if 'Camera captures' in enhanced: enhanced = enhanced.replace('Camera captures', 'Dynamic camera captures') return enhanced def adjust_pace(prompt, analysis): """Adjust the pace of action""" if analysis['intensity'] == 'slow': pace_replacements = { 'naturally': 'slowly and deliberately', 'smoothly': 'in measured slow motion', 'moves': 'moves with deliberate slowness', 'speaks': 'speaks thoughtfully', 'gestures': 'gestures with careful precision' } else: # fast pace_replacements = { 'naturally': 'with energetic quickness', 'slowly': 'rapidly', 'smoothly': 'with swift fluidity', 'deliberate': 'rapid', 'measured': 'quick' } adjusted = prompt for original, paced in pace_replacements.items(): adjusted = adjusted.replace(original, paced) return adjusted def enhance_camera_work(prompt, analysis, scene_info): """Enhance camera work based on scene context""" # Analyze current camera work enhanced = prompt # Upgrade basic camera work camera_enhancements = { 'Camera captures': 'Dynamic camera work captures', 'camera follows': 'cinematic camera tracks', 'handheld camera': 'fluid handheld camera movement', 'steady camera': 'precision camera operation', 'locked camera': 'artistically locked camera' } for basic, enhanced_version in camera_enhancements.items(): enhanced = enhanced.replace(basic, enhanced_version) # Add specific camera techniques based on scene if scene_info: composition = scene_info.get('composition', '') if 'Wide' in composition: enhanced += ". Wide tracking shots reveal environmental context" elif 'Portrait' in composition: enhanced += ". Intimate camera framing emphasizes character details" # If no camera work exists, add it if 'camera' not in enhanced.lower(): enhanced += ". Sophisticated camera movement enhances the narrative" return enhanced def enhance_lighting(prompt, analysis, scene_info): """Enhance lighting based on scene context""" enhanced = prompt # Upgrade lighting descriptions lighting_enhancements = { 'professional lighting': 'artistic lighting design', 'cinematic lighting': 'masterful cinematic lighting', 'dramatic lighting': 'sculptural dramatic lighting', 'natural lighting': 'beautiful natural light' } for basic, enhanced_version in lighting_enhancements.items(): enhanced = enhanced.replace(basic, enhanced_version) # Add lighting based on emotional tone if scene_info: emotional_tone = scene_info.get('emotional_tone', 'neutral') if emotional_tone == 'dramatic': enhanced += ". High-contrast lighting creates powerful shadows" elif emotional_tone == 'elegant': enhanced += ". Soft, sophisticated lighting enhances refinement" elif emotional_tone == 'theatrical': enhanced += ". Stage-quality lighting emphasizes performance" # If no lighting exists, add it if 'lighting' not in enhanced.lower() and 'light' not in enhanced.lower(): enhanced += ". Expressive lighting design supports the mood" return enhanced def simplify_prompt(prompt): """Simplify prompt to essential elements""" # Split into main components parts = prompt.split('.') # Keep the main action and one enhancement if len(parts) > 1: simplified = parts[0] + '.' # Add one simple enhancement if 'camera' in prompt.lower(): simplified += " Camera follows naturally." elif 'lighting' in prompt.lower(): simplified += " Natural lighting." else: simplified = prompt return simplified def elaborate_prompt(prompt, scene_info): """Add sophisticated details to the prompt""" elaborated = prompt # Add environmental details if scene_info: setting = scene_info.get('setting', 'neutral') distinctive_elements = scene_info.get('distinctive_elements', []) if setting == 'outdoor': elaborated += ". Environmental elements respond subtly to the action" elif setting == 'indoor': elaborated += ". Interior atmosphere enhances intimate connection" elif setting == 'performance': elaborated += ". Stage environment supports theatrical presence" # Add details about distinctive elements if distinctive_elements: element = distinctive_elements[0] if distinctive_elements else '' if 'costume' in element: elaborated += ". Costume textures and details visible in motion" elif 'color' in element: elaborated += ". Color palette enhanced through dynamic lighting" # Add technical sophistication elaborated += ". Multi-layered composition with depth and visual interest" return elaborated def apply_style_preference(prompt, analysis, scene_info): """Apply specific style preferences""" styled = prompt preference = analysis['style_preference'] if preference == 'elegant': style_replacements = { 'dramatically': 'with refined elegance', 'boldly': 'gracefully', 'powerfully': 'with sophisticated poise', 'dramatic lighting': 'elegant lighting transitions', 'intensive': 'refined' } elif preference == 'powerful': style_replacements = { 'gently': 'with commanding force', 'naturally': 'with authoritative presence', 'smoothly': 'with decisive power', 'professional lighting': 'bold, impactful lighting' } elif preference == 'natural': style_replacements = { 'dramatically': 'naturally', 'theatrical': 'authentic', 'commanding': 'genuine', 'dramatic lighting': 'natural lighting' } if preference in ['elegant', 'powerful', 'natural']: for original, styled_version in style_replacements.items(): styled = styled.replace(original, styled_version) return styled def apply_general_enhancement(prompt, analysis, scene_info): """Apply general enhancements based on context""" enhanced = prompt # Add sophistication to basic elements if 'moves' in enhanced and 'gracefully' not in enhanced: enhanced = enhanced.replace('moves', 'moves with purposeful grace') if 'speaks' in enhanced and 'expressively' not in enhanced: enhanced = enhanced.replace('speaks', 'speaks with genuine expression') # Enhance based on scene context if scene_info: emotional_tone = scene_info.get('emotional_tone', 'neutral') if emotional_tone != 'neutral' and emotional_tone not in enhanced: enhanced += f". {emotional_tone.capitalize()} energy throughout" return enhanced def create_refinement_explanation(analysis, original, refined): """Create an explanation of what was changed""" explanations = { 'dramatic': "I've enhanced the dramatic intensity by upgrading the actions and adding powerful lighting elements.", 'pace': f"I've adjusted the pacing to be more {'slow and deliberate' if analysis['intensity'] == 'slow' else 'energetic and quick'}.", 'camera': "I've enhanced the camera work with more sophisticated cinematography techniques.", 'lighting': "I've upgraded the lighting description to create more visual impact.", 'simplify': "I've simplified the prompt to focus on the essential action.", 'elaborate': "I've added more sophisticated details and environmental context.", 'style': f"I've adjusted the style to be more {analysis['style_preference']}." } base_explanation = explanations.get(analysis['request_type'], "I've enhanced the prompt based on your feedback.") # Add language-specific response if analysis['language'] != 'english': language_notes = { 'spanish': "Entiendo tu sugerencia y ", 'french': "Je comprends votre suggestion et ", 'german': "Ich verstehe Ihren Vorschlag und " } prefix = language_notes.get(analysis['language'], "") base_explanation = prefix + base_explanation.lower() return base_explanation def generate_gen4_prompts_local(scene_info, user_input=""): """Generate Gen-4 prompts using iterative building""" try: description = scene_info.get('description', '') has_person = scene_info.get('has_person', False) setting = scene_info.get('setting', 'neutral') # Extract specific details for contextual prompts specific_details = extract_specific_details(description) subject_ref = get_contextual_subject(description, specific_details) prompts = [] # Basic - specific to what's in the image if has_person: actions = get_contextual_actions(description, specific_details) basic = f"{subject_ref} {random.choice(actions)} to camera" else: basic = f"The {specific_details.get('main_object', 'main element')} {random.choice(['moves', 'shifts', 'transforms'])}" prompts.append(f"**Basic**: {basic}") # + Subject Motion - add natural movement based on what's visible motion_adverbs = get_contextual_adverbs(specific_details) motion_addition = random.choice(motion_adverbs) with_subject = f"{basic} {motion_addition}" prompts.append(f"**+ Subject Motion**: {with_subject}") # + Camera Motion - appropriate for the scene camera_movements = get_contextual_camera_movement(description, specific_details) camera_addition = random.choice(camera_movements) with_camera = f"{with_subject}. {camera_addition}" prompts.append(f"**+ Camera Motion**: {with_camera}") # + Scene/Style - enhance the specific elements if specific_details.get('colors'): style_addition = f"{specific_details['colors']} tones enhanced by lighting. {get_contextual_atmosphere(specific_details)}" elif setting == 'outdoor': style_addition = "Natural lighting enhances the scene. Cinematic" else: style_addition = f"Professional lighting highlights {specific_details.get('distinctive_feature', 'the subject')}. Documentary style" complete = f"{with_camera}. {style_addition}" prompts.append(f"**+ Scene/Style**: {complete}") return "\n\n".join(prompts) except Exception as e: return f"Error generating Gen-4 prompts: {str(e)}" def build_custom_prompt_local(foundation, subject_motion, scene_motion, camera_motion, style, approach="SARA"): """Build custom prompt using selected approach""" if approach == "SARA": # SARA Structure: [Subject] [Action] while [Reference], [Atmosphere] parts = [] if foundation: parts.append(foundation) # Add motion elements motion_parts = [] if subject_motion: motion_parts.extend(subject_motion) if scene_motion: motion_parts.extend(scene_motion) if motion_parts: parts.append(", ".join(motion_parts)) # Reference (camera stability) if camera_motion: parts.append(f"while {camera_motion}") else: parts.append("while background remains steady") # Atmosphere if style: parts.append(style) return " ".join(parts) else: # Gen-4 style # Gen-4 Structure: Simple iterative building parts = [] if foundation: parts.append(foundation) if subject_motion: parts.extend(subject_motion) if camera_motion: parts.append(camera_motion) if scene_motion: parts.extend(scene_motion) if style: parts.append(style) return ". ".join(parts) if parts else "The subject moves naturally" def get_smart_suggestions_local(scene_info): """Generate intelligent suggestions using AI-enhanced analysis""" enhanced_description = scene_info.get('enhanced_description', '') emotional_tone = scene_info.get('emotional_tone', 'neutral') visual_style = scene_info.get('visual_style', 'cinematic') distinctive_elements = scene_info.get('distinctive_elements', []) motion_potential = scene_info.get('motion_potential', []) setting = scene_info.get('setting', 'neutral') if not enhanced_description: return "Please analyze an image first to generate smart suggestions." suggestions = [] # AI-enhanced scene understanding subject_ref = extract_intelligent_subject_reference(scene_info) suggestions.append(f'🤖 **AI Analysis**: {enhanced_description}') suggestions.append(f'🎯 **Smart Reference**: Use "{subject_ref}" for optimal clarity') # Tone-based action suggestions actions = generate_tone_appropriate_actions(emotional_tone, scene_info)[:3] suggestions.append(f'🎭 **Tone-Matched Actions**: {", ".join(actions)}') # Motion potential insights if motion_potential: top_potential = motion_potential[:3] suggestions.append(f'🎬 **Motion Opportunities**: {", ".join(top_potential)}') # Distinctive element highlights if distinctive_elements: top_elements = distinctive_elements[:2] suggestions.append(f'✨ **Key Elements to Highlight**: {", ".join(top_elements)}') # Visual style recommendations style_cameras = generate_style_appropriate_cameras(visual_style, scene_info.get('cinematic_qualities', []))[:2] suggestions.append(f'🎥 **Style-Appropriate Cameras**: {", ".join(style_cameras)}') # Emotional tone guidance appropriate_adverbs = [get_tone_appropriate_adverb(emotional_tone) for _ in range(3)] suggestions.append(f'💫 **Emotional Adverbs**: {", ".join(appropriate_adverbs)}') # Setting-specific insights if setting == 'performance': suggestions.append('🎪 **Performance Context**: Focus on stage presence and audience engagement') elif setting == 'outdoor': suggestions.append('🌿 **Outdoor Setting**: Leverage natural lighting and environmental elements') elif setting == 'indoor': suggestions.append('🏠 **Indoor Context**: Utilize controlled lighting and intimate framing') # Cinematic quality suggestions cinematic_qualities = scene_info.get('cinematic_qualities', []) if cinematic_qualities: top_qualities = cinematic_qualities[:2] suggestions.append(f'🎬 **Cinematic Opportunities**: {", ".join(top_qualities)}') # Atmosphere recommendation atmosphere = get_style_appropriate_atmosphere(visual_style, emotional_tone) suggestions.append(f'🌟 **Recommended Atmosphere**: {atmosphere}') return "\n".join(suggestions[:10]) def generate_instant_prompts(scene_info): """Generate sophisticated ready-to-use prompts based on AI-enhanced analysis""" enhanced_description = scene_info.get('enhanced_description', '') emotional_tone = scene_info.get('emotional_tone', 'neutral') visual_style = scene_info.get('visual_style', 'cinematic') distinctive_elements = scene_info.get('distinctive_elements', []) cinematic_qualities = scene_info.get('cinematic_qualities', []) motion_potential = scene_info.get('motion_potential', []) if not enhanced_description: return "Please analyze an image first to generate instant prompts." # Extract intelligent subject reference subject_ref = extract_intelligent_subject_reference(scene_info) # Generate tone-appropriate actions actions = generate_tone_appropriate_actions(emotional_tone, scene_info) # Generate style-appropriate camera work camera_movements = generate_style_appropriate_cameras(visual_style, cinematic_qualities) # Generate sophisticated prompts instant_prompts = [] # === AI-POWERED SIMPLE PROMPTS === instant_prompts.append("🤖 **AI-Powered Simple Prompts:**") for i in range(3): action = random.choice(actions) adverb = get_tone_appropriate_adverb(emotional_tone) instant_prompts.append(f" • {subject_ref} {action} {adverb}") # === CONTEXT-AWARE SARA PROMPTS === instant_prompts.append("\n🧠 **Context-Aware SARA Prompts:**") for i in range(3): action = random.choice(actions) adverb = get_tone_appropriate_adverb(emotional_tone) camera = random.choice(camera_movements) atmosphere = get_style_appropriate_atmosphere(visual_style, emotional_tone) # Include distinctive elements if distinctive_elements and random.choice([True, False]): distinctive = random.choice(distinctive_elements) instant_prompts.append(f" • {subject_ref} {action} {adverb} while {camera}, {distinctive} enhanced, {atmosphere}") else: instant_prompts.append(f" • {subject_ref} {action} {adverb} while {camera}, {atmosphere}") # === INTELLIGENCE-ENHANCED GEN-4 === instant_prompts.append("\n🎬 **Intelligence-Enhanced Gen-4:**") for i in range(3): action = random.choice(actions) adverb = get_tone_appropriate_adverb(emotional_tone) camera = random.choice(camera_movements) # Build Gen-4 iteratively with intelligence basic = f"{subject_ref} {action}" with_motion = f"{basic} {adverb}" with_camera = f"{with_motion}. {camera}" # Add intelligent style enhancement if distinctive_elements: distinctive = random.choice(distinctive_elements) style_addition = f"{distinctive} highlighted by {get_lighting_for_style(visual_style)}" else: style_addition = f"{get_lighting_for_style(visual_style)} enhances {emotional_tone} mood" complete = f"{with_camera}. {style_addition}" instant_prompts.append(f" • {complete}") # === SPECIALIZED INTELLIGENT PROMPTS === instant_prompts.append("\n✨ **Specialized AI Prompts:**") # Motion-potential based prompts if 'costume dynamics' in motion_potential: instant_prompts.append(f" 🎭 **Costume Dynamics**: {subject_ref} {random.choice(actions)} while camera captures fabric textures, costume elements react to movement, theatrical lighting") if 'facial expressions' in motion_potential: instant_prompts.append(f" 😊 **Expression Focus**: {subject_ref} {random.choice(['expresses emotion', 'speaks meaningfully', 'reacts naturally'])} while camera maintains intimate framing, {emotional_tone} energy emphasized") # Cinematic quality based prompts if 'dramatic lighting potential' in cinematic_qualities: instant_prompts.append(f" 💡 **Dramatic Lighting**: {subject_ref} {random.choice(actions)} as lighting creates dramatic shadows, visual contrast enhances {emotional_tone} mood, cinematic depth") if 'color enhancement opportunities' in cinematic_qualities: colors = [elem for elem in distinctive_elements if 'coloring' in elem] if colors: instant_prompts.append(f" 🎨 **Color Enhanced**: {subject_ref} {random.choice(actions)} while lighting dramatically enhances {colors[0]}, color grading emphasizes mood, {visual_style} aesthetic") # Environmental integration setting = scene_info.get('setting', 'neutral') if setting == 'performance': instant_prompts.append(f" 🎪 **Performance Mode**: {subject_ref} {random.choice(['performs', 'presents', 'commands attention'])} while audience perspective maintained, {emotional_tone} stage presence, professional capture") elif setting == 'outdoor': instant_prompts.append(f" 🌿 **Environmental Harmony**: {subject_ref} {random.choice(actions)} as natural elements complement motion, environmental lighting, organic {visual_style} feel") # === ADVANCED COMPOSITE PROMPTS === instant_prompts.append("\n🚀 **Advanced AI Composite:**") # Ultra-sophisticated prompt advanced_action = random.choice(actions) advanced_adverb = get_tone_appropriate_adverb(emotional_tone) advanced_camera = random.choice(camera_movements) advanced_atmosphere = get_style_appropriate_atmosphere(visual_style, emotional_tone) if distinctive_elements: advanced_distinctive = random.choice(distinctive_elements) advanced_prompt = f"{subject_ref} {advanced_action} {advanced_adverb} as {advanced_camera} captures nuanced details, {advanced_distinctive} dynamically enhanced, lighting and color grading amplify {emotional_tone} undertones, {advanced_atmosphere} with {visual_style} cinematography" else: advanced_prompt = f"{subject_ref} {advanced_action} {advanced_adverb} while {advanced_camera} follows natural rhythm, environmental elements support the motion, {advanced_atmosphere} with intelligent {visual_style} direction" instant_prompts.append(f" • {advanced_prompt}") return "\n".join(instant_prompts) def extract_intelligent_subject_reference(scene_info): """Extract intelligent subject reference using AI analysis""" enhanced_desc = scene_info.get('enhanced_description', '') basic_desc = scene_info.get('basic_description', '') # Check if we have a person has_person = scene_info.get('has_person', False) if not has_person: return "The subject" # Use enhanced description for smarter reference if isinstance(enhanced_desc, str): enhanced_lower = enhanced_desc.lower() if 'man in costume' in enhanced_lower: return "The man in costume" elif 'woman in dress' in enhanced_lower: return "The woman in dress" elif 'man in suit' in enhanced_lower: return "The man in suit" # Fallback to basic description if isinstance(basic_desc, str): basic_lower = basic_desc.lower() if 'man' in basic_lower: return "The man" elif 'woman' in basic_lower: return "The woman" elif 'person' in basic_lower: return "The person" return "The subject" def generate_tone_appropriate_actions(emotional_tone, scene_info): """Generate actions that match the emotional tone""" base_actions = { 'dramatic': ['moves powerfully', 'gestures boldly', 'commands attention', 'strikes a pose', 'displays intensity'], 'elegant': ['moves gracefully', 'gestures refined', 'poses elegantly', 'demonstrates poise', 'flows naturally'], 'theatrical': ['performs dramatically', 'presents theatrically', 'expresses character', 'embodies role', 'captivates audience'], 'serious': ['maintains composure', 'speaks authoritatively', 'gestures formally', 'projects confidence', 'demonstrates focus'], 'cheerful': ['expresses joy', 'gestures enthusiastically', 'radiates energy', 'shows warmth', 'displays positivity'], 'professional': ['presents professionally', 'maintains bearing', 'demonstrates expertise', 'projects authority', 'engages formally'], 'neutral': ['moves naturally', 'gestures appropriately', 'maintains presence', 'expresses subtly', 'demonstrates character'] } # Add context-specific actions based on scene elements actions = base_actions.get(emotional_tone, base_actions['neutral']).copy() # Add clothing-specific actions if scene_info.get('distinctive_elements'): for element in scene_info['distinctive_elements']: if 'costume' in element: actions.extend(['adjusts costume', 'displays costume details']) elif 'cape' in element: actions.extend(['gestures with cape', 'moves dramatically with cape']) elif 'flag' in element: actions.extend(['acknowledges flag', 'presents with flag']) return actions def generate_style_appropriate_cameras(visual_style, cinematic_qualities): """Generate camera movements appropriate for the visual style""" base_cameras = { 'cinematic': ['camera glides smoothly', 'tracking shot follows', 'camera orbits elegantly', 'dolly movement captures', 'crane shot reveals'], 'dramatic': ['camera emphasizes motion', 'dynamic camera movement', 'camera captures intensity', 'bold camera work follows', 'dramatic camera angles'], 'theatrical': ['camera frames performance', 'audience perspective maintained', 'camera captures stage presence', 'performance-focused framing', 'theatrical camera work'], 'professional': ['steady camera captures', 'professional camera movement', 'controlled camera work', 'camera maintains stability', 'precise camera tracking'], 'documentary': ['handheld camera follows', 'natural camera movement', 'camera observes genuinely', 'documentary-style capture', 'authentic camera work'] } cameras = base_cameras.get(visual_style, base_cameras['cinematic']).copy() # Add cameras based on cinematic qualities if 'horizontal camera movements' in cinematic_qualities: cameras.extend(['camera pans horizontally', 'lateral camera movement']) if 'vertical movement' in cinematic_qualities: cameras.extend(['camera tilts vertically', 'vertical camera motion']) if 'environmental context' in cinematic_qualities: cameras.extend(['camera reveals environment', 'wide establishing shots']) return cameras def get_tone_appropriate_adverb(emotional_tone): """Get adverbs that match the emotional tone""" adverbs = { 'dramatic': ['powerfully', 'intensely', 'dramatically', 'boldly', 'majestically'], 'elegant': ['gracefully', 'refinedly', 'elegantly', 'smoothly', 'sophisticatedly'], 'theatrical': ['dramatically', 'expressively', 'theatrically', 'charismatically', 'captivating'], 'serious': ['authoritatively', 'professionally', 'formally', 'confidently', 'purposefully'], 'cheerful': ['enthusiastically', 'energetically', 'warmly', 'positively', 'vibrantly'], 'professional': ['professionally', 'precisely', 'competently', 'expertly', 'authoritatively'], 'neutral': ['naturally', 'smoothly', 'appropriately', 'genuinely', 'authentically'] } return random.choice(adverbs.get(emotional_tone, adverbs['neutral'])) def get_style_appropriate_atmosphere(visual_style, emotional_tone): """Get atmosphere that combines style and tone""" style_atmospheres = { 'cinematic': f'cinematic {emotional_tone} atmosphere', 'dramatic': f'dramatic {emotional_tone} mood', 'theatrical': f'theatrical {emotional_tone} presence', 'professional': f'professional {emotional_tone} environment', 'documentary': f'authentic {emotional_tone} feeling' } return style_atmospheres.get(visual_style, f'{visual_style} {emotional_tone} atmosphere') def get_lighting_for_style(visual_style): """Get appropriate lighting description for visual style""" lighting = { 'cinematic': 'cinematic lighting', 'dramatic': 'dramatic lighting', 'theatrical': 'stage lighting', 'professional': 'professional lighting', 'documentary': 'natural lighting' } return lighting.get(visual_style, 'cinematic lighting') # Gen-4 style prompts for i in range(3): action = random.choice(contextual_actions) adverb = random.choice(contextual_adverbs) camera = random.choice(camera_moves) # Build Gen-4 iteratively basic = f"{subject_ref} {action}" with_motion = f"{basic} {adverb}" with_camera = f"{with_motion}. {camera}" # Add style based on specific details if specific_details.get('colors'): style_addition = f"{specific_details['colors']} tones enhanced by lighting" else: style_addition = "Cinematic lighting" complete = f"{with_camera}. {style_addition}" instant_prompts.append(f"📝 **Gen-4**: {complete}") # Specialized prompts based on distinctive features if specific_details.get('clothing'): clothing = specific_details['clothing'] if 'cape' in clothing: instant_prompts.append(f"🦸 **Cape Focus**: {subject_ref} moves dramatically while camera captures cape movement, wind effects enhance cape flow, heroic atmosphere") if 'dress' in clothing: instant_prompts.append(f"👗 **Dress Focus**: {subject_ref} moves gracefully while camera tracks smoothly, fabric reacts to movement, elegant atmosphere") if 'hat' in clothing: instant_prompts.append(f"🎩 **Hat Focus**: {subject_ref} tips hat confidently while camera frames from chest up, professional lighting") # Color-focused prompts if specific_details.get('colors'): colors = specific_details['colors'] instant_prompts.append(f"🎨 **Color Enhanced**: {subject_ref} {random.choice(contextual_actions)} while lighting dramatically enhances {colors} tones, cinematic depth") return "\n\n".join(instant_prompts) def copy_to_foundation(prompt_text, approach): """Extract the main prompt from formatted text for foundation field""" # Remove the emoji and label prefix to get clean prompt if "**" in prompt_text: # Extract text after the **: parts = prompt_text.split("**: ", 1) if len(parts) > 1: return parts[1] return prompt_text # Create optimized Gradio interface with gr.Blocks(theme=gr.themes.Soft(), title="Universal Video Prompting Tool") as demo: gr.Markdown("# 🎬 Universal Video Prompting Tool") gr.Markdown("*Compatible with Gen-4, Sora, Pika, Luma, Runway & all AI video models*") gr.Markdown("**Combines official Gen-4 guidelines with advanced SARA Framework**") with gr.Tabs(): with gr.TabItem("📚 Prompting Guide"): gr.Markdown(unified_instructions) with gr.TabItem("🎬 Quick Video Prompt Generator"): with gr.Row(): with gr.Column(scale=1): # Image upload and analysis gr.Markdown("## 📷 Upload Your Frame 0") image_input = gr.Image(type="pil", label="Upload your initial frame") analyze_btn = gr.Button("🔍 Analyze Image (Fast)", variant="primary") image_analysis = gr.Textbox( label="Image Analysis Results", placeholder="Upload an image and click 'Analyze Image' for instant analysis...", lines=10, interactive=False ) # Hidden state for scene info scene_info_state = gr.State({}) # Quick suggestions with gr.Group(): gr.Markdown("### 💡 Smart Suggestions") get_suggestions_btn = gr.Button("Get Smart Tips", variant="secondary") smart_suggestions = gr.Textbox( label="Context-Aware Suggestions", placeholder="Click 'Get Smart Tips' after image analysis...", lines=5, interactive=False ) # Instant prompts - NEW SECTION with gr.Group(): gr.Markdown("### 🚀 Ready-to-Use Prompts") generate_instant_btn = gr.Button("Generate Instant Prompts", variant="primary") instant_prompts = gr.Textbox( label="Copy & Paste Ready Prompts", placeholder="Click 'Generate Instant Prompts' to get ready-to-use prompts based on your image...", lines=12, interactive=True, show_copy_button=True ) with gr.Column(scale=1): # Prompt generation methods gr.Markdown("## 🚀 Choose Your Method") with gr.Tabs(): with gr.TabItem("🤖 AI Prompt Assistant"): gr.Markdown("*Describe your idea in any language - AI will create optimized English video prompts*") with gr.Row(): with gr.Column(scale=2): user_idea = gr.Textbox( label="Your Idea (any language)", placeholder="e.g., 'el personaje se quita la nariz de payaso' or 'character walks slowly towards camera'", lines=3 ) with gr.Column(scale=1): optimize_btn = gr.Button("🚀 Optimize & Structure", variant="primary") ai_optimized = gr.Textbox( label="AI-Optimized Video Prompt", placeholder="Your optimized prompt will appear here...", lines=4, interactive=True, show_copy_button=True ) # Chat interface for refinement gr.Markdown("### 💬 Refine Your Prompt") chat_history = gr.Chatbot( label="Prompt Refinement Chat", height=250, placeholder="Chat history will appear here as you refine your prompt..." ) with gr.Row(): refine_input = gr.Textbox( label="Refine further", placeholder="e.g., 'make it more dramatic' or 'add camera movement' or 'más lento'", scale=3 ) refine_btn = gr.Button("💬 Refine", scale=1) with gr.TabItem("📝 Gen-4 Official"): gr.Markdown("*Official method: Simple → Complex building*") foundation_gen4 = gr.Textbox( label="Foundation (Optional)", placeholder="e.g., 'The subject walks forward'", lines=1 ) generate_gen4_btn = gr.Button("Generate Gen-4 Prompts", variant="primary") gen4_output = gr.Textbox( label="Gen-4 Style Prompts", lines=8, interactive=False ) # Custom prompt builder with gr.Group(): gr.Markdown("## 🛠️ Custom Prompt Builder") with gr.Row(): approach_selector = gr.Radio( choices=["SARA", "Gen-4"], value="SARA", label="Approach", interactive=True ) custom_foundation = gr.Textbox( label="Foundation", placeholder="The subject...", lines=1 ) with gr.Row(): subject_motion = gr.CheckboxGroup( choices=["walks smoothly", "speaks clearly", "gestures naturally", "moves gracefully", "turns slowly"], label="Subject Motion" ) scene_motion = gr.CheckboxGroup( choices=["dust swirls", "lighting changes", "wind effects", "water movement", "atmosphere shifts"], label="Scene Motion" ) with gr.Row(): camera_motion = gr.Dropdown( choices=["camera remains steady", "handheld camera", "camera pans left", "camera pans right", "camera tracks forward", "camera zooms in"], label="Camera Motion", value="camera remains steady" ) style_motion = gr.Dropdown( choices=["cinematic", "documentary style", "live-action", "dramatic", "peaceful", "energetic", "professional"], label="Style/Atmosphere", value="cinematic" ) build_custom_btn = gr.Button("🔨 Build Custom Prompt", variant="secondary") custom_output = gr.Textbox( label="Your Custom Prompt", lines=3, interactive=True ) # Event handlers analyze_btn.click( fn=analyze_image_simple, inputs=[image_input], outputs=[image_analysis, gr.State(), scene_info_state] ) get_suggestions_btn.click( fn=get_smart_suggestions_local, inputs=[scene_info_state], outputs=[smart_suggestions] ) # NEW: Generate instant prompts generate_instant_btn.click( fn=generate_instant_prompts, inputs=[scene_info_state], outputs=[instant_prompts] ) # NEW: AI Prompt Assistant optimize_btn.click( fn=optimize_user_prompt, inputs=[user_idea, scene_info_state], outputs=[ai_optimized] ) refine_btn.click( fn=refine_prompt_with_feedback, inputs=[ai_optimized, refine_input, chat_history, scene_info_state], outputs=[ai_optimized, chat_history] ) generate_gen4_btn.click( fn=generate_gen4_prompts_local, inputs=[scene_info_state, foundation_gen4], outputs=[gen4_output] ) build_custom_btn.click( fn=build_custom_prompt_local, inputs=[custom_foundation, subject_motion, scene_motion, camera_motion, style_motion, approach_selector], outputs=[custom_output] ) # Launch the app if __name__ == "__main__": demo.launch()