Malaji71's picture
Update app.py
5114ebd verified
import gradio as gr
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import random
# Use lighter BLIP model instead of heavy LLaVA
print("Loading BLIP model (lighter version)...")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-large",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
# Universal Video Prompting Guide combining Gen-4 + SARA
unified_instructions = """
# 🎬 Universal Video Prompting Guide
*Compatible with Gen-4, Sora, Pika, Luma, Runway and all diffusion-based video models*
## Core Principles (Universal)
βœ… **Focus on MOTION, not static description**
βœ… **Use positive phrasing exclusively**
βœ… **Start simple, iterate progressively**
βœ… **Refer to subjects in general terms** ("the subject," "the woman")
βœ… **Keep prompts direct and easily understood**
## Two Complementary Approaches
### πŸš€ **Gen-4 Official Method** (Recommended for beginners)
**Structure**: Simple iterative building
1. Start with essential motion only
2. Add one element at a time: Subject Motion β†’ Camera Motion β†’ Scene Motion β†’ Style Descriptors
3. Use general terms and avoid complex descriptions
**Example**:
- Basic: "The subject walks forward"
- + Camera: "The subject walks forward. Handheld camera follows"
- + Scene: "The subject walks forward. Handheld camera follows. Dust trails behind"
- + Style: "The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic."
### 🎯 **SARA Framework** (Advanced precision)
**Structure**: [Subject] + [Action] + [Reference] + [Atmosphere]
- **Subject (S)**: Main element to control
- **Action (A)**: Movement/transformation ([verb] + [adverb])
- **Reference (R)**: Spatial anchors ("while X remains steady")
- **Atmosphere (A)**: Context and style
**Template**: [Subject] [verb] [adverb] while [reference] [atmosphere]
**Example**: "The subject walks smoothly while background remains steady, cinematic atmosphere"
## Essential Vocabulary
### Effective Verbs (Action)
- **Movement**: walks, runs, moves, glides, flows, drifts
- **Rotation**: turns, spins, rotates, pivots, tilts
- **Transformation**: transforms, morphs, transitions, evolves
- **Expression**: speaks, gestures, looks, smiles, nods
### Effective Adverbs (Quality)
- **Speed**: slowly, quickly, gradually, suddenly, steadily
- **Style**: smoothly, naturally, elegantly, gracefully, dramatically
- **Intensity**: gently, softly, powerfully, intensely, subtly
### Camera Motion Terms
- **Basic**: locked camera, handheld, steady cam
- **Movement**: pan left/right, tilt up/down, zoom in/out, dolly forward/back
- **Advanced**: tracking shot, crane movement, orbital movement
### Style Descriptors
- **Aesthetic**: cinematic, live-action, smooth animation, stop motion
- **Mood**: dramatic, peaceful, energetic, mysterious, professional
- **Technical**: 4K, slow motion, time-lapse, documentary style
## Multi-Subject Guidelines
- **Positional**: "The subject on the left walks. The subject on the right remains still."
- **Descriptive**: "The woman nods. The man waves."
- **Sequential**: "The woman speaks then the man responds."
## Scene Motion Approaches
- **Insinuated**: "The subject runs across the dusty desert" (natural)
- **Explicit**: "The subject runs across the desert. Dust trails behind them" (emphasized)
## Proven Examples (from SARA Framework)
### Character Motion
- "The woman speaks enthusiastically to camera while camera remains still, online tutorial"
- "The subject transitions from walking to jumping while background stays constant"
### Camera Motion
- "The subject remains centered as camera smoothly moves left with steady background"
- "Handheld camera tracks the subject as they walk forward naturally"
### Environmental
- "Camera stays fixed while day cycles into night over the temple, stone structures remain still"
- "The red cup slides smoothly to the right on white table, maintaining background constant"
### Complex Scenes
- "The pile of rocks transforms into a humanoid made of rugged volcanic rocks. The rock humanoid walks around"
- "The woman inspects her reflection in mirror. Surface bubbles with translucent bubbles. Locked camera"
## Technical Notes
- **Gen-4/Runway**: Prefer SARA structure for precision
- **Sora/OpenAI**: Works well with both approaches
- **Pika/Stable**: Gen-4 method often more effective
- **All models**: Start simple, iterate based on results
"""
# Prompt templates from both Gen-4 and SARA research
SARA_TEMPLATES = {
"character_motion": [
"{subject} speaks {adverb} to camera while camera remains still, {genre}",
"{subject} {action} {adverb} while background stays constant, {style}",
"{subject} transitions from {action1} to {action2} while frame remains fixed, {genre}"
],
"camera_motion": [
"{subject} remains centered as camera {movement} {adverb} with steady background",
"{camera_type} camera {action} the {subject} as they {movement} {adverb}",
"Camera {movement} {adverb} while {subject} maintains position, {style}"
],
"environmental": [
"Camera stays fixed while {environment} {transformation} over {subject}, {reference} remain still",
"{subject} {action} while {environmental_effect} around them, {style}",
"{environmental_element} {movement} {adverb} as {subject} {action}, maintaining {reference}"
],
"transformations": [
"{object} transforms into {new_form} made of {material}. The {new_subject} {action} around",
"{subject} {action} in {location}. {environmental_reaction} {adverb}. {camera_style}",
"The {subject} {action} while {environmental_change} occurs {adverb}, {atmosphere}"
]
}
GEN4_TEMPLATES = {
"basic": [
"The subject {action}",
"The {subject} {movement} {direction}",
"{subject} {expression} to camera"
],
"with_camera": [
"The subject {action}. {camera_movement}",
"{subject} {movement} {direction}. Camera {camera_action}",
"Handheld camera {camera_behavior} as {subject} {action}"
],
"with_scene": [
"The subject {action}. {camera_movement}. {scene_element} {scene_action}",
"{subject} {movement} across {environment}. {environmental_reaction}",
"Camera {camera_movement} while {subject} {action}, {scene_description}"
],
"complete": [
"The subject {action}. {camera_movement}. {scene_element} {scene_action}. {style}",
"{subject} {movement} {adverb} across {environment}. {camera_type} camera {camera_action}. {style}",
"Camera {camera_movement} as {subject} {action}, {environmental_reaction}, {atmosphere}"
]
}
# Vocabulary databases
VOCABULARY = {
"subjects": ["the subject", "the woman", "the man", "the person", "the character"],
"actions": ["walks", "runs", "moves", "glides", "flows", "turns", "speaks", "gestures"],
"adverbs": ["smoothly", "slowly", "quickly", "naturally", "gracefully", "steadily", "gently"],
"camera_movements": ["locked camera", "handheld", "dolly forward", "pan left", "pan right", "tracking shot"],
"environments": ["dusty desert", "forest", "urban street", "open field", "indoor space"],
"styles": ["cinematic", "documentary", "live-action", "dramatic", "peaceful", "energetic"]
}
def analyze_image_simple(image):
"""Enhanced image analysis using BLIP + AI reasoning"""
if image is None:
return "Please upload an image first.", "", {}
try:
# Convert to PIL if needed
if not isinstance(image, Image.Image):
image = Image.fromarray(image)
# Get basic image info
width, height = image.size
aspect_ratio = width / height
if aspect_ratio > 1.5:
composition = "Wide landscape shot"
elif aspect_ratio < 0.7:
composition = "Vertical portrait shot"
else:
composition = "Balanced composition"
# Generate caption with BLIP
inputs = processor(image, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
model.cuda()
out = model.generate(**inputs, max_length=50, num_beams=3)
basic_caption = processor.decode(out[0], skip_special_tokens=True)
# Enhanced analysis using AI reasoning
enhanced_analysis = analyze_scene_with_ai(basic_caption, aspect_ratio, composition)
# Create comprehensive analysis text
analysis = f"""πŸ“Š **Image Analysis:**
β€’ **Dimensions**: {width} x {height}
β€’ **Composition**: {composition}
β€’ **Aspect Ratio**: {aspect_ratio:.2f}
🎨 **Basic Description**:
"{basic_caption}"
🧠 **AI-Enhanced Analysis**:
{enhanced_analysis['scene_interpretation']}
πŸ’‘ **Motion & Cinematography Insights**:
{chr(10).join(f"β€’ {insight}" for insight in enhanced_analysis['motion_insights'])}
🎯 **Recommended Approach**:
{enhanced_analysis['recommended_approach']}"""
# Enhanced scene info for prompt generation
scene_info = {
'basic_description': basic_caption,
'enhanced_description': enhanced_analysis['detailed_description'],
'composition': composition,
'aspect_ratio': aspect_ratio,
'has_person': enhanced_analysis['has_person'],
'emotional_tone': enhanced_analysis['emotional_tone'],
'visual_style': enhanced_analysis['visual_style'],
'setting': enhanced_analysis['setting'],
'distinctive_elements': enhanced_analysis['distinctive_elements'],
'motion_potential': enhanced_analysis['motion_potential'],
'cinematic_qualities': enhanced_analysis['cinematic_qualities']
}
return analysis, basic_caption, scene_info
except Exception as e:
return f"Error analyzing image: {str(e)}", "", {}
def analyze_scene_with_ai(basic_caption, aspect_ratio, composition):
"""Use AI reasoning to enhance basic image analysis"""
text = basic_caption.lower() if isinstance(basic_caption, str) else ""
# Interpret the scene beyond basic description
scene_elements = extract_scene_elements(text)
# Determine emotional tone and mood
emotional_tone = determine_emotional_tone(text, scene_elements)
# Analyze visual style potential
visual_style = determine_visual_style(text, scene_elements, composition)
# Identify distinctive elements for video potential
distinctive_elements = identify_distinctive_elements(text)
# Assess motion potential
motion_potential = assess_motion_potential(text, scene_elements)
# Generate cinematic insights
cinematic_qualities = analyze_cinematic_potential(text, composition, aspect_ratio)
# Create enhanced interpretation
enhanced_description = create_enhanced_description(basic_caption, scene_elements, emotional_tone)
# Generate motion and cinematography insights
motion_insights = generate_motion_insights(scene_elements, emotional_tone, visual_style, composition)
# Recommend best approach
recommended_approach = recommend_approach(scene_elements, emotional_tone, visual_style)
return {
'detailed_description': enhanced_description,
'scene_interpretation': f"Scene shows {scene_elements['subject']} in {scene_elements['setting']} with {emotional_tone} mood. Key elements: {', '.join(distinctive_elements)}",
'motion_insights': motion_insights,
'recommended_approach': recommended_approach,
'has_person': scene_elements['has_person'],
'emotional_tone': emotional_tone,
'visual_style': visual_style,
'setting': scene_elements['setting'],
'distinctive_elements': distinctive_elements,
'motion_potential': motion_potential,
'cinematic_qualities': cinematic_qualities
}
def extract_scene_elements(text):
"""Extract and interpret scene elements intelligently"""
elements = {
'subject': 'subject',
'setting': 'neutral',
'clothing': None,
'colors': [],
'objects': [],
'has_person': False
}
# Detect subjects with context
if any(word in text for word in ['man', 'male', 'gentleman']):
elements['subject'] = 'man'
elements['has_person'] = True
# Detect what the man is wearing/doing
if 'costume' in text:
elements['subject'] = 'man in costume'
elements['clothing'] = 'costume'
elif 'suit' in text:
elements['subject'] = 'man in suit'
elements['clothing'] = 'suit'
elif any(word in text for word in ['woman', 'female', 'lady']):
elements['subject'] = 'woman'
elements['has_person'] = True
if 'dress' in text:
elements['subject'] = 'woman in dress'
elements['clothing'] = 'dress'
# Detect setting with intelligence
if any(word in text for word in ['outdoor', 'outside', 'street', 'nature', 'park']):
elements['setting'] = 'outdoor'
elif any(word in text for word in ['indoor', 'inside', 'room', 'office', 'studio']):
elements['setting'] = 'indoor'
elif any(word in text for word in ['stage', 'performance']):
elements['setting'] = 'performance'
# Extract colors intelligently
color_words = ['red', 'blue', 'green', 'yellow', 'black', 'white', 'brown', 'pink', 'purple', 'orange', 'gold', 'silver']
elements['colors'] = [color for color in color_words if color in text]
# Extract objects
objects = ['hat', 'cape', 'flag', 'chair', 'table', 'background', 'wall']
elements['objects'] = [obj for obj in objects if obj in text]
return elements
def determine_emotional_tone(text, scene_elements):
"""Intelligently determine the emotional tone of the scene"""
# Ensure text is a string
if not isinstance(text, str):
text = ""
text_lower = text.lower()
# Look for emotional indicators
if any(word in text_lower for word in ['serious', 'formal', 'stern', 'professional']):
return 'serious'
elif any(word in text_lower for word in ['happy', 'smiling', 'cheerful', 'joyful']):
return 'cheerful'
elif any(word in text_lower for word in ['dramatic', 'intense', 'powerful', 'bold']):
return 'dramatic'
elif any(word in text_lower for word in ['elegant', 'graceful', 'refined']):
return 'elegant'
elif 'costume' in text_lower or 'performance' in text_lower:
return 'theatrical'
else:
# Infer from context
if scene_elements['setting'] == 'performance':
return 'theatrical'
elif scene_elements['clothing'] in ['suit', 'formal']:
return 'professional'
else:
return 'neutral'
def determine_visual_style(text, scene_elements, composition):
"""Determine the most suitable visual style"""
# Ensure text is a string
if not isinstance(text, str):
text = ""
text_lower = text.lower()
# Analyze scene for style cues
if scene_elements['setting'] == 'performance' or 'costume' in text_lower:
return 'theatrical'
elif scene_elements['setting'] == 'indoor' and 'formal' in text_lower:
return 'professional'
elif composition in ['Wide landscape shot']:
return 'cinematic'
elif any(color in scene_elements['colors'] for color in ['red', 'gold', 'dramatic']):
return 'dramatic'
else:
return 'cinematic'
def identify_distinctive_elements(text):
"""Identify unique elements that can enhance video prompts"""
# Ensure text is a string
if not isinstance(text, str):
text = ""
elements = []
text_lower = text.lower()
# Clothing and costume elements
if 'costume' in text_lower:
elements.append('elaborate costume')
if 'cape' in text_lower:
elements.append('flowing cape')
if 'hat' in text_lower:
elements.append('distinctive hat')
if 'flag' in text_lower:
elements.append('flag detail')
# Color elements
colors = ['red', 'blue', 'green', 'gold']
found_colors = [color for color in colors if color in text_lower]
if found_colors:
elements.append(f"{', '.join(found_colors)} coloring")
# Setting elements
if 'background' in text_lower:
elements.append('detailed background')
return elements if elements else ['natural elements']
def assess_motion_potential(text, scene_elements):
"""Assess what types of motion would work best"""
# Ensure text is a string
if not isinstance(text, str):
text = ""
potential = []
text_lower = text.lower()
if scene_elements['has_person']:
potential.extend(['facial expressions', 'hand gestures', 'body movement'])
if scene_elements['clothing'] == 'costume':
potential.append('costume dynamics')
if scene_elements['clothing'] == 'cape':
potential.append('cape flow')
if scene_elements['clothing'] == 'dress':
potential.append('fabric movement')
if scene_elements['setting'] == 'outdoor':
potential.extend(['environmental effects', 'natural lighting changes'])
elif scene_elements['setting'] == 'indoor':
potential.extend(['controlled lighting', 'subtle environment shifts'])
return potential
def analyze_cinematic_potential(text, composition, aspect_ratio):
"""Analyze the cinematic qualities and potential"""
# Ensure text is a string
if not isinstance(text, str):
text = ""
qualities = []
text_lower = text.lower()
# Composition analysis
if composition == 'Wide landscape shot':
qualities.extend(['horizontal camera movements', 'panoramic reveals', 'environmental context'])
elif composition == 'Vertical portrait shot':
qualities.extend(['character focus', 'intimate framing', 'vertical movement'])
else:
qualities.extend(['balanced framing', 'versatile movement', 'centered composition'])
# Content analysis
if 'costume' in text_lower or 'dramatic' in text_lower:
qualities.append('dramatic lighting potential')
if any(color in text_lower for color in ['red', 'gold', 'rich']):
qualities.append('color enhancement opportunities')
return qualities
def create_enhanced_description(basic_caption, scene_elements, emotional_tone):
"""Create a richer description using AI analysis"""
subject = scene_elements['subject']
setting = scene_elements['setting']
clothing = scene_elements['clothing']
enhanced = f"A {emotional_tone} scene featuring {subject}"
if clothing:
enhanced += f" wearing {clothing}"
enhanced += f" in a {setting} setting"
if scene_elements['colors']:
enhanced += f" with prominent {', '.join(scene_elements['colors'])} elements"
return enhanced
def generate_motion_insights(scene_elements, emotional_tone, visual_style, composition):
"""Generate intelligent motion and cinematography insights"""
insights = []
# Subject-based insights
if scene_elements['has_person']:
if emotional_tone == 'dramatic':
insights.append('Emphasize powerful gestures and dynamic poses')
elif emotional_tone == 'elegant':
insights.append('Focus on graceful, refined movements')
elif emotional_tone == 'theatrical':
insights.append('Capture performance-style expressions and gestures')
if scene_elements['clothing']:
clothing = scene_elements['clothing']
if clothing == 'costume':
insights.append('Highlight costume details with movement')
elif clothing == 'cape':
insights.append('Showcase cape flow and dramatic movement')
elif clothing == 'dress':
insights.append('Capture fabric dynamics and elegant motion')
# Composition-based insights
if composition == 'Wide landscape shot':
insights.append('Utilize horizontal camera movements and wide reveals')
elif composition == 'Vertical portrait shot':
insights.append('Focus on vertical movement and character detail')
# Style-based insights
if visual_style == 'cinematic':
insights.append('Use cinematic camera techniques and dramatic lighting')
elif visual_style == 'dramatic':
insights.append('Emphasize bold movements and high contrast lighting')
elif visual_style == 'professional':
insights.append('Maintain clean, controlled camera work')
# Color-based insights
if scene_elements['colors']:
insights.append(f"Enhance {', '.join(scene_elements['colors'])} tones through lighting")
return insights[:6] # Limit to 6 most relevant insights
def recommend_approach(scene_elements, emotional_tone, visual_style):
"""Intelligently recommend the best prompting approach"""
# For complex scenes with people in costume/formal wear
if scene_elements['has_person'] and scene_elements['clothing'] in ['costume', 'suit', 'dress']:
return "SARA Framework recommended for precise character and costume control"
# For dramatic or theatrical scenes
elif emotional_tone in ['dramatic', 'theatrical']:
return "SARA Framework ideal for complex dramatic scenes with multiple elements"
# For simple, natural scenes
elif emotional_tone in ['neutral', 'peaceful'] and visual_style != 'dramatic':
return "Gen-4 method perfect for natural, iterative scene building"
# For professional or formal contexts
elif emotional_tone == 'professional' or visual_style == 'professional':
return "Either approach works - SARA for precision, Gen-4 for simplicity"
else:
return "Start with Gen-4 for base prompt, then refine with SARA for complexity"
def generate_motion_suggestions(description, aspect_ratio):
"""Generate contextual motion suggestions"""
text = description.lower()
suggestions = []
# Content-based suggestions
if any(word in text for word in ['person', 'woman', 'man', 'people']):
suggestions.extend([
'Focus on character expressions and gestures',
'Use "the subject" or "the woman/man" for clarity',
'Consider handheld camera for natural movement'
])
if any(word in text for word in ['sitting', 'standing']):
suggestions.extend([
'Start with simple movements: speaking, gesturing',
'Locked or steady camera works well for portraits'
])
if any(word in text for word in ['outdoor', 'landscape', 'nature']):
suggestions.extend([
'Camera movement can explore the environment',
'Consider environmental motion: wind, clouds',
'Cinematic style complements outdoor scenes'
])
if any(word in text for word in ['indoor', 'room']):
suggestions.extend([
'Controlled movements work best indoors',
'Focus on subject motion within the space'
])
# Composition-based suggestions
if aspect_ratio > 1.5:
suggestions.append('Wide format perfect for horizontal camera movements')
elif aspect_ratio < 0.8:
suggestions.append('Portrait format ideal for character-focused content')
return suggestions[:6] if suggestions else [
'Start with simple motion: "The subject moves"',
'Add camera movement: "Camera follows naturally"',
'Include environment: "Background remains steady"'
]
def get_recommended_approach(description):
"""Recommend best approach based on image content"""
text = description.lower()
if any(word in text for word in ['person', 'woman', 'man']):
return "SARA Framework recommended for character precision"
elif any(word in text for word in ['landscape', 'building', 'nature']):
return "Gen-4 method works well for environmental scenes"
else:
return "Try both approaches - start with Gen-4, refine with SARA"
def detect_setting(description):
"""Detect setting type from description"""
text = description.lower()
if any(word in text for word in ['outdoor', 'outside', 'street', 'nature']):
return 'outdoor'
elif any(word in text for word in ['indoor', 'inside', 'room', 'building']):
return 'indoor'
else:
return 'neutral'
def extract_specific_details(description):
"""Extract specific details from the image description"""
details = {
'colors': [],
'clothing': None,
'distinctive_feature': None,
'main_object': None,
'setting_clues': []
}
text = description.lower()
# Extract colors
colors = ['red', 'blue', 'green', 'yellow', 'black', 'white', 'brown', 'pink', 'purple', 'orange']
found_colors = [color for color in colors if color in text]
if found_colors:
details['colors'] = ', '.join(found_colors)
# Extract clothing/costume details
clothing_items = ['cape', 'hat', 'dress', 'suit', 'shirt', 'coat', 'jacket', 'uniform', 'costume', 'robe']
for item in clothing_items:
if item in text:
# Get the full clothing description
if 'red cape' in text:
details['clothing'] = 'red cape'
details['distinctive_feature'] = 'flowing red cape'
elif 'hat' in text:
if 'red hat' in text:
details['clothing'] = 'red hat'
details['distinctive_feature'] = 'red hat'
else:
details['clothing'] = 'hat'
details['distinctive_feature'] = 'hat'
else:
details['clothing'] = item
details['distinctive_feature'] = item
break
# Extract main subject
if 'man' in text:
details['main_object'] = 'man'
elif 'woman' in text:
details['main_object'] = 'woman'
elif 'person' in text:
details['main_object'] = 'person'
elif 'people' in text:
details['main_object'] = 'people'
# Extract setting clues
setting_indicators = ['outdoor', 'indoor', 'street', 'room', 'building', 'nature', 'park', 'office']
details['setting_clues'] = [indicator for indicator in setting_indicators if indicator in text]
return details
def get_contextual_subject(description, details):
"""Get appropriate subject reference based on image content"""
text = description.lower()
if 'man' in text:
if details.get('clothing'):
return f"The man in the {details['clothing']}"
else:
return "The man"
elif 'woman' in text:
if details.get('clothing'):
return f"The woman in the {details['clothing']}"
else:
return "The woman"
elif 'person' in text:
return "The person"
else:
return "The subject"
def get_contextual_actions(description, details):
"""Get actions appropriate for the scene"""
base_actions = ['speaks', 'gestures', 'moves', 'looks', 'turns']
# Add context-specific actions
if details.get('clothing'):
if 'cape' in details['clothing']:
base_actions.extend(['adjusts cape', 'moves dramatically', 'gestures with cape flowing'])
if 'hat' in details['clothing']:
base_actions.extend(['tips hat', 'adjusts hat', 'nods with hat'])
# Add character-appropriate actions
if 'man' in description.lower():
base_actions.extend(['speaks confidently', 'gestures authoritatively'])
return base_actions
def get_contextual_adverbs(details):
"""Get adverbs that fit the scene"""
base_adverbs = ['naturally', 'smoothly', 'slowly', 'gracefully']
if details.get('clothing'):
if 'cape' in details['clothing']:
base_adverbs.extend(['dramatically', 'majestically', 'with flair'])
if 'hat' in details['clothing']:
base_adverbs.extend(['elegantly', 'with style', 'confidently'])
return base_adverbs
def get_contextual_camera_movement(description, details):
"""Get camera movements appropriate for the scene"""
base_movements = ['Camera follows steadily', 'Locked camera captures', 'Handheld camera tracks']
if details.get('distinctive_feature'):
if 'cape' in details['distinctive_feature']:
base_movements.extend(['Camera captures cape movement', 'Tracking shot follows cape flow'])
if 'hat' in details['distinctive_feature']:
base_movements.extend(['Camera frames from chest up', 'Close tracking of upper body'])
return base_movements
def get_contextual_environment(description, details):
"""Get environmental effects that complement the scene"""
if details.get('colors'):
if 'red' in details['colors']:
return "lighting enhances red tones"
if details.get('clothing'):
if 'cape' in details['clothing']:
return "cape fabric reacts to air movement"
return None
def get_contextual_style(details):
"""Get style that fits the scene context"""
if details.get('clothing'):
if 'cape' in details['clothing']:
return "dramatic cinematic style"
if 'hat' in details['clothing']:
return "classic portrait style"
return "professional documentary style"
def get_contextual_atmosphere(details):
"""Get atmosphere that matches the scene"""
if details.get('colors'):
if 'red' in details['colors']:
return "dramatic atmosphere with rich red tones"
if details.get('clothing'):
if 'cape' in details['clothing']:
return "heroic cinematic atmosphere"
if 'hat' in details['clothing']:
return "elegant portrait atmosphere"
return "professional cinematic atmosphere"
def optimize_user_prompt(user_idea, scene_info=None):
"""Optimize and structure user's prompt idea into professional video prompt"""
if not user_idea.strip():
return "Please enter your idea first."
try:
# Analyze the user's input
idea = user_idea.strip()
# Detect language and content
analysis = analyze_user_idea(idea)
# Generate optimized prompt
optimized = create_optimized_prompt(idea, analysis, scene_info)
return optimized
except Exception as e:
return f"Error optimizing prompt: {str(e)}"
def analyze_user_idea(idea):
"""Analyze user's idea to understand intent and content"""
idea_lower = idea.lower()
analysis = {
'language': detect_language(idea),
'has_action': False,
'has_object': False,
'has_emotion': False,
'has_camera': False,
'complexity': 'simple',
'main_elements': []
}
# Detect actions (multilingual)
action_words = {
'en': ['removes', 'takes off', 'puts on', 'walks', 'runs', 'speaks', 'gestures', 'moves', 'turns', 'looks'],
'es': ['quita', 'se quita', 'pone', 'camina', 'corre', 'habla', 'gesticula', 'mueve', 'gira', 'mira'],
'fr': ['enlève', 'met', 'marche', 'court', 'parle', 'gesticule', 'bouge'],
'de': ['nimmt ab', 'zieht aus', 'geht', 'lΓ€uft', 'spricht', 'bewegt']
}
for lang, actions in action_words.items():
if any(action in idea_lower for action in actions):
analysis['has_action'] = True
break
# Detect objects/elements
object_words = ['nose', 'nariz', 'hat', 'sombrero', 'costume', 'traje', 'cape', 'capa', 'mask', 'mΓ‘scara']
if any(obj in idea_lower for obj in object_words):
analysis['has_object'] = True
# Detect emotions/style
emotion_words = ['dramatic', 'dramΓ‘tico', 'slow', 'lento', 'fast', 'rΓ‘pido', 'gentle', 'suave', 'powerful', 'poderoso']
if any(emotion in idea_lower for emotion in emotion_words):
analysis['has_emotion'] = True
# Detect camera references
camera_words = ['camera', 'cΓ‘mara', 'shot', 'toma', 'angle', 'Γ‘ngulo', 'close', 'cerca', 'wide', 'amplio']
if any(camera in idea_lower for camera in camera_words):
analysis['has_camera'] = True
# Determine complexity
element_count = sum([analysis['has_action'], analysis['has_object'], analysis['has_emotion'], analysis['has_camera']])
if element_count >= 3:
analysis['complexity'] = 'complex'
elif element_count >= 2:
analysis['complexity'] = 'medium'
return analysis
def detect_language(text):
"""Simple language detection"""
spanish_indicators = ['el', 'la', 'se', 'que', 'con', 'por', 'para', 'del', 'de la', 'nariz', 'payaso']
french_indicators = ['le', 'la', 'se', 'que', 'avec', 'pour', 'du', 'de la', 'nez', 'clown']
german_indicators = ['der', 'die', 'das', 'sich', 'mit', 'fΓΌr', 'vom', 'nase', 'clown']
text_lower = text.lower()
if any(indicator in text_lower for indicator in spanish_indicators):
return 'spanish'
elif any(indicator in text_lower for indicator in french_indicators):
return 'french'
elif any(indicator in text_lower for indicator in german_indicators):
return 'german'
else:
return 'english'
def create_optimized_prompt(idea, analysis, scene_info=None):
"""Create optimized English video prompt from user idea"""
# Translation dictionary for common elements
translations = {
'spanish': {
'se quita': 'removes',
'quita': 'removes',
'pone': 'puts on',
'camina': 'walks',
'habla': 'speaks',
'mueve': 'moves',
'nariz': 'nose',
'payaso': 'clown',
'personaje': 'character',
'sombrero': 'hat',
'capa': 'cape',
'lentamente': 'slowly',
'rΓ‘pidamente': 'quickly',
'dramΓ‘ticamente': 'dramatically'
},
'french': {
'enlève': 'removes',
'met': 'puts on',
'marche': 'walks',
'parle': 'speaks',
'bouge': 'moves',
'nez': 'nose',
'clown': 'clown',
'personnage': 'character',
'chapeau': 'hat',
'cape': 'cape'
}
}
# Start with basic translation
optimized_idea = idea
if analysis['language'] in translations:
for original, translation in translations[analysis['language']].items():
optimized_idea = optimized_idea.replace(original, translation)
# Structure the prompt professionally
structured_prompt = structure_video_prompt(optimized_idea, analysis, scene_info)
return structured_prompt
def structure_video_prompt(idea, analysis, scene_info=None):
"""Structure the idea into a professional video prompt"""
# Extract main elements
idea_lower = idea.lower()
# Identify subject
if 'character' in idea_lower or 'personaje' in idea_lower:
subject = "The character"
elif 'person' in idea_lower or 'persona' in idea_lower:
subject = "The person"
elif scene_info and scene_info.get('has_person'):
# Use context from scene analysis
subject = extract_intelligent_subject_reference(scene_info)
else:
subject = "The subject"
# Extract and optimize action
action = extract_action_from_idea(idea)
# Add appropriate style modifiers
if analysis['complexity'] == 'simple':
# Simple structure: Subject + Action + naturally
optimized = f"{subject} {action} naturally"
# Add camera suggestion
optimized += ". Camera captures the motion smoothly"
elif analysis['complexity'] == 'medium':
# Medium structure: Add more detail
optimized = f"{subject} {action} while camera follows steadily"
# Add environmental/lighting
if analysis['has_emotion']:
optimized += ", dramatic lighting enhances the mood"
else:
optimized += ", professional lighting"
else:
# Complex structure: Full SARA framework
optimized = f"{subject} {action} expressively while camera tracks the motion"
optimized += ", lighting and environment support the action, cinematic atmosphere"
# Add technical improvements
optimized = improve_technical_language(optimized)
return optimized
def extract_action_from_idea(idea):
"""Extract and refine the main action from user's idea"""
idea_lower = idea.lower()
# Map common actions to video-optimized versions
action_mappings = {
'removes': 'removes',
'quita': 'removes',
'se quita': 'removes',
'takes off': 'removes',
'puts on': 'puts on',
'pone': 'puts on',
'walks': 'walks',
'camina': 'walks',
'speaks': 'speaks',
'habla': 'speaks',
'moves': 'moves',
'mueve': 'moves',
'turns': 'turns',
'gira': 'turns',
'looks': 'looks',
'mira': 'looks'
}
# Find the action and object
action = "moves" # default
object_part = ""
for original, mapped in action_mappings.items():
if original in idea_lower:
action = mapped
# Try to extract what's being acted upon
if original in ['removes', 'quita', 'se quita', 'takes off']:
# Look for what's being removed
if 'nose' in idea_lower or 'nariz' in idea_lower:
if 'clown' in idea_lower or 'payaso' in idea_lower:
object_part = "the clown nose"
else:
object_part = "the nose piece"
elif 'hat' in idea_lower or 'sombrero' in idea_lower:
object_part = "the hat"
elif 'mask' in idea_lower or 'mΓ‘scara' in idea_lower:
object_part = "the mask"
break
# Combine action with object
if object_part:
return f"{action} {object_part}"
else:
return action
def improve_technical_language(prompt):
"""Improve the prompt with professional video terminology"""
# Enhance basic terms
improvements = {
'moves naturally': 'moves with natural grace',
'Camera captures': 'Camera captures',
'smoothly': 'with smooth motion',
'follows steadily': 'follows with steady tracking',
'dramatic lighting': 'dramatic lighting transitions',
'professional lighting': 'professional lighting setup',
'cinematic atmosphere': 'rich cinematic atmosphere'
}
improved_prompt = prompt
for basic, enhanced in improvements.items():
improved_prompt = improved_prompt.replace(basic, enhanced)
return improved_prompt
def refine_prompt_with_feedback(current_prompt, feedback, chat_history, scene_info=None):
"""Use AI to intelligently refine prompts based on user feedback"""
if not feedback.strip():
return current_prompt, chat_history
# Analyze the feedback with AI understanding
refinement_analysis = analyze_refinement_request(feedback, current_prompt, scene_info)
# Generate intelligent refinement
refined_prompt = apply_intelligent_refinement(current_prompt, refinement_analysis, scene_info)
# Create explanatory response
explanation = create_refinement_explanation(refinement_analysis, current_prompt, refined_prompt)
# Update chat history with intelligent conversation
new_chat_history = chat_history + [
[feedback, f"πŸ€– {explanation}\n\n✨ **Refined Prompt**: {refined_prompt}"]
]
return refined_prompt, new_chat_history
def analyze_refinement_request(feedback, current_prompt, scene_info):
"""Analyze what the user wants to change using AI understanding"""
feedback_lower = feedback.lower()
analysis = {
'request_type': 'general',
'intensity': 'moderate',
'focus_area': 'action',
'style_preference': None,
'specific_elements': [],
'language': detect_language(feedback)
}
# Detect request type with AI understanding
if any(word in feedback_lower for word in ['dramatic', 'dramΓ‘tico', 'dramatique', 'dramatisch']):
analysis['request_type'] = 'dramatic'
analysis['intensity'] = 'high'
elif any(word in feedback_lower for word in ['slow', 'slower', 'lento', 'mΓ‘s lento', 'lentement']):
analysis['request_type'] = 'pace'
analysis['intensity'] = 'slow'
elif any(word in feedback_lower for word in ['fast', 'faster', 'rΓ‘pido', 'mΓ‘s rΓ‘pido', 'rapide']):
analysis['request_type'] = 'pace'
analysis['intensity'] = 'fast'
elif any(word in feedback_lower for word in ['camera', 'cΓ‘mara', 'camΓ©ra', 'kamera']):
analysis['request_type'] = 'camera'
analysis['focus_area'] = 'cinematography'
elif any(word in feedback_lower for word in ['lighting', 'light', 'luz', 'lumière', 'licht']):
analysis['request_type'] = 'lighting'
analysis['focus_area'] = 'atmosphere'
elif any(word in feedback_lower for word in ['simple', 'simpler', 'mΓ‘s simple', 'plus simple']):
analysis['request_type'] = 'simplify'
analysis['intensity'] = 'low'
elif any(word in feedback_lower for word in ['complex', 'complicated', 'detalle', 'detail', 'dΓ©tail']):
analysis['request_type'] = 'elaborate'
analysis['intensity'] = 'high'
elif any(word in feedback_lower for word in ['elegant', 'elegante', 'Γ©lΓ©gant']):
analysis['request_type'] = 'style'
analysis['style_preference'] = 'elegant'
elif any(word in feedback_lower for word in ['powerful', 'poderoso', 'puissant']):
analysis['request_type'] = 'style'
analysis['style_preference'] = 'powerful'
elif any(word in feedback_lower for word in ['natural', 'natural', 'naturel']):
analysis['request_type'] = 'style'
analysis['style_preference'] = 'natural'
# Detect specific elements mentioned
elements = ['costume', 'dress', 'cape', 'hat', 'background', 'face', 'hands', 'movement']
for element in elements:
if element in feedback_lower:
analysis['specific_elements'].append(element)
return analysis
def apply_intelligent_refinement(current_prompt, analysis, scene_info):
"""Apply intelligent refinement based on analysis"""
# Start with current prompt
refined = current_prompt
# Apply refinements based on request type
if analysis['request_type'] == 'dramatic':
refined = enhance_dramatic_elements(refined, analysis, scene_info)
elif analysis['request_type'] == 'pace':
refined = adjust_pace(refined, analysis)
elif analysis['request_type'] == 'camera':
refined = enhance_camera_work(refined, analysis, scene_info)
elif analysis['request_type'] == 'lighting':
refined = enhance_lighting(refined, analysis, scene_info)
elif analysis['request_type'] == 'simplify':
refined = simplify_prompt(refined)
elif analysis['request_type'] == 'elaborate':
refined = elaborate_prompt(refined, scene_info)
elif analysis['request_type'] == 'style':
refined = apply_style_preference(refined, analysis, scene_info)
else:
# General enhancement
refined = apply_general_enhancement(refined, analysis, scene_info)
return refined
def enhance_dramatic_elements(prompt, analysis, scene_info):
"""Enhance dramatic elements intelligently"""
# Replace gentle actions with dramatic ones
dramatic_replacements = {
'naturally': 'dramatically with intensity',
'smoothly': 'with powerful emphasis',
'gently': 'boldly',
'moves': 'commands attention',
'speaks': 'declares passionately',
'gestures': 'gestures with commanding presence',
'professional lighting': 'dramatic lighting with stark contrasts',
'cinematic lighting': 'theatrical lighting with deep shadows'
}
enhanced = prompt
for original, dramatic in dramatic_replacements.items():
enhanced = enhanced.replace(original, dramatic)
# Add dramatic elements based on scene context
if scene_info and scene_info.get('distinctive_elements'):
elements = scene_info['distinctive_elements']
if 'costume' in str(elements):
enhanced += ". Costume elements amplify the dramatic presence"
if 'cape' in str(elements):
enhanced += ". Cape billows dramatically with movement"
# Enhance camera work for drama
if 'Camera captures' in enhanced:
enhanced = enhanced.replace('Camera captures', 'Dynamic camera captures')
return enhanced
def adjust_pace(prompt, analysis):
"""Adjust the pace of action"""
if analysis['intensity'] == 'slow':
pace_replacements = {
'naturally': 'slowly and deliberately',
'smoothly': 'in measured slow motion',
'moves': 'moves with deliberate slowness',
'speaks': 'speaks thoughtfully',
'gestures': 'gestures with careful precision'
}
else: # fast
pace_replacements = {
'naturally': 'with energetic quickness',
'slowly': 'rapidly',
'smoothly': 'with swift fluidity',
'deliberate': 'rapid',
'measured': 'quick'
}
adjusted = prompt
for original, paced in pace_replacements.items():
adjusted = adjusted.replace(original, paced)
return adjusted
def enhance_camera_work(prompt, analysis, scene_info):
"""Enhance camera work based on scene context"""
# Analyze current camera work
enhanced = prompt
# Upgrade basic camera work
camera_enhancements = {
'Camera captures': 'Dynamic camera work captures',
'camera follows': 'cinematic camera tracks',
'handheld camera': 'fluid handheld camera movement',
'steady camera': 'precision camera operation',
'locked camera': 'artistically locked camera'
}
for basic, enhanced_version in camera_enhancements.items():
enhanced = enhanced.replace(basic, enhanced_version)
# Add specific camera techniques based on scene
if scene_info:
composition = scene_info.get('composition', '')
if 'Wide' in composition:
enhanced += ". Wide tracking shots reveal environmental context"
elif 'Portrait' in composition:
enhanced += ". Intimate camera framing emphasizes character details"
# If no camera work exists, add it
if 'camera' not in enhanced.lower():
enhanced += ". Sophisticated camera movement enhances the narrative"
return enhanced
def enhance_lighting(prompt, analysis, scene_info):
"""Enhance lighting based on scene context"""
enhanced = prompt
# Upgrade lighting descriptions
lighting_enhancements = {
'professional lighting': 'artistic lighting design',
'cinematic lighting': 'masterful cinematic lighting',
'dramatic lighting': 'sculptural dramatic lighting',
'natural lighting': 'beautiful natural light'
}
for basic, enhanced_version in lighting_enhancements.items():
enhanced = enhanced.replace(basic, enhanced_version)
# Add lighting based on emotional tone
if scene_info:
emotional_tone = scene_info.get('emotional_tone', 'neutral')
if emotional_tone == 'dramatic':
enhanced += ". High-contrast lighting creates powerful shadows"
elif emotional_tone == 'elegant':
enhanced += ". Soft, sophisticated lighting enhances refinement"
elif emotional_tone == 'theatrical':
enhanced += ". Stage-quality lighting emphasizes performance"
# If no lighting exists, add it
if 'lighting' not in enhanced.lower() and 'light' not in enhanced.lower():
enhanced += ". Expressive lighting design supports the mood"
return enhanced
def simplify_prompt(prompt):
"""Simplify prompt to essential elements"""
# Split into main components
parts = prompt.split('.')
# Keep the main action and one enhancement
if len(parts) > 1:
simplified = parts[0] + '.'
# Add one simple enhancement
if 'camera' in prompt.lower():
simplified += " Camera follows naturally."
elif 'lighting' in prompt.lower():
simplified += " Natural lighting."
else:
simplified = prompt
return simplified
def elaborate_prompt(prompt, scene_info):
"""Add sophisticated details to the prompt"""
elaborated = prompt
# Add environmental details
if scene_info:
setting = scene_info.get('setting', 'neutral')
distinctive_elements = scene_info.get('distinctive_elements', [])
if setting == 'outdoor':
elaborated += ". Environmental elements respond subtly to the action"
elif setting == 'indoor':
elaborated += ". Interior atmosphere enhances intimate connection"
elif setting == 'performance':
elaborated += ". Stage environment supports theatrical presence"
# Add details about distinctive elements
if distinctive_elements:
element = distinctive_elements[0] if distinctive_elements else ''
if 'costume' in element:
elaborated += ". Costume textures and details visible in motion"
elif 'color' in element:
elaborated += ". Color palette enhanced through dynamic lighting"
# Add technical sophistication
elaborated += ". Multi-layered composition with depth and visual interest"
return elaborated
def apply_style_preference(prompt, analysis, scene_info):
"""Apply specific style preferences"""
styled = prompt
preference = analysis['style_preference']
if preference == 'elegant':
style_replacements = {
'dramatically': 'with refined elegance',
'boldly': 'gracefully',
'powerfully': 'with sophisticated poise',
'dramatic lighting': 'elegant lighting transitions',
'intensive': 'refined'
}
elif preference == 'powerful':
style_replacements = {
'gently': 'with commanding force',
'naturally': 'with authoritative presence',
'smoothly': 'with decisive power',
'professional lighting': 'bold, impactful lighting'
}
elif preference == 'natural':
style_replacements = {
'dramatically': 'naturally',
'theatrical': 'authentic',
'commanding': 'genuine',
'dramatic lighting': 'natural lighting'
}
if preference in ['elegant', 'powerful', 'natural']:
for original, styled_version in style_replacements.items():
styled = styled.replace(original, styled_version)
return styled
def apply_general_enhancement(prompt, analysis, scene_info):
"""Apply general enhancements based on context"""
enhanced = prompt
# Add sophistication to basic elements
if 'moves' in enhanced and 'gracefully' not in enhanced:
enhanced = enhanced.replace('moves', 'moves with purposeful grace')
if 'speaks' in enhanced and 'expressively' not in enhanced:
enhanced = enhanced.replace('speaks', 'speaks with genuine expression')
# Enhance based on scene context
if scene_info:
emotional_tone = scene_info.get('emotional_tone', 'neutral')
if emotional_tone != 'neutral' and emotional_tone not in enhanced:
enhanced += f". {emotional_tone.capitalize()} energy throughout"
return enhanced
def create_refinement_explanation(analysis, original, refined):
"""Create an explanation of what was changed"""
explanations = {
'dramatic': "I've enhanced the dramatic intensity by upgrading the actions and adding powerful lighting elements.",
'pace': f"I've adjusted the pacing to be more {'slow and deliberate' if analysis['intensity'] == 'slow' else 'energetic and quick'}.",
'camera': "I've enhanced the camera work with more sophisticated cinematography techniques.",
'lighting': "I've upgraded the lighting description to create more visual impact.",
'simplify': "I've simplified the prompt to focus on the essential action.",
'elaborate': "I've added more sophisticated details and environmental context.",
'style': f"I've adjusted the style to be more {analysis['style_preference']}."
}
base_explanation = explanations.get(analysis['request_type'], "I've enhanced the prompt based on your feedback.")
# Add language-specific response
if analysis['language'] != 'english':
language_notes = {
'spanish': "Entiendo tu sugerencia y ",
'french': "Je comprends votre suggestion et ",
'german': "Ich verstehe Ihren Vorschlag und "
}
prefix = language_notes.get(analysis['language'], "")
base_explanation = prefix + base_explanation.lower()
return base_explanation
def generate_gen4_prompts_local(scene_info, user_input=""):
"""Generate Gen-4 prompts using iterative building"""
try:
description = scene_info.get('description', '')
has_person = scene_info.get('has_person', False)
setting = scene_info.get('setting', 'neutral')
# Extract specific details for contextual prompts
specific_details = extract_specific_details(description)
subject_ref = get_contextual_subject(description, specific_details)
prompts = []
# Basic - specific to what's in the image
if has_person:
actions = get_contextual_actions(description, specific_details)
basic = f"{subject_ref} {random.choice(actions)} to camera"
else:
basic = f"The {specific_details.get('main_object', 'main element')} {random.choice(['moves', 'shifts', 'transforms'])}"
prompts.append(f"**Basic**: {basic}")
# + Subject Motion - add natural movement based on what's visible
motion_adverbs = get_contextual_adverbs(specific_details)
motion_addition = random.choice(motion_adverbs)
with_subject = f"{basic} {motion_addition}"
prompts.append(f"**+ Subject Motion**: {with_subject}")
# + Camera Motion - appropriate for the scene
camera_movements = get_contextual_camera_movement(description, specific_details)
camera_addition = random.choice(camera_movements)
with_camera = f"{with_subject}. {camera_addition}"
prompts.append(f"**+ Camera Motion**: {with_camera}")
# + Scene/Style - enhance the specific elements
if specific_details.get('colors'):
style_addition = f"{specific_details['colors']} tones enhanced by lighting. {get_contextual_atmosphere(specific_details)}"
elif setting == 'outdoor':
style_addition = "Natural lighting enhances the scene. Cinematic"
else:
style_addition = f"Professional lighting highlights {specific_details.get('distinctive_feature', 'the subject')}. Documentary style"
complete = f"{with_camera}. {style_addition}"
prompts.append(f"**+ Scene/Style**: {complete}")
return "\n\n".join(prompts)
except Exception as e:
return f"Error generating Gen-4 prompts: {str(e)}"
def build_custom_prompt_local(foundation, subject_motion, scene_motion, camera_motion, style, approach="SARA"):
"""Build custom prompt using selected approach"""
if approach == "SARA":
# SARA Structure: [Subject] [Action] while [Reference], [Atmosphere]
parts = []
if foundation:
parts.append(foundation)
# Add motion elements
motion_parts = []
if subject_motion:
motion_parts.extend(subject_motion)
if scene_motion:
motion_parts.extend(scene_motion)
if motion_parts:
parts.append(", ".join(motion_parts))
# Reference (camera stability)
if camera_motion:
parts.append(f"while {camera_motion}")
else:
parts.append("while background remains steady")
# Atmosphere
if style:
parts.append(style)
return " ".join(parts)
else: # Gen-4 style
# Gen-4 Structure: Simple iterative building
parts = []
if foundation:
parts.append(foundation)
if subject_motion:
parts.extend(subject_motion)
if camera_motion:
parts.append(camera_motion)
if scene_motion:
parts.extend(scene_motion)
if style:
parts.append(style)
return ". ".join(parts) if parts else "The subject moves naturally"
def get_smart_suggestions_local(scene_info):
"""Generate intelligent suggestions using AI-enhanced analysis"""
enhanced_description = scene_info.get('enhanced_description', '')
emotional_tone = scene_info.get('emotional_tone', 'neutral')
visual_style = scene_info.get('visual_style', 'cinematic')
distinctive_elements = scene_info.get('distinctive_elements', [])
motion_potential = scene_info.get('motion_potential', [])
setting = scene_info.get('setting', 'neutral')
if not enhanced_description:
return "Please analyze an image first to generate smart suggestions."
suggestions = []
# AI-enhanced scene understanding
subject_ref = extract_intelligent_subject_reference(scene_info)
suggestions.append(f'πŸ€– **AI Analysis**: {enhanced_description}')
suggestions.append(f'🎯 **Smart Reference**: Use "{subject_ref}" for optimal clarity')
# Tone-based action suggestions
actions = generate_tone_appropriate_actions(emotional_tone, scene_info)[:3]
suggestions.append(f'🎭 **Tone-Matched Actions**: {", ".join(actions)}')
# Motion potential insights
if motion_potential:
top_potential = motion_potential[:3]
suggestions.append(f'🎬 **Motion Opportunities**: {", ".join(top_potential)}')
# Distinctive element highlights
if distinctive_elements:
top_elements = distinctive_elements[:2]
suggestions.append(f'✨ **Key Elements to Highlight**: {", ".join(top_elements)}')
# Visual style recommendations
style_cameras = generate_style_appropriate_cameras(visual_style, scene_info.get('cinematic_qualities', []))[:2]
suggestions.append(f'πŸŽ₯ **Style-Appropriate Cameras**: {", ".join(style_cameras)}')
# Emotional tone guidance
appropriate_adverbs = [get_tone_appropriate_adverb(emotional_tone) for _ in range(3)]
suggestions.append(f'πŸ’« **Emotional Adverbs**: {", ".join(appropriate_adverbs)}')
# Setting-specific insights
if setting == 'performance':
suggestions.append('πŸŽͺ **Performance Context**: Focus on stage presence and audience engagement')
elif setting == 'outdoor':
suggestions.append('🌿 **Outdoor Setting**: Leverage natural lighting and environmental elements')
elif setting == 'indoor':
suggestions.append('🏠 **Indoor Context**: Utilize controlled lighting and intimate framing')
# Cinematic quality suggestions
cinematic_qualities = scene_info.get('cinematic_qualities', [])
if cinematic_qualities:
top_qualities = cinematic_qualities[:2]
suggestions.append(f'🎬 **Cinematic Opportunities**: {", ".join(top_qualities)}')
# Atmosphere recommendation
atmosphere = get_style_appropriate_atmosphere(visual_style, emotional_tone)
suggestions.append(f'🌟 **Recommended Atmosphere**: {atmosphere}')
return "\n".join(suggestions[:10])
def generate_instant_prompts(scene_info):
"""Generate sophisticated ready-to-use prompts based on AI-enhanced analysis"""
enhanced_description = scene_info.get('enhanced_description', '')
emotional_tone = scene_info.get('emotional_tone', 'neutral')
visual_style = scene_info.get('visual_style', 'cinematic')
distinctive_elements = scene_info.get('distinctive_elements', [])
cinematic_qualities = scene_info.get('cinematic_qualities', [])
motion_potential = scene_info.get('motion_potential', [])
if not enhanced_description:
return "Please analyze an image first to generate instant prompts."
# Extract intelligent subject reference
subject_ref = extract_intelligent_subject_reference(scene_info)
# Generate tone-appropriate actions
actions = generate_tone_appropriate_actions(emotional_tone, scene_info)
# Generate style-appropriate camera work
camera_movements = generate_style_appropriate_cameras(visual_style, cinematic_qualities)
# Generate sophisticated prompts
instant_prompts = []
# === AI-POWERED SIMPLE PROMPTS ===
instant_prompts.append("πŸ€– **AI-Powered Simple Prompts:**")
for i in range(3):
action = random.choice(actions)
adverb = get_tone_appropriate_adverb(emotional_tone)
instant_prompts.append(f" β€’ {subject_ref} {action} {adverb}")
# === CONTEXT-AWARE SARA PROMPTS ===
instant_prompts.append("\n🧠 **Context-Aware SARA Prompts:**")
for i in range(3):
action = random.choice(actions)
adverb = get_tone_appropriate_adverb(emotional_tone)
camera = random.choice(camera_movements)
atmosphere = get_style_appropriate_atmosphere(visual_style, emotional_tone)
# Include distinctive elements
if distinctive_elements and random.choice([True, False]):
distinctive = random.choice(distinctive_elements)
instant_prompts.append(f" β€’ {subject_ref} {action} {adverb} while {camera}, {distinctive} enhanced, {atmosphere}")
else:
instant_prompts.append(f" β€’ {subject_ref} {action} {adverb} while {camera}, {atmosphere}")
# === INTELLIGENCE-ENHANCED GEN-4 ===
instant_prompts.append("\n🎬 **Intelligence-Enhanced Gen-4:**")
for i in range(3):
action = random.choice(actions)
adverb = get_tone_appropriate_adverb(emotional_tone)
camera = random.choice(camera_movements)
# Build Gen-4 iteratively with intelligence
basic = f"{subject_ref} {action}"
with_motion = f"{basic} {adverb}"
with_camera = f"{with_motion}. {camera}"
# Add intelligent style enhancement
if distinctive_elements:
distinctive = random.choice(distinctive_elements)
style_addition = f"{distinctive} highlighted by {get_lighting_for_style(visual_style)}"
else:
style_addition = f"{get_lighting_for_style(visual_style)} enhances {emotional_tone} mood"
complete = f"{with_camera}. {style_addition}"
instant_prompts.append(f" β€’ {complete}")
# === SPECIALIZED INTELLIGENT PROMPTS ===
instant_prompts.append("\n✨ **Specialized AI Prompts:**")
# Motion-potential based prompts
if 'costume dynamics' in motion_potential:
instant_prompts.append(f" 🎭 **Costume Dynamics**: {subject_ref} {random.choice(actions)} while camera captures fabric textures, costume elements react to movement, theatrical lighting")
if 'facial expressions' in motion_potential:
instant_prompts.append(f" 😊 **Expression Focus**: {subject_ref} {random.choice(['expresses emotion', 'speaks meaningfully', 'reacts naturally'])} while camera maintains intimate framing, {emotional_tone} energy emphasized")
# Cinematic quality based prompts
if 'dramatic lighting potential' in cinematic_qualities:
instant_prompts.append(f" πŸ’‘ **Dramatic Lighting**: {subject_ref} {random.choice(actions)} as lighting creates dramatic shadows, visual contrast enhances {emotional_tone} mood, cinematic depth")
if 'color enhancement opportunities' in cinematic_qualities:
colors = [elem for elem in distinctive_elements if 'coloring' in elem]
if colors:
instant_prompts.append(f" 🎨 **Color Enhanced**: {subject_ref} {random.choice(actions)} while lighting dramatically enhances {colors[0]}, color grading emphasizes mood, {visual_style} aesthetic")
# Environmental integration
setting = scene_info.get('setting', 'neutral')
if setting == 'performance':
instant_prompts.append(f" πŸŽͺ **Performance Mode**: {subject_ref} {random.choice(['performs', 'presents', 'commands attention'])} while audience perspective maintained, {emotional_tone} stage presence, professional capture")
elif setting == 'outdoor':
instant_prompts.append(f" 🌿 **Environmental Harmony**: {subject_ref} {random.choice(actions)} as natural elements complement motion, environmental lighting, organic {visual_style} feel")
# === ADVANCED COMPOSITE PROMPTS ===
instant_prompts.append("\nπŸš€ **Advanced AI Composite:**")
# Ultra-sophisticated prompt
advanced_action = random.choice(actions)
advanced_adverb = get_tone_appropriate_adverb(emotional_tone)
advanced_camera = random.choice(camera_movements)
advanced_atmosphere = get_style_appropriate_atmosphere(visual_style, emotional_tone)
if distinctive_elements:
advanced_distinctive = random.choice(distinctive_elements)
advanced_prompt = f"{subject_ref} {advanced_action} {advanced_adverb} as {advanced_camera} captures nuanced details, {advanced_distinctive} dynamically enhanced, lighting and color grading amplify {emotional_tone} undertones, {advanced_atmosphere} with {visual_style} cinematography"
else:
advanced_prompt = f"{subject_ref} {advanced_action} {advanced_adverb} while {advanced_camera} follows natural rhythm, environmental elements support the motion, {advanced_atmosphere} with intelligent {visual_style} direction"
instant_prompts.append(f" β€’ {advanced_prompt}")
return "\n".join(instant_prompts)
def extract_intelligent_subject_reference(scene_info):
"""Extract intelligent subject reference using AI analysis"""
enhanced_desc = scene_info.get('enhanced_description', '')
basic_desc = scene_info.get('basic_description', '')
# Check if we have a person
has_person = scene_info.get('has_person', False)
if not has_person:
return "The subject"
# Use enhanced description for smarter reference
if isinstance(enhanced_desc, str):
enhanced_lower = enhanced_desc.lower()
if 'man in costume' in enhanced_lower:
return "The man in costume"
elif 'woman in dress' in enhanced_lower:
return "The woman in dress"
elif 'man in suit' in enhanced_lower:
return "The man in suit"
# Fallback to basic description
if isinstance(basic_desc, str):
basic_lower = basic_desc.lower()
if 'man' in basic_lower:
return "The man"
elif 'woman' in basic_lower:
return "The woman"
elif 'person' in basic_lower:
return "The person"
return "The subject"
def generate_tone_appropriate_actions(emotional_tone, scene_info):
"""Generate actions that match the emotional tone"""
base_actions = {
'dramatic': ['moves powerfully', 'gestures boldly', 'commands attention', 'strikes a pose', 'displays intensity'],
'elegant': ['moves gracefully', 'gestures refined', 'poses elegantly', 'demonstrates poise', 'flows naturally'],
'theatrical': ['performs dramatically', 'presents theatrically', 'expresses character', 'embodies role', 'captivates audience'],
'serious': ['maintains composure', 'speaks authoritatively', 'gestures formally', 'projects confidence', 'demonstrates focus'],
'cheerful': ['expresses joy', 'gestures enthusiastically', 'radiates energy', 'shows warmth', 'displays positivity'],
'professional': ['presents professionally', 'maintains bearing', 'demonstrates expertise', 'projects authority', 'engages formally'],
'neutral': ['moves naturally', 'gestures appropriately', 'maintains presence', 'expresses subtly', 'demonstrates character']
}
# Add context-specific actions based on scene elements
actions = base_actions.get(emotional_tone, base_actions['neutral']).copy()
# Add clothing-specific actions
if scene_info.get('distinctive_elements'):
for element in scene_info['distinctive_elements']:
if 'costume' in element:
actions.extend(['adjusts costume', 'displays costume details'])
elif 'cape' in element:
actions.extend(['gestures with cape', 'moves dramatically with cape'])
elif 'flag' in element:
actions.extend(['acknowledges flag', 'presents with flag'])
return actions
def generate_style_appropriate_cameras(visual_style, cinematic_qualities):
"""Generate camera movements appropriate for the visual style"""
base_cameras = {
'cinematic': ['camera glides smoothly', 'tracking shot follows', 'camera orbits elegantly', 'dolly movement captures', 'crane shot reveals'],
'dramatic': ['camera emphasizes motion', 'dynamic camera movement', 'camera captures intensity', 'bold camera work follows', 'dramatic camera angles'],
'theatrical': ['camera frames performance', 'audience perspective maintained', 'camera captures stage presence', 'performance-focused framing', 'theatrical camera work'],
'professional': ['steady camera captures', 'professional camera movement', 'controlled camera work', 'camera maintains stability', 'precise camera tracking'],
'documentary': ['handheld camera follows', 'natural camera movement', 'camera observes genuinely', 'documentary-style capture', 'authentic camera work']
}
cameras = base_cameras.get(visual_style, base_cameras['cinematic']).copy()
# Add cameras based on cinematic qualities
if 'horizontal camera movements' in cinematic_qualities:
cameras.extend(['camera pans horizontally', 'lateral camera movement'])
if 'vertical movement' in cinematic_qualities:
cameras.extend(['camera tilts vertically', 'vertical camera motion'])
if 'environmental context' in cinematic_qualities:
cameras.extend(['camera reveals environment', 'wide establishing shots'])
return cameras
def get_tone_appropriate_adverb(emotional_tone):
"""Get adverbs that match the emotional tone"""
adverbs = {
'dramatic': ['powerfully', 'intensely', 'dramatically', 'boldly', 'majestically'],
'elegant': ['gracefully', 'refinedly', 'elegantly', 'smoothly', 'sophisticatedly'],
'theatrical': ['dramatically', 'expressively', 'theatrically', 'charismatically', 'captivating'],
'serious': ['authoritatively', 'professionally', 'formally', 'confidently', 'purposefully'],
'cheerful': ['enthusiastically', 'energetically', 'warmly', 'positively', 'vibrantly'],
'professional': ['professionally', 'precisely', 'competently', 'expertly', 'authoritatively'],
'neutral': ['naturally', 'smoothly', 'appropriately', 'genuinely', 'authentically']
}
return random.choice(adverbs.get(emotional_tone, adverbs['neutral']))
def get_style_appropriate_atmosphere(visual_style, emotional_tone):
"""Get atmosphere that combines style and tone"""
style_atmospheres = {
'cinematic': f'cinematic {emotional_tone} atmosphere',
'dramatic': f'dramatic {emotional_tone} mood',
'theatrical': f'theatrical {emotional_tone} presence',
'professional': f'professional {emotional_tone} environment',
'documentary': f'authentic {emotional_tone} feeling'
}
return style_atmospheres.get(visual_style, f'{visual_style} {emotional_tone} atmosphere')
def get_lighting_for_style(visual_style):
"""Get appropriate lighting description for visual style"""
lighting = {
'cinematic': 'cinematic lighting',
'dramatic': 'dramatic lighting',
'theatrical': 'stage lighting',
'professional': 'professional lighting',
'documentary': 'natural lighting'
}
return lighting.get(visual_style, 'cinematic lighting')
# Gen-4 style prompts
for i in range(3):
action = random.choice(contextual_actions)
adverb = random.choice(contextual_adverbs)
camera = random.choice(camera_moves)
# Build Gen-4 iteratively
basic = f"{subject_ref} {action}"
with_motion = f"{basic} {adverb}"
with_camera = f"{with_motion}. {camera}"
# Add style based on specific details
if specific_details.get('colors'):
style_addition = f"{specific_details['colors']} tones enhanced by lighting"
else:
style_addition = "Cinematic lighting"
complete = f"{with_camera}. {style_addition}"
instant_prompts.append(f"πŸ“ **Gen-4**: {complete}")
# Specialized prompts based on distinctive features
if specific_details.get('clothing'):
clothing = specific_details['clothing']
if 'cape' in clothing:
instant_prompts.append(f"🦸 **Cape Focus**: {subject_ref} moves dramatically while camera captures cape movement, wind effects enhance cape flow, heroic atmosphere")
if 'dress' in clothing:
instant_prompts.append(f"πŸ‘— **Dress Focus**: {subject_ref} moves gracefully while camera tracks smoothly, fabric reacts to movement, elegant atmosphere")
if 'hat' in clothing:
instant_prompts.append(f"🎩 **Hat Focus**: {subject_ref} tips hat confidently while camera frames from chest up, professional lighting")
# Color-focused prompts
if specific_details.get('colors'):
colors = specific_details['colors']
instant_prompts.append(f"🎨 **Color Enhanced**: {subject_ref} {random.choice(contextual_actions)} while lighting dramatically enhances {colors} tones, cinematic depth")
return "\n\n".join(instant_prompts)
def copy_to_foundation(prompt_text, approach):
"""Extract the main prompt from formatted text for foundation field"""
# Remove the emoji and label prefix to get clean prompt
if "**" in prompt_text:
# Extract text after the **:
parts = prompt_text.split("**: ", 1)
if len(parts) > 1:
return parts[1]
return prompt_text
# Create optimized Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Universal Video Prompting Tool") as demo:
gr.Markdown("# 🎬 Universal Video Prompting Tool")
gr.Markdown("*Compatible with Gen-4, Sora, Pika, Luma, Runway & all AI video models*")
gr.Markdown("**Combines official Gen-4 guidelines with advanced SARA Framework**")
with gr.Tabs():
with gr.TabItem("πŸ“š Prompting Guide"):
gr.Markdown(unified_instructions)
with gr.TabItem("🎬 Quick Video Prompt Generator"):
with gr.Row():
with gr.Column(scale=1):
# Image upload and analysis
gr.Markdown("## πŸ“· Upload Your Frame 0")
image_input = gr.Image(type="pil", label="Upload your initial frame")
analyze_btn = gr.Button("πŸ” Analyze Image (Fast)", variant="primary")
image_analysis = gr.Textbox(
label="Image Analysis Results",
placeholder="Upload an image and click 'Analyze Image' for instant analysis...",
lines=10,
interactive=False
)
# Hidden state for scene info
scene_info_state = gr.State({})
# Quick suggestions
with gr.Group():
gr.Markdown("### πŸ’‘ Smart Suggestions")
get_suggestions_btn = gr.Button("Get Smart Tips", variant="secondary")
smart_suggestions = gr.Textbox(
label="Context-Aware Suggestions",
placeholder="Click 'Get Smart Tips' after image analysis...",
lines=5,
interactive=False
)
# Instant prompts - NEW SECTION
with gr.Group():
gr.Markdown("### πŸš€ Ready-to-Use Prompts")
generate_instant_btn = gr.Button("Generate Instant Prompts", variant="primary")
instant_prompts = gr.Textbox(
label="Copy & Paste Ready Prompts",
placeholder="Click 'Generate Instant Prompts' to get ready-to-use prompts based on your image...",
lines=12,
interactive=True,
show_copy_button=True
)
with gr.Column(scale=1):
# Prompt generation methods
gr.Markdown("## πŸš€ Choose Your Method")
with gr.Tabs():
with gr.TabItem("πŸ€– AI Prompt Assistant"):
gr.Markdown("*Describe your idea in any language - AI will create optimized English video prompts*")
with gr.Row():
with gr.Column(scale=2):
user_idea = gr.Textbox(
label="Your Idea (any language)",
placeholder="e.g., 'el personaje se quita la nariz de payaso' or 'character walks slowly towards camera'",
lines=3
)
with gr.Column(scale=1):
optimize_btn = gr.Button("πŸš€ Optimize & Structure", variant="primary")
ai_optimized = gr.Textbox(
label="AI-Optimized Video Prompt",
placeholder="Your optimized prompt will appear here...",
lines=4,
interactive=True,
show_copy_button=True
)
# Chat interface for refinement
gr.Markdown("### πŸ’¬ Refine Your Prompt")
chat_history = gr.Chatbot(
label="Prompt Refinement Chat",
height=250,
placeholder="Chat history will appear here as you refine your prompt..."
)
with gr.Row():
refine_input = gr.Textbox(
label="Refine further",
placeholder="e.g., 'make it more dramatic' or 'add camera movement' or 'mΓ‘s lento'",
scale=3
)
refine_btn = gr.Button("πŸ’¬ Refine", scale=1)
with gr.TabItem("πŸ“ Gen-4 Official"):
gr.Markdown("*Official method: Simple β†’ Complex building*")
foundation_gen4 = gr.Textbox(
label="Foundation (Optional)",
placeholder="e.g., 'The subject walks forward'",
lines=1
)
generate_gen4_btn = gr.Button("Generate Gen-4 Prompts", variant="primary")
gen4_output = gr.Textbox(
label="Gen-4 Style Prompts",
lines=8,
interactive=False
)
# Custom prompt builder
with gr.Group():
gr.Markdown("## πŸ› οΈ Custom Prompt Builder")
with gr.Row():
approach_selector = gr.Radio(
choices=["SARA", "Gen-4"],
value="SARA",
label="Approach",
interactive=True
)
custom_foundation = gr.Textbox(
label="Foundation",
placeholder="The subject...",
lines=1
)
with gr.Row():
subject_motion = gr.CheckboxGroup(
choices=["walks smoothly", "speaks clearly", "gestures naturally", "moves gracefully", "turns slowly"],
label="Subject Motion"
)
scene_motion = gr.CheckboxGroup(
choices=["dust swirls", "lighting changes", "wind effects", "water movement", "atmosphere shifts"],
label="Scene Motion"
)
with gr.Row():
camera_motion = gr.Dropdown(
choices=["camera remains steady", "handheld camera", "camera pans left", "camera pans right", "camera tracks forward", "camera zooms in"],
label="Camera Motion",
value="camera remains steady"
)
style_motion = gr.Dropdown(
choices=["cinematic", "documentary style", "live-action", "dramatic", "peaceful", "energetic", "professional"],
label="Style/Atmosphere",
value="cinematic"
)
build_custom_btn = gr.Button("πŸ”¨ Build Custom Prompt", variant="secondary")
custom_output = gr.Textbox(
label="Your Custom Prompt",
lines=3,
interactive=True
)
# Event handlers
analyze_btn.click(
fn=analyze_image_simple,
inputs=[image_input],
outputs=[image_analysis, gr.State(), scene_info_state]
)
get_suggestions_btn.click(
fn=get_smart_suggestions_local,
inputs=[scene_info_state],
outputs=[smart_suggestions]
)
# NEW: Generate instant prompts
generate_instant_btn.click(
fn=generate_instant_prompts,
inputs=[scene_info_state],
outputs=[instant_prompts]
)
# NEW: AI Prompt Assistant
optimize_btn.click(
fn=optimize_user_prompt,
inputs=[user_idea, scene_info_state],
outputs=[ai_optimized]
)
refine_btn.click(
fn=refine_prompt_with_feedback,
inputs=[ai_optimized, refine_input, chat_history, scene_info_state],
outputs=[ai_optimized, chat_history]
)
generate_gen4_btn.click(
fn=generate_gen4_prompts_local,
inputs=[scene_info_state, foundation_gen4],
outputs=[gen4_output]
)
build_custom_btn.click(
fn=build_custom_prompt_local,
inputs=[custom_foundation, subject_motion, scene_motion, camera_motion, style_motion, approach_selector],
outputs=[custom_output]
)
# Launch the app
if __name__ == "__main__":
demo.launch()