Malaji71's picture
Update app.py
f9202e1 verified
import gradio as gr
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
from PIL import Image
import random
# Check GPU availability
use_gpu = torch.cuda.is_available()
# Lazy loading of models
processor, model, zephyr_generator = None, None, None
def load_models():
"""Load models only when needed"""
global processor, model, zephyr_generator
if processor is None or model is None or zephyr_generator is None:
print("Loading BLIP model...")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-large",
torch_dtype=torch.float32 # Use float32 for CPU
)
print("βœ… BLIP model loaded successfully!")
print("Loading SARA-Zephyr fine-tuned model...")
zephyr_generator = pipeline(
"text-generation",
model="Malaji71/SARA-Zephyr", # Cambiado al modelo fine-tuned
torch_dtype=torch.float32, # Use float32 for CPU
device_map="auto" if use_gpu else None # Use auto device mapping if GPU available
)
print("βœ… SARA-Zephyr fine-tuned model loaded successfully!")
# Universal Video Prompting Guide combining Gen-4 + SARA
unified_instructions = """
# 🎬 Universal Video Prompting Guide
*Compatible with Gen-4, Sora, Pika, Luma, Runway and all diffusion-based video models*
## Core Principles (Universal)
βœ… **Focus on MOTION, not static description**
βœ… **Use positive phrasing exclusively**
βœ… **Start simple, iterate progressively**
βœ… **Refer to subjects in general terms** ("the subject," "the woman")
βœ… **Keep prompts direct and easily understood**
## Two Complementary Approaches
### πŸš€ **Gen-4 Official Method** (Recommended for beginners)
**Structure**: Simple iterative building
1. Start with essential motion only
2. Add one element at a time: Subject Motion β†’ Camera Motion β†’ Scene Motion β†’ Style Descriptors
3. Use general terms and avoid complex descriptions
**Example**:
- Basic: "The subject walks forward"
- + Camera: "The subject walks forward. Handheld camera follows"
- + Scene: "The subject walks forward. Handheld camera follows. Dust trails behind"
- + Style: "The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic."
### 🎯 **SARA Framework** (Advanced precision)
**Structure**: [Subject] + [Action] + [Reference] + [Atmosphere]
- **Subject (S)**: Main element to control
- **Action (A)**: Movement/transformation ([verb] + [adverb])
- **Reference (R)**: Spatial anchors ("while X remains steady")
- **Atmosphere (A)**: Context and style
**Template**: [Subject] [verb] [adverb] while [reference] [atmosphere]
**Example**: "The subject walks smoothly while background remains steady, cinematic atmosphere"
"""
def analyze_image_with_zephyr(image):
"""Analyze image using BLIP + Zephyr AI for enhanced understanding"""
if image is None:
return "Please upload an image first.", {}
try:
# Lazy load models
load_models()
# Convert to PIL if needed
if not isinstance(image, Image.Image):
image = Image.fromarray(image)
# Get image dimensions
width, height = image.size
aspect_ratio = width / height
if aspect_ratio > 1.5:
composition = "Wide landscape shot"
elif aspect_ratio < 0.7:
composition = "Vertical portrait shot"
else:
composition = "Balanced composition"
# Generate caption with BLIP
inputs = processor(image, return_tensors="pt")
out = model.generate(**inputs, max_length=50, num_beams=3)
basic_caption = processor.decode(out[0], skip_special_tokens=True)
# Use Zephyr for advanced analysis
enhanced_analysis = analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition)
# Create comprehensive analysis
analysis = f"""πŸ“Š **Image Analysis:**
β€’ **Dimensions**: {width} x {height}
β€’ **Composition**: {composition}
β€’ **Aspect Ratio**: {aspect_ratio:.2f}
🎨 **Scene Description**:
"{basic_caption}"
πŸ€– **AI Enhanced Analysis**:
{enhanced_analysis['scene_interpretation']}
πŸ’‘ **Motion Insights**:
{chr(10).join(f"β€’ {insight}" for insight in enhanced_analysis['motion_insights'])}
🎯 **Recommended Approach**:
{enhanced_analysis['recommended_approach']}"""
# Scene info for prompt generation
scene_info = {
'basic_description': basic_caption,
'composition': composition,
'aspect_ratio': aspect_ratio,
'enhanced_analysis': enhanced_analysis
}
return analysis, scene_info
except Exception as e:
return f"Error analyzing image: {str(e)}", {}
def analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition):
"""Use SARA-Zephyr for advanced scene analysis"""
analysis_prompt = f"""<|system|>
You are a video prompt engineering expert specializing in the SARA framework. Analyze this image description for video creation potential.
<|user|>
Image description: "{basic_caption}"
Image composition: {composition}
Aspect ratio: {aspect_ratio:.2f}
Please provide:
1. Type of motion that would work best
2. Recommended camera movements
3. Emotional tone/style suggestions
4. Best prompting approach (SARA vs Gen-4)
Be concise and practical.
<|assistant|>"""
response = zephyr_generator(
analysis_prompt,
max_new_tokens=200,
do_sample=True,
temperature=0.7,
pad_token_id=zephyr_generator.tokenizer.eos_token_id
)
ai_analysis = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
lines = ai_analysis.split('\n') # Fixed: Properly split by newline
motion_insights = []
recommended_approach = "SARA framework recommended for precise control"
for line in lines:
if line.strip():
if any(keyword in line.lower() for keyword in ['motion', 'movement', 'camera', 'lighting']):
motion_insights.append(line.strip('- ').strip())
elif 'sara' in line.lower() or 'gen-4' in line.lower():
recommended_approach = line.strip('- ').strip()
return {
'scene_interpretation': ai_analysis.split('\n')[0] if ai_analysis else "Scene analysis completed",
'motion_insights': motion_insights[:6],
'recommended_approach': recommended_approach
}
def generate_sample_prompts_with_zephyr(scene_info=None):
"""Generate sample prompts using SARA-Zephyr"""
if scene_info and scene_info.get('basic_description'):
# Use Zephyr to generate contextual prompts
context_prompt = f"""<|system|>
Generate 3 professional video prompts using the SARA framework based on this image analysis.
<|user|>
Image description: {scene_info['basic_description']}
Composition: {scene_info.get('composition', 'Balanced')}
Aspect Ratio: {scene_info.get('aspect_ratio', 'N/A'):.2f}
Remember the SARA framework: Subject + Action + Reference + Atmosphere
<|assistant|>"""
response = zephyr_generator(
context_prompt,
max_new_tokens=200,
do_sample=True,
temperature=0.8,
pad_token_id=zephyr_generator.tokenizer.eos_token_id
)
# Extract and clean prompts
prompts_text = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
prompts = [p.strip('123.-β€’ ') for p in prompts_text.split('\n') if p.strip()] # Fixed: Split by newline
# Return first 3 clean prompts
if len(prompts) >= 3:
return prompts[:3]
# Fallback prompts if Zephyr fails or no scene info
base_prompts = [
"The subject walks forward smoothly while the background remains steady, cinematic atmosphere.",
"A dramatic close-up captures the subject's expression as they speak directly to the camera.",
"The scene transitions with a handheld camera following the subject through a bustling environment."
]
return base_prompts
def optimize_user_prompt_with_zephyr(user_idea, scene_info=None):
"""Optimize user's prompt idea using SARA-Zephyr while respecting SARA/Gen-4 structure"""
if not user_idea.strip():
return "Please enter your idea first."
# Create context from scene if available
context = ""
if scene_info and scene_info.get('basic_description'):
context = f"Image context: {scene_info['basic_description']}"
# Enforce structure based on approach
optimization_prompt = f"""<|system|>
You are an expert in video prompting, specializing in the SARA framework. Transform user ideas into professional prompts compatible with AI video models like Sora, Gen-4, Pika, Runway, and Luma.
Key principles:
- Focus on MOTION, not static description
- Use positive phrasing
- Be specific about camera work
- Include lighting/atmosphere details
- Follow the SARA structure: Subject + Action + Reference + Atmosphere
<|user|>
User's idea: "{user_idea}"
{context}
Please create an optimized video prompt using the SARA framework. Respond with just the prompt.
<|assistant|>"""
response = zephyr_generator(
optimization_prompt,
max_new_tokens=100,
do_sample=True,
temperature=0.7,
pad_token_id=zephyr_generator.tokenizer.eos_token_id
)
# Extract optimized prompt
optimized = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
return optimized
def refine_prompt_with_zephyr(current_prompt, feedback, chat_history, scene_info=None):
"""Refine a prompt based on user feedback using SARA-Zephyr"""
if not feedback.strip():
return current_prompt, chat_history
# Create refinement context
context = ""
if scene_info and scene_info.get('basic_description'):
context = f"Image context: {scene_info['basic_description']}"
# Construct Zephyr refinement prompt
refinement_prompt = f"""<|system|>
You are an expert in refining video prompts using the SARA framework. Based on the user's feedback, improve the current prompt while maintaining its core structure.
Key principles:
- Focus on MOTION, not static description
- Use positive phrasing
- Be specific about camera work
- Include lighting/atmosphere details
- Follow the SARA structure: Subject + Action + Reference + Atmosphere
<|user|>
Current prompt: "{current_prompt}"
Feedback: "{feedback}"
{context}
Please refine the prompt while keeping it under 100 words. Respond with just the refined prompt.
<|assistant|>"""
response = zephyr_generator(
refinement_prompt,
max_new_tokens=100,
do_sample=True,
temperature=0.7,
pad_token_id=zephyr_generator.tokenizer.eos_token_id
)
# Extract refined prompt
refined = response[0]['generated_text'].split("<|assistant|>")[-1].strip()
# Update chat history
new_chat_history = chat_history + [[feedback, refined]]
return refined, new_chat_history
def generate_gen4_prompts(scene_info, foundation=""):
"""Generate Gen-4 style prompts iteratively"""
try:
if scene_info and scene_info.get('basic_description'):
description = scene_info['basic_description']
# Detect subject
if 'man' in description.lower():
subject = "The man"
elif 'woman' in description.lower():
subject = "The woman"
elif 'person' in description.lower():
subject = "The person"
else:
subject = "The subject"
# Generate actions based on scene
if any(word in description.lower() for word in ['sitting', 'seated']):
actions = ['speaks to camera', 'gestures while seated', 'leans forward', 'adjusts posture']
elif any(word in description.lower() for word in ['standing', 'portrait']):
actions = ['speaks directly', 'gestures naturally', 'shifts weight', 'looks around']
else:
actions = ['moves forward', 'turns slightly', 'gestures', 'demonstrates']
action = random.choice(actions)
# Build Gen-4 iteratively
basic = f"{subject} {action}"
with_motion = f"{basic} smoothly"
with_camera = f"{with_motion}. Camera captures steadily"
# Add style based on composition
composition = scene_info.get('composition', '')
if 'Wide' in composition:
style_addition = "Wide cinematic framing"
elif 'Portrait' in composition:
style_addition = "Intimate portrait lighting"
else:
style_addition = "Professional documentary style"
with_style = f"{with_camera}. {style_addition}."
return f"""πŸš€ **Gen-4 Iterative Building:**
**Basic**: {basic}
**+ Motion**: {with_motion}
**+ Camera**: {with_camera}
**+ Style**: {with_style}"""
else:
return """πŸš€ **Gen-4 Iterative Building:**
**Basic**: The subject walks forward
**+ Camera**: The subject walks forward. Handheld camera follows
**+ Scene**: The subject walks forward. Handheld camera follows. Dust trails behind
**+ Style**: The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic."""
except Exception as e:
return f"Error generating Gen-4 prompts: {str(e)}"
def build_custom_prompt(foundation, subject_motion, scene_motion, camera_motion, style, approach="SARA"):
"""Build custom prompt using selected approach"""
if approach == "SARA":
# SARA Structure: [Subject] [Action] while [Reference], [Atmosphere]
parts = []
if foundation:
parts.append(foundation)
# Add motion elements
motion_parts = []
if subject_motion:
motion_parts.extend(subject_motion)
if scene_motion:
motion_parts.extend(scene_motion)
if motion_parts:
parts.append(", ".join(motion_parts))
# Reference (camera stability)
if camera_motion:
parts.append(f"while {camera_motion}")
else:
parts.append("while background remains steady")
# Atmosphere
if style:
parts.append(style)
return " ".join(parts)
else: # Gen-4 style
# Gen-4 Structure: Simple iterative building
parts = []
if foundation:
parts.append(foundation)
if subject_motion:
parts.extend(subject_motion)
if camera_motion:
parts.append(camera_motion)
if scene_motion:
parts.extend(scene_motion)
if style:
parts.append(style)
return ". ".join(parts) if parts else "The subject moves naturally"
# Create the Gradio interface
def create_interface():
"""Create the Gradio interface"""
with gr.Blocks(theme=gr.themes.Soft(), title="AI Video Prompt Generator") as demo:
# Header
gr.Markdown("# 🎬 AI Video Prompt Generator - πŸ€– SARA-Zephyr AI Powered")
gr.Markdown("*Professional prompts for Sora, Gen-4, Pika, Luma, Runway and more*")
# State variables
scene_state = gr.State({})
chat_history_state = gr.State([])
with gr.Tabs():
# Tab 1: Learning Guide
with gr.Tab("πŸ“š Prompting Guide"):
gr.Markdown(unified_instructions)
# Advanced tips
with gr.Accordion("🎯 Advanced Tips", open=False):
gr.Markdown("""
## Advanced Prompting Strategies
### 🎨 Style Integration
- **Cinematography**: "Dutch angle," "Extreme close-up," "Bird's eye view"
- **Lighting**: "Golden hour," "Neon glow," "Harsh shadows," "Soft diffused light"
- **Movement Quality**: "Fluid motion," "Mechanical precision," "Organic flow"
### ⚑ Motion Types
- **Subject Motion**: Walking, running, dancing, gesturing
- **Camera Motion**: Pan, tilt, dolly, zoom, orbit, tracking
- **Environmental**: Wind, water flow, particle effects, lighting changes
""")
# Tab 2: Image Analysis
with gr.Tab("πŸ“· Image Analysis"):
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(
label="Upload Image for Analysis",
type="pil"
)
analyze_btn = gr.Button("πŸ” Analyze Image", variant="primary")
with gr.Column(scale=2):
analysis_output = gr.Markdown(label="AI Analysis Results")
# Sample prompts section
with gr.Group():
gr.Markdown("### πŸ’‘ Sample Prompts")
sample_btn = gr.Button("🎲 Generate Sample Prompts")
sample_prompts = [
gr.Textbox(
label=f"Sample {i+1}",
lines=2,
interactive=False,
show_copy_button=True
)
for i in range(3)
]
# Tab 3: AI Prompt Generator
with gr.Tab("πŸ€– AI Prompt Generator"):
with gr.Row():
with gr.Column():
user_idea = gr.Textbox(
label="Your Video Idea (any language)",
placeholder="e.g., 'el personaje se quita la nariz' or 'character walks slowly'",
lines=3
)
optimize_btn = gr.Button("πŸš€ Generate Optimized Prompt", variant="primary")
optimized_prompt = gr.Textbox(
label="AI-Optimized Video Prompt",
lines=4,
interactive=True,
show_copy_button=True
)
with gr.Column():
gr.Markdown("### πŸ”„ Refine Your Prompt")
feedback_input = gr.Textbox(
label="Feedback/Changes",
placeholder="e.g., 'make it more dramatic' or 'add camera movement'",
lines=2
)
refine_btn = gr.Button("πŸ”„ Refine Prompt")
# Chat history
with gr.Accordion("πŸ’¬ Refinement History", open=False):
chat_display = gr.Chatbot(height=300, type='messages')
# Tab 4: Gen-4 Method
with gr.Tab("πŸ“ Gen-4 Official"):
gr.Markdown("*Official Gen-4 method: Simple β†’ Complex building*")
with gr.Row():
foundation_gen4 = gr.Textbox(
label="Foundation (Optional)",
placeholder="e.g., 'The subject walks forward'",
lines=1
)
generate_gen4_btn = gr.Button("Generate Gen-4 Prompts", variant="primary")
gen4_output = gr.Textbox(
label="Gen-4 Style Prompts",
lines=8,
interactive=False,
show_copy_button=True
)
# Tab 5: Custom Builder
with gr.Tab("πŸ› οΈ Custom Builder"):
gr.Markdown("## Build Your Custom Prompt")
with gr.Row():
approach_selector = gr.Radio(
choices=["SARA", "Gen-4"],
value="SARA",
label="Approach",
interactive=True
)
custom_foundation = gr.Textbox(
label="Foundation",
placeholder="The subject...",
lines=1
)
with gr.Row():
subject_motion = gr.CheckboxGroup(
choices=["walks smoothly", "speaks clearly", "gestures naturally", "moves gracefully", "turns slowly"],
label="Subject Motion"
)
scene_motion = gr.CheckboxGroup(
choices=["dust swirls", "lighting changes", "wind effects", "water movement", "atmosphere shifts"],
label="Scene Motion"
)
with gr.Row():
camera_motion = gr.Dropdown(
choices=["camera remains steady", "handheld camera", "camera pans left", "camera pans right", "camera tracks forward", "camera zooms in"],
label="Camera Motion",
value="camera remains steady"
)
style_motion = gr.Dropdown(
choices=["cinematic", "documentary style", "live-action", "dramatic", "peaceful", "energetic", "professional"],
label="Style/Atmosphere",
value="cinematic"
)
build_custom_btn = gr.Button("πŸ”¨ Build Custom Prompt", variant="secondary")
custom_output = gr.Textbox(
label="Your Custom Prompt",
lines=3,
interactive=True,
show_copy_button=True
)
# Event handlers
analyze_btn.click(
fn=analyze_image_with_zephyr,
inputs=[image_input],
outputs=[analysis_output, scene_state]
)
sample_btn.click(
fn=generate_sample_prompts_with_zephyr,
inputs=[scene_state],
outputs=sample_prompts
)
optimize_btn.click(
fn=optimize_user_prompt_with_zephyr,
inputs=[user_idea, scene_state],
outputs=[optimized_prompt]
)
refine_btn.click(
fn=refine_prompt_with_zephyr,
inputs=[optimized_prompt, feedback_input, chat_history_state, scene_state],
outputs=[optimized_prompt, chat_history_state]
)
# Update chat display when history changes
chat_history_state.change(
fn=lambda history: history,
inputs=[chat_history_state],
outputs=[chat_display]
)
generate_gen4_btn.click(
fn=generate_gen4_prompts,
inputs=[scene_state, foundation_gen4],
outputs=[gen4_output]
)
build_custom_btn.click(
fn=build_custom_prompt,
inputs=[custom_foundation, subject_motion, scene_motion, camera_motion, style_motion, approach_selector],
outputs=[custom_output]
)
return demo
# Launch the app
if __name__ == "__main__":
print("🎬 Starting AI Video Prompt Generator with SARA-Zephyr...")
print(f"πŸ“Š Status: {'GPU' if use_gpu else 'CPU'} Mode Enabled")
print("πŸ”§ Loading models (this may take a few minutes)...")
try:
demo = create_interface()
print("βœ… Interface created successfully!")
print("πŸš€ Launching application...")
demo.launch(
share=True,
server_name="0.0.0.0",
server_port=7860,
debug=True,
show_error=True
)
except Exception as e:
print(f"❌ Error launching app: {e}")
print("πŸ”§ Make sure you have sufficient CPU resources and all dependencies installed.")
print("πŸ“¦ Required packages:")
print(" pip install torch transformers gradio pillow accelerate bitsandbytes")
# Alternative launch attempt
print("\nπŸ”„ Attempting alternative launch...")
try:
demo = create_interface()
demo.launch(
share=False,
server_name="127.0.0.1",
server_port=7860,
debug=False
)
except Exception as e2:
print(f"❌ Alternative launch failed: {e2}")
print("\nπŸ’‘ Troubleshooting tips:")
print("1. Ensure CPU resources are sufficient.")
print("2. Check CPU usage: top or htop")
print("3. Try reducing model precision: set torch_dtype=torch.float32")
print("4. Monitor memory usage: free -h")