Ai_Video_Production_Delta

Paused

App Files Files Community

Ai_Video_Production_Delta / app.py

Malaji71

Update app.py

f9202e1 verified 8 months ago

raw

history blame contribute delete

24.8 kB

	import gradio as gr
	import torch
	from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
	from PIL import Image
	import random

	# Check GPU availability
	use_gpu = torch.cuda.is_available()

	# Lazy loading of models
	processor, model, zephyr_generator = None, None, None


	def load_models():
	"""Load models only when needed"""
	global processor, model, zephyr_generator
	if processor is None or model is None or zephyr_generator is None:
	print("Loading BLIP model...")
	processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
	model = BlipForConditionalGeneration.from_pretrained(
	"Salesforce/blip-image-captioning-large",
	torch_dtype=torch.float32 # Use float32 for CPU
	)
	print("✅ BLIP model loaded successfully!")
	print("Loading SARA-Zephyr fine-tuned model...")
	zephyr_generator = pipeline(
	"text-generation",
	model="Malaji71/SARA-Zephyr", # Cambiado al modelo fine-tuned
	torch_dtype=torch.float32, # Use float32 for CPU
	device_map="auto" if use_gpu else None # Use auto device mapping if GPU available
	)
	print("✅ SARA-Zephyr fine-tuned model loaded successfully!")


	# Universal Video Prompting Guide combining Gen-4 + SARA
	unified_instructions = """
	# 🎬 Universal Video Prompting Guide
	Compatible with Gen-4, Sora, Pika, Luma, Runway and all diffusion-based video models
	## Core Principles (Universal)
	✅ Focus on MOTION, not static description
	✅ Use positive phrasing exclusively
	✅ Start simple, iterate progressively
	✅ Refer to subjects in general terms ("the subject," "the woman")
	✅ Keep prompts direct and easily understood
	## Two Complementary Approaches
	### 🚀 Gen-4 Official Method (Recommended for beginners)
	Structure: Simple iterative building
	1. Start with essential motion only
	2. Add one element at a time: Subject Motion → Camera Motion → Scene Motion → Style Descriptors
	3. Use general terms and avoid complex descriptions
	Example:
	- Basic: "The subject walks forward"
	- + Camera: "The subject walks forward. Handheld camera follows"
	- + Scene: "The subject walks forward. Handheld camera follows. Dust trails behind"
	- + Style: "The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic."
	### 🎯 SARA Framework (Advanced precision)
	Structure: [Subject] + [Action] + [Reference] + [Atmosphere]
	- Subject (S): Main element to control
	- Action (A): Movement/transformation ([verb] + [adverb])
	- Reference (R): Spatial anchors ("while X remains steady")
	- Atmosphere (A): Context and style
	Template: [Subject] [verb] [adverb] while [reference] [atmosphere]
	Example: "The subject walks smoothly while background remains steady, cinematic atmosphere"
	"""


	def analyze_image_with_zephyr(image):
	"""Analyze image using BLIP + Zephyr AI for enhanced understanding"""
	if image is None:
	return "Please upload an image first.", {}
	try:
	# Lazy load models
	load_models()
	# Convert to PIL if needed
	if not isinstance(image, Image.Image):
	image = Image.fromarray(image)
	# Get image dimensions
	width, height = image.size
	aspect_ratio = width / height
	if aspect_ratio > 1.5:
	composition = "Wide landscape shot"
	elif aspect_ratio < 0.7:
	composition = "Vertical portrait shot"
	else:
	composition = "Balanced composition"
	# Generate caption with BLIP
	inputs = processor(image, return_tensors="pt")
	out = model.generate(**inputs, max_length=50, num_beams=3)
	basic_caption = processor.decode(out[0], skip_special_tokens=True)
	# Use Zephyr for advanced analysis
	enhanced_analysis = analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition)
	# Create comprehensive analysis
	analysis = f"""📊 Image Analysis:
	• Dimensions: {width} x {height}
	• Composition: {composition}
	• Aspect Ratio: {aspect_ratio:.2f}
	🎨 Scene Description:
	"{basic_caption}"
	🤖 AI Enhanced Analysis:
	{enhanced_analysis['scene_interpretation']}
	💡 Motion Insights:
	{chr(10).join(f"• {insight}" for insight in enhanced_analysis['motion_insights'])}
	🎯 Recommended Approach:
	{enhanced_analysis['recommended_approach']}"""
	# Scene info for prompt generation
	scene_info = {
	'basic_description': basic_caption,
	'composition': composition,
	'aspect_ratio': aspect_ratio,
	'enhanced_analysis': enhanced_analysis
	}
	return analysis, scene_info
	except Exception as e:
	return f"Error analyzing image: {str(e)}", {}


	def analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition):
	"""Use SARA-Zephyr for advanced scene analysis"""
	analysis_prompt = f"""<\|system\|>
	You are a video prompt engineering expert specializing in the SARA framework. Analyze this image description for video creation potential.
	<\|user\|>
	Image description: "{basic_caption}"
	Image composition: {composition}
	Aspect ratio: {aspect_ratio:.2f}
	Please provide:
	1. Type of motion that would work best
	2. Recommended camera movements
	3. Emotional tone/style suggestions
	4. Best prompting approach (SARA vs Gen-4)
	Be concise and practical.
	<\|assistant\|>"""
	response = zephyr_generator(
	analysis_prompt,
	max_new_tokens=200,
	do_sample=True,
	temperature=0.7,
	pad_token_id=zephyr_generator.tokenizer.eos_token_id
	)
	ai_analysis = response[0]['generated_text'].split("<\|assistant\|>")[-1].strip()
	lines = ai_analysis.split('\n') # Fixed: Properly split by newline
	motion_insights = []
	recommended_approach = "SARA framework recommended for precise control"
	for line in lines:
	if line.strip():
	if any(keyword in line.lower() for keyword in ['motion', 'movement', 'camera', 'lighting']):
	motion_insights.append(line.strip('- ').strip())
	elif 'sara' in line.lower() or 'gen-4' in line.lower():
	recommended_approach = line.strip('- ').strip()
	return {
	'scene_interpretation': ai_analysis.split('\n')[0] if ai_analysis else "Scene analysis completed",
	'motion_insights': motion_insights[:6],
	'recommended_approach': recommended_approach
	}


	def generate_sample_prompts_with_zephyr(scene_info=None):
	"""Generate sample prompts using SARA-Zephyr"""
	if scene_info and scene_info.get('basic_description'):
	# Use Zephyr to generate contextual prompts
	context_prompt = f"""<\|system\|>
	Generate 3 professional video prompts using the SARA framework based on this image analysis.
	<\|user\|>
	Image description: {scene_info['basic_description']}
	Composition: {scene_info.get('composition', 'Balanced')}
	Aspect Ratio: {scene_info.get('aspect_ratio', 'N/A'):.2f}
	Remember the SARA framework: Subject + Action + Reference + Atmosphere
	<\|assistant\|>"""
	response = zephyr_generator(
	context_prompt,
	max_new_tokens=200,
	do_sample=True,
	temperature=0.8,
	pad_token_id=zephyr_generator.tokenizer.eos_token_id
	)
	# Extract and clean prompts
	prompts_text = response[0]['generated_text'].split("<\|assistant\|>")[-1].strip()
	prompts = [p.strip('123.-• ') for p in prompts_text.split('\n') if p.strip()] # Fixed: Split by newline
	# Return first 3 clean prompts
	if len(prompts) >= 3:
	return prompts[:3]
	# Fallback prompts if Zephyr fails or no scene info
	base_prompts = [
	"The subject walks forward smoothly while the background remains steady, cinematic atmosphere.",
	"A dramatic close-up captures the subject's expression as they speak directly to the camera.",
	"The scene transitions with a handheld camera following the subject through a bustling environment."
	]
	return base_prompts


	def optimize_user_prompt_with_zephyr(user_idea, scene_info=None):
	"""Optimize user's prompt idea using SARA-Zephyr while respecting SARA/Gen-4 structure"""
	if not user_idea.strip():
	return "Please enter your idea first."
	# Create context from scene if available
	context = ""
	if scene_info and scene_info.get('basic_description'):
	context = f"Image context: {scene_info['basic_description']}"
	# Enforce structure based on approach
	optimization_prompt = f"""<\|system\|>
	You are an expert in video prompting, specializing in the SARA framework. Transform user ideas into professional prompts compatible with AI video models like Sora, Gen-4, Pika, Runway, and Luma.
	Key principles:
	- Focus on MOTION, not static description
	- Use positive phrasing
	- Be specific about camera work
	- Include lighting/atmosphere details
	- Follow the SARA structure: Subject + Action + Reference + Atmosphere
	<\|user\|>
	User's idea: "{user_idea}"
	{context}
	Please create an optimized video prompt using the SARA framework. Respond with just the prompt.
	<\|assistant\|>"""
	response = zephyr_generator(
	optimization_prompt,
	max_new_tokens=100,
	do_sample=True,
	temperature=0.7,
	pad_token_id=zephyr_generator.tokenizer.eos_token_id
	)
	# Extract optimized prompt
	optimized = response[0]['generated_text'].split("<\|assistant\|>")[-1].strip()
	return optimized


	def refine_prompt_with_zephyr(current_prompt, feedback, chat_history, scene_info=None):
	"""Refine a prompt based on user feedback using SARA-Zephyr"""
	if not feedback.strip():
	return current_prompt, chat_history
	# Create refinement context
	context = ""
	if scene_info and scene_info.get('basic_description'):
	context = f"Image context: {scene_info['basic_description']}"
	# Construct Zephyr refinement prompt
	refinement_prompt = f"""<\|system\|>
	You are an expert in refining video prompts using the SARA framework. Based on the user's feedback, improve the current prompt while maintaining its core structure.
	Key principles:
	- Focus on MOTION, not static description
	- Use positive phrasing
	- Be specific about camera work
	- Include lighting/atmosphere details
	- Follow the SARA structure: Subject + Action + Reference + Atmosphere
	<\|user\|>
	Current prompt: "{current_prompt}"
	Feedback: "{feedback}"
	{context}
	Please refine the prompt while keeping it under 100 words. Respond with just the refined prompt.
	<\|assistant\|>"""
	response = zephyr_generator(
	refinement_prompt,
	max_new_tokens=100,
	do_sample=True,
	temperature=0.7,
	pad_token_id=zephyr_generator.tokenizer.eos_token_id
	)
	# Extract refined prompt
	refined = response[0]['generated_text'].split("<\|assistant\|>")[-1].strip()
	# Update chat history
	new_chat_history = chat_history + [[feedback, refined]]
	return refined, new_chat_history


	def generate_gen4_prompts(scene_info, foundation=""):
	"""Generate Gen-4 style prompts iteratively"""
	try:
	if scene_info and scene_info.get('basic_description'):
	description = scene_info['basic_description']
	# Detect subject
	if 'man' in description.lower():
	subject = "The man"
	elif 'woman' in description.lower():
	subject = "The woman"
	elif 'person' in description.lower():
	subject = "The person"
	else:
	subject = "The subject"
	# Generate actions based on scene
	if any(word in description.lower() for word in ['sitting', 'seated']):
	actions = ['speaks to camera', 'gestures while seated', 'leans forward', 'adjusts posture']
	elif any(word in description.lower() for word in ['standing', 'portrait']):
	actions = ['speaks directly', 'gestures naturally', 'shifts weight', 'looks around']
	else:
	actions = ['moves forward', 'turns slightly', 'gestures', 'demonstrates']
	action = random.choice(actions)
	# Build Gen-4 iteratively
	basic = f"{subject} {action}"
	with_motion = f"{basic} smoothly"
	with_camera = f"{with_motion}. Camera captures steadily"
	# Add style based on composition
	composition = scene_info.get('composition', '')
	if 'Wide' in composition:
	style_addition = "Wide cinematic framing"
	elif 'Portrait' in composition:
	style_addition = "Intimate portrait lighting"
	else:
	style_addition = "Professional documentary style"
	with_style = f"{with_camera}. {style_addition}."
	return f"""🚀 Gen-4 Iterative Building:
	Basic: {basic}
	+ Motion: {with_motion}
	+ Camera: {with_camera}
	+ Style: {with_style}"""
	else:
	return """🚀 Gen-4 Iterative Building:
	Basic: The subject walks forward
	+ Camera: The subject walks forward. Handheld camera follows
	+ Scene: The subject walks forward. Handheld camera follows. Dust trails behind
	+ Style: The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic."""
	except Exception as e:
	return f"Error generating Gen-4 prompts: {str(e)}"


	def build_custom_prompt(foundation, subject_motion, scene_motion, camera_motion, style, approach="SARA"):
	"""Build custom prompt using selected approach"""
	if approach == "SARA":
	# SARA Structure: [Subject] [Action] while [Reference], [Atmosphere]
	parts = []
	if foundation:
	parts.append(foundation)
	# Add motion elements
	motion_parts = []
	if subject_motion:
	motion_parts.extend(subject_motion)
	if scene_motion:
	motion_parts.extend(scene_motion)
	if motion_parts:
	parts.append(", ".join(motion_parts))
	# Reference (camera stability)
	if camera_motion:
	parts.append(f"while {camera_motion}")
	else:
	parts.append("while background remains steady")
	# Atmosphere
	if style:
	parts.append(style)
	return " ".join(parts)
	else: # Gen-4 style
	# Gen-4 Structure: Simple iterative building
	parts = []
	if foundation:
	parts.append(foundation)
	if subject_motion:
	parts.extend(subject_motion)
	if camera_motion:
	parts.append(camera_motion)
	if scene_motion:
	parts.extend(scene_motion)
	if style:
	parts.append(style)
	return ". ".join(parts) if parts else "The subject moves naturally"


	# Create the Gradio interface
	def create_interface():
	"""Create the Gradio interface"""
	with gr.Blocks(theme=gr.themes.Soft(), title="AI Video Prompt Generator") as demo:
	# Header
	gr.Markdown("# 🎬 AI Video Prompt Generator - 🤖 SARA-Zephyr AI Powered")
	gr.Markdown("Professional prompts for Sora, Gen-4, Pika, Luma, Runway and more")
	# State variables
	scene_state = gr.State({})
	chat_history_state = gr.State([])
	with gr.Tabs():
	# Tab 1: Learning Guide
	with gr.Tab("📚 Prompting Guide"):
	gr.Markdown(unified_instructions)
	# Advanced tips
	with gr.Accordion("🎯 Advanced Tips", open=False):
	gr.Markdown("""
	## Advanced Prompting Strategies
	### 🎨 Style Integration
	- Cinematography: "Dutch angle," "Extreme close-up," "Bird's eye view"
	- Lighting: "Golden hour," "Neon glow," "Harsh shadows," "Soft diffused light"
	- Movement Quality: "Fluid motion," "Mechanical precision," "Organic flow"
	### ⚡ Motion Types
	- Subject Motion: Walking, running, dancing, gesturing
	- Camera Motion: Pan, tilt, dolly, zoom, orbit, tracking
	- Environmental: Wind, water flow, particle effects, lighting changes
	""")
	# Tab 2: Image Analysis
	with gr.Tab("📷 Image Analysis"):
	with gr.Row():
	with gr.Column(scale=1):
	image_input = gr.Image(
	label="Upload Image for Analysis",
	type="pil"
	)
	analyze_btn = gr.Button("🔍 Analyze Image", variant="primary")
	with gr.Column(scale=2):
	analysis_output = gr.Markdown(label="AI Analysis Results")
	# Sample prompts section
	with gr.Group():
	gr.Markdown("### 💡 Sample Prompts")
	sample_btn = gr.Button("🎲 Generate Sample Prompts")
	sample_prompts = [
	gr.Textbox(
	label=f"Sample {i+1}",
	lines=2,
	interactive=False,
	show_copy_button=True
	)
	for i in range(3)
	]
	# Tab 3: AI Prompt Generator
	with gr.Tab("🤖 AI Prompt Generator"):
	with gr.Row():
	with gr.Column():
	user_idea = gr.Textbox(
	label="Your Video Idea (any language)",
	placeholder="e.g., 'el personaje se quita la nariz' or 'character walks slowly'",
	lines=3
	)
	optimize_btn = gr.Button("🚀 Generate Optimized Prompt", variant="primary")
	optimized_prompt = gr.Textbox(
	label="AI-Optimized Video Prompt",
	lines=4,
	interactive=True,
	show_copy_button=True
	)
	with gr.Column():
	gr.Markdown("### 🔄 Refine Your Prompt")
	feedback_input = gr.Textbox(
	label="Feedback/Changes",
	placeholder="e.g., 'make it more dramatic' or 'add camera movement'",
	lines=2
	)
	refine_btn = gr.Button("🔄 Refine Prompt")
	# Chat history
	with gr.Accordion("💬 Refinement History", open=False):
	chat_display = gr.Chatbot(height=300, type='messages')
	# Tab 4: Gen-4 Method
	with gr.Tab("📝 Gen-4 Official"):
	gr.Markdown("Official Gen-4 method: Simple → Complex building")
	with gr.Row():
	foundation_gen4 = gr.Textbox(
	label="Foundation (Optional)",
	placeholder="e.g., 'The subject walks forward'",
	lines=1
	)
	generate_gen4_btn = gr.Button("Generate Gen-4 Prompts", variant="primary")
	gen4_output = gr.Textbox(
	label="Gen-4 Style Prompts",
	lines=8,
	interactive=False,
	show_copy_button=True
	)
	# Tab 5: Custom Builder
	with gr.Tab("🛠️ Custom Builder"):
	gr.Markdown("## Build Your Custom Prompt")
	with gr.Row():
	approach_selector = gr.Radio(
	choices=["SARA", "Gen-4"],
	value="SARA",
	label="Approach",
	interactive=True
	)
	custom_foundation = gr.Textbox(
	label="Foundation",
	placeholder="The subject...",
	lines=1
	)
	with gr.Row():
	subject_motion = gr.CheckboxGroup(
	choices=["walks smoothly", "speaks clearly", "gestures naturally", "moves gracefully", "turns slowly"],
	label="Subject Motion"
	)
	scene_motion = gr.CheckboxGroup(
	choices=["dust swirls", "lighting changes", "wind effects", "water movement", "atmosphere shifts"],
	label="Scene Motion"
	)
	with gr.Row():
	camera_motion = gr.Dropdown(
	choices=["camera remains steady", "handheld camera", "camera pans left", "camera pans right", "camera tracks forward", "camera zooms in"],
	label="Camera Motion",
	value="camera remains steady"
	)
	style_motion = gr.Dropdown(
	choices=["cinematic", "documentary style", "live-action", "dramatic", "peaceful", "energetic", "professional"],
	label="Style/Atmosphere",
	value="cinematic"
	)
	build_custom_btn = gr.Button("🔨 Build Custom Prompt", variant="secondary")
	custom_output = gr.Textbox(
	label="Your Custom Prompt",
	lines=3,
	interactive=True,
	show_copy_button=True
	)
	# Event handlers
	analyze_btn.click(
	fn=analyze_image_with_zephyr,
	inputs=[image_input],
	outputs=[analysis_output, scene_state]
	)
	sample_btn.click(
	fn=generate_sample_prompts_with_zephyr,
	inputs=[scene_state],
	outputs=sample_prompts
	)
	optimize_btn.click(
	fn=optimize_user_prompt_with_zephyr,
	inputs=[user_idea, scene_state],
	outputs=[optimized_prompt]
	)
	refine_btn.click(
	fn=refine_prompt_with_zephyr,
	inputs=[optimized_prompt, feedback_input, chat_history_state, scene_state],
	outputs=[optimized_prompt, chat_history_state]
	)
	# Update chat display when history changes
	chat_history_state.change(
	fn=lambda history: history,
	inputs=[chat_history_state],
	outputs=[chat_display]
	)
	generate_gen4_btn.click(
	fn=generate_gen4_prompts,
	inputs=[scene_state, foundation_gen4],
	outputs=[gen4_output]
	)
	build_custom_btn.click(
	fn=build_custom_prompt,
	inputs=[custom_foundation, subject_motion, scene_motion, camera_motion, style_motion, approach_selector],
	outputs=[custom_output]
	)
	return demo


	# Launch the app
	if __name__ == "__main__":
	print("🎬 Starting AI Video Prompt Generator with SARA-Zephyr...")
	print(f"📊 Status: {'GPU' if use_gpu else 'CPU'} Mode Enabled")
	print("🔧 Loading models (this may take a few minutes)...")
	try:
	demo = create_interface()
	print("✅ Interface created successfully!")
	print("🚀 Launching application...")
	demo.launch(
	share=True,
	server_name="0.0.0.0",
	server_port=7860,
	debug=True,
	show_error=True
	)
	except Exception as e:
	print(f"❌ Error launching app: {e}")
	print("🔧 Make sure you have sufficient CPU resources and all dependencies installed.")
	print("📦 Required packages:")
	print(" pip install torch transformers gradio pillow accelerate bitsandbytes")
	# Alternative launch attempt
	print("\n🔄 Attempting alternative launch...")
	try:
	demo = create_interface()
	demo.launch(
	share=False,
	server_name="127.0.0.1",
	server_port=7860,
	debug=False
	)
	except Exception as e2:
	print(f"❌ Alternative launch failed: {e2}")
	print("\n💡 Troubleshooting tips:")
	print("1. Ensure CPU resources are sufficient.")
	print("2. Check CPU usage: top or htop")
	print("3. Try reducing model precision: set torch_dtype=torch.float32")
	print("4. Monitor memory usage: free -h")