import gradio as gr import torch from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline from PIL import Image import random # Check GPU availability use_gpu = torch.cuda.is_available() # Lazy loading of models processor, model, zephyr_generator = None, None, None def load_models(): """Load models only when needed""" global processor, model, zephyr_generator if processor is None or model is None or zephyr_generator is None: print("Loading BLIP model...") processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = BlipForConditionalGeneration.from_pretrained( "Salesforce/blip-image-captioning-large", torch_dtype=torch.float32 # Use float32 for CPU ) print("āœ… BLIP model loaded successfully!") print("Loading SARA-Zephyr fine-tuned model...") zephyr_generator = pipeline( "text-generation", model="Malaji71/SARA-Zephyr", # Cambiado al modelo fine-tuned torch_dtype=torch.float32, # Use float32 for CPU device_map="auto" if use_gpu else None # Use auto device mapping if GPU available ) print("āœ… SARA-Zephyr fine-tuned model loaded successfully!") # Universal Video Prompting Guide combining Gen-4 + SARA unified_instructions = """ # šŸŽ¬ Universal Video Prompting Guide *Compatible with Gen-4, Sora, Pika, Luma, Runway and all diffusion-based video models* ## Core Principles (Universal) āœ… **Focus on MOTION, not static description** āœ… **Use positive phrasing exclusively** āœ… **Start simple, iterate progressively** āœ… **Refer to subjects in general terms** ("the subject," "the woman") āœ… **Keep prompts direct and easily understood** ## Two Complementary Approaches ### šŸš€ **Gen-4 Official Method** (Recommended for beginners) **Structure**: Simple iterative building 1. Start with essential motion only 2. Add one element at a time: Subject Motion → Camera Motion → Scene Motion → Style Descriptors 3. Use general terms and avoid complex descriptions **Example**: - Basic: "The subject walks forward" - + Camera: "The subject walks forward. Handheld camera follows" - + Scene: "The subject walks forward. Handheld camera follows. Dust trails behind" - + Style: "The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic." ### šŸŽÆ **SARA Framework** (Advanced precision) **Structure**: [Subject] + [Action] + [Reference] + [Atmosphere] - **Subject (S)**: Main element to control - **Action (A)**: Movement/transformation ([verb] + [adverb]) - **Reference (R)**: Spatial anchors ("while X remains steady") - **Atmosphere (A)**: Context and style **Template**: [Subject] [verb] [adverb] while [reference] [atmosphere] **Example**: "The subject walks smoothly while background remains steady, cinematic atmosphere" """ def analyze_image_with_zephyr(image): """Analyze image using BLIP + Zephyr AI for enhanced understanding""" if image is None: return "Please upload an image first.", {} try: # Lazy load models load_models() # Convert to PIL if needed if not isinstance(image, Image.Image): image = Image.fromarray(image) # Get image dimensions width, height = image.size aspect_ratio = width / height if aspect_ratio > 1.5: composition = "Wide landscape shot" elif aspect_ratio < 0.7: composition = "Vertical portrait shot" else: composition = "Balanced composition" # Generate caption with BLIP inputs = processor(image, return_tensors="pt") out = model.generate(**inputs, max_length=50, num_beams=3) basic_caption = processor.decode(out[0], skip_special_tokens=True) # Use Zephyr for advanced analysis enhanced_analysis = analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition) # Create comprehensive analysis analysis = f"""šŸ“Š **Image Analysis:** • **Dimensions**: {width} x {height} • **Composition**: {composition} • **Aspect Ratio**: {aspect_ratio:.2f} šŸŽØ **Scene Description**: "{basic_caption}" šŸ¤– **AI Enhanced Analysis**: {enhanced_analysis['scene_interpretation']} šŸ’” **Motion Insights**: {chr(10).join(f"• {insight}" for insight in enhanced_analysis['motion_insights'])} šŸŽÆ **Recommended Approach**: {enhanced_analysis['recommended_approach']}""" # Scene info for prompt generation scene_info = { 'basic_description': basic_caption, 'composition': composition, 'aspect_ratio': aspect_ratio, 'enhanced_analysis': enhanced_analysis } return analysis, scene_info except Exception as e: return f"Error analyzing image: {str(e)}", {} def analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition): """Use SARA-Zephyr for advanced scene analysis""" analysis_prompt = f"""<|system|> You are a video prompt engineering expert specializing in the SARA framework. Analyze this image description for video creation potential. <|user|> Image description: "{basic_caption}" Image composition: {composition} Aspect ratio: {aspect_ratio:.2f} Please provide: 1. Type of motion that would work best 2. Recommended camera movements 3. Emotional tone/style suggestions 4. Best prompting approach (SARA vs Gen-4) Be concise and practical. <|assistant|>""" response = zephyr_generator( analysis_prompt, max_new_tokens=200, do_sample=True, temperature=0.7, pad_token_id=zephyr_generator.tokenizer.eos_token_id ) ai_analysis = response[0]['generated_text'].split("<|assistant|>")[-1].strip() lines = ai_analysis.split('\n') # Fixed: Properly split by newline motion_insights = [] recommended_approach = "SARA framework recommended for precise control" for line in lines: if line.strip(): if any(keyword in line.lower() for keyword in ['motion', 'movement', 'camera', 'lighting']): motion_insights.append(line.strip('- ').strip()) elif 'sara' in line.lower() or 'gen-4' in line.lower(): recommended_approach = line.strip('- ').strip() return { 'scene_interpretation': ai_analysis.split('\n')[0] if ai_analysis else "Scene analysis completed", 'motion_insights': motion_insights[:6], 'recommended_approach': recommended_approach } def generate_sample_prompts_with_zephyr(scene_info=None): """Generate sample prompts using SARA-Zephyr""" if scene_info and scene_info.get('basic_description'): # Use Zephyr to generate contextual prompts context_prompt = f"""<|system|> Generate 3 professional video prompts using the SARA framework based on this image analysis. <|user|> Image description: {scene_info['basic_description']} Composition: {scene_info.get('composition', 'Balanced')} Aspect Ratio: {scene_info.get('aspect_ratio', 'N/A'):.2f} Remember the SARA framework: Subject + Action + Reference + Atmosphere <|assistant|>""" response = zephyr_generator( context_prompt, max_new_tokens=200, do_sample=True, temperature=0.8, pad_token_id=zephyr_generator.tokenizer.eos_token_id ) # Extract and clean prompts prompts_text = response[0]['generated_text'].split("<|assistant|>")[-1].strip() prompts = [p.strip('123.-• ') for p in prompts_text.split('\n') if p.strip()] # Fixed: Split by newline # Return first 3 clean prompts if len(prompts) >= 3: return prompts[:3] # Fallback prompts if Zephyr fails or no scene info base_prompts = [ "The subject walks forward smoothly while the background remains steady, cinematic atmosphere.", "A dramatic close-up captures the subject's expression as they speak directly to the camera.", "The scene transitions with a handheld camera following the subject through a bustling environment." ] return base_prompts def optimize_user_prompt_with_zephyr(user_idea, scene_info=None): """Optimize user's prompt idea using SARA-Zephyr while respecting SARA/Gen-4 structure""" if not user_idea.strip(): return "Please enter your idea first." # Create context from scene if available context = "" if scene_info and scene_info.get('basic_description'): context = f"Image context: {scene_info['basic_description']}" # Enforce structure based on approach optimization_prompt = f"""<|system|> You are an expert in video prompting, specializing in the SARA framework. Transform user ideas into professional prompts compatible with AI video models like Sora, Gen-4, Pika, Runway, and Luma. Key principles: - Focus on MOTION, not static description - Use positive phrasing - Be specific about camera work - Include lighting/atmosphere details - Follow the SARA structure: Subject + Action + Reference + Atmosphere <|user|> User's idea: "{user_idea}" {context} Please create an optimized video prompt using the SARA framework. Respond with just the prompt. <|assistant|>""" response = zephyr_generator( optimization_prompt, max_new_tokens=100, do_sample=True, temperature=0.7, pad_token_id=zephyr_generator.tokenizer.eos_token_id ) # Extract optimized prompt optimized = response[0]['generated_text'].split("<|assistant|>")[-1].strip() return optimized def refine_prompt_with_zephyr(current_prompt, feedback, chat_history, scene_info=None): """Refine a prompt based on user feedback using SARA-Zephyr""" if not feedback.strip(): return current_prompt, chat_history # Create refinement context context = "" if scene_info and scene_info.get('basic_description'): context = f"Image context: {scene_info['basic_description']}" # Construct Zephyr refinement prompt refinement_prompt = f"""<|system|> You are an expert in refining video prompts using the SARA framework. Based on the user's feedback, improve the current prompt while maintaining its core structure. Key principles: - Focus on MOTION, not static description - Use positive phrasing - Be specific about camera work - Include lighting/atmosphere details - Follow the SARA structure: Subject + Action + Reference + Atmosphere <|user|> Current prompt: "{current_prompt}" Feedback: "{feedback}" {context} Please refine the prompt while keeping it under 100 words. Respond with just the refined prompt. <|assistant|>""" response = zephyr_generator( refinement_prompt, max_new_tokens=100, do_sample=True, temperature=0.7, pad_token_id=zephyr_generator.tokenizer.eos_token_id ) # Extract refined prompt refined = response[0]['generated_text'].split("<|assistant|>")[-1].strip() # Update chat history new_chat_history = chat_history + [[feedback, refined]] return refined, new_chat_history def generate_gen4_prompts(scene_info, foundation=""): """Generate Gen-4 style prompts iteratively""" try: if scene_info and scene_info.get('basic_description'): description = scene_info['basic_description'] # Detect subject if 'man' in description.lower(): subject = "The man" elif 'woman' in description.lower(): subject = "The woman" elif 'person' in description.lower(): subject = "The person" else: subject = "The subject" # Generate actions based on scene if any(word in description.lower() for word in ['sitting', 'seated']): actions = ['speaks to camera', 'gestures while seated', 'leans forward', 'adjusts posture'] elif any(word in description.lower() for word in ['standing', 'portrait']): actions = ['speaks directly', 'gestures naturally', 'shifts weight', 'looks around'] else: actions = ['moves forward', 'turns slightly', 'gestures', 'demonstrates'] action = random.choice(actions) # Build Gen-4 iteratively basic = f"{subject} {action}" with_motion = f"{basic} smoothly" with_camera = f"{with_motion}. Camera captures steadily" # Add style based on composition composition = scene_info.get('composition', '') if 'Wide' in composition: style_addition = "Wide cinematic framing" elif 'Portrait' in composition: style_addition = "Intimate portrait lighting" else: style_addition = "Professional documentary style" with_style = f"{with_camera}. {style_addition}." return f"""šŸš€ **Gen-4 Iterative Building:** **Basic**: {basic} **+ Motion**: {with_motion} **+ Camera**: {with_camera} **+ Style**: {with_style}""" else: return """šŸš€ **Gen-4 Iterative Building:** **Basic**: The subject walks forward **+ Camera**: The subject walks forward. Handheld camera follows **+ Scene**: The subject walks forward. Handheld camera follows. Dust trails behind **+ Style**: The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic.""" except Exception as e: return f"Error generating Gen-4 prompts: {str(e)}" def build_custom_prompt(foundation, subject_motion, scene_motion, camera_motion, style, approach="SARA"): """Build custom prompt using selected approach""" if approach == "SARA": # SARA Structure: [Subject] [Action] while [Reference], [Atmosphere] parts = [] if foundation: parts.append(foundation) # Add motion elements motion_parts = [] if subject_motion: motion_parts.extend(subject_motion) if scene_motion: motion_parts.extend(scene_motion) if motion_parts: parts.append(", ".join(motion_parts)) # Reference (camera stability) if camera_motion: parts.append(f"while {camera_motion}") else: parts.append("while background remains steady") # Atmosphere if style: parts.append(style) return " ".join(parts) else: # Gen-4 style # Gen-4 Structure: Simple iterative building parts = [] if foundation: parts.append(foundation) if subject_motion: parts.extend(subject_motion) if camera_motion: parts.append(camera_motion) if scene_motion: parts.extend(scene_motion) if style: parts.append(style) return ". ".join(parts) if parts else "The subject moves naturally" # Create the Gradio interface def create_interface(): """Create the Gradio interface""" with gr.Blocks(theme=gr.themes.Soft(), title="AI Video Prompt Generator") as demo: # Header gr.Markdown("# šŸŽ¬ AI Video Prompt Generator - šŸ¤– SARA-Zephyr AI Powered") gr.Markdown("*Professional prompts for Sora, Gen-4, Pika, Luma, Runway and more*") # State variables scene_state = gr.State({}) chat_history_state = gr.State([]) with gr.Tabs(): # Tab 1: Learning Guide with gr.Tab("šŸ“š Prompting Guide"): gr.Markdown(unified_instructions) # Advanced tips with gr.Accordion("šŸŽÆ Advanced Tips", open=False): gr.Markdown(""" ## Advanced Prompting Strategies ### šŸŽØ Style Integration - **Cinematography**: "Dutch angle," "Extreme close-up," "Bird's eye view" - **Lighting**: "Golden hour," "Neon glow," "Harsh shadows," "Soft diffused light" - **Movement Quality**: "Fluid motion," "Mechanical precision," "Organic flow" ### ⚔ Motion Types - **Subject Motion**: Walking, running, dancing, gesturing - **Camera Motion**: Pan, tilt, dolly, zoom, orbit, tracking - **Environmental**: Wind, water flow, particle effects, lighting changes """) # Tab 2: Image Analysis with gr.Tab("šŸ“· Image Analysis"): with gr.Row(): with gr.Column(scale=1): image_input = gr.Image( label="Upload Image for Analysis", type="pil" ) analyze_btn = gr.Button("šŸ” Analyze Image", variant="primary") with gr.Column(scale=2): analysis_output = gr.Markdown(label="AI Analysis Results") # Sample prompts section with gr.Group(): gr.Markdown("### šŸ’” Sample Prompts") sample_btn = gr.Button("šŸŽ² Generate Sample Prompts") sample_prompts = [ gr.Textbox( label=f"Sample {i+1}", lines=2, interactive=False, show_copy_button=True ) for i in range(3) ] # Tab 3: AI Prompt Generator with gr.Tab("šŸ¤– AI Prompt Generator"): with gr.Row(): with gr.Column(): user_idea = gr.Textbox( label="Your Video Idea (any language)", placeholder="e.g., 'el personaje se quita la nariz' or 'character walks slowly'", lines=3 ) optimize_btn = gr.Button("šŸš€ Generate Optimized Prompt", variant="primary") optimized_prompt = gr.Textbox( label="AI-Optimized Video Prompt", lines=4, interactive=True, show_copy_button=True ) with gr.Column(): gr.Markdown("### šŸ”„ Refine Your Prompt") feedback_input = gr.Textbox( label="Feedback/Changes", placeholder="e.g., 'make it more dramatic' or 'add camera movement'", lines=2 ) refine_btn = gr.Button("šŸ”„ Refine Prompt") # Chat history with gr.Accordion("šŸ’¬ Refinement History", open=False): chat_display = gr.Chatbot(height=300, type='messages') # Tab 4: Gen-4 Method with gr.Tab("šŸ“ Gen-4 Official"): gr.Markdown("*Official Gen-4 method: Simple → Complex building*") with gr.Row(): foundation_gen4 = gr.Textbox( label="Foundation (Optional)", placeholder="e.g., 'The subject walks forward'", lines=1 ) generate_gen4_btn = gr.Button("Generate Gen-4 Prompts", variant="primary") gen4_output = gr.Textbox( label="Gen-4 Style Prompts", lines=8, interactive=False, show_copy_button=True ) # Tab 5: Custom Builder with gr.Tab("šŸ› ļø Custom Builder"): gr.Markdown("## Build Your Custom Prompt") with gr.Row(): approach_selector = gr.Radio( choices=["SARA", "Gen-4"], value="SARA", label="Approach", interactive=True ) custom_foundation = gr.Textbox( label="Foundation", placeholder="The subject...", lines=1 ) with gr.Row(): subject_motion = gr.CheckboxGroup( choices=["walks smoothly", "speaks clearly", "gestures naturally", "moves gracefully", "turns slowly"], label="Subject Motion" ) scene_motion = gr.CheckboxGroup( choices=["dust swirls", "lighting changes", "wind effects", "water movement", "atmosphere shifts"], label="Scene Motion" ) with gr.Row(): camera_motion = gr.Dropdown( choices=["camera remains steady", "handheld camera", "camera pans left", "camera pans right", "camera tracks forward", "camera zooms in"], label="Camera Motion", value="camera remains steady" ) style_motion = gr.Dropdown( choices=["cinematic", "documentary style", "live-action", "dramatic", "peaceful", "energetic", "professional"], label="Style/Atmosphere", value="cinematic" ) build_custom_btn = gr.Button("šŸ”Ø Build Custom Prompt", variant="secondary") custom_output = gr.Textbox( label="Your Custom Prompt", lines=3, interactive=True, show_copy_button=True ) # Event handlers analyze_btn.click( fn=analyze_image_with_zephyr, inputs=[image_input], outputs=[analysis_output, scene_state] ) sample_btn.click( fn=generate_sample_prompts_with_zephyr, inputs=[scene_state], outputs=sample_prompts ) optimize_btn.click( fn=optimize_user_prompt_with_zephyr, inputs=[user_idea, scene_state], outputs=[optimized_prompt] ) refine_btn.click( fn=refine_prompt_with_zephyr, inputs=[optimized_prompt, feedback_input, chat_history_state, scene_state], outputs=[optimized_prompt, chat_history_state] ) # Update chat display when history changes chat_history_state.change( fn=lambda history: history, inputs=[chat_history_state], outputs=[chat_display] ) generate_gen4_btn.click( fn=generate_gen4_prompts, inputs=[scene_state, foundation_gen4], outputs=[gen4_output] ) build_custom_btn.click( fn=build_custom_prompt, inputs=[custom_foundation, subject_motion, scene_motion, camera_motion, style_motion, approach_selector], outputs=[custom_output] ) return demo # Launch the app if __name__ == "__main__": print("šŸŽ¬ Starting AI Video Prompt Generator with SARA-Zephyr...") print(f"šŸ“Š Status: {'GPU' if use_gpu else 'CPU'} Mode Enabled") print("šŸ”§ Loading models (this may take a few minutes)...") try: demo = create_interface() print("āœ… Interface created successfully!") print("šŸš€ Launching application...") demo.launch( share=True, server_name="0.0.0.0", server_port=7860, debug=True, show_error=True ) except Exception as e: print(f"āŒ Error launching app: {e}") print("šŸ”§ Make sure you have sufficient CPU resources and all dependencies installed.") print("šŸ“¦ Required packages:") print(" pip install torch transformers gradio pillow accelerate bitsandbytes") # Alternative launch attempt print("\nšŸ”„ Attempting alternative launch...") try: demo = create_interface() demo.launch( share=False, server_name="127.0.0.1", server_port=7860, debug=False ) except Exception as e2: print(f"āŒ Alternative launch failed: {e2}") print("\nšŸ’” Troubleshooting tips:") print("1. Ensure CPU resources are sufficient.") print("2. Check CPU usage: top or htop") print("3. Try reducing model precision: set torch_dtype=torch.float32") print("4. Monitor memory usage: free -h")