import spaces import torch import gradio as gr import numpy as np import random from PIL import Image from diffusers import CogVideoXImageToVideoPipeline from diffusers.utils import export_to_video import tempfile import os # Model configuration MODEL_ID = "THUDM/CogVideoX-5b-I2V" MAX_SEED = np.iinfo(np.int32).max # Load pipeline globally (on CPU first, moved to GPU when needed) print("Loading CogVideoX pipeline...") pipe = CogVideoXImageToVideoPipeline.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, ) pipe.enable_model_cpu_offload() pipe.vae.enable_slicing() pipe.vae.enable_tiling() print("Pipeline loaded!") def resize_image(image: Image.Image, max_size: int = 720) -> Image.Image: """Resize image to fit within max_size while maintaining aspect ratio.""" width, height = image.size if max(width, height) > max_size: if width > height: new_width = max_size new_height = int(height * max_size / width) else: new_height = max_size new_width = int(width * max_size / height) # Make dimensions divisible by 16 new_width = (new_width // 16) * 16 new_height = (new_height // 16) * 16 image = image.resize((new_width, new_height), Image.LANCZOS) return image @spaces.GPU(duration=300) def generate_video( image: Image.Image, prompt: str, negative_prompt: str = "", num_frames: int = 49, guidance_scale: float = 6.0, num_inference_steps: int = 50, seed: int = -1, ): """Generate video from image and prompt.""" if image is None: raise gr.Error("Please upload an image!") if not prompt: prompt = "Make this image come alive with smooth, cinematic motion" # Set seed if seed == -1: seed = random.randint(0, MAX_SEED) generator = torch.Generator(device="cuda").manual_seed(seed) # Resize image image = resize_image(image) # Move to GPU and generate pipe.to("cuda") with torch.inference_mode(): video_frames = pipe( image=image, prompt=prompt, negative_prompt=negative_prompt, num_frames=num_frames, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, generator=generator, ).frames[0] # Export to video file with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: export_to_video(video_frames, f.name, fps=8) return f.name, seed # Gradio UI with gr.Blocks(title="Video Generator") as demo: gr.Markdown(""" # 🎬 Image to Video Generator Upload an image and describe the motion you want. Powered by CogVideoX. **Tips:** - Use clear, descriptive prompts about motion (e.g., "the person waves hello", "the flower blooms") - Keep images simple with clear subjects for best results """) with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="Upload Image") prompt_input = gr.Textbox( label="Prompt", placeholder="Describe the motion you want...", value="Make this image come alive with smooth, cinematic motion" ) negative_prompt = gr.Textbox( label="Negative Prompt (optional)", placeholder="What to avoid...", value="blurry, low quality, distorted" ) with gr.Row(): num_frames = gr.Slider( minimum=17, maximum=81, value=49, step=8, label="Number of Frames" ) guidance_scale = gr.Slider( minimum=1.0, maximum=15.0, value=6.0, step=0.5, label="Guidance Scale" ) with gr.Row(): num_steps = gr.Slider( minimum=20, maximum=100, value=50, step=5, label="Inference Steps" ) seed_input = gr.Number( value=-1, label="Seed (-1 for random)" ) generate_btn = gr.Button("🎬 Generate Video", variant="primary") with gr.Column(): video_output = gr.Video(label="Generated Video") seed_output = gr.Number(label="Seed Used") generate_btn.click( fn=generate_video, inputs=[image_input, prompt_input, negative_prompt, num_frames, guidance_scale, num_steps, seed_input], outputs=[video_output, seed_output] ) if __name__ == "__main__": demo.launch()