Spaces:

efecelik
/

video-generator

Paused

File size: 4,613 Bytes

c60e9e5

import spaces
import torch
import gradio as gr
import numpy as np
import random
from PIL import Image
from diffusers import CogVideoXImageToVideoPipeline
from diffusers.utils import export_to_video
import tempfile
import os

# Model configuration
MODEL_ID = "THUDM/CogVideoX-5b-I2V"
MAX_SEED = np.iinfo(np.int32).max

# Load pipeline globally (on CPU first, moved to GPU when needed)
print("Loading CogVideoX pipeline...")
pipe = CogVideoXImageToVideoPipeline.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
)
pipe.enable_model_cpu_offload()
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()
print("Pipeline loaded!")

def resize_image(image: Image.Image, max_size: int = 720) -> Image.Image:
    """Resize image to fit within max_size while maintaining aspect ratio."""
    width, height = image.size
    if max(width, height) > max_size:
        if width > height:
            new_width = max_size
            new_height = int(height * max_size / width)
        else:
            new_height = max_size
            new_width = int(width * max_size / height)
        # Make dimensions divisible by 16
        new_width = (new_width // 16) * 16
        new_height = (new_height // 16) * 16
        image = image.resize((new_width, new_height), Image.LANCZOS)
    return image

@spaces.GPU(duration=300)
def generate_video(
    image: Image.Image,
    prompt: str,
    negative_prompt: str = "",
    num_frames: int = 49,
    guidance_scale: float = 6.0,
    num_inference_steps: int = 50,
    seed: int = -1,
):
    """Generate video from image and prompt."""
    if image is None:
        raise gr.Error("Please upload an image!")

    if not prompt:
        prompt = "Make this image come alive with smooth, cinematic motion"

    # Set seed
    if seed == -1:
        seed = random.randint(0, MAX_SEED)
    generator = torch.Generator(device="cuda").manual_seed(seed)

    # Resize image
    image = resize_image(image)

    # Move to GPU and generate
    pipe.to("cuda")

    with torch.inference_mode():
        video_frames = pipe(
            image=image,
            prompt=prompt,
            negative_prompt=negative_prompt,
            num_frames=num_frames,
            guidance_scale=guidance_scale,
            num_inference_steps=num_inference_steps,
            generator=generator,
        ).frames[0]

    # Export to video file
    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
        export_to_video(video_frames, f.name, fps=8)
        return f.name, seed

# Gradio UI
with gr.Blocks(title="Video Generator") as demo:
    gr.Markdown("""
    # 🎬 Image to Video Generator

    Upload an image and describe the motion you want. Powered by CogVideoX.

    **Tips:**
    - Use clear, descriptive prompts about motion (e.g., "the person waves hello", "the flower blooms")
    - Keep images simple with clear subjects for best results
    """)

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Image")
            prompt_input = gr.Textbox(
                label="Prompt",
                placeholder="Describe the motion you want...",
                value="Make this image come alive with smooth, cinematic motion"
            )
            negative_prompt = gr.Textbox(
                label="Negative Prompt (optional)",
                placeholder="What to avoid...",
                value="blurry, low quality, distorted"
            )

            with gr.Row():
                num_frames = gr.Slider(
                    minimum=17, maximum=81, value=49, step=8,
                    label="Number of Frames"
                )
                guidance_scale = gr.Slider(
                    minimum=1.0, maximum=15.0, value=6.0, step=0.5,
                    label="Guidance Scale"
                )

            with gr.Row():
                num_steps = gr.Slider(
                    minimum=20, maximum=100, value=50, step=5,
                    label="Inference Steps"
                )
                seed_input = gr.Number(
                    value=-1, label="Seed (-1 for random)"
                )

            generate_btn = gr.Button("🎬 Generate Video", variant="primary")

        with gr.Column():
            video_output = gr.Video(label="Generated Video")
            seed_output = gr.Number(label="Seed Used")

    generate_btn.click(
        fn=generate_video,
        inputs=[image_input, prompt_input, negative_prompt, num_frames, guidance_scale, num_steps, seed_input],
        outputs=[video_output, seed_output]
    )

if __name__ == "__main__":
    demo.launch()