|
|
|
|
|
import spaces |
|
|
|
|
|
|
|
|
import os |
|
|
|
|
|
|
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
import torch |
|
|
from diffusers import WanPipeline, AutoencoderKLWan |
|
|
from diffusers.utils import export_to_video |
|
|
|
|
|
|
|
|
MODEL_ID = "Wan-AI/Wan2.2-TI2V-5B-Diffusers" |
|
|
dtype = torch.bfloat16 |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
|
|
|
pipe = None |
|
|
|
|
|
def initialize_pipeline(): |
|
|
"""Initialize the Wan2.2 pipeline""" |
|
|
global pipe |
|
|
if pipe is None: |
|
|
print("Loading Wan2.2-TI2V-5B model...") |
|
|
vae = AutoencoderKLWan.from_pretrained( |
|
|
MODEL_ID, |
|
|
subfolder="vae", |
|
|
torch_dtype=torch.float32 |
|
|
) |
|
|
pipe = WanPipeline.from_pretrained( |
|
|
MODEL_ID, |
|
|
vae=vae, |
|
|
torch_dtype=dtype |
|
|
) |
|
|
pipe.to(device) |
|
|
print("Model loaded successfully!") |
|
|
return pipe |
|
|
|
|
|
@spaces.GPU(duration=180) |
|
|
def generate_video( |
|
|
prompt: str, |
|
|
image: Image.Image = None, |
|
|
width: int = 1280, |
|
|
height: int = 704, |
|
|
num_frames: int = 73, |
|
|
num_inference_steps: int = 35, |
|
|
guidance_scale: float = 5.0, |
|
|
seed: int = -1 |
|
|
): |
|
|
""" |
|
|
Generate video from text prompt and optional image |
|
|
|
|
|
Args: |
|
|
prompt: Text description of the video to generate |
|
|
image: Optional input image for image-to-video generation |
|
|
width: Video width (default: 1280) |
|
|
height: Video height (default: 704) |
|
|
num_frames: Number of frames to generate (default: 73 for 3 seconds at 24fps) |
|
|
num_inference_steps: Number of denoising steps (default: 35 for faster generation) |
|
|
guidance_scale: Guidance scale for generation (default: 5.0) |
|
|
seed: Random seed for reproducibility (-1 for random) |
|
|
""" |
|
|
try: |
|
|
|
|
|
pipeline = initialize_pipeline() |
|
|
|
|
|
|
|
|
if seed == -1: |
|
|
seed = torch.randint(0, 2**32 - 1, (1,)).item() |
|
|
generator = torch.Generator(device=device).manual_seed(seed) |
|
|
|
|
|
|
|
|
gen_params = { |
|
|
"prompt": prompt, |
|
|
"height": height, |
|
|
"width": width, |
|
|
"num_frames": num_frames, |
|
|
"guidance_scale": guidance_scale, |
|
|
"num_inference_steps": num_inference_steps, |
|
|
"generator": generator, |
|
|
} |
|
|
|
|
|
|
|
|
if image is not None: |
|
|
gen_params["image"] = image |
|
|
|
|
|
|
|
|
print(f"Generating video with prompt: {prompt}") |
|
|
print(f"Parameters: {width}x{height}, {num_frames} frames, seed: {seed}") |
|
|
|
|
|
output = pipeline(**gen_params).frames[0] |
|
|
|
|
|
|
|
|
output_path = "output.mp4" |
|
|
export_to_video(output, output_path, fps=24) |
|
|
|
|
|
return output_path, f"Video generated successfully! Seed used: {seed}" |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"Error generating video: {str(e)}" |
|
|
print(error_msg) |
|
|
return None, error_msg |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Wan2.2 Video Generation") as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# Wan2.2 Video Generation |
|
|
|
|
|
Generate high-quality videos from text prompts or images using Wan2.2-TI2V-5B model. |
|
|
This model supports both **Text-to-Video** and **Image-to-Video** generation at 720P/24fps. |
|
|
|
|
|
**Note:** Generation takes 2-3 minutes. Settings are optimized for Zero GPU 3-minute limit. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
|
|
|
prompt_input = gr.Textbox( |
|
|
label="Prompt", |
|
|
placeholder="Describe the video you want to generate...", |
|
|
lines=3, |
|
|
value="Two anthropomorphic cats in comfy boxing gear fight on stage" |
|
|
) |
|
|
|
|
|
image_input = gr.Image( |
|
|
label="Input Image (Optional - for Image-to-Video)", |
|
|
type="pil", |
|
|
sources=["upload"] |
|
|
) |
|
|
|
|
|
with gr.Accordion("Advanced Settings", open=False): |
|
|
with gr.Row(): |
|
|
width_input = gr.Slider( |
|
|
label="Width", |
|
|
minimum=512, |
|
|
maximum=1920, |
|
|
step=64, |
|
|
value=1280 |
|
|
) |
|
|
height_input = gr.Slider( |
|
|
label="Height", |
|
|
minimum=512, |
|
|
maximum=1080, |
|
|
step=64, |
|
|
value=704 |
|
|
) |
|
|
|
|
|
num_frames_input = gr.Slider( |
|
|
label="Number of Frames (more frames = longer video)", |
|
|
minimum=25, |
|
|
maximum=145, |
|
|
step=24, |
|
|
value=73, |
|
|
info="73 frames ≈ 3 seconds at 24fps (optimized for Zero GPU limits)" |
|
|
) |
|
|
|
|
|
num_steps_input = gr.Slider( |
|
|
label="Inference Steps (more steps = better quality, slower)", |
|
|
minimum=20, |
|
|
maximum=60, |
|
|
step=5, |
|
|
value=35 |
|
|
) |
|
|
|
|
|
guidance_scale_input = gr.Slider( |
|
|
label="Guidance Scale (higher = closer to prompt)", |
|
|
minimum=1.0, |
|
|
maximum=15.0, |
|
|
step=0.5, |
|
|
value=5.0 |
|
|
) |
|
|
|
|
|
seed_input = gr.Number( |
|
|
label="Seed (-1 for random)", |
|
|
value=-1, |
|
|
precision=0 |
|
|
) |
|
|
|
|
|
generate_btn = gr.Button("Generate Video", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(): |
|
|
|
|
|
video_output = gr.Video( |
|
|
label="Generated Video", |
|
|
autoplay=True |
|
|
) |
|
|
status_output = gr.Textbox( |
|
|
label="Status", |
|
|
lines=2 |
|
|
) |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
[ |
|
|
"Two anthropomorphic cats in comfy boxing gear fight on stage", |
|
|
None, |
|
|
1280, |
|
|
704, |
|
|
73, |
|
|
35, |
|
|
5.0, |
|
|
42 |
|
|
], |
|
|
[ |
|
|
"A serene underwater scene with colorful coral reefs and tropical fish swimming gracefully", |
|
|
None, |
|
|
1280, |
|
|
704, |
|
|
73, |
|
|
35, |
|
|
5.0, |
|
|
123 |
|
|
], |
|
|
[ |
|
|
"A bustling futuristic city at night with neon lights and flying cars", |
|
|
None, |
|
|
1280, |
|
|
704, |
|
|
73, |
|
|
35, |
|
|
5.0, |
|
|
456 |
|
|
], |
|
|
[ |
|
|
"A peaceful mountain landscape with snow-capped peaks and a flowing river", |
|
|
None, |
|
|
1280, |
|
|
704, |
|
|
73, |
|
|
35, |
|
|
5.0, |
|
|
789 |
|
|
], |
|
|
], |
|
|
inputs=[ |
|
|
prompt_input, |
|
|
image_input, |
|
|
width_input, |
|
|
height_input, |
|
|
num_frames_input, |
|
|
num_steps_input, |
|
|
guidance_scale_input, |
|
|
seed_input |
|
|
], |
|
|
outputs=[video_output, status_output], |
|
|
fn=generate_video, |
|
|
cache_examples=False, |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_video, |
|
|
inputs=[ |
|
|
prompt_input, |
|
|
image_input, |
|
|
width_input, |
|
|
height_input, |
|
|
num_frames_input, |
|
|
num_steps_input, |
|
|
guidance_scale_input, |
|
|
seed_input |
|
|
], |
|
|
outputs=[video_output, status_output] |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
## Tips for Best Results: |
|
|
- Use detailed, descriptive prompts |
|
|
- For image-to-video: Upload a clear image that matches your prompt |
|
|
- Higher inference steps = better quality but slower generation |
|
|
- Adjust guidance scale to balance creativity vs. prompt adherence |
|
|
- Use the same seed to reproduce results |
|
|
- Keep generation under 3 minutes to fit Zero GPU limits |
|
|
|
|
|
## Model Information: |
|
|
- Model: Wan2.2-TI2V-5B (5B parameters) |
|
|
- Resolution: 720P (1280x704 or custom) |
|
|
- Frame Rate: 24 fps |
|
|
- Default Duration: 3 seconds (optimized for Zero GPU) |
|
|
- Generation Time: ~2-3 minutes on Zero GPU (with optimized settings) |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.queue(max_size=20) |
|
|
demo.launch() |
|
|
|