Video-Reason
/

VBVR-Wan2.2

+#!/usr/bin/env python3
+"""
+VBVR-Wan2.2 Image-to-Video Inference Example
+Generate a video from a reference image using the VBVR-Wan2.2 model.
+Usage:
+    python inference.py --model_path /path/to/VBVR-Wan2.2
+"""
+import os
+import torch
+from PIL import Image
+from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
+from diffusers.utils import export_to_video
+# ─────────────── Configuration (only change model_path) ───────────────
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_path", type=str, default="VBVR-Wan2.2")
+args = parser.parse_args()
+model_path = args.model_path
+# ──────────────────────────────────────────────────────────────────────
+# Paths derived from model_path
+image_path = os.path.join(model_path, "assets", "first_frame.png")
+output_path = "output.mp4"
+# Prompt
+prompt = (
+    "The scene contains two types of shapes, each type has three shapes of "
+    "different sizes arranged randomly. Keep all shapes unchanged in appearance "
+    "(type, size, and color). Only rearrange their positions: first group the "
+    "shapes by type, then within each group, sort the shapes from smallest to "
+    "largest (left to right), and arrange all shapes in a single horizontal "
+    "line from left to right."
+)
+negative_prompt = (
+    "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，"
+    "整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，"
+    "画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，"
+    "静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+)
+# Generation settings
+num_frames = 96
+num_inference_steps = 50
+guidance_scale = 5.0
+seed = 1
+# ──────────────────────── Load Pipeline ────────────────────────
+print(f"Loading model from: {model_path}")
+vae = AutoencoderKLWan.from_pretrained(
+    model_path, subfolder="vae", torch_dtype=torch.float32
+)
+pipe = WanImageToVideoPipeline.from_pretrained(
+    model_path,
+    vae=vae,
+    torch_dtype=torch.bfloat16,
+)
+pipe.enable_model_cpu_offload()
+print(f"Pipeline loaded. boundary_ratio = {pipe.config.boundary_ratio}")
+# ──────────────────────── Load Image ────────────────────────
+print(f"Loading image: {image_path}")
+image = Image.open(image_path).convert("RGB")
+width, height = image.size
+print(f"Image size: {width}x{height}")
+# ──────────────────────── Generate Video ────────────────────────
+print(f"Generating video: {num_frames} frames @ {width}x{height}, {num_inference_steps} steps")
+generator = torch.Generator(device="cpu").manual_seed(seed)
+output = pipe(
+    image=image,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=height,
+    width=width,
+    num_frames=num_frames,
+    num_inference_steps=num_inference_steps,
+    guidance_scale=guidance_scale,
+    generator=generator,
+)
+export_to_video(output.frames[0], output_path, fps=16)
+print(f"Video saved to: {output_path}")