import torch from diffsynth.utils.data import save_video, VideoData from diffsynth.pipelines.wan_video_instanceV import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth"), ], tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"), ) H, W, num_frames = 480, 832, 81 # latent 时间长度是 (num_frames-1)//4+1 = 21 (NoiseInitializer 里就是这么算的):contentReference[oaicite:15]{index=15} f_lat = (num_frames - 1) // 4 + 1 # 例子:只有 1 个 instance(小狗),给一个大概的 bbox(全程固定) dog_box = (0.20 * W, 0.35 * H, 0.80 * W, 0.92 * H) # (x0,y0,x1,y1) instance_bboxes = [[dog_box] for _ in range(f_lat)] video = pipe( prompt="纪实摄影风格画面,一只活泼的小狗在绿茵茵的草地上迅速奔跑。中景侧面移动视角。", negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,最差质量,低质量,JPEG压缩残留,畸形,毁容,多余的手指,背景人很多", seed=0, height=H, width=W, num_frames=num_frames, tiled=True, # ===== InstanceV new args ===== instance_prompts=[ "一只棕黄色、毛发柔软的小狗,奔跑时耳朵竖起,表情欢快", ], instance_bboxes=instance_bboxes, # ===== SAUG (optional) ===== saug_scale=0.6, # 你可以从 0.3~1.0 试 saug_drop_prob=0.0, ) save_video(video, "video_1_instancev.mp4", fps=15, quality=5)