PencilHu's picture
Upload folder using huggingface_hub
1146a67 verified
Raw
History Blame Contribute Delete
1.89 kB
import torch
from diffsynth.utils.data import save_video, VideoData
from diffsynth.pipelines.wan_video_instanceV import WanVideoPipeline, ModelConfig
pipe = WanVideoPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors"),
ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"),
ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth"),
],
tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"),
)
H, W, num_frames = 480, 832, 81
# latent 时间长度是 (num_frames-1)//4+1 = 21 (NoiseInitializer 里就是这么算的):contentReference[oaicite:15]{index=15}
f_lat = (num_frames - 1) // 4 + 1
# 例子:只有 1 个 instance(小狗),给一个大概的 bbox(全程固定)
dog_box = (0.20 * W, 0.35 * H, 0.80 * W, 0.92 * H) # (x0,y0,x1,y1)
instance_bboxes = [[dog_box] for _ in range(f_lat)]
video = pipe(
prompt="纪实摄影风格画面,一只活泼的小狗在绿茵茵的草地上迅速奔跑。中景侧面移动视角。",
negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,最差质量,低质量,JPEG压缩残留,畸形,毁容,多余的手指,背景人很多",
seed=0,
height=H, width=W, num_frames=num_frames,
tiled=True,
# ===== InstanceV new args =====
instance_prompts=[
"一只棕黄色、毛发柔软的小狗,奔跑时耳朵竖起,表情欢快",
],
instance_bboxes=instance_bboxes,
# ===== SAUG (optional) =====
saug_scale=0.6, # 你可以从 0.3~1.0 试
saug_drop_prob=0.0,
)
save_video(video, "video_1_instancev.mp4", fps=15, quality=5)