Spaces:

vinesnt
/

1111

Runtime error

App Files Files Community

vinesnt commited on 27 days ago

Commit

405de0f

verified ·

1 Parent(s): 078b7c3

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

.gitattributes +3 -0
README.md +30 -8
aoti.py +35 -0
app.py +995 -0
kill_bill.jpeg +3 -0
lora_loader.py +127 -0
model/loss.py +128 -0
model/pytorch_msssim/__init__.py +198 -0
model/warplayer.py +24 -0
packages.txt +1 -0
requirements.txt +16 -0
wan22_input_2.jpg +3 -0
wan_controlnet.py +284 -0
wan_i2v_input.JPG +3 -0
wan_t2v_controlnet_pipeline.py +798 -0
wan_teacache.py +78 -0
wan_transformer.py +135 -0
workflows/sam2.1_optimized.json +0 -0
workflows/sam_optimized.json +0 -0
workflows/vace_optimized.json +1043 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+kill_bill.jpeg filter=lfs diff=lfs merge=lfs -text
+wan22_input_2.jpg filter=lfs diff=lfs merge=lfs -text
+wan_i2v_input.JPG filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,14 +1,36 @@
 ---
-title: '1111'
-emoji: 📈
-colorFrom: indigo
-colorTo: purple
 sdk: gradio
-sdk_version: 6.9.0
 app_file: app.py
 pinned: false
-license: openrail
-short_description: 哈哈
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: WAN 2.2 3-Step V2V Pipeline
+emoji: 🎬
+colorFrom: purple
+colorTo: blue
 sdk: gradio
+sdk_version: 5.44.1
+python_version: "3.10"
 app_file: app.py
 pinned: false
+short_description: I2V + T2V + 3-Step V2V (SAM2 → Composite → VACE)
+models:
+  - facebook/sam2.1-hiera-large
+  - google/umt5-xxl
+  - Kijai/WanVideo_comfy
+  - linoyts/Wan2.2-T2V-A14B-Diffusers-BF16
+  - lkzd7/WAN2.2_LoraSet_NSFW
+  - r3gm/RIFE
+  - TestOrganizationPleaseIgnore/WAMU_v2_WAN2.2_I2V_LIGHTNING
+  - Wan-AI/Wan2.1-VACE-14B-diffusers
+  - Wan-AI/Wan2.2-T2V-A14B-Diffusers
+  - zerogpu-aoti/Wan2
 ---
+# WAN 2.2 Multi-Task Video Generation
+## Features
+- **I2V**: Image-to-Video (Lightning 14B, 6-step, FP8+AoT)
+- **T2V**: Text-to-Video (Lightning 14B, 4-step, Lightning LoRA)
+- **V2V**: 3-Step Video-to-Video Pipeline
+  1. **SAM2 Segmentation**: Click points on first frame → auto-track through video → mask video
+  2. **Composite + GrowMask**: Original + mask → expanded mask + composite video (automatic)
+  3. **VACE Generation**: Composite + grown mask + reference image + prompt → final video
+## V2V Workflow
+Based on ComfyUI workflows: `sam_optimized`, `sam2.1_optimized`, `vace_optimized`

aoti.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""
+"""
+from typing import cast
+import torch
+from huggingface_hub import hf_hub_download
+from spaces.zero.torch.aoti import ZeroGPUCompiledModel
+from spaces.zero.torch.aoti import ZeroGPUWeights
+from torch._functorch._aot_autograd.subclass_parametrization import unwrap_tensor_subclass_parameters
+def _shallow_clone_module(module: torch.nn.Module) -> torch.nn.Module:
+    clone = object.__new__(module.__class__)
+    clone.__dict__ = module.__dict__.copy()
+    clone._parameters = module._parameters.copy()
+    clone._buffers = module._buffers.copy()
+    clone._modules = {k: _shallow_clone_module(v) for k, v in module._modules.items() if v is not None}
+    return clone
+def aoti_blocks_load(module: torch.nn.Module, repo_id: str, variant: str | None = None):
+    repeated_blocks = cast(list[str], module._repeated_blocks)
+    aoti_files = {name: hf_hub_download(
+        repo_id=repo_id,
+        filename='package.pt2',
+        subfolder=name if variant is None else f'{name}.{variant}',
+    ) for name in repeated_blocks}
+    for block_name, aoti_file in aoti_files.items():
+        for block in module.modules():
+            if block.__class__.__name__ == block_name:
+                block_ = _shallow_clone_module(block)
+                unwrap_tensor_subclass_parameters(block_)
+                weights = ZeroGPUWeights(block_.state_dict())
+                block.forward = ZeroGPUCompiledModel(aoti_file, weights)

app.py ADDED Viewed

	@@ -0,0 +1,995 @@

+"""
+WAN 2.2 Multi-Task Video Generation - 3-Step V2V Pipeline
+I2V: Lightning 14B (6 steps, FP8+AoT)
+T2V: Lightning 14B (4 steps, Lightning LoRA + FP8)
+V2V: 3-Step Pipeline (SAM2 → Composite → VACE)
+  Step 1: SAM2 video segmentation (click points → mask video)
+  Step 2: ImageComposite (original + mask → composite video)
+  Step 3: VACE generation (composite + grown mask + ref image + prompt → final)
+LoRA: from lkzd7/WAN2.2_LoraSet_NSFW (I2V only)
+"""
+import os
+import spaces
+import shutil
+import subprocess
+import copy
+import random
+import tempfile
+import warnings
+import time
+import gc
+import uuid
+from tqdm import tqdm
+import cv2
+import numpy as np
+import torch
+from torch.nn import functional as F
+from PIL import Image, ImageFilter
+import gradio as gr
+from diffusers import (
+    AutoencoderKLWan,
+    FlowMatchEulerDiscreteScheduler,
+    WanPipeline,
+    SASolverScheduler,
+    DEISMultistepScheduler,
+    DPMSolverMultistepInverseScheduler,
+    UniPCMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+)
+from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
+from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
+from diffusers.pipelines.wan.pipeline_wan_vace import WanVACEPipeline
+from diffusers.utils.export_utils import export_to_video
+from diffusers.utils import load_video
+from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig, Int8WeightOnlyConfig
+import aoti
+import lora_loader
+# SAM2 for video mask generation
+from sam2.sam2_video_predictor import SAM2VideoPredictor
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+warnings.filterwarnings("ignore")
+def clear_vram():
+    gc.collect()
+    torch.cuda.empty_cache()
+# ============ RIFE ============
+get_timestamp_js = """
+function() {
+    const video = document.querySelector('#generated-video video');
+    if (video) { return video.currentTime; }
+    return 0;
+}
+"""
+def extract_frame(video_path, timestamp):
+    if not video_path:
+        return None
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        return None
+    fps = cap.get(cv2.CAP_FPS)
+    target_frame_num = int(float(timestamp) * fps)
+    total_frames = int(cap.get(cv2.CAP_FRAME_COUNT))
+    if target_frame_num >= total_frames:
+        target_frame_num = total_frames - 1
+    cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame_num)
+    ret, frame = cap.read()
+    cap.release()
+    if ret:
+        return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    return None
+if not os.path.exists("RIFEv4.26_0921.zip"):
+    print("Downloading RIFE Model...")
+    subprocess.run(["wget", "-q", "https://huggingface.co/r3gm/RIFE/resolve/main/RIFEv4.26_0921.zip", "-O", "RIFEv4.26_0921.zip"], check=True)
+    subprocess.run(["unzip", "-o", "RIFEv4.26_0921.zip"], check=True)
+from train_log.RIFE_HDv3 import Model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+rife_model = Model()
+rife_model.load_model("train_log", -1)
+rife_model.eval()
+@torch.no_grad()
+def interpolate_bits(frames_np, multiplier=2, scale=1.0):
+    if isinstance(frames_np, list):
+        T = len(frames_np)
+        H, W, C = frames_np[0].shape
+    else:
+        T, H, W, C = frames_np.shape
+    if multiplier < 2:
+        return list(frames_np) if isinstance(frames_np, np.ndarray) else frames_np
+    n_interp = multiplier - 1
+    tmp = max(128, int(128 / scale))
+    ph = ((H - 1) // tmp + 1) * tmp
+    pw = ((W - 1) // tmp + 1) * tmp
+    padding = (0, pw - W, 0, ph - H)
+    def to_tensor(frame_np):
+        t = torch.from_numpy(frame_np).to(device)
+        t = t.permute(2, 0, 1).unsqueeze(0)
+        return F.pad(t, padding).half()
+    def from_tensor(tensor):
+        t = tensor[0, :, :H, :W]
+        return t.permute(1, 2, 0).float().cpu().numpy()
+    def make_inference(I0, I1, n):
+        if rife_model.version >= 3.9:
+            return [rife_model.inference(I0, I1, (i+1) * 1. / (n+1), scale) for i in range(n)]
+        else:
+            middle = rife_model.inference(I0, I1, scale)
+            if n == 1: return [middle]
+            first_half = make_inference(I0, middle, n//2)
+            second_half = make_inference(middle, I1, n//2)
+            return [*first_half, middle, *second_half] if n % 2 else [*first_half, *second_half]
+    output_frames = []
+    I1 = to_tensor(frames_np[0])
+    with tqdm(total=T-1, desc="Interpolating", unit="frame") as pbar:
+        for i in range(T - 1):
+            I0 = I1
+            output_frames.append(from_tensor(I0))
+            I1 = to_tensor(frames_np[i+1])
+            for mid in make_inference(I0, I1, n_interp):
+                output_frames.append(from_tensor(mid))
+            if (i + 1) % 50 == 0:
+                pbar.update(50)
+        pbar.update((T-1) % 50)
+        output_frames.append(from_tensor(I1))
+    del I0, I1
+    torch.cuda.empty_cache()
+    return output_frames
+# ============ Config ============
+FIXED_FPS = 16
+MAX_FRAMES_MODEL = 241  # ~15s@16fps, requires more VRAM/time
+MAX_SEED = np.iinfo(np.int32).max
+SCHEDULER_MAP = {
+    "FlowMatchEulerDiscrete": FlowMatchEulerDiscreteScheduler,
+    "SASolver": SASolverScheduler,
+    "DEISMultistep": DEISMultistepScheduler,
+    "DPMSolverMultistepInverse": DPMSolverMultistepInverseScheduler,
+    "UniPCMultistep": UniPCMultistepScheduler,
+    "DPMSolverMultistep": DPMSolverMultistepScheduler,
+    "DPMSolverSinglestep": DPMSolverSinglestepScheduler,
+}
+default_negative_prompt = (
+    "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, "
+    "still image, overall gray, worst quality, low quality, JPEG artifacts, ugly, incomplete, "
+    "extra fingers, poorly drawn hands, poorly drawn face, deformed, disfigured, "
+    "malformed limbs, fused fingers, still frame, messy background, three legs, "
+    "many people in background, walking backwards, watermark, text, signature"
+)
+# ============ Load I2V Pipeline (Lightning, AoT compiled) ============
+print("Loading I2V Pipeline (Lightning 14B)...")
+i2v_pipe = WanImageToVideoPipeline.from_pretrained(
+    "TestOrganizationPleaseIgnore/WAMU_v2_WAN2.2_I2V_LIGHTNING",
+    torch_dtype=torch.bfloat16,
+).to('cuda')
+i2v_original_scheduler = copy.deepcopy(i2v_pipe.scheduler)
+quantize_(i2v_pipe.text_encoder, Int8WeightOnlyConfig())
+major, minor = torch.cuda.get_device_capability()
+supports_fp8 = (major > 8) or (major == 8 and minor >= 9)
+if supports_fp8:
+    quantize_(i2v_pipe.transformer, Float8DynamicActivationFloat8WeightConfig())
+    quantize_(i2v_pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig())
+    aoti.aoti_blocks_load(i2v_pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da')
+    aoti.aoti_blocks_load(i2v_pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da')
+else:
+    quantize_(i2v_pipe.transformer, Int8WeightOnlyConfig())
+    quantize_(i2v_pipe.transformer_2, Int8WeightOnlyConfig())
+# ============ T2V Pipeline (on-demand, 14B + Wan22 Lightning LoRA) ============
+# Use T2V-A14B + Wan22 Lightning LoRA (separate HIGH/LOW for dual transformer)
+# Load on-demand with CPU offload to avoid OOM alongside I2V
+T2V_MODEL_ID = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"
+T2V_LORA_REPO = "Kijai/WanVideo_comfy"
+T2V_LORA_HIGH = "LoRAs/Wan22-Lightning/Wan22_A14B_T2V_HIGH_Lightning_4steps_lora_250928_rank128_fp16.safetensors"
+T2V_LORA_LOW = "LoRAs/Wan22-Lightning/Wan22_A14B_T2V_LOW_Lightning_4steps_lora_250928_rank64_fp16.safetensors"
+t2v_pipe = None
+t2v_ready = False
+def load_t2v_pipeline():
+    """Load T2V 14B + Lightning LoRA on-demand with CPU offload."""
+    global t2v_pipe, t2v_ready
+    if t2v_pipe is not None and t2v_ready:
+        print("T2V pipeline reused from memory")
+        return t2v_pipe
+    print("Loading T2V Pipeline (14B + Lightning LoRA) first time...")
+    # Move I2V components to CPU to make room
+    i2v_pipe.to('cpu')
+    clear_vram()
+    t2v_vae = AutoencoderKLWan.from_pretrained(T2V_MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
+    t2v_pipe = WanPipeline.from_pretrained(
+        T2V_MODEL_ID,
+        transformer=WanTransformer3DModel.from_pretrained(
+            'linoyts/Wan2.2-T2V-A14B-Diffusers-BF16',
+            subfolder='transformer',
+            torch_dtype=torch.bfloat16,
+        ),
+        transformer_2=WanTransformer3DModel.from_pretrained(
+            'linoyts/Wan2.2-T2V-A14B-Diffusers-BF16',
+            subfolder='transformer_2',
+            torch_dtype=torch.bfloat16,
+        ),
+        vae=t2v_vae,
+        torch_dtype=torch.bfloat16,
+    )
+    # Load and fuse Lightning LoRAs (HIGH for transformer, LOW for transformer_2)
+    print("Fusing Lightning LoRA HIGH (transformer)...")
+    from safetensors.torch import load_file
+    from huggingface_hub import hf_hub_download
+    # Download LoRA files
+    high_path = hf_hub_download(T2V_LORA_REPO, T2V_LORA_HIGH)
+    low_path = hf_hub_download(T2V_LORA_REPO, T2V_LORA_LOW)
+    # Load HIGH LoRA into transformer
+    t2v_pipe.load_lora_weights(high_path, adapter_name="lightning_high")
+    t2v_pipe.set_adapters(["lightning_high"], adapter_weights=[1.0])
+    t2v_pipe.fuse_lora(adapter_names=["lightning_high"], lora_scale=1.0, components=["transformer"])
+    t2v_pipe.unload_lora_weights()
+    # Load LOW LoRA into transformer_2
+    print("Fusing Lightning LoRA LOW (transformer_2)...")
+    t2v_pipe.load_lora_weights(low_path, adapter_name="lightning_low", load_into_transformer_2=True)
+    t2v_pipe.set_adapters(["lightning_low"], adapter_weights=[1.0])
+    t2v_pipe.fuse_lora(adapter_names=["lightning_low"], lora_scale=1.0, components=["transformer_2"])
+    t2v_pipe.unload_lora_weights()
+    # Use model CPU offload — only one component on GPU at a time
+    t2v_pipe.enable_model_cpu_offload()
+    t2v_ready = True
+    print("T2V pipeline ready (14B + Lightning + CPU offload)")
+    return t2v_pipe
+def unload_t2v_pipeline():
+    """Restore I2V to GPU after T2V is done."""
+    clear_vram()
+    i2v_pipe.to('cuda')
+    print("I2V restored to GPU")
+# Keep cache for on-demand T2V loading
+# ============ SAM2 Video Segmentation ============
+sam2_predictor = None
+def get_sam2_predictor():
+    global sam2_predictor
+    if sam2_predictor is None:
+        print("Loading SAM2.1 hiera-large...")
+        sam2_predictor = SAM2VideoPredictor.from_pretrained("facebook/sam2.1-hiera-large")
+        print("SAM2 loaded")
+    return sam2_predictor
+def extract_first_frame_from_video(video_path):
+    """Extract first frame from video as PIL Image."""
+    cap = cv2.VideoCapture(video_path)
+    ret, frame = cap.read()
+    cap.release()
+    if ret:
+        return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    return None
+def video_to_frames_dir(video_path, max_frames=None):
+    """Extract video frames to a temp directory for SAM2."""
+    tmp_dir = tempfile.mkdtemp(prefix="sam2_frames_")
+    cap = cv2.VideoCapture(video_path)
+    fps = cap.get(cv2.CAP_PROP_FPS) or 16
+    idx = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if max_frames and idx >= max_frames:
+            break
+        cv2.imwrite(os.path.join(tmp_dir, f"{idx:05d}.jpg"), frame)
+        idx += 1
+    cap.release()
+    print(f"Extracted {idx} frames to {tmp_dir} (fps={fps:.1f})")
+    return tmp_dir, idx, fps
+@spaces.GPU(duration=120)
+def generate_mask_video(video_path, points_json, num_frames_limit=None):
+    """Generate mask video using SAM2 from user-clicked points."""
+    import json
+    if not video_path:
+        raise gr.Error("请先上传视频 / Upload a video first")
+    if not points_json or points_json.strip() == "[]":
+        raise gr.Error("请在视频第一帧上点击要编辑的区域 / Click on the area to edit")
+    points_data = json.loads(points_json)
+    if not points_data:
+        raise gr.Error("没有标记点 / No points marked")
+    # Extract frames
+    frames_dir, total_frames, fps = video_to_frames_dir(video_path, max_frames=num_frames_limit)
+    predictor = get_sam2_predictor()
+    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+        state = predictor.init_state(video_path=frames_dir)
+        # Add points (all on frame 0)
+        pos_points = []
+        neg_points = []
+        for p in points_data:
+            if p.get("label", 1) == 1:
+                pos_points.append([p["x"], p["y"]])
+            else:
+                neg_points.append([p["x"], p["y"]])
+        all_points = pos_points + neg_points
+        all_labels = [1] * len(pos_points) + [0] * len(neg_points)
+        points_np = np.array(all_points, dtype=np.float32)
+        labels_np = np.array(all_labels, dtype=np.int32)
+        _, _, _ = predictor.add_new_points_or_box(
+            state,
+            frame_idx=0,
+            obj_id=1,
+            points=points_np,
+            labels=labels_np,
+        )
+        # Propagate through video
+        all_masks = {}
+        for frame_idx, obj_ids, masks in predictor.propagate_in_video(state):
+            # masks shape: (num_objects, 1, H, W)
+            mask = (masks[0, 0] > 0.0).cpu().numpy().astype(np.uint8) * 255
+            all_masks[frame_idx] = mask
+    # Build mask video
+    out_path = os.path.join(tempfile.mkdtemp(), "mask_video.mp4")
+    # Get frame size from first mask
+    first_mask = all_masks[0]
+    h, w = first_mask.shape
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h), isColor=False)
+    for i in range(total_frames):
+        if i in all_masks:
+            writer.write(all_masks[i])
+        elif all_masks:
+            # Use nearest available mask
+            nearest = min(all_masks.keys(), key=lambda k: abs(k - i))
+            writer.write(all_masks[nearest])
+    writer.release()
+    # Cleanup frames dir
+    shutil.rmtree(frames_dir, ignore_errors=True)
+    print(f"Mask video generated: {out_path} ({total_frames} frames, {w}x{h})")
+    return out_path
+# ============ Step 2: GrowMask + ImageComposite (from sam2.1_optimized workflow) ============
+def grow_mask_frame(mask_gray, expand_pixels=5, blur=True):
+    """Expand mask by N pixels (matching ComfyUI GrowMask node).
+    mask_gray: numpy uint8 H×W (255=mask, 0=bg)
+    Returns: expanded mask as numpy uint8 H×W
+    """
+    if expand_pixels <= 0:
+        return mask_gray
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (expand_pixels*2+1, expand_pixels*2+1))
+    grown = cv2.dilate(mask_gray, kernel, iterations=1)
+    if blur:
+        grown = cv2.GaussianBlur(grown, (expand_pixels*2+1, expand_pixels*2+1), 0)
+        # Re-threshold to keep it binary-ish but with soft edges
+        _, grown = cv2.threshold(grown, 127, 255, cv2.THRESH_BINARY)
+    return grown
+def grow_mask_video_file(mask_video_path, expand_pixels=5):
+    """Apply GrowMask to every frame of a mask video. Returns new video path."""
+    if expand_pixels <= 0:
+        return mask_video_path
+    cap = cv2.VideoCapture(mask_video_path)
+    fps = cap.get(cv2.CAP_PROP_FPS) or 16
+    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    out_path = os.path.join(tempfile.mkdtemp(), "grown_mask.mp4")
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h), isColor=False)
+    count = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) if len(frame.shape) == 3 else frame
+        grown = grow_mask_frame(gray, expand_pixels)
+        writer.write(grown)
+        count += 1
+    cap.release()
+    writer.release()
+    print(f"GrowMask applied: {count} frames, expand={expand_pixels}px → {out_path}")
+    return out_path
+def composite_video_from_mask(source_video_path, mask_video_path):
+    """ImageComposite: replace masked region with mask overlay (from sam2.1_optimized workflow).
+    Creates a composite video where:
+    - Masked regions (white in mask) show the mask as white overlay
+    - Unmasked regions show original video
+    This gives VACE the control_video input it needs.
+    Returns: composite video path
+    """
+    src_cap = cv2.VideoCapture(source_video_path)
+    mask_cap = cv2.VideoCapture(mask_video_path)
+    fps = src_cap.get(cv2.CAP_PROP_FPS) or 16
+    w = int(src_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    h = int(src_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    out_path = os.path.join(tempfile.mkdtemp(), "composite.mp4")
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h))
+    count = 0
+    while True:
+        ret_s, src_frame = src_cap.read()
+        ret_m, mask_frame = mask_cap.read()
+        if not ret_s:
+            break
+        if not ret_m:
+            # If mask video is shorter, use last available or all-black
+            mask_gray = np.zeros((h, w), dtype=np.uint8)
+        else:
+            # Resize mask to match source if needed
+            if mask_frame.shape[:2] != (h, w):
+                mask_frame = cv2.resize(mask_frame, (w, h), interpolation=cv2.INTER_NEAREST)
+            mask_gray = cv2.cvtColor(mask_frame, cv2.COLOR_BGR2GRAY) if len(mask_frame.shape) == 3 else mask_frame
+        # Composite: original where mask=0, white where mask=255
+        mask_bool = mask_gray > 127
+        composite = src_frame.copy()
+        composite[mask_bool] = 255  # White in masked region
+        writer.write(composite)
+        count += 1
+    src_cap.release()
+    mask_cap.release()
+    writer.release()
+    print(f"Composite video: {count} frames → {out_path}")
+    return out_path
+# ============ V2V Pipeline (VACE 14B, on-demand) ============
+VACE_MODEL_ID = "Wan-AI/Wan2.1-VACE-14B-diffusers"
+v2v_pipe = None
+v2v_ready = False
+def load_v2v_pipeline():
+    """Load VACE 14B pipeline on-demand for mask-based video editing."""
+    global v2v_pipe, v2v_ready
+    # Move I2V to CPU to free GPU
+    i2v_pipe.to('cpu')
+    clear_vram()
+    if v2v_pipe is not None and v2v_ready:
+        v2v_pipe.to('cuda')
+        print("VACE pipeline restored to GPU")
+        return v2v_pipe
+    print("Loading VACE 14B Pipeline first time (this downloads ~75GB)...")
+    v2v_vae = AutoencoderKLWan.from_pretrained(VACE_MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
+    v2v_pipe = WanVACEPipeline.from_pretrained(
+        VACE_MODEL_ID,
+        vae=v2v_vae,
+        torch_dtype=torch.bfloat16,
+    )
+    v2v_pipe.scheduler = UniPCMultistepScheduler.from_config(v2v_pipe.scheduler.config, flow_shift=5.0)
+    # Quantize to fit in A100 80GB
+    quantize_(v2v_pipe.text_encoder, Int8WeightOnlyConfig())
+    major, minor = torch.cuda.get_device_capability()
+    if (major > 8) or (major == 8 and minor >= 9):
+        quantize_(v2v_pipe.transformer, Float8DynamicActivationFloat8WeightConfig())
+    else:
+        quantize_(v2v_pipe.transformer, Int8WeightOnlyConfig())
+    v2v_pipe.to('cuda')
+    v2v_ready = True
+    print("VACE 14B pipeline ready (quantized, on GPU)")
+    return v2v_pipe
+def unload_v2v_pipeline():
+    """Move V2V to CPU and restore I2V to GPU."""
+    global v2v_pipe
+    if v2v_pipe is not None:
+        v2v_pipe.to('cpu')
+    clear_vram()
+    i2v_pipe.to('cuda')
+    print("VACE → CPU, I2V → GPU")
+def load_video_frames_and_masks(video_path, mask_path, num_frames, target_h, target_w):
+    """Load source video frames and mask video frames for VACE."""
+    # Load source video frames as PIL Images
+    src_frames = load_video(video_path)[:num_frames]
+    print(f"Loaded {len(src_frames)} source frames (original size: {src_frames[0].size if src_frames else 'N/A'})")
+    # Load mask video frames
+    mask_frames_raw = load_video(mask_path)[:num_frames]
+    # Convert mask to L mode (white=edit, black=keep) — don't resize, let pipeline handle it
+    masks = []
+    for mf in mask_frames_raw:
+        gray = mf.convert("L")
+        masks.append(gray)
+    print(f"Loaded {len(masks)} mask frames")
+    # Pad or trim to match
+    while len(masks) < len(src_frames):
+        masks.append(masks[-1] if masks else Image.new("L", src_frames[0].size, 0))
+    while len(src_frames) < len(masks):
+        src_frames.append(src_frames[-1] if src_frames else Image.new("RGB", (target_w, target_h), (128, 128, 128)))
+    frame_count = min(len(src_frames), len(masks))
+    src_frames = src_frames[:frame_count]
+    masks = masks[:frame_count]
+    return src_frames, masks
+# ============ Utils ============
+def resize_image(image, max_dim=832, min_dim=480, square_dim=640, multiple_of=16):
+    width, height = image.size
+    if width == height:
+        return image.resize((square_dim, square_dim), Image.LANCZOS)
+    aspect_ratio = width / height
+    max_ar = max_dim / min_dim
+    min_ar = min_dim / max_dim
+    if aspect_ratio > max_ar:
+        crop_width = int(round(height * max_ar))
+        left = (width - crop_width) // 2
+        image = image.crop((left, 0, left + crop_width, height))
+        target_w, target_h = max_dim, min_dim
+    elif aspect_ratio < min_ar:
+        crop_height = int(round(width / min_ar))
+        top = (height - crop_height) // 2
+        image = image.crop((0, top, width, top + crop_height))
+        target_w, target_h = min_dim, max_dim
+    else:
+        if width > height:
+            target_w = max_dim
+            target_h = int(round(target_w / aspect_ratio))
+        else:
+            target_h = max_dim
+            target_w = int(round(target_h * aspect_ratio))
+    final_w = max(min_dim, min(max_dim, round(target_w / multiple_of) * multiple_of))
+    final_h = max(min_dim, min(max_dim, round(target_h / multiple_of) * multiple_of))
+    return image.resize((final_w, final_h), Image.LANCZOS)
+def resize_and_crop_to_match(target_image, reference_image):
+    ref_w, ref_h = reference_image.size
+    tgt_w, tgt_h = target_image.size
+    scale = max(ref_w / tgt_w, ref_h / tgt_h)
+    new_w, new_h = int(tgt_w * scale), int(tgt_h * scale)
+    resized = target_image.resize((new_w, new_h), Image.Resampling.LANCZOS)
+    left, top = (new_w - ref_w) // 2, (new_h - ref_h) // 2
+    return resized.crop((left, top, left + ref_w, top + ref_h))
+def get_num_frames(duration_seconds):
+    raw = int(round(duration_seconds * FIXED_FPS))
+    raw = ((raw - 1) // 4) * 4 + 1
+    return int(np.clip(raw, 9, MAX_FRAMES_MODEL))
+def extract_video_path(input_video):
+    if input_video is None:
+        return None
+    if isinstance(input_video, str):
+        return input_video
+    if isinstance(input_video, dict):
+        # Gradio 5.x format: {'video': filepath, ...} or {'name': filepath, ...} or {'path': filepath}
+        return input_video.get("video", input_video.get("path", input_video.get("name", None)))
+    # Could be a Gradio VideoData object
+    if hasattr(input_video, 'video'):
+        return input_video.video
+    if hasattr(input_video, 'path'):
+        return input_video.path
+    if hasattr(input_video, 'name'):
+        return input_video.name
+    return str(input_video)
+def extract_first_frame(video_input):
+    path = extract_video_path(video_input)
+    if not path or not os.path.exists(path):
+        return None
+    cap = cv2.VideoCapture(path)
+    ret, frame = cap.read()
+    cap.release()
+    if ret:
+        return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    return None
+# ============ Inference ============
+@spaces.GPU(duration=1200)
+def run_inference(
+    task_type, input_image, input_video, mask_video, prompt, negative_prompt,
+    duration_seconds, steps, guidance_scale, guidance_scale_2,
+    current_seed, scheduler_name, flow_shift, frame_multiplier,
+    quality, last_image_input, lora_groups,
+    reference_image=None, grow_pixels=5,
+    progress=gr.Progress(track_tqdm=True),
+):
+    clear_vram()
+    num_frames = get_num_frames(duration_seconds)
+    task_id = str(uuid.uuid4())[:8]
+    print(f"Task: {task_id}, type={task_type}, duration={duration_seconds}s, frames={num_frames}")
+    start = time.time()
+    if "T2V" in task_type:
+        # ====== T2V: 14B + Lightning LoRA (4 steps, dual guidance) ======
+        t2v_steps = max(int(steps), 4)
+        print(f"T2V: steps={t2v_steps}, guidance={guidance_scale}/{guidance_scale_2}, frames={num_frames}")
+        pipe = load_t2v_pipeline()
+        result = pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=480,
+            width=832,
+            num_frames=num_frames,
+            guidance_scale=float(guidance_scale),
+            guidance_scale_2=float(guidance_scale_2),
+            num_inference_steps=t2v_steps,
+            generator=torch.Generator(device="cpu").manual_seed(int(current_seed)),
+            output_type="np",
+        )
+        unload_t2v_pipeline()
+    else:
+        # ====== I2V / V2V ======
+        if "V2V" in task_type:
+            # ====== V2V: 3-Step Pipeline (SAM2 mask → Composite → VACE) ======
+            print(f"V2V 3-Step Pipeline: input_video type={type(input_video)}, value={input_video}")
+            video_path = extract_video_path(input_video)
+            if not video_path or not os.path.exists(video_path):
+                raise gr.Error("Upload a source video for V2V / V2V请上传原视频")
+            # Get mask video path
+            mask_path = extract_video_path(mask_video)
+            if not mask_path or not os.path.exists(mask_path):
+                raise gr.Error("Upload a mask video for V2V / V2V请上传遮罩视频（黑白视频，白色=编辑区域）")
+            # Step 2a: GrowMask — expand mask boundaries (from vace_optimized workflow)
+            grown_mask_path = grow_mask_video_file(mask_path, expand_pixels=int(grow_pixels))
+            print(f"V2V: GrowMask applied ({grow_pixels}px)")
+            # Step 2b: Composite — original video with mask overlay (from sam2.1_optimized workflow)
+            composite_path = composite_video_from_mask(video_path, mask_path)
+            print(f"V2V: Composite video created")
+            # Step 3: VACE generation using composite as control_video + grown mask
+            target_h, target_w = 480, 832
+            # Load composite video as control frames for VACE
+            src_frames = load_video(composite_path)[:num_frames]
+            print(f"Loaded {len(src_frames)} composite frames")
+            # Load grown mask frames
+            mask_frames_raw = load_video(grown_mask_path)[:num_frames]
+            masks = [mf.convert("L") for mf in mask_frames_raw]
+            print(f"Loaded {len(masks)} grown mask frames")
+            # Pad or trim to match
+            while len(masks) < len(src_frames):
+                masks.append(masks[-1] if masks else Image.new("L", src_frames[0].size, 0))
+            while len(src_frames) < len(masks):
+                src_frames.append(src_frames[-1] if src_frames else Image.new("RGB", (target_w, target_h), (128, 128, 128)))
+            # Ensure num_frames satisfies (n-1) % 4 == 0 for VACE
+            n = len(src_frames)
+            n = (n - 1) // 4 * 4 + 1
+            n = max(n, 5)
+            src_frames = src_frames[:n]
+            masks = masks[:n]
+            # Load VACE pipeline
+            pipe = load_v2v_pipeline()
+            v2v_steps = max(int(steps), 20)
+            print(f"V2V VACE: steps={v2v_steps}, guidance={guidance_scale}, frames={len(src_frames)}, ref_image={'yes' if reference_image else 'no'}")
+            # Build VACE kwargs
+            vace_kwargs = dict(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                video=src_frames,
+                mask=masks,
+                height=target_h,
+                width=target_w,
+                num_frames=len(src_frames),
+                guidance_scale=max(float(guidance_scale), 5.0),
+                num_inference_steps=v2v_steps,
+                generator=torch.Generator(device="cuda").manual_seed(int(current_seed)),
+                output_type="np",
+            )
+            result = pipe(**vace_kwargs)
+            unload_v2v_pipeline()
+            # Cleanup temp files
+            for p in [grown_mask_path, composite_path]:
+                try:
+                    if p and os.path.exists(p):
+                        os.remove(p)
+                except:
+                    pass
+        else:
+            # ====== I2V ======
+            if input_image is None:
+                raise gr.Error("Upload an image / 请上传图片")
+            scheduler_class = SCHEDULER_MAP.get(scheduler_name)
+            if scheduler_class and scheduler_class.__name__ != i2v_pipe.scheduler.config._class_name:
+                config = copy.deepcopy(i2v_original_scheduler.config)
+                if scheduler_class == FlowMatchEulerDiscreteScheduler:
+                    config['shift'] = flow_shift
+                else:
+                    config['flow_shift'] = flow_shift
+                i2v_pipe.scheduler = scheduler_class.from_config(config)
+            lora_loaded = False
+            if lora_groups:
+                try:
+                    for idx, name in enumerate(lora_groups):
+                        if name and name != "(None)":
+                            lora_loader.load_lora_to_pipe(i2v_pipe, name, adapter_name=f"lora_{idx}")
+                    lora_loaded = True
+                except Exception as e:
+                    print(f"LoRA warning: {e}")
+            resized_image = resize_image(input_image)
+            processed_last = None
+            if last_image_input:
+                processed_last = resize_and_crop_to_match(last_image_input, resized_image)
+            print(f"I2V: size={resized_image.size}, steps={int(steps)}, guidance={guidance_scale}/{guidance_scale_2}")
+            result = i2v_pipe(
+                image=resized_image,
+                last_image=processed_last,
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                height=resized_image.height,
+                width=resized_image.width,
+                num_frames=num_frames,
+                guidance_scale=float(guidance_scale),
+                guidance_scale_2=float(guidance_scale_2),
+                num_inference_steps=int(steps),
+                generator=torch.Generator(device="cuda").manual_seed(int(current_seed)),
+                output_type="np",
+            )
+            if lora_loaded:
+                lora_loader.unload_lora(i2v_pipe)
+    raw_frames = result.frames[0]
+    elapsed = time.time() - start
+    print(f"Generation took {elapsed:.1f}s ({len(raw_frames)} frames)")
+    frame_factor = frame_multiplier // FIXED_FPS
+    if frame_factor > 1:
+        rife_model.device()
+        rife_model.flownet = rife_model.flownet.half()
+        final_frames = interpolate_bits(raw_frames, multiplier=int(frame_factor))
+    else:
+        final_frames = list(raw_frames)
+    final_fps = FIXED_FPS * max(1, frame_factor)
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
+        video_path = tmpfile.name
+    export_to_video(final_frames, video_path, fps=final_fps, quality=quality)
+    return video_path, task_id
+# ============ Generate ============
+def generate_video(
+    task_type, input_image, input_video, mask_video, prompt,
+    lora_groups, duration_seconds, frame_multiplier,
+    steps, guidance_scale, guidance_scale_2,
+    negative_prompt, quality, seed, randomize_seed,
+    scheduler, flow_shift, last_image, display_result,
+    reference_image, grow_pixels,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if not prompt or not prompt.strip():
+        raise gr.Error("Enter a prompt / 请输入提示词")
+    current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
+    video_path, task_id = run_inference(
+        task_type, input_image, input_video, mask_video, prompt, negative_prompt,
+        duration_seconds, steps, guidance_scale, guidance_scale_2,
+        current_seed, scheduler, flow_shift, frame_multiplier,
+        quality, last_image, lora_groups,
+        reference_image=reference_image, grow_pixels=grow_pixels,
+    )
+    print(f"Done: {task_id}")
+    return (video_path if display_result else None), video_path, current_seed
+# ============ UI ============
+CSS = """
+#hidden-timestamp { opacity: 0; height: 0; width: 0; margin: 0; padding: 0; overflow: hidden; position: absolute; }
+"""
+with gr.Blocks(theme=gr.themes.Soft(), css=CSS, delete_cache=(3600, 10800)) as demo:
+    gr.Markdown("## WAN 2.2 Multi-Task Video Generation / 多任务视频生成")
+    gr.Markdown("#### I2V (Lightning 6-step) · T2V (Lightning 14B 4-step) · V2V (3-Step: SAM2→Composite→VACE)")
+    gr.Markdown("---")
+    task_type = gr.Radio(
+        choices=[
+            "I2V (图生视频 / Image-to-Video)",
+            "T2V (文生视频 / Text-to-Video)",
+            "V2V (视频生视频 / Video-to-Video)",
+        ],
+        value="I2V (图生视频 / Image-to-Video)",
+        label="Task Type / 任务类型",
+    )
+    with gr.Row():
+        with gr.Column():
+            with gr.Group():
+                input_image = gr.Image(type="pil", label="Input Image / 输入图片 (I2V)", sources=["upload", "clipboard"])
+            with gr.Group():
+                input_video = gr.Video(label="Source Video / 原视频 (V2V)", sources=["upload"], visible=False, interactive=True)
+            with gr.Group():
+                mask_video = gr.Video(label="Mask Video / 遮罩视频 (V2V, 白色=编辑区域)", sources=["upload"], visible=False, interactive=True)
+            v2v_guide = gr.Markdown(
+                value="""### 📖 V2V 三步流水线 / 3-Step V2V Pipeline
+**Step 1 — SAM2 分割**: 上传原视频 → 提取第一帧 → 点击标记区域 → 生成遮罩视频
+**Step 2 — 自动合成**: 原视频 + 遮罩 → GrowMask扩展边界 + ImageComposite合成（自动完成）
+**Step 3 — VACE 生成**: 合成视频 + 遮罩 + 参考图 + Prompt → 最终成品视频
+💡 也可跳过 Step 1，直接上传自己的遮罩视频（白色=编辑区域）
+""",
+                visible=False,
+            )
+            with gr.Group(visible=False) as v2v_mask_tools:
+                first_frame_display = gr.Image(label="第一帧预览 / First Frame (点击标记区域)", type="pil", interactive=False)
+                points_store = gr.State(value=[])
+                points_display = gr.Textbox(label="标记点 / Points", value="无标记 / No points", interactive=False)
+                with gr.Row():
+                    point_mode = gr.Radio(choices=["include (编辑)", "exclude (排除)"], value="include (编辑)", label="点击模式")
+                with gr.Row():
+                    extract_frame_btn = gr.Button("📷 提取第一帧 / Extract First Frame", variant="secondary")
+                    gen_mask_btn = gr.Button("🎭 生成遮罩 / Generate Mask (SAM2)", variant="primary")
+                    clear_points_btn = gr.Button("🗑️ 清除标记 / Clear Points")
+                with gr.Accordion("🖼️ V2V 高级选项 / V2V Advanced", open=True):
+                    reference_image = gr.Image(type="pil", label="参考图 / Reference Image (控制编辑区域的目标外观)", sources=["upload", "clipboard"])
+                    grow_pixels_sl = gr.Slider(minimum=0, maximum=30, step=1, value=5, label="GrowMask / 遮罩扩展 (像素)", info="扩展遮罩边界，让编辑区域过渡更自然")
+            prompt_input = gr.Textbox(
+                label="Prompt / 提示词", value="",
+                placeholder="Describe the video... / 描述你想生成的视频...", lines=3,
+            )
+            duration_slider = gr.Slider(
+                minimum=0.5, maximum=15, step=0.5, value=3,
+                label="Duration / 时长 (seconds/秒)",
+                info="Max ~15s (241 frames @16fps) / 最大约15秒",
+            )
+            frame_multi = gr.Dropdown(choices=[16, 32, 64], value=16, label="Output FPS / 输出帧率", info="RIFE interpolation / RIFE插帧")
+            with gr.Accordion("⚙️ Advanced Settings / 高级设置", open=False):
+                last_image = gr.Image(type="pil", label="Last Frame / 末帧 (Optional)", sources=["upload", "clipboard"])
+                negative_prompt_input = gr.Textbox(label="Negative Prompt / 负面提示词", value=default_negative_prompt, lines=3)
+                with gr.Row():
+                    steps_slider = gr.Slider(minimum=1, maximum=50, step=1, value=6, label="Steps / 步数", info="I2V: 4-8 | T2V: 4-8 | V2V: 25-50")
+                    quality_sl = gr.Slider(minimum=1, maximum=10, step=1, value=6, label="Quality / 质量")
+                with gr.Row():
+                    guidance_h = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1.0, label="Guidance High / 引导(高噪声)")
+                    guidance_l = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1.0, label="Guidance Low / 引导(低噪声)")
+                with gr.Row():
+                    scheduler_dd = gr.Dropdown(choices=list(SCHEDULER_MAP.keys()), value="UniPCMultistep", label="Scheduler / 调度器")
+                    flow_shift_sl = gr.Slider(minimum=0.5, maximum=15.0, step=0.1, value=3.0, label="Flow Shift / 流偏移")
+                with gr.Row():
+                    seed_sl = gr.Slider(minimum=0, maximum=MAX_SEED, step=1, value=42, label="Seed / 种子")
+                    random_seed_cb = gr.Checkbox(label="Random / 随机", value=True)
+                lora_dd = gr.Dropdown(choices=lora_loader.get_lora_choices(), label="LoRA (I2V only / 仅I2V)", multiselect=True, info="From WAN2.2_LoraSet_NSFW")
+                display_cb = gr.Checkbox(label="Display / 显示", value=True)
+            generate_btn = gr.Button("🎬 Generate / 生成视频", variant="primary", size="lg")
+        with gr.Column():
+            video_output = gr.Video(label="Generated Video / 生成的视频", autoplay=True, sources=["upload"], show_download_button=True, show_share_button=True, interactive=False, elem_id="generated-video")
+            with gr.Row():
+                grab_frame_btn = gr.Button("📸 Use Frame / 使用帧", variant="secondary")
+                timestamp_box = gr.Number(value=0, label="Timestamp", visible=False, elem_id="hidden-timestamp")
+            file_output = gr.File(label="Download / 下载")
+    def update_task_ui(task):
+        is_v2v = "V2V" in task
+        is_t2v = "T2V" in task
+        if is_t2v:
+            return (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+                    gr.update(visible=False), gr.update(visible=False),
+                    gr.update(value=4), gr.update(value=1.0), gr.update(value=1.0))
+        elif is_v2v:
+            return (gr.update(visible=False), gr.update(visible=True), gr.update(visible=True),
+                    gr.update(visible=True), gr.update(visible=True),
+                    gr.update(value=30), gr.update(value=5.0), gr.update(value=1.0))
+        else:
+            return (gr.update(visible=True), gr.update(visible=False), gr.update(visible=False),
+                    gr.update(visible=False), gr.update(visible=False),
+                    gr.update(value=6), gr.update(value=1.0), gr.update(value=1.0))
+    task_type.change(update_task_ui, inputs=[task_type], outputs=[input_image, input_video, mask_video, v2v_guide, v2v_mask_tools, steps_slider, guidance_h, guidance_l])
+    # V2V mask generation callbacks
+    def on_extract_first_frame(video):
+        vpath = extract_video_path(video)
+        if not vpath or not os.path.exists(vpath):
+            raise gr.Error("请先上传视频 / Upload video first")
+        frame = extract_first_frame_from_video(vpath)
+        if frame is None:
+            raise gr.Error("无法提取第一帧 / Failed to extract first frame")
+        return frame, [], "无标记 / No points"
+    def on_click_frame(img, points, mode, evt: gr.SelectData):
+        if img is None:
+            return img, points, "请先提取第一帧 / Extract first frame first"
+        x, y = evt.index
+        label = 1 if "include" in mode else 0
+        points.append({"x": x, "y": y, "label": label})
+        # Draw points on image
+        display_img = img.copy()
+        draw = __import__('PIL').ImageDraw.Draw(display_img)
+        for p in points:
+            color = (0, 255, 0) if p["label"] == 1 else (255, 0, 0)
+            r = 8
+            draw.ellipse([p["x"]-r, p["y"]-r, p["x"]+r, p["y"]+r], fill=color, outline="white", width=2)
+        info = f"{len([p for p in points if p['label']==1])} include, {len([p for p in points if p['label']==0])} exclude"
+        return display_img, points, info
+    def on_clear_points(original_video):
+        vpath = extract_video_path(original_video)
+        if vpath and os.path.exists(vpath):
+            frame = extract_first_frame_from_video(vpath)
+            return frame, [], "无标记 / No points"
+        return None, [], "无标记 / No points"
+    def on_generate_mask(video, points):
+        import json
+        vpath = extract_video_path(video)
+        if not vpath:
+            raise gr.Error("请先上传视频 / Upload video first")
+        if not points:
+            raise gr.Error("请先在第一帧上点击标记 / Click on first frame to mark areas")
+        mask_path = generate_mask_video(vpath, json.dumps(points))
+        return mask_path
+    extract_frame_btn.click(fn=on_extract_first_frame, inputs=[input_video], outputs=[first_frame_display, points_store, points_display])
+    first_frame_display.select(fn=on_click_frame, inputs=[first_frame_display, points_store, point_mode], outputs=[first_frame_display, points_store, points_display])
+    clear_points_btn.click(fn=on_clear_points, inputs=[input_video], outputs=[first_frame_display, points_store, points_display])
+    gen_mask_btn.click(fn=on_generate_mask, inputs=[input_video, points_store], outputs=[mask_video])
+    generate_btn.click(
+        fn=generate_video,
+        inputs=[task_type, input_image, input_video, mask_video, prompt_input, lora_dd, duration_slider, frame_multi,
+                steps_slider, guidance_h, guidance_l, negative_prompt_input, quality_sl, seed_sl, random_seed_cb,
+                scheduler_dd, flow_shift_sl, last_image, display_cb,
+                reference_image, grow_pixels_sl],
+        outputs=[video_output, file_output, seed_sl],
+    )
+    grab_frame_btn.click(fn=None, inputs=None, outputs=[timestamp_box], js=get_timestamp_js)
+    timestamp_box.change(fn=extract_frame, inputs=[video_output, timestamp_box], outputs=[input_image])
+if __name__ == "__main__":
+    demo.queue().launch(mcp_server=True, show_error=True)

kill_bill.jpeg ADDED Viewed

Git LFS Details

SHA256: d1db15fcc022a6c639d14d4b246c40729af2873ca81d4acf7b48d36d62b8d864
Pointer size: 131 Bytes
Size of remote file: 240 kB

lora_loader.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+LoRA Loader for WAN 2.2 - references files from lkzd7/WAN2.2_LoraSet_NSFW
+"""
+import urllib.parse
+import re
+from huggingface_hub import hf_hub_download
+LORA_REPO = "lkzd7/WAN2.2_LoraSet_NSFW"
+HF_TOKEN = None
+LORA_FILES = [
+    "Blink_Squatting_Cowgirl_Position_I2V_HIGH.safetensors",
+    "Blink_Squatting_Cowgirl_Position_I2V_LOW.safetensors",
+    "PENISLORA_22_i2v_HIGH_e320.safetensors",
+    "PENISLORA_22_i2v_LOW_e496.safetensors",
+    "Pornmaster_wan 2.2_14b_I2V_bukkake_v1.4_high_noise.safetensors",
+    "Pornmaster_wan 2.2_14b_I2V_bukkake_v1.4_low_noise.safetensors",
+    "W22_Multiscene_Photoshoot_Softcore_i2v_HN.safetensors",
+    "W22_Multiscene_Photoshoot_Softcore_i2v_LN.safetensors",
+    "WAN-2.2-I2V-Double-Blowjob-HIGH-v1.safetensors",
+    "WAN-2.2-I2V-Double-Blowjob-LOW-v1.safetensors",
+    "WAN-2.2-I2V-HandjobBlowjobCombo-HIGH-v1.safetensors",
+    "WAN-2.2-I2V-HandjobBlowjobCombo-LOW-v1.safetensors",
+    "WAN-2.2-I2V-SensualTeasingBlowjob-HIGH-v1.safetensors",
+    "WAN-2.2-I2V-SensualTeasingBlowjob-LOW-v1.safetensors",
+    "iGOON_Blink_Blowjob_I2V_HIGH.safetensors",
+    "iGOON_Blink_Blowjob_I2V_LOW.safetensors",
+    "iGoon - Blink_Front_Doggystyle_I2V_HIGH.safetensors",
+    "iGoon - Blink_Front_Doggystyle_I2V_LOW.safetensors",
+    "iGoon - Blink_Missionary_I2V_HIGH.safetensors",
+    "iGoon - Blink_Missionary_I2V_LOW v2.safetensors",
+    "iGoon - Blink_Missionary_I2V_LOW.safetensors",
+    "iGoon%20-%20Blink_Back_Doggystyle_HIGH.safetensors",
+    "iGoon%20-%20Blink_Back_Doggystyle_LOW.safetensors",
+    "iGoon%20-%20Blink_Facial_I2V_HIGH.safetensors",
+    "iGoon%20-%20Blink_Facial_I2V_LOW.safetensors",
+    "iGoon_Blink_Missionary_I2V_HIGH v2.safetensors",
+    "iGoon_Blink_Titjob_I2V_HIGH.safetensors",
+    "iGoon_Blink_Titjob_I2V_LOW.safetensors",
+    "lips-bj_high_noise.safetensors",
+    "lips-bj_low_noise.safetensors",
+    "mql_casting_sex_doggy_kneel_diagonally_behind_vagina_wan22_i2v_v1_high_noise.safetensors",
+    "mql_casting_sex_doggy_kneel_diagonally_behind_vagina_wan22_i2v_v1_low_noise.safetensors",
+    "mql_casting_sex_reverse_cowgirl_lie_front_vagina_wan22_i2v_v1_high_noise.safetensors",
+    "mql_casting_sex_reverse_cowgirl_lie_front_vagina_wan22_i2v_v1_low_noise.safetensors",
+    "mql_casting_sex_spoon_wan22_i2v_v1_high_noise.safetensors",
+    "mql_casting_sex_spoon_wan22_i2v_v1_low_noise.safetensors",
+    "mql_doggy_a_wan22_t2v_v1_high_noise .safetensors",
+    "mql_doggy_a_wan22_t2v_v1_low_noise.safetensors",
+    "mql_massage_tits_wan22_i2v_v1_high_noise.safetensors",
+    "mql_massage_tits_wan22_i2v_v1_low_noise.safetensors",
+    "mql_panties_aside_wan22_i2v_v1_high_noise.safetensors",
+    "mql_panties_aside_wan22_i2v_v1_low_noise.safetensors",
+    "mqlspn_a_wan22_t2v_v1_high_noise.safetensors",
+    "mqlspn_a_wan22_t2v_v1_low_noise.safetensors",
+    "sfbehind_v2.1_high_noise.safetensors",
+    "sfbehind_v2.1_low_noise.safetensors",
+    "sid3l3g_transition_v2.0_H.safetensors",
+    "sid3l3g_transition_v2.0_L.safetensors",
+    "wan2.2_i2v_high_ulitmate_pussy_asshole.safetensors",
+    "wan2.2_i2v_low_ulitmate_pussy_asshole.safetensors",
+    "wan22-mouthfull-140epoc-high-k3nk.safetensors",
+    "wan22-mouthfull-152epoc-low-k3nk.safetensors",
+]
+LORA_PAIRS = {}
+for f in LORA_FILES:
+    name = urllib.parse.unquote(f).replace(".safetensors", "")
+    is_high = bool(re.search(r'(high|HN|_H\b)', name, re.IGNORECASE))
+    is_low = bool(re.search(r'(low|LN|_L\b)', name, re.IGNORECASE))
+    group = re.sub(r'[\s_-]*(high|low|noise|HN|LN)([\s_-]*noise)?[\s_-]*(v?\d+(\.\d+)?)?\s*$', '', name, flags=re.IGNORECASE).strip()
+    group = re.sub(r'[\s_]+$', '', group)
+    if group not in LORA_PAIRS:
+        LORA_PAIRS[group] = {"HIGH": None, "LOW": None}
+    if is_high:
+        LORA_PAIRS[group]["HIGH"] = f
+    elif is_low:
+        LORA_PAIRS[group]["LOW"] = f
+def get_lora_choices():
+    choices = []
+    for group in sorted(LORA_PAIRS.keys()):
+        p = LORA_PAIRS[group]
+        if p["HIGH"] and p["LOW"]:
+            choices.append(group)
+        elif p["HIGH"]:
+            choices.append(f"{group} (HIGH only)")
+        elif p["LOW"]:
+            choices.append(f"{group} (LOW only)")
+    return choices
+def download_lora(group_name):
+    if not group_name:
+        return None, None
+    clean_name = re.sub(r'\s*\(HIGH only\)|\s*\(LOW only\)', '', group_name)
+    if clean_name not in LORA_PAIRS:
+        return None, None
+    pair = LORA_PAIRS[clean_name]
+    high_path, low_path = None, None
+    if pair["HIGH"]:
+        high_path = hf_hub_download(LORA_REPO, pair["HIGH"], token=HF_TOKEN)
+    if pair["LOW"]:
+        low_path = hf_hub_download(LORA_REPO, pair["LOW"], token=HF_TOKEN)
+    return high_path, low_path
+def load_lora_to_pipe(pipe, group_name, adapter_name="lora"):
+    high_path, low_path = download_lora(group_name)
+    if high_path and low_path:
+        pipe.load_lora_weights(high_path, adapter_name=f"{adapter_name}_high")
+        pipe.load_lora_weights(low_path, adapter_name=f"{adapter_name}_low")
+        print(f"Loaded LoRA pair: {group_name}")
+        return True
+    elif high_path:
+        pipe.load_lora_weights(high_path, adapter_name=adapter_name)
+        print(f"Loaded LoRA: {group_name}")
+        return True
+    return False
+def unload_lora(pipe):
+    try:
+        pipe.unload_lora_weights()
+    except:
+        pass

model/loss.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class EPE(nn.Module):
+    def __init__(self):
+        super(EPE, self).__init__()
+    def forward(self, flow, gt, loss_mask):
+        loss_map = (flow - gt.detach()) ** 2
+        loss_map = (loss_map.sum(1, True) + 1e-6) ** 0.5
+        return (loss_map * loss_mask)
+class Ternary(nn.Module):
+    def __init__(self):
+        super(Ternary, self).__init__()
+        patch_size = 7
+        out_channels = patch_size * patch_size
+        self.w = np.eye(out_channels).reshape(
+            (patch_size, patch_size, 1, out_channels))
+        self.w = np.transpose(self.w, (3, 2, 0, 1))
+        self.w = torch.tensor(self.w).float().to(device)
+    def transform(self, img):
+        patches = F.conv2d(img, self.w, padding=3, bias=None)
+        transf = patches - img
+        transf_norm = transf / torch.sqrt(0.81 + transf**2)
+        return transf_norm
+    def rgb2gray(self, rgb):
+        r, g, b = rgb[:, 0:1, :, :], rgb[:, 1:2, :, :], rgb[:, 2:3, :, :]
+        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
+        return gray
+    def hamming(self, t1, t2):
+        dist = (t1 - t2) ** 2
+        dist_norm = torch.mean(dist / (0.1 + dist), 1, True)
+        return dist_norm
+    def valid_mask(self, t, padding):
+        n, _, h, w = t.size()
+        inner = torch.ones(n, 1, h - 2 * padding, w - 2 * padding).type_as(t)
+        mask = F.pad(inner, [padding] * 4)
+        return mask
+    def forward(self, img0, img1):
+        img0 = self.transform(self.rgb2gray(img0))
+        img1 = self.transform(self.rgb2gray(img1))
+        return self.hamming(img0, img1) * self.valid_mask(img0, 1)
+class SOBEL(nn.Module):
+    def __init__(self):
+        super(SOBEL, self).__init__()
+        self.kernelX = torch.tensor([
+            [1, 0, -1],
+            [2, 0, -2],
+            [1, 0, -1],
+        ]).float()
+        self.kernelY = self.kernelX.clone().T
+        self.kernelX = self.kernelX.unsqueeze(0).unsqueeze(0).to(device)
+        self.kernelY = self.kernelY.unsqueeze(0).unsqueeze(0).to(device)
+    def forward(self, pred, gt):
+        N, C, H, W = pred.shape[0], pred.shape[1], pred.shape[2], pred.shape[3]
+        img_stack = torch.cat(
+            [pred.reshape(N*C, 1, H, W), gt.reshape(N*C, 1, H, W)], 0)
+        sobel_stack_x = F.conv2d(img_stack, self.kernelX, padding=1)
+        sobel_stack_y = F.conv2d(img_stack, self.kernelY, padding=1)
+        pred_X, gt_X = sobel_stack_x[:N*C], sobel_stack_x[N*C:]
+        pred_Y, gt_Y = sobel_stack_y[:N*C], sobel_stack_y[N*C:]
+        L1X, L1Y = torch.abs(pred_X-gt_X), torch.abs(pred_Y-gt_Y)
+        loss = (L1X+L1Y)
+        return loss
+class MeanShift(nn.Conv2d):
+    def __init__(self, data_mean, data_std, data_range=1, norm=True):
+        c = len(data_mean)
+        super(MeanShift, self).__init__(c, c, kernel_size=1)
+        std = torch.Tensor(data_std)
+        self.weight.data = torch.eye(c).view(c, c, 1, 1)
+        if norm:
+            self.weight.data.div_(std.view(c, 1, 1, 1))
+            self.bias.data = -1 * data_range * torch.Tensor(data_mean)
+            self.bias.data.div_(std)
+        else:
+            self.weight.data.mul_(std.view(c, 1, 1, 1))
+            self.bias.data = data_range * torch.Tensor(data_mean)
+        self.requires_grad = False
+class VGGPerceptualLoss(torch.nn.Module):
+    def __init__(self, rank=0):
+        super(VGGPerceptualLoss, self).__init__()
+        blocks = []
+        pretrained = True
+        self.vgg_pretrained_features = models.vgg19(pretrained=pretrained).features
+        self.normalize = MeanShift([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], norm=True).cuda()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, X, Y, indices=None):
+        X = self.normalize(X)
+        Y = self.normalize(Y)
+        indices = [2, 7, 12, 21, 30]
+        weights = [1.0/2.6, 1.0/4.8, 1.0/3.7, 1.0/5.6, 10/1.5]
+        k = 0
+        loss = 0
+        for i in range(indices[-1]):
+            X = self.vgg_pretrained_features[i](X)
+            Y = self.vgg_pretrained_features[i](Y)
+            if (i+1) in indices:
+                loss += weights[k] * (X - Y.detach()).abs().mean() * 0.1
+                k += 1
+        return loss
+if __name__ == '__main__':
+    img0 = torch.zeros(3, 3, 256, 256).float().to(device)
+    img1 = torch.tensor(np.random.normal(
+        0, 1, (3, 3, 256, 256))).float().to(device)
+    ternary_loss = Ternary()
+    print(ternary_loss(img0, img1).shape)

model/pytorch_msssim/__init__.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import torch
+import torch.nn.functional as F
+from math import exp
+import numpy as np
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def gaussian(window_size, sigma):
+    gauss = torch.Tensor([exp(-(x - window_size//2)**2/float(2*sigma**2)) for x in range(window_size)])
+    return gauss/gauss.sum()
+def create_window(window_size, channel=1):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0).to(device)
+    window = _2D_window.expand(channel, 1, window_size, window_size).contiguous()
+    return window
+def create_window_3d(window_size, channel=1):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t())
+    _3D_window = _2D_window.unsqueeze(2) @ (_1D_window.t())
+    window = _3D_window.expand(1, channel, window_size, window_size, window_size).contiguous().to(device)
+    return window
+def ssim(img1, img2, window_size=11, window=None, size_average=True, full=False, val_range=None):
+    # Value range can be different from 255. Other common ranges are 1 (sigmoid) and 2 (tanh).
+    if val_range is None:
+        if torch.max(img1) > 128:
+            max_val = 255
+        else:
+            max_val = 1
+        if torch.min(img1) < -0.5:
+            min_val = -1
+        else:
+            min_val = 0
+        L = max_val - min_val
+    else:
+        L = val_range
+    padd = 0
+    (_, channel, height, width) = img1.size()
+    if window is None:
+        real_size = min(window_size, height, width)
+        window = create_window(real_size, channel=channel).to(img1.device).type_as(img1)
+    mu1 = F.conv2d(F.pad(img1, (5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=channel)
+    mu2 = F.conv2d(F.pad(img2, (5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=channel)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = F.conv2d(F.pad(img1 * img1, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(F.pad(img2 * img2, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(F.pad(img1 * img2, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu1_mu2
+    C1 = (0.01 * L) ** 2
+    C2 = (0.03 * L) ** 2
+    v1 = 2.0 * sigma12 + C2
+    v2 = sigma1_sq + sigma2_sq + C2
+    cs = torch.mean(v1 / v2)  # contrast sensitivity
+    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)
+    if size_average:
+        ret = ssim_map.mean()
+    else:
+        ret = ssim_map.mean(1).mean(1).mean(1)
+    if full:
+        return ret, cs
+    return ret
+def ssim_matlab(img1, img2, window_size=11, window=None, size_average=True, full=False, val_range=None):
+    # Value range can be different from 255. Other common ranges are 1 (sigmoid) and 2 (tanh).
+    if val_range is None:
+        if torch.max(img1) > 128:
+            max_val = 255
+        else:
+            max_val = 1
+        if torch.min(img1) < -0.5:
+            min_val = -1
+        else:
+            min_val = 0
+        L = max_val - min_val
+    else:
+        L = val_range
+    padd = 0
+    (_, _, height, width) = img1.size()
+    if window is None:
+        real_size = min(window_size, height, width)
+        window = create_window_3d(real_size, channel=1).to(img1.device).type_as(img1)
+        # Channel is set to 1 since we consider color images as volumetric images
+    img1 = img1.unsqueeze(1)
+    img2 = img2.unsqueeze(1)
+    mu1 = F.conv3d(F.pad(img1, (5, 5, 5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=1)
+    mu2 = F.conv3d(F.pad(img2, (5, 5, 5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=1)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = F.conv3d(F.pad(img1 * img1, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu1_sq
+    sigma2_sq = F.conv3d(F.pad(img2 * img2, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu2_sq
+    sigma12 = F.conv3d(F.pad(img1 * img2, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu1_mu2
+    C1 = (0.01 * L) ** 2
+    C2 = (0.03 * L) ** 2
+    v1 = 2.0 * sigma12 + C2
+    v2 = sigma1_sq + sigma2_sq + C2
+    cs = torch.mean(v1 / v2)  # contrast sensitivity
+    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)
+    if size_average:
+        ret = ssim_map.mean()
+    else:
+        ret = ssim_map.mean(1).mean(1).mean(1)
+    if full:
+        return ret, cs
+    return ret
+def msssim(img1, img2, window_size=11, size_average=True, val_range=None, normalize=False):
+    device = img1.device
+    weights = torch.FloatTensor([0.0448, 0.2856, 0.3001, 0.2363, 0.1333]).to(device).type_as(img1)
+    levels = weights.size()[0]
+    mssim = []
+    mcs = []
+    for _ in range(levels):
+        sim, cs = ssim(img1, img2, window_size=window_size, size_average=size_average, full=True, val_range=val_range)
+        mssim.append(sim)
+        mcs.append(cs)
+        img1 = F.avg_pool2d(img1, (2, 2))
+        img2 = F.avg_pool2d(img2, (2, 2))
+    mssim = torch.stack(mssim)
+    mcs = torch.stack(mcs)
+    # Normalize (to avoid NaNs during training unstable models, not compliant with original definition)
+    if normalize:
+        mssim = (mssim + 1) / 2
+        mcs = (mcs + 1) / 2
+    pow1 = mcs ** weights
+    pow2 = mssim ** weights
+    # From Matlab implementation https://ece.uwaterloo.ca/~z70wang/research/iwssim/
+    output = torch.prod(pow1[:-1] * pow2[-1])
+    return output
+# Classes to re-use window
+class SSIM(torch.nn.Module):
+    def __init__(self, window_size=11, size_average=True, val_range=None):
+        super(SSIM, self).__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.val_range = val_range
+        # Assume 3 channel for SSIM
+        self.channel = 3
+        self.window = create_window(window_size, channel=self.channel)
+    def forward(self, img1, img2):
+        (_, channel, _, _) = img1.size()
+        if channel == self.channel and self.window.dtype == img1.dtype:
+            window = self.window
+        else:
+            window = create_window(self.window_size, channel).to(img1.device).type(img1.dtype)
+            self.window = window
+            self.channel = channel
+        _ssim = ssim(img1, img2, window=window, window_size=self.window_size, size_average=self.size_average)
+        dssim = (1 - _ssim) / 2
+        return dssim
+class MSSSIM(torch.nn.Module):
+    def __init__(self, window_size=11, size_average=True, channel=3):
+        super(MSSSIM, self).__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.channel = channel
+    def forward(self, img1, img2):
+        return msssim(img1, img2, window_size=self.window_size, size_average=self.size_average)

model/warplayer.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch
+import torch.nn as nn
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+backwarp_tenGrid = {}
+def warp(tenInput, tenFlow):
+    k = (str(tenFlow.device), str(tenFlow.size()))
+    if k not in backwarp_tenGrid:
+        tenHorizontal = torch.linspace(-1.0, 1.0, tenFlow.shape[3], device=tenFlow.device).view(
+            1, 1, 1, tenFlow.shape[3]).expand(tenFlow.shape[0], -1, tenFlow.shape[2], -1)
+        tenVertical = torch.linspace(-1.0, 1.0, tenFlow.shape[2], device=tenFlow.device).view(
+            1, 1, tenFlow.shape[2], 1).expand(tenFlow.shape[0], -1, -1, tenFlow.shape[3])
+        backwarp_tenGrid[k] = torch.cat(
+            [tenHorizontal, tenVertical], 1).to(tenFlow.device)
+    tenFlow = torch.cat([tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0),
+                         tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0)], 1)
+    grid = backwarp_tenGrid[k].type_as(tenFlow)
+    g = (grid + tenFlow).permute(0, 2, 3, 1)
+    return torch.nn.functional.grid_sample(input=tenInput, grid=g, mode='bilinear', padding_mode='border', align_corners=True)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ unzip

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+git+https://github.com/linoytsaban/diffusers.git@wan22-loras
+transformers<5
+accelerate
+safetensors
+sentencepiece
+peft
+ftfy
+imageio
+imageio-ffmpeg
+opencv-python
+torchao==0.11.0
+sam2
+numpy
+torchvision

wan22_input_2.jpg ADDED Viewed

Git LFS Details

SHA256: e5f312a03278dc2009fc02e61b1cd3f743ee1abd12ae184deb6ea504f8676a8a
Pointer size: 131 Bytes
Size of remote file: 234 kB

wan_controlnet.py ADDED Viewed

	@@ -0,0 +1,284 @@

+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.transformers.transformer_wan import (
+    WanTimeTextImageEmbedding,
+    WanRotaryPosEmbed,
+    WanTransformerBlock
+)
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
+class WanControlnet(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    r"""
+    A Controlnet Transformer model for video-like data used in the Wan model.
+    Args:
+        patch_size (`Tuple[int]`, defaults to `(1, 2, 2)`):
+            3D patch dimensions for video embedding (t_patch, h_patch, w_patch).
+        num_attention_heads (`int`, defaults to `40`):
+            Fixed length for text embeddings.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of channels in each head.
+        vae_channels (`int`, defaults to `16`):
+            The number of channels in the vae input.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the controlnet input.
+        text_dim (`int`, defaults to `512`):
+            Input dimension for text embeddings.
+        freq_dim (`int`, defaults to `256`):
+            Dimension for sinusoidal time embeddings.
+        ffn_dim (`int`, defaults to `13824`):
+            Intermediate dimension in feed-forward network.
+        num_layers (`int`, defaults to `40`):
+            The number of layers of transformer blocks to use.
+        window_size (`Tuple[int]`, defaults to `(-1, -1)`):
+            Window size for local attention (-1 indicates global attention).
+        cross_attn_norm (`bool`, defaults to `True`):
+            Enable cross-attention normalization.
+        qk_norm (`bool`, defaults to `True`):
+            Enable query/key normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        add_img_emb (`bool`, defaults to `False`):
+            Whether to use img_emb.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+        downscale_coef (`int`, *optional*, defaults to `8`):
+            Coeficient for downscale controlnet input video.
+        out_proj_dim (`int`, *optional*, defaults to `128 * 12`):
+            Output projection dimention for last linear layers.
+    """
+    _supports_gradient_checkpointing = True
+    _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
+    _no_split_modules = ["WanTransformerBlock"]
+    _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
+    _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: Tuple[int] = (1, 2, 2),
+        num_attention_heads: int = 40,
+        attention_head_dim: int = 128,
+        in_channels: int = 3,
+        vae_channels: int = 16,
+        text_dim: int = 4096,
+        freq_dim: int = 256,
+        ffn_dim: int = 13824,
+        num_layers: int = 20,
+        cross_attn_norm: bool = True,
+        qk_norm: Optional[str] = "rms_norm_across_heads",
+        eps: float = 1e-6,
+        image_dim: Optional[int] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        rope_max_seq_len: int = 1024,
+        downscale_coef: int = 8,
+        out_proj_dim: int = 128 * 12,
+    ) -> None:
+        super().__init__()
+        start_channels = in_channels * (downscale_coef ** 2)
+        input_channels = [start_channels, start_channels // 2, start_channels // 4]
+        self.control_encoder = nn.ModuleList([
+            ## Spatial compression with time awareness
+            nn.Sequential(
+                nn.Conv3d(
+                    in_channels,
+                    input_channels[0],
+                    kernel_size=(3, downscale_coef  + 1, downscale_coef + 1),
+                    stride=(1, downscale_coef, downscale_coef),
+                    padding=(1, downscale_coef // 2, downscale_coef // 2)
+                ),
+                nn.GELU(approximate="tanh"),
+                nn.GroupNorm(2, input_channels[0]),
+            ),
+            ## Spatio-Temporal compression with spatial awareness
+            nn.Sequential(
+                nn.Conv3d(input_channels[0], input_channels[1], kernel_size=3, stride=(2, 1, 1), padding=1),
+                nn.GELU(approximate="tanh"),
+                nn.GroupNorm(2, input_channels[1]),
+            ),
+            ## Temporal compression with spatial awareness
+            nn.Sequential(
+                nn.Conv3d(input_channels[1], input_channels[2], kernel_size=3, stride=(2, 1, 1), padding=1),
+                nn.GELU(approximate="tanh"),
+                nn.GroupNorm(2, input_channels[2]),
+            )
+        ])
+        inner_dim = num_attention_heads * attention_head_dim
+        # 1. Patch & position embedding
+        self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
+        self.patch_embedding = nn.Conv3d(vae_channels + input_channels[2], inner_dim, kernel_size=patch_size, stride=patch_size)
+        # 2. Condition embeddings
+        # image_embedding_dim=1280 for I2V model
+        self.condition_embedder = WanTimeTextImageEmbedding(
+            dim=inner_dim,
+            time_freq_dim=freq_dim,
+            time_proj_dim=inner_dim * 6,
+            text_embed_dim=text_dim,
+            image_embed_dim=image_dim,
+        )
+        # 3. Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                WanTransformerBlock(
+                    inner_dim, ffn_dim, num_attention_heads, qk_norm, cross_attn_norm, eps, added_kv_proj_dim
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # 4 Controlnet modules
+        self.controlnet_blocks = nn.ModuleList([])
+        for _ in range(len(self.blocks)):
+            controlnet_block = nn.Linear(inner_dim, out_proj_dim)
+            controlnet_block = zero_module(controlnet_block)
+            self.controlnet_blocks.append(controlnet_block)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        controlnet_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        rotary_emb = self.rope(hidden_states)
+        # 0. Controlnet encoder
+        for control_encoder_block in self.control_encoder:
+            controlnet_states = control_encoder_block(controlnet_states)
+        # print("+" * 50, hidden_states.shape, controlnet_states.shape)
+        hidden_states = torch.cat([hidden_states, controlnet_states], dim=1)
+        hidden_states = self.patch_embedding(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        # timestep shape: batch_size, or batch_size, seq_len (wan 2.2 ti2v)
+        if timestep.ndim == 2:
+            ts_seq_len = timestep.shape[1]
+            timestep = timestep.flatten()  # batch_size * seq_len
+        else:
+            ts_seq_len = None
+        temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+            timestep, encoder_hidden_states, encoder_hidden_states_image, timestep_seq_len=ts_seq_len
+        )
+        if ts_seq_len is not None:
+            # batch_size, seq_len, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(2, (6, -1))
+        else:
+            # batch_size, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(1, (6, -1))
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
+        # 4. Transformer blocks
+        controlnet_hidden_states = ()
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for block, controlnet_block in zip(self.blocks, self.controlnet_blocks):
+                hidden_states = self._gradient_checkpointing_func(
+                    block, hidden_states, encoder_hidden_states, timestep_proj, rotary_emb
+                )
+                controlnet_hidden_states += (controlnet_block(hidden_states),)
+        else:
+            for block, controlnet_block in zip(self.blocks, self.controlnet_blocks):
+                hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+                controlnet_hidden_states += (controlnet_block(hidden_states),)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (controlnet_hidden_states,)
+        return Transformer2DModelOutput(sample=controlnet_hidden_states)
+if __name__ == "__main__":
+    parameters = {
+        "added_kv_proj_dim": None,
+        "attention_head_dim": 128,
+        "cross_attn_norm": True,
+        "eps": 1e-06,
+        "ffn_dim": 8960,
+        "freq_dim": 256,
+        "image_dim": None,
+        "in_channels": 3,
+        "num_attention_heads": 12,
+        "num_layers": 2,
+        "patch_size": [1, 2, 2],
+        "qk_norm": "rms_norm_across_heads",
+        "rope_max_seq_len": 1024,
+        "text_dim": 4096,
+        "downscale_coef": 8,
+        "out_proj_dim": 12 * 128,
+        "vae_channels": 16
+    }
+    controlnet = WanControlnet(**parameters)
+    hidden_states = torch.rand(1, 16, 13, 60, 90)
+    timestep = torch.tensor([1000]).repeat(17550).unsqueeze(0) #torch.randint(low=0, high=1000, size=(1,), dtype=torch.long)
+    encoder_hidden_states = torch.rand(1, 512, 4096)
+    controlnet_states = torch.rand(1, 3, 49, 480, 720)
+    controlnet_hidden_states = controlnet(
+        hidden_states=hidden_states,
+        timestep=timestep,
+        encoder_hidden_states=encoder_hidden_states,
+        controlnet_states=controlnet_states,
+        return_dict=False
+    )
+    print("Output states count", len(controlnet_hidden_states[0]))
+    for out_hidden_states in controlnet_hidden_states[0]:
+        print(out_hidden_states.shape)

wan_i2v_input.JPG ADDED Viewed

Git LFS Details

SHA256: 077e3d965090c9028c69c00931675f42e1acc815c6eb450ab291b3b72d211a8e
Pointer size: 131 Bytes
Size of remote file: 251 kB

wan_t2v_controlnet_pipeline.py ADDED Viewed

	@@ -0,0 +1,798 @@

+# # Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+import html
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import ftfy
+import regex as re
+import torch
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from transformers import AutoTokenizer, UMT5EncoderModel
+from diffusers import WanTransformer3DModel
+from diffusers.image_processor import PipelineImageInput
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.loaders import WanLoraLoaderMixin
+from diffusers.models import AutoencoderKLWan
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import is_torch_xla_available, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.wan.pipeline_output import WanPipelineOutput
+from wan_transformer import CustomWanTransformer3DModel
+from wan_controlnet import WanControlnet
+from wan_teacache import TeaCache
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def resize_for_crop(image, crop_h, crop_w):
+    img_h, img_w = image.shape[-2:]
+    if img_h >= crop_h and img_w >= crop_w:
+        coef = max(crop_h / img_h, crop_w / img_w)
+    elif img_h <= crop_h and img_w <= crop_w:
+        coef = max(crop_h / img_h, crop_w / img_w)
+    else:
+        coef = crop_h / img_h if crop_h > img_h else crop_w / img_w
+    out_h, out_w = int(img_h * coef), int(img_w * coef)
+    resized_image = transforms.functional.resize(image, (out_h, out_w), antialias=True)
+    return resized_image
+def prepare_frames(input_images, video_size, do_resize=True, do_crop=True):
+    input_images = np.stack([np.array(x) for x in input_images])
+    images_tensor = torch.from_numpy(input_images).permute(0, 3, 1, 2) / 127.5 - 1
+    if do_resize:
+        images_tensor = [resize_for_crop(x, crop_h=video_size[0], crop_w=video_size[1]) for x in images_tensor]
+    if do_crop:
+        images_tensor = [transforms.functional.center_crop(x, video_size) for x in images_tensor]
+    if isinstance(images_tensor, list):
+        images_tensor = torch.stack(images_tensor)
+    return images_tensor.unsqueeze(0)
+def prepare_controlnet_frames(controlnet_frames, height, width, dtype, device):
+    prepared_frames = prepare_frames(controlnet_frames, (height, width))
+    controlnet_encoded_frames = prepared_frames.to(dtype=dtype, device=device)
+    return controlnet_encoded_frames.permute(0, 2, 1, 3, 4).contiguous()
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class WanTextToVideoControlnetPipeline(DiffusionPipeline, WanLoraLoaderMixin):
+    r"""
+    Pipeline for text-to-video generation using Wan.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        tokenizer ([`T5Tokenizer`]):
+            Tokenizer from [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5Tokenizer),
+            specifically the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
+        text_encoder ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
+        transformer ([`WanTransformer3DModel`]):
+            Conditional Transformer to denoise the input latents.
+        scheduler ([`UniPCMultistepScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKLWan`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+    """
+    model_cpu_offload_seq = "text_encoder->transformer->transformer_2->vae->controlnet"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    _optional_components = ["transformer_2"]
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: UMT5EncoderModel,
+        transformer: CustomWanTransformer3DModel,
+        vae: AutoencoderKLWan,
+        controlnet: WanControlnet,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        transformer_2: WanTransformer3DModel = None,
+        boundary_ratio: Optional[float] = None,
+        expand_timesteps: bool = False,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            transformer_2=transformer_2,
+        )
+        self.register_to_config(boundary_ratio=boundary_ratio)
+        self.register_to_config(expand_timesteps=expand_timesteps)
+        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt = [prompt_clean(u) for u in prompt]
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+        )
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        return prompt_embeds, negative_prompt_embeds
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        height,
+        width,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        guidance_scale_2=None,
+    ):
+        if height % 16 != 0 or width % 16 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif negative_prompt is not None and (
+            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+        if self.config.boundary_ratio is None and guidance_scale_2 is not None:
+            raise ValueError("`guidance_scale_2` is only supported when the pipeline's `boundary_ratio` is not None.")
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_latent_frames,
+            int(height) // self.vae_scale_factor_spatial,
+            int(width) // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+    @torch.no_grad()
+    def __call__(
+        self,
+        controlnet_frames: List[Image.Image] = None,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        guidance_scale_2: Optional[float] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        controlnet_latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        controlnet_weight: float = 1.0,
+        controlnet_guidance_start: float = 0.0,
+        controlnet_guidance_end: float = 1.0,
+        controlnet_stride: int = 3,
+        teacache_state: Optional[TeaCache]= None,
+        teacache_treshold: float = 0.0,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (`guidance_scale` < `1`).
+            height (`int`, defaults to `480`):
+                The height in pixels of the generated image.
+            width (`int`, defaults to `832`):
+                The width in pixels of the generated image.
+            num_frames (`int`, defaults to `81`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to `5.0`):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            guidance_scale_2 (`float`, *optional*, defaults to `None`):
+                Guidance scale for the low-noise stage transformer (`transformer_2`). If `None` and the pipeline's
+                `boundary_ratio` is not None, uses the same value as `guidance_scale`. Only used when `transformer_2`
+                and the pipeline's `boundary_ratio` are not None.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `512`):
+                The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
+                truncated. If the prompt is shorter, it will be padded to this length.
+            controlnet_weight (`float`, defaults to `0.8`):
+                Wigight for controlnet modules.
+            controlnet_guidance_start (`float`, defaults to `0.0`):
+                When start do control.
+            controlnet_guidance_end (`float`, defaults to `0.8`):
+                When finish do control.
+            controlnet_stride (`int`, defaults to `3`):
+                Stride for controlnet blocks.
+        Examples:
+        Returns:
+            [`~WanPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`WanPipelineOutput`] is returned, otherwise a `tuple` is returned where
+                the first element is a list with the generated images and the second element is a list of `bool`s
+                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
+        """
+        self.teacache = teacache_state or None
+        if (self.teacache is None) and (teacache_treshold > 0.0):
+            self.teacache = TeaCache(
+                num_inference_steps=num_inference_steps,
+                model_name="DEFAULT",
+                treshold=teacache_treshold
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            height,
+            width,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            guidance_scale_2,
+        )
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+        if self.config.boundary_ratio is not None and guidance_scale_2 is None:
+            guidance_scale_2 = guidance_scale
+        self._guidance_scale = guidance_scale
+        self._guidance_scale_2 = guidance_scale_2
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        device = self._execution_device
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        transformer_dtype = self.transformer.dtype
+        prompt_embeds = prompt_embeds.to(transformer_dtype)
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            num_frames,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+        mask = torch.ones(latents.shape, dtype=torch.float32, device=device)
+        # 6. Encode controlnet frames
+        if (controlnet_latents is None) and (controlnet_frames is not None):
+            duplicate_frames_count = num_frames - len(controlnet_frames)
+            print(f'Using controlnet frames: {len(controlnet_frames)}. Extended frames count: {duplicate_frames_count}')
+            if duplicate_frames_count > 0:
+                # Simple duplicate first frame
+                # controlnet_frames = [controlnet_frames[0]] * duplicate_frames_count + controlnet_frames
+                # Or reversed duplicate frames ?
+                reversed_controlnet_frames = list(reversed(controlnet_frames))
+                controlnet_sum_frames = controlnet_frames + reversed_controlnet_frames
+                reversed_chunks_count = num_frames // len(controlnet_sum_frames)
+                controlnet_frames = [*controlnet_sum_frames]
+                for _ in range(reversed_chunks_count):
+                    controlnet_frames += controlnet_sum_frames
+            # If controlnet frames count greater than num_frames parameter
+            controlnet_frames = controlnet_frames[:num_frames]
+            controlnet_latents = prepare_controlnet_frames(
+                controlnet_frames,
+                height,
+                width,
+                dtype=self.controlnet.dtype,
+                device=self.controlnet.device
+            )
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        if self.config.boundary_ratio is not None:
+            boundary_timestep = self.config.boundary_ratio * self.scheduler.config.num_train_timesteps
+        else:
+            boundary_timestep = None
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                if boundary_timestep is None or t >= boundary_timestep:
+                    # wan2.1 or high-noise stage in wan2.2
+                    current_model = self.transformer
+                    current_guidance_scale = guidance_scale
+                else:
+                    # low-noise stage in wan2.2
+                    current_model = self.transformer_2
+                    current_guidance_scale = guidance_scale_2
+                latent_model_input = latents.to(transformer_dtype)
+                if self.config.expand_timesteps:
+                    # seq_len: num_latent_frames * latent_height//2 * latent_width//2
+                    temp_ts = (mask[0][0][:, ::2, ::2] * t).flatten()
+                    # batch_size, seq_len
+                    timestep = temp_ts.unsqueeze(0).expand(latents.shape[0], -1)
+                else:
+                    timestep = t.expand(latents.shape[0])
+                controlnet_states = None
+                current_sampling_percent = i / len(timesteps)
+                if (controlnet_latents is not None) and (controlnet_guidance_start <= current_sampling_percent < controlnet_guidance_end):
+                    controlnet_states = self.controlnet(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=prompt_embeds,
+                        attention_kwargs=attention_kwargs,
+                        controlnet_states=controlnet_latents,
+                        return_dict=False,
+                    )[0]
+                    if isinstance(controlnet_states, (tuple, list)):
+                        controlnet_states = [x.to(dtype=self.transformer.dtype) for x in controlnet_states]
+                    else:
+                        controlnet_states = controlnet_states.to(dtype=self.transformer.dtype)
+                with current_model.cache_context("cond"):
+                    noise_pred = current_model(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=prompt_embeds,
+                        controlnet_states=controlnet_states,
+                        controlnet_weight=controlnet_weight,
+                        controlnet_stride=controlnet_stride,
+                        teacache=self.teacache,
+                        attention_kwargs=attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                if self.do_classifier_free_guidance:
+                    with current_model.cache_context("uncond"):
+                        noise_uncond = current_model(
+                            hidden_states=latent_model_input,
+                            timestep=timestep,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            controlnet_states=controlnet_states,
+                            controlnet_weight=controlnet_weight,
+                            controlnet_stride=controlnet_stride,
+                            teacache=self.teacache,
+                            attention_kwargs=attention_kwargs,
+                            return_dict=False,
+                        )[0]
+                    noise_pred = noise_uncond + current_guidance_scale * (noise_pred - noise_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        self.teacache = None
+        if not output_type == "latent":
+            latents = latents.to(self.vae.dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return WanPipelineOutput(frames=video)

wan_teacache.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+import numpy as np
+coefficients = {
+    "DEFAULT": [-1.12343328e+02,  1.50680483e+02, -5.15023303e+01,  6.24892431e+00, 6.85022158e-02],
+}
+class TeaCache:
+    def __init__(self, num_inference_steps, model_name, treshold=0.3, start_step_treshold=0.1, end_step_treshold=0.9):
+        self.input_bank = []
+        self.current_step = 0
+        self.accumulated_distance = 0.0
+        self.num_inference_steps = num_inference_steps * 2
+        self.start_step_teacache = int(num_inference_steps * start_step_treshold) * 2
+        self.end_step_teacache = int(num_inference_steps * end_step_treshold) * 2
+        self.treshold = treshold # [0.3, 0.5, 0.7, 0.9]
+        self.coefficients = coefficients[model_name]
+        self.step_name = "even"
+        self.init_memory()
+    def init_memory(self):
+        self.accumulated_distance = {
+            "even": 0.0,
+            "odd": 0.0,
+        }
+        self.flow_direction = {
+            "even": None,
+            "odd": None,
+        }
+        self.previous_modulated_input = {
+            "even": None,
+            "odd": None,
+        }
+        # print("TEACACHE MEMORY HAS BEEN CREATED")
+    def check_for_using_cached_value(self, modulated_input):
+        use_tea_cache = (self.treshold > 0.0) and (self.start_step_teacache <= self.current_step < self.end_step_teacache)
+        self.step_name = "even" if self.current_step % 2 == 0 else "odd"
+        use_cached_value = False
+        if use_tea_cache:
+            rescale_func = np.poly1d(self.coefficients)
+            current_disntace = rescale_func(
+                self.calculate_distance(modulated_input, self.previous_modulated_input[self.step_name])
+            )
+            self.accumulated_distance[self.step_name] += current_disntace
+            if self.accumulated_distance[self.step_name] < self.treshold:
+                use_cached_value = True
+            else:
+                use_cached_value = False
+                self.accumulated_distance[self.step_name] = 0.0
+        if self.step_name == "even":
+            self.input_bank.append(modulated_input.cpu())
+        self.previous_modulated_input[self.step_name] = modulated_input.clone()
+        # if use_tea_cache:
+        #     print(f"[ STEP:{self.current_step} | USE CACHED VALUE: {use_cached_value} | ACCUMULATED DISTANCE: {self.accumulated_distance} | CURRENT DISTANCE: {current_disntace} ]")
+        return use_cached_value
+    def use_cache(self, hidden_states):
+        return hidden_states + self.flow_direction[self.step_name].to(device=hidden_states.device)
+    def calculate_distance(self, previous_tensor, current_tensor):
+        relative_l1_distance = torch.abs(
+            previous_tensor - current_tensor
+        ).mean() / torch.abs(previous_tensor).mean()
+        return relative_l1_distance.to(torch.float32).cpu().item()
+    def update(self, flow_direction):
+        self.flow_direction[self.step_name] = flow_direction
+        self.current_step += 1
+        if self.current_step == self.num_inference_steps:
+            self.current_step = 0
+            self.init_memory()

wan_transformer.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from typing import Any, Dict, Optional, Union
+import torch
+from diffusers import WanTransformer3DModel
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from wan_teacache import TeaCache
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class CustomWanTransformer3DModel(WanTransformer3DModel):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_states: torch.Tensor = None,
+        controlnet_weight: Optional[float] = 1.0,
+        controlnet_stride: Optional[int] = 1,
+        teacache: Optional[TeaCache] = None,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.config.patch_size
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p_h
+        post_patch_width = width // p_w
+        rotary_emb = self.rope(hidden_states)
+        hidden_states = self.patch_embedding(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        # timestep shape: batch_size, or batch_size, seq_len (wan 2.2 ti2v)
+        if timestep.ndim == 2:
+            ts_seq_len = timestep.shape[1]
+            timestep = timestep.flatten()  # batch_size * seq_len
+        else:
+            ts_seq_len = None
+        temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+            timestep, encoder_hidden_states, encoder_hidden_states_image, timestep_seq_len=ts_seq_len
+        )
+        if ts_seq_len is not None:
+            # batch_size, seq_len, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(2, (6, -1))
+        else:
+            # batch_size, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(1, (6, -1))
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
+        use_cached_value = False
+        original_hidden_states = None
+        if (teacache is not None) and (teacache.treshold > 0.0):
+            original_hidden_states = hidden_states.clone()
+            use_cached_value = teacache.check_for_using_cached_value(temb)
+        if use_cached_value:
+            hidden_states = teacache.use_cache(hidden_states)
+        else:
+            # 4. Transformer blocks
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                for i, block in enumerate(self.blocks):
+                    hidden_states = self._gradient_checkpointing_func(
+                        block, hidden_states, encoder_hidden_states, timestep_proj, rotary_emb
+                    )
+                    if (controlnet_states is not None) and (i % controlnet_stride == 0) and (i // controlnet_stride < len(controlnet_states)):
+                        hidden_states = hidden_states + controlnet_states[i // controlnet_stride] * controlnet_weight
+            else:
+                for i, block in enumerate(self.blocks):
+                    hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+                    if (controlnet_states is not None) and (i % controlnet_stride == 0) and (i // controlnet_stride < len(controlnet_states)):
+                        hidden_states = hidden_states + controlnet_states[i // controlnet_stride] * controlnet_weight
+        if (teacache is not None) and (teacache.treshold > 0.0):
+            teacache.update(hidden_states - original_hidden_states)
+        # 5. Output norm, projection & unpatchify
+        if temb.ndim == 3:
+            # batch_size, seq_len, inner_dim (wan 2.2 ti2v)
+            shift, scale = (self.scale_shift_table.unsqueeze(0) + temb.unsqueeze(2)).chunk(2, dim=2)
+            shift = shift.squeeze(2)
+            scale = scale.squeeze(2)
+        else:
+            # batch_size, inner_dim
+            shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
+        # Move the shift and scale tensors to the same device as hidden_states.
+        # When using multi-GPU inference via accelerate these will be on the
+        # first device rather than the last device, which hidden_states ends up
+        # on.
+        shift = shift.to(hidden_states.device)
+        scale = scale.to(hidden_states.device)
+        hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(
+            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, p_t, p_h, p_w, -1
+        )
+        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
+        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

workflows/sam2.1_optimized.json ADDED Viewed

The diff for this file is too large to render. See raw diff

workflows/sam_optimized.json ADDED Viewed

The diff for this file is too large to render. See raw diff

workflows/vace_optimized.json ADDED Viewed

	@@ -0,0 +1,1043 @@

+{
+  "id": "960108a5-bf9d-497f-a6e5-4c5c3e41c056",
+  "revision": 0,
+  "last_node_id": 37,
+  "last_link_id": 93,
+  "nodes": [
+    {
+      "id": 11,
+      "type": "ModelSamplingSD3",
+      "pos": [
+        442.7779541015625,
+        942.9921264648438
+      ],
+      "size": [
+        210,
+        58
+      ],
+      "flags": {
+        "collapsed": false
+      },
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 91
+        }
+      ],
+      "outputs": [
+        {
+          "name": "MODEL",
+          "type": "MODEL",
+          "links": [
+            58
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ModelSamplingSD3"
+      },
+      "widgets_values": [
+        2.0000000000000004
+      ]
+    },
+    {
+      "id": 32,
+      "type": "VHS_LoadVideo",
+      "pos": [
+        120.05851745605469,
+        397.98248291015625
+      ],
+      "size": [
+        253.279296875,
+        310
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "meta_batch",
+          "shape": 7,
+          "type": "VHS_BatchManager",
+          "link": null
+        },
+        {
+          "name": "vae",
+          "shape": 7,
+          "type": "VAE",
+          "link": null
+        },
+        {
+          "name": "frame_load_cap",
+          "type": "INT",
+          "widget": {
+            "name": "frame_load_cap"
+          },
+          "link": 76
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            86
+          ]
+        },
+        {
+          "name": "frame_count",
+          "type": "INT",
+          "links": [
+            78
+          ]
+        },
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "links": null
+        },
+        {
+          "name": "video_info",
+          "type": "VHS_VIDEOINFO",
+          "links": null
+        }
+      ],
+      "title": "上传遮罩合成视频",
+      "properties": {
+        "Node name for S&R": "VHS_LoadVideo"
+      },
+      "widgets_values": {
+        "video": "sam2.1_00182.mp4",
+        "force_rate": 16,
+        "custom_width": 0,
+        "custom_height": 0,
+        "frame_load_cap": 0,
+        "skip_first_frames": 0,
+        "select_every_nth": 1,
+        "format": "Wan",
+        "choose video to upload": "image",
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "filename": "sam2.1_00182.mp4",
+            "type": "input",
+            "format": "video/mp4",
+            "force_rate": 16,
+            "custom_width": 0,
+            "custom_height": 0,
+            "frame_load_cap": 0,
+            "skip_first_frames": 0,
+            "select_every_nth": 1
+          }
+        }
+      }
+    },
+    {
+      "id": 33,
+      "type": "VHS_LoadVideo",
+      "pos": [
+        112.58995056152344,
+        753.9783325195312
+      ],
+      "size": [
+        253.279296875,
+        310
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "meta_batch",
+          "shape": 7,
+          "type": "VHS_BatchManager",
+          "link": null
+        },
+        {
+          "name": "vae",
+          "shape": 7,
+          "type": "VAE",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            85
+          ]
+        },
+        {
+          "name": "frame_count",
+          "type": "INT",
+          "links": [
+            76
+          ]
+        },
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "links": null
+        },
+        {
+          "name": "video_info",
+          "type": "VHS_VIDEOINFO",
+          "links": null
+        }
+      ],
+      "title": "上传遮罩视频（黑白那个）",
+      "properties": {
+        "Node name for S&R": "VHS_LoadVideo"
+      },
+      "widgets_values": {
+        "video": "sam2.1_00181.mp4",
+        "force_rate": 0,
+        "custom_width": 0,
+        "custom_height": 0,
+        "frame_load_cap": 0,
+        "skip_first_frames": 0,
+        "select_every_nth": 1,
+        "format": "Wan",
+        "choose video to upload": "image",
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "filename": "sam2.1_00181.mp4",
+            "type": "input",
+            "format": "video/mp4",
+            "force_rate": 0,
+            "custom_width": 0,
+            "custom_height": 0,
+            "frame_load_cap": 0,
+            "skip_first_frames": 0,
+            "select_every_nth": 1
+          }
+        }
+      }
+    },
+    {
+      "id": 35,
+      "type": "GrowMask",
+      "pos": [
+        722.2931518554688,
+        1093.416015625
+      ],
+      "size": [
+        270,
+        82
+      ],
+      "flags": {},
+      "order": 10,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "mask",
+          "type": "MASK",
+          "link": 79
+        }
+      ],
+      "outputs": [
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": [
+            80
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "GrowMask"
+      },
+      "widgets_values": [
+        5,
+        true
+      ]
+    },
+    {
+      "id": 6,
+      "type": "CLIPLoader",
+      "pos": [
+        111.71733093261719,
+        1112.0469970703125
+      ],
+      "size": [
+        210,
+        106
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "slot_index": 0,
+          "links": [
+            92,
+            93
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPLoader"
+      },
+      "widgets_values": [
+        "umt5_xxl_fp8_e4m3fn_scaled.safetensors",
+        "wan",
+        "cpu"
+      ]
+    },
+    {
+      "id": 8,
+      "type": "UNETLoader",
+      "pos": [
+        153.8439178466797,
+        269.8687438964844
+      ],
+      "size": [
+        210,
+        82
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "MODEL",
+          "type": "MODEL",
+          "slot_index": 0,
+          "links": [
+            91
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "UNETLoader"
+      },
+      "widgets_values": [
+        "Wan2.1_T2V_14B_FusionX_VACE-FP8_e4m3fn.safetensors",
+        "default"
+      ]
+    },
+    {
+      "id": 15,
+      "type": "CLIPTextEncode",
+      "pos": [
+        451.8983154296875,
+        311.4078674316406
+      ],
+      "size": [
+        494.83367919921875,
+        119.57742309570312
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 93
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "slot_index": 0,
+          "links": [
+            18
+          ]
+        }
+      ],
+      "title": "CLIP Text Encode (Positive Prompt)",
+      "properties": {
+        "Node name for S&R": "CLIPTextEncode"
+      },
+      "widgets_values": [
+        "赤身裸体的亚洲女子，主观视角，她抖动着胸部，乳房剧烈晃动。bustygrid. a completely naked asian woman with pale skin and huge breasts. she has straight dark hair. she is completely naked, and wearing pointed stilleto heels. bare legs, bare calf, sky-high heeled pumps. remove all clothes. "
+      ],
+      "color": "#232",
+      "bgcolor": "#353"
+    },
+    {
+      "id": 2,
+      "type": "CLIPTextEncode",
+      "pos": [
+        453.97589111328125,
+        487.16363525390625
+      ],
+      "size": [
+        486.9105529785156,
+        107.89899444580078
+      ],
+      "flags": {
+        "collapsed": false
+      },
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 92
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "slot_index": 0,
+          "links": [
+            19
+          ]
+        }
+      ],
+      "title": "CLIP Text Encode (Negative Prompt)",
+      "properties": {
+        "Node name for S&R": "CLIPTextEncode"
+      },
+      "widgets_values": [
+        "白种人，黑种人，阴部遮挡，内裤，六根手指，低像素，模糊，像素点，多余的手臂，肢体扭曲，手指模糊，脸部改变，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+      ],
+      "color": "#322",
+      "bgcolor": "#533"
+    },
+    {
+      "id": 3,
+      "type": "VAELoader",
+      "pos": [
+        433.6892395019531,
+        643.1557006835938
+      ],
+      "size": [
+        210,
+        58
+      ],
+      "flags": {
+        "collapsed": false
+      },
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "VAE",
+          "type": "VAE",
+          "links": [
+            16,
+            20
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VAELoader"
+      },
+      "widgets_values": [
+        "Wan2.1_VAE.safetensors"
+      ]
+    },
+    {
+      "id": 17,
+      "type": "WanVaceToVideo",
+      "pos": [
+        706.262939453125,
+        658.4074096679688
+      ],
+      "size": [
+        224.32986450195312,
+        254
+      ],
+      "flags": {},
+      "order": 11,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 18
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 19
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": 20
+        },
+        {
+          "name": "control_video",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": 86
+        },
+        {
+          "name": "control_masks",
+          "shape": 7,
+          "type": "MASK",
+          "link": 80
+        },
+        {
+          "name": "reference_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": 22
+        },
+        {
+          "name": "length",
+          "type": "INT",
+          "widget": {
+            "name": "length"
+          },
+          "link": 78
+        }
+      ],
+      "outputs": [
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "links": [
+            12
+          ]
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "links": [
+            13
+          ]
+        },
+        {
+          "name": "latent",
+          "type": "LATENT",
+          "links": [
+            14
+          ]
+        },
+        {
+          "name": "trim_latent",
+          "type": "INT",
+          "links": [
+            10
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "WanVaceToVideo"
+      },
+      "widgets_values": [
+        480,
+        320,
+        49,
+        1,
+        1.0000000000000002
+      ]
+    },
+    {
+      "id": 12,
+      "type": "TrimVideoLatent",
+      "pos": [
+        746.625,
+        985.3895874023438
+      ],
+      "size": [
+        226.2460174560547,
+        58
+      ],
+      "flags": {
+        "collapsed": false
+      },
+      "order": 13,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 9
+        },
+        {
+          "name": "trim_amount",
+          "type": "INT",
+          "widget": {
+            "name": "trim_amount"
+          },
+          "link": 10
+        }
+      ],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [
+            15
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "TrimVideoLatent"
+      },
+      "widgets_values": [
+        0
+      ]
+    },
+    {
+      "id": 13,
+      "type": "KSampler",
+      "pos": [
+        985.894775390625,
+        349.17340087890625
+      ],
+      "size": [
+        210,
+        605.3333129882812
+      ],
+      "flags": {},
+      "order": 12,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 58
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 12
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 13
+        },
+        {
+          "name": "latent_image",
+          "type": "LATENT",
+          "link": 14
+        }
+      ],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "slot_index": 0,
+          "links": [
+            9
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "KSampler"
+      },
+      "widgets_values": [
+        109768395777514,
+        "randomize",
+        10,
+        1,
+        "uni_pc_bh2",
+        "simple",
+        1
+      ]
+    },
+    {
+      "id": 14,
+      "type": "VAEDecode",
+      "pos": [
+        973.5802612304688,
+        1001.729736328125
+      ],
+      "size": [
+        208.16270446777344,
+        46
+      ],
+      "flags": {
+        "collapsed": false
+      },
+      "order": 14,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 15
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": 16
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "slot_index": 0,
+          "links": [
+            3
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VAEDecode"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 4,
+      "type": "VHS_VideoCombine",
+      "pos": [
+        1219.9688720703125,
+        358.5111389160156
+      ],
+      "size": [
+        239.620361328125,
+        310
+      ],
+      "flags": {},
+      "order": 15,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 3
+        },
+        {
+          "name": "audio",
+          "shape": 7,
+          "type": "AUDIO",
+          "link": null
+        },
+        {
+          "name": "meta_batch",
+          "shape": 7,
+          "type": "VHS_BatchManager",
+          "link": null
+        },
+        {
+          "name": "vae",
+          "shape": 7,
+          "type": "VAE",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "Filenames",
+          "type": "VHS_FILENAMES",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VHS_VideoCombine"
+      },
+      "widgets_values": {
+        "frame_rate": 16,
+        "loop_count": 0,
+        "filename_prefix": "wan2.1",
+        "format": "video/h265-mp4",
+        "pix_fmt": "yuv420p10le",
+        "crf": 5,
+        "save_metadata": true,
+        "pingpong": false,
+        "save_output": true,
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "filename": "wan2.1_00518.mp4",
+            "subfolder": "",
+            "type": "output",
+            "format": "video/h265-mp4",
+            "frame_rate": 16,
+            "workflow": "wan2.1_00518.png",
+            "fullpath": "E:\\comfyui3\\ComfyUI\\output\\wan2.1_00518.mp4"
+          }
+        }
+      }
+    },
+    {
+      "id": 25,
+      "type": "ImageToMask",
+      "pos": [
+        403.78155517578125,
+        1100.6531982421875
+      ],
+      "size": [
+        270,
+        58
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 85
+        }
+      ],
+      "outputs": [
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": [
+            79
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ImageToMask"
+      },
+      "widgets_values": [
+        "red"
+      ]
+    },
+    {
+      "id": 5,
+      "type": "LoadImage",
+      "pos": [
+        -272.46954345703125,
+        357.37689208984375
+      ],
+      "size": [
+        335.15673828125,
+        709.6021728515625
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            22
+          ]
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "ComfUI_287879_.png",
+        "image"
+      ]
+    }
+  ],
+  "links": [
+    [
+      3,
+      14,
+      0,
+      4,
+      0,
+      "IMAGE"
+    ],
+    [
+      9,
+      13,
+      0,
+      12,
+      0,
+      "LATENT"
+    ],
+    [
+      10,
+      17,
+      3,
+      12,
+      1,
+      "INT"
+    ],
+    [
+      12,
+      17,
+      0,
+      13,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      13,
+      17,
+      1,
+      13,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      14,
+      17,
+      2,
+      13,
+      3,
+      "LATENT"
+    ],
+    [
+      15,
+      12,
+      0,
+      14,
+      0,
+      "LATENT"
+    ],
+    [
+      16,
+      3,
+      0,
+      14,
+      1,
+      "VAE"
+    ],
+    [
+      18,
+      15,
+      0,
+      17,
+      0,
+      "CONDITIONING"
+    ],
+    [
+      19,
+      2,
+      0,
+      17,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      20,
+      3,
+      0,
+      17,
+      2,
+      "VAE"
+    ],
+    [
+      22,
+      5,
+      0,
+      17,
+      5,
+      "IMAGE"
+    ],
+    [
+      58,
+      11,
+      0,
+      13,
+      0,
+      "MODEL"
+    ],
+    [
+      76,
+      33,
+      1,
+      32,
+      2,
+      "INT"
+    ],
+    [
+      78,
+      32,
+      1,
+      17,
+      6,
+      "INT"
+    ],
+    [
+      79,
+      25,
+      0,
+      35,
+      0,
+      "MASK"
+    ],
+    [
+      80,
+      35,
+      0,
+      17,
+      4,
+      "MASK"
+    ],
+    [
+      85,
+      33,
+      0,
+      25,
+      0,
+      "IMAGE"
+    ],
+    [
+      86,
+      32,
+      0,
+      17,
+      3,
+      "IMAGE"
+    ],
+    [
+      91,
+      8,
+      0,
+      11,
+      0,
+      "MODEL"
+    ],
+    [
+      92,
+      6,
+      0,
+      2,
+      0,
+      "CLIP"
+    ],
+    [
+      93,
+      6,
+      0,
+      15,
+      0,
+      "CLIP"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 1.0152559799477145,
+      "offset": [
+        564.1931902142793,
+        -170.45932466624348
+      ]
+    },
+    "frontendVersion": "1.25.11",
+    "node_versions": {
+      "comfy-core": "0.3.56",
+      "ComfyUI-VideoHelperSuite": "972c87da577b47211c4e9aeed30dc38c7bae607f"
+    },
+    "VHS_latentpreview": true,
+    "VHS_latentpreviewrate": 0,
+    "VHS_MetadataImage": true,
+    "VHS_KeepIntermediate": true
+  },
+  "version": 0.4
+}