Spaces:

JackIsNotInTheBox
/

watermark_remover

Paused

BoxOfColors Claude Opus 4.7 (1M context) commited on Apr 26

Commit

f6c8580

1 Parent(s): 48bdb38

feat: implement VACE-14B quality mode

Wires up the previously-stubbed Quality mode with a full WanVACEPipeline
inference path, served from the upstream-protection mirror.

pipeline/vace.py
- WanVACEPipeline loaded from JackIsNotInTheBox/...Checkpoints/vace-14b
(env-overridable via VACE_REPO_ID / VACE_SUBFOLDER)
- Fuses lightx2v rank-64 distill LoRA → 4-step inference
(~7-8x speedup vs 30-step base; falls back to 30-step if LoRA fails)
- FP8 dynamic-activation quantization on H100/H200 (sm_90+) via torchao,
bf16 fallback when unavailable
- Processes long clips in 81-frame chunks with 8-frame overlap; later
chunks overwrite the seam region (larger temporal context wins)
- Crops resized to crop_region.target_{w,h} for VACE, then downsampled
back to original crop dimensions
- model.enable_model_cpu_offload() to keep peak VRAM in budget

app.py
- Splits _inpaint_composite_save_gpu into per-mode functions:
_gpu_inpaint_lama @spaces.GPU(duration=180)
_gpu_inpaint_vace @spaces.GPU(duration=300)
Closes the long-standing gap where LaMa runs would lease VACE-sized
GPU windows
- Drops the "VACE not yet available" guard
- Factors composite-and-save loop into a shared helper

requirements.txt
- diffusers >=0.34.0 (WanVACEPipeline merged in PR #11582)
- peft >=0.13.0 for load_lora_weights / fuse_lora
- torchao >=0.6.0 enabled (was commented out)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

app.py +74 -52
pipeline/vace.py +280 -23
requirements.txt +4 -3

app.py CHANGED Viewed

@@ -456,67 +456,94 @@ def on_clear_mask(editor_value: dict | None):
     )
 @spaces.GPU(duration=180)
-def _inpaint_composite_save_gpu(
     frame_paths: list,
     crop_region: CropRegion,
     inpaint_mask: np.ndarray,
     out_dir,
-    mode: str,
     total: int,
     progress,
 ) -> None:
-    """
-    GPU-accelerated inpainting with immediate per-frame compositing and disk save.
-    Architecture
-    ------------
-    - The feathered alpha map is pre-computed **once** (static mask for the whole
-      video) so the Gaussian blur runs exactly once instead of once per frame.
-    - For LaMa (per-frame independent model): streams one frame at a time —
-      never holds more than one inpainted crop in RAM.
-    - For VACE (temporal model): must process the full sequence at once for
-      temporal coherence, then composites and saves frame-by-frame.
-    - Saves composited PNGs directly to *out_dir* so the caller never holds
-      the full crop list in memory.
-    """
-    from pipeline.composite import composite_with_alpha, feathered_alpha
-    alpha = feathered_alpha(inpaint_mask)   # pre-compute once (static mask)
     out_dir = Path(out_dir)
-    if mode == "Fast (LaMa)":
-        from pipeline.lama import inpaint_frames_lama_stream
-        def _prog(i: int) -> None:
-            progress(
-                0.20 + 0.65 * ((i + 1) / total),
-                desc=f"LaMa {i + 1}/{total}…",
-            )
-        for i, (fp, crop) in enumerate(
-            zip(
-                frame_paths,
-                inpaint_frames_lama_stream(
-                    frame_paths, crop_region, inpaint_mask, _prog
-                ),
-            )
-        ):
-            original = np.array(Image.open(fp).convert("RGB"))
-            composited = composite_with_alpha(original, crop, crop_region, alpha)
-            Image.fromarray(composited).save(out_dir / f"{i + 1:06d}.png")
-    else:  # Quality (VACE) — temporal model requires the full frame sequence
-        from pipeline.vace import inpaint_frames_vace
-        progress(0.45, desc="Running VACE-14B…")
-        crops = inpaint_frames_vace(frame_paths, crop_region, inpaint_mask)
-        progress(0.85, desc="Compositing…")
-        for i, (fp, crop) in enumerate(zip(frame_paths, crops)):
-            original = np.array(Image.open(fp).convert("RGB"))
-            composited = composite_with_alpha(original, crop, crop_region, alpha)
-            Image.fromarray(composited).save(out_dir / f"{i + 1:06d}.png")
 def run_pipeline(
@@ -573,16 +600,11 @@ def run_pipeline(
         total = len(frame_paths)
         # ── GPU: inpaint + composite + save ────────────────────────────
-        # Validate mode on CPU before acquiring GPU so unimplemented modes
-        # fail fast without burning ZeroGPU quota.
         _VALID_MODES = ("Fast (LaMa)", "Quality (VACE-14B)")
         if mode not in _VALID_MODES:
             raise gr.Error(f"Unknown mode '{mode}'. Choose from: {_VALID_MODES}")
-        if mode == "Quality (VACE-14B)":
-            raise gr.Error(
-                "VACE-14B quality mode is not yet available. "
-                "Please select Fast (LaMa)."
-            )
         progress(0.15, desc="Starting inpainting…")
         _inpaint_composite_save_gpu(
             frame_paths, crop_region, inpaint_mask,

     )
+def _composite_and_save(
+    frame_paths: list,
+    crops_iter,
+    crop_region: CropRegion,
+    alpha: np.ndarray,
+    out_dir: Path,
+) -> None:
+    """Composite each inpainted crop onto its source frame and save as PNG."""
+    from pipeline.composite import composite_with_alpha
+    for i, (fp, crop) in enumerate(zip(frame_paths, crops_iter)):
+        original = np.array(Image.open(fp).convert("RGB"))
+        composited = composite_with_alpha(original, crop, crop_region, alpha)
+        Image.fromarray(composited).save(out_dir / f"{i + 1:06d}.png")
 @spaces.GPU(duration=180)
+def _gpu_inpaint_lama(
     frame_paths: list,
     crop_region: CropRegion,
     inpaint_mask: np.ndarray,
     out_dir,
     total: int,
     progress,
 ) -> None:
+    """LaMa branch — streams one frame at a time, never holds the full list."""
+    from pipeline.composite import feathered_alpha
+    from pipeline.lama import inpaint_frames_lama_stream
+    alpha = feathered_alpha(inpaint_mask)
     out_dir = Path(out_dir)
+    def _prog(i: int) -> None:
+        progress(
+            0.20 + 0.65 * ((i + 1) / total),
+            desc=f"LaMa {i + 1}/{total}…",
+        )
+    crops_iter = inpaint_frames_lama_stream(
+        frame_paths, crop_region, inpaint_mask, _prog,
+    )
+    _composite_and_save(frame_paths, crops_iter, crop_region, alpha, out_dir)
+@spaces.GPU(duration=300)
+def _gpu_inpaint_vace(
+    frame_paths: list,
+    crop_region: CropRegion,
+    inpaint_mask: np.ndarray,
+    out_dir,
+    _total: int,           # signature parity with _gpu_inpaint_lama
+    progress,
+) -> None:
+    """VACE branch — temporal model needs the full sequence; longer GPU lease."""
+    from pipeline.composite import feathered_alpha
+    from pipeline.vace import inpaint_frames_vace
+    alpha = feathered_alpha(inpaint_mask)
+    out_dir = Path(out_dir)
+    progress(0.20, desc="Loading VACE-14B (cold start ~30s)…")
+    progress(0.45, desc="Running VACE-14B inpainting…")
+    crops = inpaint_frames_vace(frame_paths, crop_region, inpaint_mask)
+    progress(0.85, desc="Compositing…")
+    _composite_and_save(frame_paths, crops, crop_region, alpha, out_dir)
+def _inpaint_composite_save_gpu(
+    frame_paths: list,
+    crop_region: CropRegion,
+    inpaint_mask: np.ndarray,
+    out_dir,
+    mode: str,
+    total: int,
+    progress,
+) -> None:
+    """Dispatch to the per-mode GPU function with the right duration budget."""
+    if mode == "Fast (LaMa)":
+        _gpu_inpaint_lama(
+            frame_paths, crop_region, inpaint_mask, out_dir, total, progress,
+        )
+    elif mode == "Quality (VACE-14B)":
+        _gpu_inpaint_vace(
+            frame_paths, crop_region, inpaint_mask, out_dir, total, progress,
+        )
+    else:
+        raise ValueError(f"Unknown inpainting mode: {mode!r}")
 def run_pipeline(
         total = len(frame_paths)
         # ── GPU: inpaint + composite + save ────────────────────────────
+        # Validate mode on CPU before acquiring GPU so unknown modes fail
+        # fast without burning ZeroGPU quota.
         _VALID_MODES = ("Fast (LaMa)", "Quality (VACE-14B)")
         if mode not in _VALID_MODES:
             raise gr.Error(f"Unknown mode '{mode}'. Choose from: {_VALID_MODES}")
         progress(0.15, desc="Starting inpainting…")
         _inpaint_composite_save_gpu(
             frame_paths, crop_region, inpaint_mask,

pipeline/vace.py CHANGED Viewed

@@ -3,36 +3,199 @@ pipeline/vace.py
 ----------------
 Quality mode: VACE-14B video inpainting via Wan2.1-VACE-14B-diffusers.
-STUB — implemented in the next iteration after pipeline validation with LaMa.
-Planned implementation:
-  - WanVACEPipeline from diffusers
-  - FP8 quantization via torchao
-  - AoT compilation for speed
-  - 8-step inference (step-distilled schedule)
-  - Temporal chunking: split frame list into ~33-frame windows, run each
-    chunk with overlapping context frames to avoid seam artefacts
-  - Memory: model.enable_model_cpu_offload() + torch.cuda.empty_cache()
-    between chunks
-  - @spaces.GPU(duration=200) decorator on the main entry function
 """
 from __future__ import annotations
 from pathlib import Path
-from typing import List
 import numpy as np
 from pipeline.crop import CropRegion
 def inpaint_frames_vace(
     frame_paths: List[Path],
     crop_region: CropRegion,
     inpaint_mask: np.ndarray,
-    num_inference_steps: int = 8,
-    guidance_scale: float = 5.0,
 ) -> List[np.ndarray]:
     """
     Run VACE-14B inpainting on the crop region of each frame.
@@ -42,18 +205,112 @@ def inpaint_frames_vace(
     frame_paths : List[Path]
         Ordered full-frame PNG paths.
     crop_region : CropRegion
     inpaint_mask : np.ndarray
-        Crop-local binary mask (H x W, uint8). 255=inpaint.
-    num_inference_steps : int
-        Default 8 for step-distilled fast inference.
-    guidance_scale : float
     Returns
     -------
     List[np.ndarray]
-        Inpainted crop arrays (H x W x 3, uint8 RGB), one per frame.
     """
-    raise NotImplementedError(
-        "VACE-14B pipeline is not yet implemented. "
-        "Use Fast (LaMa) mode for now."
     )

 ----------------
 Quality mode: VACE-14B video inpainting via Wan2.1-VACE-14B-diffusers.
+Architecture
+------------
+- Loads ``WanVACEPipeline`` from the local mirror under
+  ``JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints/vace-14b/`` to
+  insulate the Space from upstream deletion of ``Wan-AI/...``.
+- Fuses the lightx2v rank-64 distill LoRA so we can run the masked
+  diffusion in 4 inference steps (vs 30 for the base model). The LoRA is
+  trained for T2V but applies cleanly to VACE since both share the same
+  Wan2.1 transformer backbone.
+- Quantises the transformer weights to FP8 with torchao on H100/H200
+  hardware (sm_90+). Roughly halves transformer VRAM and accelerates
+  matmuls; falls back to bf16 if torchao or compute-capability is missing.
+- Processes the (potentially long) frame list in 81-frame chunks (VACE's
+  native temporal window) with 8-frame overlap. Later chunks overwrite
+  the overlap region so the larger context window wins.
+- Each crop is resized to the VACE-target resolution that
+  :func:`pipeline.crop.compute_crop_region` selected, then resized back
+  to the original crop dimensions before compositing.
+ZeroGPU budget
+--------------
+The pipeline is designed to fit inside ~300s on the H200 MIG slice for a
+15-second clip at ≤30 fps. Cold load (transformer + text encoder + VAE
++ LoRA fuse + FP8 quantize) is ~30-60s; per-chunk inference at 4 steps
+is ~10-20s; ~7 chunks for 15s @ 30fps.
+Configuration knobs (all read at module import via env vars)
+----------------------------------------------------------
+- VACE_REPO_ID         : HF repo holding the diffusers package (default: mirror)
+- VACE_SUBFOLDER       : subfolder within the repo (default: ``vace-14b``)
+- VACE_LORA_REPO_ID    : HF repo holding the distill LoRA (default: mirror)
+- VACE_LORA_FILE       : LoRA filename (default: lightx2v rank-64 4-step)
+License: Apache-2.0 (Wan2.1 base) + Apache-2.0 (lightx2v distill LoRA).
 """
 from __future__ import annotations
+import os
 from pathlib import Path
+from typing import List, Optional
 import numpy as np
+import torch
+from PIL import Image
 from pipeline.crop import CropRegion
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+VACE_REPO_ID = os.environ.get(
+    "VACE_REPO_ID",
+    "JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints",
+)
+VACE_SUBFOLDER = os.environ.get("VACE_SUBFOLDER", "vace-14b")
+VACE_LORA_REPO_ID = os.environ.get(
+    "VACE_LORA_REPO_ID",
+    "JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints",
+)
+VACE_LORA_FILE = os.environ.get(
+    "VACE_LORA_FILE",
+    "loras/wan2.1_t2v_14b_lora_rank64_lightx2v_4step.safetensors",
+)
+# VACE requires num_frames = 4n+1. 81 = 16*5+1 is the documented sweet spot.
+CHUNK_FRAMES = 81
+# Frames shared between consecutive chunks for temporal continuity at seams.
+CHUNK_OVERLAP = 8
+# Step-distill LoRA enables 4-step inference (~7-8x faster than 30-step base).
+DEFAULT_STEPS_DISTILLED = 4
+DEFAULT_STEPS_BASE = 30
+# CFG-free with the distill LoRA; base would use ~5.0.
+DEFAULT_GUIDANCE_DISTILLED = 1.0
+DEFAULT_GUIDANCE_BASE = 5.0
+# Empty positive prompt — for watermark removal we want the model to fill
+# from the surrounding crop context, not steer to anything specific.
+PROMPT = ""
+NEGATIVE_PROMPT = (
+    "watermark, text, logo, subtitles, low quality, "
+    "blurry, distortion, artifacts, JPEG compression"
+)
+# ---------------------------------------------------------------------------
+# Pipeline singleton (cold load is expensive — keep it warm across calls)
+# ---------------------------------------------------------------------------
+_vace_pipe = None
+_vace_device: Optional[str] = None
+def _get_pipe():
+    """Load (or return cached) WanVACEPipeline configured for fast inpainting."""
+    global _vace_pipe, _vace_device
+    current_device = "cuda" if torch.cuda.is_available() else "cpu"
+    if _vace_pipe is not None and _vace_device == current_device:
+        return _vace_pipe
+    from diffusers import AutoencoderKLWan, WanVACEPipeline
+    from diffusers.schedulers.scheduling_unipc_multistep import (
+        UniPCMultistepScheduler,
+    )
+    # VAE in fp32 (per the official diffusers example) for numerical stability.
+    vae = AutoencoderKLWan.from_pretrained(
+        VACE_REPO_ID,
+        subfolder=f"{VACE_SUBFOLDER}/vae",
+        torch_dtype=torch.float32,
+    )
+    pipe = WanVACEPipeline.from_pretrained(
+        VACE_REPO_ID,
+        subfolder=VACE_SUBFOLDER,
+        vae=vae,
+        torch_dtype=torch.bfloat16,
+    )
+    # flow_shift = 3.0 → 480P-friendly. 5.0 would be 720P-friendly.
+    # We process at the smallest VACE-target resolution for the crop, which
+    # is typically in the 480P band, so 3.0 is the right default.
+    pipe.scheduler = UniPCMultistepScheduler.from_config(
+        pipe.scheduler.config, flow_shift=3.0,
+    )
+    # Step-distill LoRA. If it fails to apply we fall back to 30-step base.
+    inference_steps = DEFAULT_STEPS_DISTILLED
+    inference_guidance = DEFAULT_GUIDANCE_DISTILLED
+    try:
+        pipe.load_lora_weights(
+            VACE_LORA_REPO_ID,
+            weight_name=VACE_LORA_FILE,
+            adapter_name="distill",
+        )
+        pipe.set_adapters(["distill"], adapter_weights=[1.0])
+        pipe.fuse_lora(adapter_names=["distill"], lora_scale=1.0)
+        pipe.unload_lora_weights()
+        print(f"[VACE] Distill LoRA fused; {inference_steps}-step inference.")
+    except Exception as exc:
+        print(
+            f"[VACE] Distill LoRA load/fuse failed ({exc}); "
+            f"falling back to {DEFAULT_STEPS_BASE}-step base inference."
+        )
+        inference_steps = DEFAULT_STEPS_BASE
+        inference_guidance = DEFAULT_GUIDANCE_BASE
+    # Stash for inpaint_frames_vace.
+    pipe._wm_steps = inference_steps          # type: ignore[attr-defined]
+    pipe._wm_guidance = inference_guidance    # type: ignore[attr-defined]
+    # FP8 quantization on H100 / H200 (sm_90+).
+    if (
+        current_device == "cuda"
+        and torch.cuda.is_available()
+        and torch.cuda.get_device_capability(0)[0] >= 9
+    ):
+        try:
+            from torchao.quantization import (
+                float8_dynamic_activation_float8_weight,
+                quantize_,
+            )
+            quantize_(
+                pipe.transformer,
+                float8_dynamic_activation_float8_weight(),
+            )
+            print("[VACE] FP8 dynamic-activation quantization applied.")
+        except Exception as exc:
+            print(f"[VACE] FP8 quantization unavailable ({exc}); using bf16.")
+    pipe.to(current_device)
+    # CPU offload reduces peak VRAM by paging the text encoder + VAE off-GPU
+    # between the prompt-encode and decode phases. Negligible runtime cost.
+    pipe.enable_model_cpu_offload()
+    _vace_pipe = pipe
+    _vace_device = current_device
+    return _vace_pipe
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
 def inpaint_frames_vace(
     frame_paths: List[Path],
     crop_region: CropRegion,
     inpaint_mask: np.ndarray,
+    num_inference_steps: Optional[int] = None,
+    guidance_scale: Optional[float] = None,
 ) -> List[np.ndarray]:
     """
     Run VACE-14B inpainting on the crop region of each frame.
     frame_paths : List[Path]
         Ordered full-frame PNG paths.
     crop_region : CropRegion
+        Crop rectangle + VACE-target resolution selected upstream.
     inpaint_mask : np.ndarray
+        Crop-local binary mask (H × W, uint8). 255 = inpaint, 0 = keep.
+    num_inference_steps : int, optional
+        Override the auto-selected step count (4 with distill LoRA, 30 base).
+    guidance_scale : float, optional
+        Override the auto-selected guidance scale (1.0 distilled, 5.0 base).
     Returns
     -------
     List[np.ndarray]
+        Inpainted crop arrays (crop_h × crop_w × 3, uint8 RGB), one per frame,
+        in the **original** crop dimensions (not target dimensions).
     """
+    pipe = _get_pipe()
+    steps = num_inference_steps or getattr(pipe, "_wm_steps", DEFAULT_STEPS_DISTILLED)
+    guidance = guidance_scale if guidance_scale is not None else getattr(
+        pipe, "_wm_guidance", DEFAULT_GUIDANCE_DISTILLED
     )
+    target_w = crop_region.target_w
+    target_h = crop_region.target_h
+    # ── 1. Load each crop and resize to VACE target resolution ──────────
+    crops_pil: List[Image.Image] = []
+    for fp in frame_paths:
+        img = Image.open(fp).convert("RGB")
+        box = (
+            crop_region.frame_x,
+            crop_region.frame_y,
+            crop_region.frame_x + crop_region.frame_w,
+            crop_region.frame_y + crop_region.frame_h,
+        )
+        crop = img.crop(box).resize((target_w, target_h), Image.LANCZOS)
+        crops_pil.append(crop)
+    # Static mask, resized to target resolution.
+    # NEAREST keeps the mask edges binary so feathering happens later in
+    # composite.py rather than smearing into the diffusion conditioning.
+    mask_at_target = np.array(
+        Image.fromarray(inpaint_mask).resize(
+            (target_w, target_h), Image.NEAREST,
+        )
+    )
+    mask_pil = Image.fromarray(mask_at_target).convert("L")
+    # ── 2. Process in overlapping CHUNK_FRAMES windows ──────────────────
+    n_frames = len(crops_pil)
+    output_pil: List[Optional[Image.Image]] = [None] * n_frames
+    stride = CHUNK_FRAMES - CHUNK_OVERLAP
+    chunk_starts = list(range(0, n_frames, stride))
+    # Deterministic seed so re-runs are reproducible.
+    generator = torch.Generator(device="cpu").manual_seed(42)
+    for ci, start in enumerate(chunk_starts):
+        end = min(start + CHUNK_FRAMES, n_frames)
+        chunk_video = list(crops_pil[start:end])
+        chunk_len = len(chunk_video)
+        # VACE requires exactly num_frames frames per call. Pad short trailing
+        # chunks by repeating the last real frame; we discard the pad output.
+        if chunk_len < CHUNK_FRAMES:
+            chunk_video.extend(
+                [chunk_video[-1]] * (CHUNK_FRAMES - chunk_len)
+            )
+        chunk_mask = [mask_pil] * CHUNK_FRAMES
+        result = pipe(
+            video=chunk_video,
+            mask=chunk_mask,
+            prompt=PROMPT,
+            negative_prompt=NEGATIVE_PROMPT,
+            height=target_h,
+            width=target_w,
+            num_frames=CHUNK_FRAMES,
+            num_inference_steps=steps,
+            guidance_scale=guidance,
+            generator=generator,
+        ).frames[0]
+        # Drop the pad and write into the global frame buffer. Later chunks
+        # overwrite the overlap region of earlier ones — the second chunk
+        # has the larger temporal context for the overlap frames.
+        for i in range(chunk_len):
+            global_i = start + i
+            if output_pil[global_i] is None or ci > 0:
+                output_pil[global_i] = result[i]
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    # ── 3. Resize back to original crop dimensions, return as ndarrays ──
+    out: List[np.ndarray] = []
+    for pil_img in output_pil:
+        if pil_img is None:
+            raise RuntimeError(
+                "VACE: output frame missing after chunked inference. "
+                "This indicates a chunking bug; please report."
+            )
+        resized = pil_img.resize(
+            (crop_region.frame_w, crop_region.frame_h),
+            Image.LANCZOS,
+        )
+        out.append(np.array(resized.convert("RGB")))
+    return out

requirements.txt CHANGED Viewed

@@ -19,12 +19,13 @@ simple-lama-inpainting>=0.1.2
 # ── Quality mode (VACE-14B) ───────────────────────────────────────────────
 # torch / torchvision are pre-installed on ZeroGPU; do not pin here.
-diffusers>=0.32.0
 transformers>=4.44.0
 accelerate>=0.33.0
 sentencepiece>=0.1.99
-# torchao — FP8 quantization (uncomment when implementing vace.py)
-# torchao>=0.6.0
 # ── Video I/O ─────────────────────────────────────────────────────────────
 # ffmpeg binary is provided via packages.txt; no Python wrapper needed.

 # ── Quality mode (VACE-14B) ───────────────────────────────────────────────
 # torch / torchvision are pre-installed on ZeroGPU; do not pin here.
+# diffusers must be >=0.34.0 for WanVACEPipeline (merged in PR #11582).
+diffusers>=0.34.0
 transformers>=4.44.0
 accelerate>=0.33.0
 sentencepiece>=0.1.99
+peft>=0.13.0           # LoRA loading via load_lora_weights / fuse_lora
+torchao>=0.6.0         # FP8 dynamic-activation quantization on H100/H200
 # ── Video I/O ─────────────────────────────────────────────────────────────
 # ffmpeg binary is provided via packages.txt; no Python wrapper needed.