feat: implement VACE-14B quality mode
Browse filesWires up the previously-stubbed Quality mode with a full WanVACEPipeline
inference path, served from the upstream-protection mirror.
pipeline/vace.py
- WanVACEPipeline loaded from JackIsNotInTheBox/...Checkpoints/vace-14b
(env-overridable via VACE_REPO_ID / VACE_SUBFOLDER)
- Fuses lightx2v rank-64 distill LoRA β 4-step inference
(~7-8x speedup vs 30-step base; falls back to 30-step if LoRA fails)
- FP8 dynamic-activation quantization on H100/H200 (sm_90+) via torchao,
bf16 fallback when unavailable
- Processes long clips in 81-frame chunks with 8-frame overlap; later
chunks overwrite the seam region (larger temporal context wins)
- Crops resized to crop_region.target_{w,h} for VACE, then downsampled
back to original crop dimensions
- model.enable_model_cpu_offload() to keep peak VRAM in budget
app.py
- Splits _inpaint_composite_save_gpu into per-mode functions:
_gpu_inpaint_lama @spaces.GPU(duration=180)
_gpu_inpaint_vace @spaces.GPU(duration=300)
Closes the long-standing gap where LaMa runs would lease VACE-sized
GPU windows
- Drops the "VACE not yet available" guard
- Factors composite-and-save loop into a shared helper
requirements.txt
- diffusers >=0.34.0 (WanVACEPipeline merged in PR #11582)
- peft >=0.13.0 for load_lora_weights / fuse_lora
- torchao >=0.6.0 enabled (was commented out)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- app.py +74 -52
- pipeline/vace.py +280 -23
- requirements.txt +4 -3
|
@@ -456,67 +456,94 @@ def on_clear_mask(editor_value: dict | None):
|
|
| 456 |
)
|
| 457 |
|
| 458 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
@spaces.GPU(duration=180)
|
| 460 |
-
def
|
| 461 |
frame_paths: list,
|
| 462 |
crop_region: CropRegion,
|
| 463 |
inpaint_mask: np.ndarray,
|
| 464 |
out_dir,
|
| 465 |
-
mode: str,
|
| 466 |
total: int,
|
| 467 |
progress,
|
| 468 |
) -> None:
|
| 469 |
-
"""
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
Architecture
|
| 473 |
-
------------
|
| 474 |
-
- The feathered alpha map is pre-computed **once** (static mask for the whole
|
| 475 |
-
video) so the Gaussian blur runs exactly once instead of once per frame.
|
| 476 |
-
- For LaMa (per-frame independent model): streams one frame at a time β
|
| 477 |
-
never holds more than one inpainted crop in RAM.
|
| 478 |
-
- For VACE (temporal model): must process the full sequence at once for
|
| 479 |
-
temporal coherence, then composites and saves frame-by-frame.
|
| 480 |
-
- Saves composited PNGs directly to *out_dir* so the caller never holds
|
| 481 |
-
the full crop list in memory.
|
| 482 |
-
"""
|
| 483 |
-
from pipeline.composite import composite_with_alpha, feathered_alpha
|
| 484 |
|
| 485 |
-
alpha = feathered_alpha(inpaint_mask)
|
| 486 |
out_dir = Path(out_dir)
|
| 487 |
|
| 488 |
-
|
| 489 |
-
|
|
|
|
|
|
|
|
|
|
| 490 |
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
)
|
| 496 |
|
| 497 |
-
for i, (fp, crop) in enumerate(
|
| 498 |
-
zip(
|
| 499 |
-
frame_paths,
|
| 500 |
-
inpaint_frames_lama_stream(
|
| 501 |
-
frame_paths, crop_region, inpaint_mask, _prog
|
| 502 |
-
),
|
| 503 |
-
)
|
| 504 |
-
):
|
| 505 |
-
original = np.array(Image.open(fp).convert("RGB"))
|
| 506 |
-
composited = composite_with_alpha(original, crop, crop_region, alpha)
|
| 507 |
-
Image.fromarray(composited).save(out_dir / f"{i + 1:06d}.png")
|
| 508 |
|
| 509 |
-
|
| 510 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
|
| 512 |
-
|
| 513 |
-
|
|
|
|
| 514 |
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
|
| 521 |
|
| 522 |
def run_pipeline(
|
|
@@ -573,16 +600,11 @@ def run_pipeline(
|
|
| 573 |
total = len(frame_paths)
|
| 574 |
|
| 575 |
# ββ GPU: inpaint + composite + save ββββββββββββββββββββββββββββ
|
| 576 |
-
# Validate mode on CPU before acquiring GPU so
|
| 577 |
-
#
|
| 578 |
_VALID_MODES = ("Fast (LaMa)", "Quality (VACE-14B)")
|
| 579 |
if mode not in _VALID_MODES:
|
| 580 |
raise gr.Error(f"Unknown mode '{mode}'. Choose from: {_VALID_MODES}")
|
| 581 |
-
if mode == "Quality (VACE-14B)":
|
| 582 |
-
raise gr.Error(
|
| 583 |
-
"VACE-14B quality mode is not yet available. "
|
| 584 |
-
"Please select Fast (LaMa)."
|
| 585 |
-
)
|
| 586 |
progress(0.15, desc="Starting inpaintingβ¦")
|
| 587 |
_inpaint_composite_save_gpu(
|
| 588 |
frame_paths, crop_region, inpaint_mask,
|
|
|
|
| 456 |
)
|
| 457 |
|
| 458 |
|
| 459 |
+
def _composite_and_save(
|
| 460 |
+
frame_paths: list,
|
| 461 |
+
crops_iter,
|
| 462 |
+
crop_region: CropRegion,
|
| 463 |
+
alpha: np.ndarray,
|
| 464 |
+
out_dir: Path,
|
| 465 |
+
) -> None:
|
| 466 |
+
"""Composite each inpainted crop onto its source frame and save as PNG."""
|
| 467 |
+
from pipeline.composite import composite_with_alpha
|
| 468 |
+
|
| 469 |
+
for i, (fp, crop) in enumerate(zip(frame_paths, crops_iter)):
|
| 470 |
+
original = np.array(Image.open(fp).convert("RGB"))
|
| 471 |
+
composited = composite_with_alpha(original, crop, crop_region, alpha)
|
| 472 |
+
Image.fromarray(composited).save(out_dir / f"{i + 1:06d}.png")
|
| 473 |
+
|
| 474 |
+
|
| 475 |
@spaces.GPU(duration=180)
|
| 476 |
+
def _gpu_inpaint_lama(
|
| 477 |
frame_paths: list,
|
| 478 |
crop_region: CropRegion,
|
| 479 |
inpaint_mask: np.ndarray,
|
| 480 |
out_dir,
|
|
|
|
| 481 |
total: int,
|
| 482 |
progress,
|
| 483 |
) -> None:
|
| 484 |
+
"""LaMa branch β streams one frame at a time, never holds the full list."""
|
| 485 |
+
from pipeline.composite import feathered_alpha
|
| 486 |
+
from pipeline.lama import inpaint_frames_lama_stream
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
|
| 488 |
+
alpha = feathered_alpha(inpaint_mask)
|
| 489 |
out_dir = Path(out_dir)
|
| 490 |
|
| 491 |
+
def _prog(i: int) -> None:
|
| 492 |
+
progress(
|
| 493 |
+
0.20 + 0.65 * ((i + 1) / total),
|
| 494 |
+
desc=f"LaMa {i + 1}/{total}β¦",
|
| 495 |
+
)
|
| 496 |
|
| 497 |
+
crops_iter = inpaint_frames_lama_stream(
|
| 498 |
+
frame_paths, crop_region, inpaint_mask, _prog,
|
| 499 |
+
)
|
| 500 |
+
_composite_and_save(frame_paths, crops_iter, crop_region, alpha, out_dir)
|
|
|
|
| 501 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
|
| 503 |
+
@spaces.GPU(duration=300)
|
| 504 |
+
def _gpu_inpaint_vace(
|
| 505 |
+
frame_paths: list,
|
| 506 |
+
crop_region: CropRegion,
|
| 507 |
+
inpaint_mask: np.ndarray,
|
| 508 |
+
out_dir,
|
| 509 |
+
_total: int, # signature parity with _gpu_inpaint_lama
|
| 510 |
+
progress,
|
| 511 |
+
) -> None:
|
| 512 |
+
"""VACE branch β temporal model needs the full sequence; longer GPU lease."""
|
| 513 |
+
from pipeline.composite import feathered_alpha
|
| 514 |
+
from pipeline.vace import inpaint_frames_vace
|
| 515 |
+
|
| 516 |
+
alpha = feathered_alpha(inpaint_mask)
|
| 517 |
+
out_dir = Path(out_dir)
|
| 518 |
|
| 519 |
+
progress(0.20, desc="Loading VACE-14B (cold start ~30s)β¦")
|
| 520 |
+
progress(0.45, desc="Running VACE-14B inpaintingβ¦")
|
| 521 |
+
crops = inpaint_frames_vace(frame_paths, crop_region, inpaint_mask)
|
| 522 |
|
| 523 |
+
progress(0.85, desc="Compositingβ¦")
|
| 524 |
+
_composite_and_save(frame_paths, crops, crop_region, alpha, out_dir)
|
| 525 |
+
|
| 526 |
+
|
| 527 |
+
def _inpaint_composite_save_gpu(
|
| 528 |
+
frame_paths: list,
|
| 529 |
+
crop_region: CropRegion,
|
| 530 |
+
inpaint_mask: np.ndarray,
|
| 531 |
+
out_dir,
|
| 532 |
+
mode: str,
|
| 533 |
+
total: int,
|
| 534 |
+
progress,
|
| 535 |
+
) -> None:
|
| 536 |
+
"""Dispatch to the per-mode GPU function with the right duration budget."""
|
| 537 |
+
if mode == "Fast (LaMa)":
|
| 538 |
+
_gpu_inpaint_lama(
|
| 539 |
+
frame_paths, crop_region, inpaint_mask, out_dir, total, progress,
|
| 540 |
+
)
|
| 541 |
+
elif mode == "Quality (VACE-14B)":
|
| 542 |
+
_gpu_inpaint_vace(
|
| 543 |
+
frame_paths, crop_region, inpaint_mask, out_dir, total, progress,
|
| 544 |
+
)
|
| 545 |
+
else:
|
| 546 |
+
raise ValueError(f"Unknown inpainting mode: {mode!r}")
|
| 547 |
|
| 548 |
|
| 549 |
def run_pipeline(
|
|
|
|
| 600 |
total = len(frame_paths)
|
| 601 |
|
| 602 |
# ββ GPU: inpaint + composite + save ββββββββββββββββββββββββββββ
|
| 603 |
+
# Validate mode on CPU before acquiring GPU so unknown modes fail
|
| 604 |
+
# fast without burning ZeroGPU quota.
|
| 605 |
_VALID_MODES = ("Fast (LaMa)", "Quality (VACE-14B)")
|
| 606 |
if mode not in _VALID_MODES:
|
| 607 |
raise gr.Error(f"Unknown mode '{mode}'. Choose from: {_VALID_MODES}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
progress(0.15, desc="Starting inpaintingβ¦")
|
| 609 |
_inpaint_composite_save_gpu(
|
| 610 |
frame_paths, crop_region, inpaint_mask,
|
|
@@ -3,36 +3,199 @@ pipeline/vace.py
|
|
| 3 |
----------------
|
| 4 |
Quality mode: VACE-14B video inpainting via Wan2.1-VACE-14B-diffusers.
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"""
|
| 19 |
|
| 20 |
from __future__ import annotations
|
| 21 |
|
|
|
|
| 22 |
from pathlib import Path
|
| 23 |
-
from typing import List
|
| 24 |
|
| 25 |
import numpy as np
|
|
|
|
|
|
|
| 26 |
|
| 27 |
from pipeline.crop import CropRegion
|
| 28 |
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
def inpaint_frames_vace(
|
| 31 |
frame_paths: List[Path],
|
| 32 |
crop_region: CropRegion,
|
| 33 |
inpaint_mask: np.ndarray,
|
| 34 |
-
num_inference_steps: int =
|
| 35 |
-
guidance_scale: float =
|
| 36 |
) -> List[np.ndarray]:
|
| 37 |
"""
|
| 38 |
Run VACE-14B inpainting on the crop region of each frame.
|
|
@@ -42,18 +205,112 @@ def inpaint_frames_vace(
|
|
| 42 |
frame_paths : List[Path]
|
| 43 |
Ordered full-frame PNG paths.
|
| 44 |
crop_region : CropRegion
|
|
|
|
| 45 |
inpaint_mask : np.ndarray
|
| 46 |
-
Crop-local binary mask (H
|
| 47 |
-
num_inference_steps : int
|
| 48 |
-
|
| 49 |
-
guidance_scale : float
|
|
|
|
| 50 |
|
| 51 |
Returns
|
| 52 |
-------
|
| 53 |
List[np.ndarray]
|
| 54 |
-
Inpainted crop arrays (
|
|
|
|
| 55 |
"""
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
| 59 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
----------------
|
| 4 |
Quality mode: VACE-14B video inpainting via Wan2.1-VACE-14B-diffusers.
|
| 5 |
|
| 6 |
+
Architecture
|
| 7 |
+
------------
|
| 8 |
+
- Loads ``WanVACEPipeline`` from the local mirror under
|
| 9 |
+
``JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints/vace-14b/`` to
|
| 10 |
+
insulate the Space from upstream deletion of ``Wan-AI/...``.
|
| 11 |
+
- Fuses the lightx2v rank-64 distill LoRA so we can run the masked
|
| 12 |
+
diffusion in 4 inference steps (vs 30 for the base model). The LoRA is
|
| 13 |
+
trained for T2V but applies cleanly to VACE since both share the same
|
| 14 |
+
Wan2.1 transformer backbone.
|
| 15 |
+
- Quantises the transformer weights to FP8 with torchao on H100/H200
|
| 16 |
+
hardware (sm_90+). Roughly halves transformer VRAM and accelerates
|
| 17 |
+
matmuls; falls back to bf16 if torchao or compute-capability is missing.
|
| 18 |
+
- Processes the (potentially long) frame list in 81-frame chunks (VACE's
|
| 19 |
+
native temporal window) with 8-frame overlap. Later chunks overwrite
|
| 20 |
+
the overlap region so the larger context window wins.
|
| 21 |
+
- Each crop is resized to the VACE-target resolution that
|
| 22 |
+
:func:`pipeline.crop.compute_crop_region` selected, then resized back
|
| 23 |
+
to the original crop dimensions before compositing.
|
| 24 |
+
|
| 25 |
+
ZeroGPU budget
|
| 26 |
+
--------------
|
| 27 |
+
The pipeline is designed to fit inside ~300s on the H200 MIG slice for a
|
| 28 |
+
15-second clip at β€30 fps. Cold load (transformer + text encoder + VAE
|
| 29 |
+
+ LoRA fuse + FP8 quantize) is ~30-60s; per-chunk inference at 4 steps
|
| 30 |
+
is ~10-20s; ~7 chunks for 15s @ 30fps.
|
| 31 |
+
|
| 32 |
+
Configuration knobs (all read at module import via env vars)
|
| 33 |
+
----------------------------------------------------------
|
| 34 |
+
- VACE_REPO_ID : HF repo holding the diffusers package (default: mirror)
|
| 35 |
+
- VACE_SUBFOLDER : subfolder within the repo (default: ``vace-14b``)
|
| 36 |
+
- VACE_LORA_REPO_ID : HF repo holding the distill LoRA (default: mirror)
|
| 37 |
+
- VACE_LORA_FILE : LoRA filename (default: lightx2v rank-64 4-step)
|
| 38 |
+
|
| 39 |
+
License: Apache-2.0 (Wan2.1 base) + Apache-2.0 (lightx2v distill LoRA).
|
| 40 |
"""
|
| 41 |
|
| 42 |
from __future__ import annotations
|
| 43 |
|
| 44 |
+
import os
|
| 45 |
from pathlib import Path
|
| 46 |
+
from typing import List, Optional
|
| 47 |
|
| 48 |
import numpy as np
|
| 49 |
+
import torch
|
| 50 |
+
from PIL import Image
|
| 51 |
|
| 52 |
from pipeline.crop import CropRegion
|
| 53 |
|
| 54 |
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
# Configuration
|
| 57 |
+
# ---------------------------------------------------------------------------
|
| 58 |
+
|
| 59 |
+
VACE_REPO_ID = os.environ.get(
|
| 60 |
+
"VACE_REPO_ID",
|
| 61 |
+
"JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints",
|
| 62 |
+
)
|
| 63 |
+
VACE_SUBFOLDER = os.environ.get("VACE_SUBFOLDER", "vace-14b")
|
| 64 |
+
VACE_LORA_REPO_ID = os.environ.get(
|
| 65 |
+
"VACE_LORA_REPO_ID",
|
| 66 |
+
"JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints",
|
| 67 |
+
)
|
| 68 |
+
VACE_LORA_FILE = os.environ.get(
|
| 69 |
+
"VACE_LORA_FILE",
|
| 70 |
+
"loras/wan2.1_t2v_14b_lora_rank64_lightx2v_4step.safetensors",
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# VACE requires num_frames = 4n+1. 81 = 16*5+1 is the documented sweet spot.
|
| 74 |
+
CHUNK_FRAMES = 81
|
| 75 |
+
# Frames shared between consecutive chunks for temporal continuity at seams.
|
| 76 |
+
CHUNK_OVERLAP = 8
|
| 77 |
+
|
| 78 |
+
# Step-distill LoRA enables 4-step inference (~7-8x faster than 30-step base).
|
| 79 |
+
DEFAULT_STEPS_DISTILLED = 4
|
| 80 |
+
DEFAULT_STEPS_BASE = 30
|
| 81 |
+
# CFG-free with the distill LoRA; base would use ~5.0.
|
| 82 |
+
DEFAULT_GUIDANCE_DISTILLED = 1.0
|
| 83 |
+
DEFAULT_GUIDANCE_BASE = 5.0
|
| 84 |
+
|
| 85 |
+
# Empty positive prompt β for watermark removal we want the model to fill
|
| 86 |
+
# from the surrounding crop context, not steer to anything specific.
|
| 87 |
+
PROMPT = ""
|
| 88 |
+
NEGATIVE_PROMPT = (
|
| 89 |
+
"watermark, text, logo, subtitles, low quality, "
|
| 90 |
+
"blurry, distortion, artifacts, JPEG compression"
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# ---------------------------------------------------------------------------
|
| 95 |
+
# Pipeline singleton (cold load is expensive β keep it warm across calls)
|
| 96 |
+
# ---------------------------------------------------------------------------
|
| 97 |
+
|
| 98 |
+
_vace_pipe = None
|
| 99 |
+
_vace_device: Optional[str] = None
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _get_pipe():
|
| 103 |
+
"""Load (or return cached) WanVACEPipeline configured for fast inpainting."""
|
| 104 |
+
global _vace_pipe, _vace_device
|
| 105 |
+
|
| 106 |
+
current_device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 107 |
+
if _vace_pipe is not None and _vace_device == current_device:
|
| 108 |
+
return _vace_pipe
|
| 109 |
+
|
| 110 |
+
from diffusers import AutoencoderKLWan, WanVACEPipeline
|
| 111 |
+
from diffusers.schedulers.scheduling_unipc_multistep import (
|
| 112 |
+
UniPCMultistepScheduler,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
# VAE in fp32 (per the official diffusers example) for numerical stability.
|
| 116 |
+
vae = AutoencoderKLWan.from_pretrained(
|
| 117 |
+
VACE_REPO_ID,
|
| 118 |
+
subfolder=f"{VACE_SUBFOLDER}/vae",
|
| 119 |
+
torch_dtype=torch.float32,
|
| 120 |
+
)
|
| 121 |
+
pipe = WanVACEPipeline.from_pretrained(
|
| 122 |
+
VACE_REPO_ID,
|
| 123 |
+
subfolder=VACE_SUBFOLDER,
|
| 124 |
+
vae=vae,
|
| 125 |
+
torch_dtype=torch.bfloat16,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# flow_shift = 3.0 β 480P-friendly. 5.0 would be 720P-friendly.
|
| 129 |
+
# We process at the smallest VACE-target resolution for the crop, which
|
| 130 |
+
# is typically in the 480P band, so 3.0 is the right default.
|
| 131 |
+
pipe.scheduler = UniPCMultistepScheduler.from_config(
|
| 132 |
+
pipe.scheduler.config, flow_shift=3.0,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Step-distill LoRA. If it fails to apply we fall back to 30-step base.
|
| 136 |
+
inference_steps = DEFAULT_STEPS_DISTILLED
|
| 137 |
+
inference_guidance = DEFAULT_GUIDANCE_DISTILLED
|
| 138 |
+
try:
|
| 139 |
+
pipe.load_lora_weights(
|
| 140 |
+
VACE_LORA_REPO_ID,
|
| 141 |
+
weight_name=VACE_LORA_FILE,
|
| 142 |
+
adapter_name="distill",
|
| 143 |
+
)
|
| 144 |
+
pipe.set_adapters(["distill"], adapter_weights=[1.0])
|
| 145 |
+
pipe.fuse_lora(adapter_names=["distill"], lora_scale=1.0)
|
| 146 |
+
pipe.unload_lora_weights()
|
| 147 |
+
print(f"[VACE] Distill LoRA fused; {inference_steps}-step inference.")
|
| 148 |
+
except Exception as exc:
|
| 149 |
+
print(
|
| 150 |
+
f"[VACE] Distill LoRA load/fuse failed ({exc}); "
|
| 151 |
+
f"falling back to {DEFAULT_STEPS_BASE}-step base inference."
|
| 152 |
+
)
|
| 153 |
+
inference_steps = DEFAULT_STEPS_BASE
|
| 154 |
+
inference_guidance = DEFAULT_GUIDANCE_BASE
|
| 155 |
+
# Stash for inpaint_frames_vace.
|
| 156 |
+
pipe._wm_steps = inference_steps # type: ignore[attr-defined]
|
| 157 |
+
pipe._wm_guidance = inference_guidance # type: ignore[attr-defined]
|
| 158 |
+
|
| 159 |
+
# FP8 quantization on H100 / H200 (sm_90+).
|
| 160 |
+
if (
|
| 161 |
+
current_device == "cuda"
|
| 162 |
+
and torch.cuda.is_available()
|
| 163 |
+
and torch.cuda.get_device_capability(0)[0] >= 9
|
| 164 |
+
):
|
| 165 |
+
try:
|
| 166 |
+
from torchao.quantization import (
|
| 167 |
+
float8_dynamic_activation_float8_weight,
|
| 168 |
+
quantize_,
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
quantize_(
|
| 172 |
+
pipe.transformer,
|
| 173 |
+
float8_dynamic_activation_float8_weight(),
|
| 174 |
+
)
|
| 175 |
+
print("[VACE] FP8 dynamic-activation quantization applied.")
|
| 176 |
+
except Exception as exc:
|
| 177 |
+
print(f"[VACE] FP8 quantization unavailable ({exc}); using bf16.")
|
| 178 |
+
|
| 179 |
+
pipe.to(current_device)
|
| 180 |
+
# CPU offload reduces peak VRAM by paging the text encoder + VAE off-GPU
|
| 181 |
+
# between the prompt-encode and decode phases. Negligible runtime cost.
|
| 182 |
+
pipe.enable_model_cpu_offload()
|
| 183 |
+
|
| 184 |
+
_vace_pipe = pipe
|
| 185 |
+
_vace_device = current_device
|
| 186 |
+
return _vace_pipe
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
# ---------------------------------------------------------------------------
|
| 190 |
+
# Public API
|
| 191 |
+
# ---------------------------------------------------------------------------
|
| 192 |
+
|
| 193 |
def inpaint_frames_vace(
|
| 194 |
frame_paths: List[Path],
|
| 195 |
crop_region: CropRegion,
|
| 196 |
inpaint_mask: np.ndarray,
|
| 197 |
+
num_inference_steps: Optional[int] = None,
|
| 198 |
+
guidance_scale: Optional[float] = None,
|
| 199 |
) -> List[np.ndarray]:
|
| 200 |
"""
|
| 201 |
Run VACE-14B inpainting on the crop region of each frame.
|
|
|
|
| 205 |
frame_paths : List[Path]
|
| 206 |
Ordered full-frame PNG paths.
|
| 207 |
crop_region : CropRegion
|
| 208 |
+
Crop rectangle + VACE-target resolution selected upstream.
|
| 209 |
inpaint_mask : np.ndarray
|
| 210 |
+
Crop-local binary mask (H Γ W, uint8). 255 = inpaint, 0 = keep.
|
| 211 |
+
num_inference_steps : int, optional
|
| 212 |
+
Override the auto-selected step count (4 with distill LoRA, 30 base).
|
| 213 |
+
guidance_scale : float, optional
|
| 214 |
+
Override the auto-selected guidance scale (1.0 distilled, 5.0 base).
|
| 215 |
|
| 216 |
Returns
|
| 217 |
-------
|
| 218 |
List[np.ndarray]
|
| 219 |
+
Inpainted crop arrays (crop_h Γ crop_w Γ 3, uint8 RGB), one per frame,
|
| 220 |
+
in the **original** crop dimensions (not target dimensions).
|
| 221 |
"""
|
| 222 |
+
pipe = _get_pipe()
|
| 223 |
+
steps = num_inference_steps or getattr(pipe, "_wm_steps", DEFAULT_STEPS_DISTILLED)
|
| 224 |
+
guidance = guidance_scale if guidance_scale is not None else getattr(
|
| 225 |
+
pipe, "_wm_guidance", DEFAULT_GUIDANCE_DISTILLED
|
| 226 |
)
|
| 227 |
+
|
| 228 |
+
target_w = crop_region.target_w
|
| 229 |
+
target_h = crop_region.target_h
|
| 230 |
+
|
| 231 |
+
# ββ 1. Load each crop and resize to VACE target resolution ββββββββββ
|
| 232 |
+
crops_pil: List[Image.Image] = []
|
| 233 |
+
for fp in frame_paths:
|
| 234 |
+
img = Image.open(fp).convert("RGB")
|
| 235 |
+
box = (
|
| 236 |
+
crop_region.frame_x,
|
| 237 |
+
crop_region.frame_y,
|
| 238 |
+
crop_region.frame_x + crop_region.frame_w,
|
| 239 |
+
crop_region.frame_y + crop_region.frame_h,
|
| 240 |
+
)
|
| 241 |
+
crop = img.crop(box).resize((target_w, target_h), Image.LANCZOS)
|
| 242 |
+
crops_pil.append(crop)
|
| 243 |
+
|
| 244 |
+
# Static mask, resized to target resolution.
|
| 245 |
+
# NEAREST keeps the mask edges binary so feathering happens later in
|
| 246 |
+
# composite.py rather than smearing into the diffusion conditioning.
|
| 247 |
+
mask_at_target = np.array(
|
| 248 |
+
Image.fromarray(inpaint_mask).resize(
|
| 249 |
+
(target_w, target_h), Image.NEAREST,
|
| 250 |
+
)
|
| 251 |
+
)
|
| 252 |
+
mask_pil = Image.fromarray(mask_at_target).convert("L")
|
| 253 |
+
|
| 254 |
+
# ββ 2. Process in overlapping CHUNK_FRAMES windows ββββββββββββββββββ
|
| 255 |
+
n_frames = len(crops_pil)
|
| 256 |
+
output_pil: List[Optional[Image.Image]] = [None] * n_frames
|
| 257 |
+
|
| 258 |
+
stride = CHUNK_FRAMES - CHUNK_OVERLAP
|
| 259 |
+
chunk_starts = list(range(0, n_frames, stride))
|
| 260 |
+
|
| 261 |
+
# Deterministic seed so re-runs are reproducible.
|
| 262 |
+
generator = torch.Generator(device="cpu").manual_seed(42)
|
| 263 |
+
|
| 264 |
+
for ci, start in enumerate(chunk_starts):
|
| 265 |
+
end = min(start + CHUNK_FRAMES, n_frames)
|
| 266 |
+
chunk_video = list(crops_pil[start:end])
|
| 267 |
+
chunk_len = len(chunk_video)
|
| 268 |
+
|
| 269 |
+
# VACE requires exactly num_frames frames per call. Pad short trailing
|
| 270 |
+
# chunks by repeating the last real frame; we discard the pad output.
|
| 271 |
+
if chunk_len < CHUNK_FRAMES:
|
| 272 |
+
chunk_video.extend(
|
| 273 |
+
[chunk_video[-1]] * (CHUNK_FRAMES - chunk_len)
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
chunk_mask = [mask_pil] * CHUNK_FRAMES
|
| 277 |
+
|
| 278 |
+
result = pipe(
|
| 279 |
+
video=chunk_video,
|
| 280 |
+
mask=chunk_mask,
|
| 281 |
+
prompt=PROMPT,
|
| 282 |
+
negative_prompt=NEGATIVE_PROMPT,
|
| 283 |
+
height=target_h,
|
| 284 |
+
width=target_w,
|
| 285 |
+
num_frames=CHUNK_FRAMES,
|
| 286 |
+
num_inference_steps=steps,
|
| 287 |
+
guidance_scale=guidance,
|
| 288 |
+
generator=generator,
|
| 289 |
+
).frames[0]
|
| 290 |
+
|
| 291 |
+
# Drop the pad and write into the global frame buffer. Later chunks
|
| 292 |
+
# overwrite the overlap region of earlier ones β the second chunk
|
| 293 |
+
# has the larger temporal context for the overlap frames.
|
| 294 |
+
for i in range(chunk_len):
|
| 295 |
+
global_i = start + i
|
| 296 |
+
if output_pil[global_i] is None or ci > 0:
|
| 297 |
+
output_pil[global_i] = result[i]
|
| 298 |
+
|
| 299 |
+
if torch.cuda.is_available():
|
| 300 |
+
torch.cuda.empty_cache()
|
| 301 |
+
|
| 302 |
+
# ββ 3. Resize back to original crop dimensions, return as ndarrays ββ
|
| 303 |
+
out: List[np.ndarray] = []
|
| 304 |
+
for pil_img in output_pil:
|
| 305 |
+
if pil_img is None:
|
| 306 |
+
raise RuntimeError(
|
| 307 |
+
"VACE: output frame missing after chunked inference. "
|
| 308 |
+
"This indicates a chunking bug; please report."
|
| 309 |
+
)
|
| 310 |
+
resized = pil_img.resize(
|
| 311 |
+
(crop_region.frame_w, crop_region.frame_h),
|
| 312 |
+
Image.LANCZOS,
|
| 313 |
+
)
|
| 314 |
+
out.append(np.array(resized.convert("RGB")))
|
| 315 |
+
|
| 316 |
+
return out
|
|
@@ -19,12 +19,13 @@ simple-lama-inpainting>=0.1.2
|
|
| 19 |
|
| 20 |
# ββ Quality mode (VACE-14B) βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
# torch / torchvision are pre-installed on ZeroGPU; do not pin here.
|
| 22 |
-
diffusers>=0.
|
|
|
|
| 23 |
transformers>=4.44.0
|
| 24 |
accelerate>=0.33.0
|
| 25 |
sentencepiece>=0.1.99
|
| 26 |
-
#
|
| 27 |
-
|
| 28 |
|
| 29 |
# ββ Video I/O βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
# ffmpeg binary is provided via packages.txt; no Python wrapper needed.
|
|
|
|
| 19 |
|
| 20 |
# ββ Quality mode (VACE-14B) βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
# torch / torchvision are pre-installed on ZeroGPU; do not pin here.
|
| 22 |
+
# diffusers must be >=0.34.0 for WanVACEPipeline (merged in PR #11582).
|
| 23 |
+
diffusers>=0.34.0
|
| 24 |
transformers>=4.44.0
|
| 25 |
accelerate>=0.33.0
|
| 26 |
sentencepiece>=0.1.99
|
| 27 |
+
peft>=0.13.0 # LoRA loading via load_lora_weights / fuse_lora
|
| 28 |
+
torchao>=0.6.0 # FP8 dynamic-activation quantization on H100/H200
|
| 29 |
|
| 30 |
# ββ Video I/O βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
# ffmpeg binary is provided via packages.txt; no Python wrapper needed.
|