BoxOfColors Claude Opus 4.7 (1M context) commited on
Commit
f6c8580
Β·
1 Parent(s): 48bdb38

feat: implement VACE-14B quality mode

Browse files

Wires up the previously-stubbed Quality mode with a full WanVACEPipeline
inference path, served from the upstream-protection mirror.

pipeline/vace.py
- WanVACEPipeline loaded from JackIsNotInTheBox/...Checkpoints/vace-14b
(env-overridable via VACE_REPO_ID / VACE_SUBFOLDER)
- Fuses lightx2v rank-64 distill LoRA β†’ 4-step inference
(~7-8x speedup vs 30-step base; falls back to 30-step if LoRA fails)
- FP8 dynamic-activation quantization on H100/H200 (sm_90+) via torchao,
bf16 fallback when unavailable
- Processes long clips in 81-frame chunks with 8-frame overlap; later
chunks overwrite the seam region (larger temporal context wins)
- Crops resized to crop_region.target_{w,h} for VACE, then downsampled
back to original crop dimensions
- model.enable_model_cpu_offload() to keep peak VRAM in budget

app.py
- Splits _inpaint_composite_save_gpu into per-mode functions:
_gpu_inpaint_lama @spaces.GPU(duration=180)
_gpu_inpaint_vace @spaces.GPU(duration=300)
Closes the long-standing gap where LaMa runs would lease VACE-sized
GPU windows
- Drops the "VACE not yet available" guard
- Factors composite-and-save loop into a shared helper

requirements.txt
- diffusers >=0.34.0 (WanVACEPipeline merged in PR #11582)
- peft >=0.13.0 for load_lora_weights / fuse_lora
- torchao >=0.6.0 enabled (was commented out)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show
  1. app.py +74 -52
  2. pipeline/vace.py +280 -23
  3. requirements.txt +4 -3
app.py CHANGED
@@ -456,67 +456,94 @@ def on_clear_mask(editor_value: dict | None):
456
  )
457
 
458
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  @spaces.GPU(duration=180)
460
- def _inpaint_composite_save_gpu(
461
  frame_paths: list,
462
  crop_region: CropRegion,
463
  inpaint_mask: np.ndarray,
464
  out_dir,
465
- mode: str,
466
  total: int,
467
  progress,
468
  ) -> None:
469
- """
470
- GPU-accelerated inpainting with immediate per-frame compositing and disk save.
471
-
472
- Architecture
473
- ------------
474
- - The feathered alpha map is pre-computed **once** (static mask for the whole
475
- video) so the Gaussian blur runs exactly once instead of once per frame.
476
- - For LaMa (per-frame independent model): streams one frame at a time β€”
477
- never holds more than one inpainted crop in RAM.
478
- - For VACE (temporal model): must process the full sequence at once for
479
- temporal coherence, then composites and saves frame-by-frame.
480
- - Saves composited PNGs directly to *out_dir* so the caller never holds
481
- the full crop list in memory.
482
- """
483
- from pipeline.composite import composite_with_alpha, feathered_alpha
484
 
485
- alpha = feathered_alpha(inpaint_mask) # pre-compute once (static mask)
486
  out_dir = Path(out_dir)
487
 
488
- if mode == "Fast (LaMa)":
489
- from pipeline.lama import inpaint_frames_lama_stream
 
 
 
490
 
491
- def _prog(i: int) -> None:
492
- progress(
493
- 0.20 + 0.65 * ((i + 1) / total),
494
- desc=f"LaMa {i + 1}/{total}…",
495
- )
496
 
497
- for i, (fp, crop) in enumerate(
498
- zip(
499
- frame_paths,
500
- inpaint_frames_lama_stream(
501
- frame_paths, crop_region, inpaint_mask, _prog
502
- ),
503
- )
504
- ):
505
- original = np.array(Image.open(fp).convert("RGB"))
506
- composited = composite_with_alpha(original, crop, crop_region, alpha)
507
- Image.fromarray(composited).save(out_dir / f"{i + 1:06d}.png")
508
 
509
- else: # Quality (VACE) β€” temporal model requires the full frame sequence
510
- from pipeline.vace import inpaint_frames_vace
 
 
 
 
 
 
 
 
 
 
 
 
 
511
 
512
- progress(0.45, desc="Running VACE-14B…")
513
- crops = inpaint_frames_vace(frame_paths, crop_region, inpaint_mask)
 
514
 
515
- progress(0.85, desc="Compositing…")
516
- for i, (fp, crop) in enumerate(zip(frame_paths, crops)):
517
- original = np.array(Image.open(fp).convert("RGB"))
518
- composited = composite_with_alpha(original, crop, crop_region, alpha)
519
- Image.fromarray(composited).save(out_dir / f"{i + 1:06d}.png")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
 
521
 
522
  def run_pipeline(
@@ -573,16 +600,11 @@ def run_pipeline(
573
  total = len(frame_paths)
574
 
575
  # ── GPU: inpaint + composite + save ────────────────────────────
576
- # Validate mode on CPU before acquiring GPU so unimplemented modes
577
- # fail fast without burning ZeroGPU quota.
578
  _VALID_MODES = ("Fast (LaMa)", "Quality (VACE-14B)")
579
  if mode not in _VALID_MODES:
580
  raise gr.Error(f"Unknown mode '{mode}'. Choose from: {_VALID_MODES}")
581
- if mode == "Quality (VACE-14B)":
582
- raise gr.Error(
583
- "VACE-14B quality mode is not yet available. "
584
- "Please select Fast (LaMa)."
585
- )
586
  progress(0.15, desc="Starting inpainting…")
587
  _inpaint_composite_save_gpu(
588
  frame_paths, crop_region, inpaint_mask,
 
456
  )
457
 
458
 
459
+ def _composite_and_save(
460
+ frame_paths: list,
461
+ crops_iter,
462
+ crop_region: CropRegion,
463
+ alpha: np.ndarray,
464
+ out_dir: Path,
465
+ ) -> None:
466
+ """Composite each inpainted crop onto its source frame and save as PNG."""
467
+ from pipeline.composite import composite_with_alpha
468
+
469
+ for i, (fp, crop) in enumerate(zip(frame_paths, crops_iter)):
470
+ original = np.array(Image.open(fp).convert("RGB"))
471
+ composited = composite_with_alpha(original, crop, crop_region, alpha)
472
+ Image.fromarray(composited).save(out_dir / f"{i + 1:06d}.png")
473
+
474
+
475
  @spaces.GPU(duration=180)
476
+ def _gpu_inpaint_lama(
477
  frame_paths: list,
478
  crop_region: CropRegion,
479
  inpaint_mask: np.ndarray,
480
  out_dir,
 
481
  total: int,
482
  progress,
483
  ) -> None:
484
+ """LaMa branch β€” streams one frame at a time, never holds the full list."""
485
+ from pipeline.composite import feathered_alpha
486
+ from pipeline.lama import inpaint_frames_lama_stream
 
 
 
 
 
 
 
 
 
 
 
 
487
 
488
+ alpha = feathered_alpha(inpaint_mask)
489
  out_dir = Path(out_dir)
490
 
491
+ def _prog(i: int) -> None:
492
+ progress(
493
+ 0.20 + 0.65 * ((i + 1) / total),
494
+ desc=f"LaMa {i + 1}/{total}…",
495
+ )
496
 
497
+ crops_iter = inpaint_frames_lama_stream(
498
+ frame_paths, crop_region, inpaint_mask, _prog,
499
+ )
500
+ _composite_and_save(frame_paths, crops_iter, crop_region, alpha, out_dir)
 
501
 
 
 
 
 
 
 
 
 
 
 
 
502
 
503
+ @spaces.GPU(duration=300)
504
+ def _gpu_inpaint_vace(
505
+ frame_paths: list,
506
+ crop_region: CropRegion,
507
+ inpaint_mask: np.ndarray,
508
+ out_dir,
509
+ _total: int, # signature parity with _gpu_inpaint_lama
510
+ progress,
511
+ ) -> None:
512
+ """VACE branch β€” temporal model needs the full sequence; longer GPU lease."""
513
+ from pipeline.composite import feathered_alpha
514
+ from pipeline.vace import inpaint_frames_vace
515
+
516
+ alpha = feathered_alpha(inpaint_mask)
517
+ out_dir = Path(out_dir)
518
 
519
+ progress(0.20, desc="Loading VACE-14B (cold start ~30s)…")
520
+ progress(0.45, desc="Running VACE-14B inpainting…")
521
+ crops = inpaint_frames_vace(frame_paths, crop_region, inpaint_mask)
522
 
523
+ progress(0.85, desc="Compositing…")
524
+ _composite_and_save(frame_paths, crops, crop_region, alpha, out_dir)
525
+
526
+
527
+ def _inpaint_composite_save_gpu(
528
+ frame_paths: list,
529
+ crop_region: CropRegion,
530
+ inpaint_mask: np.ndarray,
531
+ out_dir,
532
+ mode: str,
533
+ total: int,
534
+ progress,
535
+ ) -> None:
536
+ """Dispatch to the per-mode GPU function with the right duration budget."""
537
+ if mode == "Fast (LaMa)":
538
+ _gpu_inpaint_lama(
539
+ frame_paths, crop_region, inpaint_mask, out_dir, total, progress,
540
+ )
541
+ elif mode == "Quality (VACE-14B)":
542
+ _gpu_inpaint_vace(
543
+ frame_paths, crop_region, inpaint_mask, out_dir, total, progress,
544
+ )
545
+ else:
546
+ raise ValueError(f"Unknown inpainting mode: {mode!r}")
547
 
548
 
549
  def run_pipeline(
 
600
  total = len(frame_paths)
601
 
602
  # ── GPU: inpaint + composite + save ────────────────────────────
603
+ # Validate mode on CPU before acquiring GPU so unknown modes fail
604
+ # fast without burning ZeroGPU quota.
605
  _VALID_MODES = ("Fast (LaMa)", "Quality (VACE-14B)")
606
  if mode not in _VALID_MODES:
607
  raise gr.Error(f"Unknown mode '{mode}'. Choose from: {_VALID_MODES}")
 
 
 
 
 
608
  progress(0.15, desc="Starting inpainting…")
609
  _inpaint_composite_save_gpu(
610
  frame_paths, crop_region, inpaint_mask,
pipeline/vace.py CHANGED
@@ -3,36 +3,199 @@ pipeline/vace.py
3
  ----------------
4
  Quality mode: VACE-14B video inpainting via Wan2.1-VACE-14B-diffusers.
5
 
6
- STUB β€” implemented in the next iteration after pipeline validation with LaMa.
7
-
8
- Planned implementation:
9
- - WanVACEPipeline from diffusers
10
- - FP8 quantization via torchao
11
- - AoT compilation for speed
12
- - 8-step inference (step-distilled schedule)
13
- - Temporal chunking: split frame list into ~33-frame windows, run each
14
- chunk with overlapping context frames to avoid seam artefacts
15
- - Memory: model.enable_model_cpu_offload() + torch.cuda.empty_cache()
16
- between chunks
17
- - @spaces.GPU(duration=200) decorator on the main entry function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  """
19
 
20
  from __future__ import annotations
21
 
 
22
  from pathlib import Path
23
- from typing import List
24
 
25
  import numpy as np
 
 
26
 
27
  from pipeline.crop import CropRegion
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def inpaint_frames_vace(
31
  frame_paths: List[Path],
32
  crop_region: CropRegion,
33
  inpaint_mask: np.ndarray,
34
- num_inference_steps: int = 8,
35
- guidance_scale: float = 5.0,
36
  ) -> List[np.ndarray]:
37
  """
38
  Run VACE-14B inpainting on the crop region of each frame.
@@ -42,18 +205,112 @@ def inpaint_frames_vace(
42
  frame_paths : List[Path]
43
  Ordered full-frame PNG paths.
44
  crop_region : CropRegion
 
45
  inpaint_mask : np.ndarray
46
- Crop-local binary mask (H x W, uint8). 255=inpaint.
47
- num_inference_steps : int
48
- Default 8 for step-distilled fast inference.
49
- guidance_scale : float
 
50
 
51
  Returns
52
  -------
53
  List[np.ndarray]
54
- Inpainted crop arrays (H x W x 3, uint8 RGB), one per frame.
 
55
  """
56
- raise NotImplementedError(
57
- "VACE-14B pipeline is not yet implemented. "
58
- "Use Fast (LaMa) mode for now."
 
59
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ----------------
4
  Quality mode: VACE-14B video inpainting via Wan2.1-VACE-14B-diffusers.
5
 
6
+ Architecture
7
+ ------------
8
+ - Loads ``WanVACEPipeline`` from the local mirror under
9
+ ``JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints/vace-14b/`` to
10
+ insulate the Space from upstream deletion of ``Wan-AI/...``.
11
+ - Fuses the lightx2v rank-64 distill LoRA so we can run the masked
12
+ diffusion in 4 inference steps (vs 30 for the base model). The LoRA is
13
+ trained for T2V but applies cleanly to VACE since both share the same
14
+ Wan2.1 transformer backbone.
15
+ - Quantises the transformer weights to FP8 with torchao on H100/H200
16
+ hardware (sm_90+). Roughly halves transformer VRAM and accelerates
17
+ matmuls; falls back to bf16 if torchao or compute-capability is missing.
18
+ - Processes the (potentially long) frame list in 81-frame chunks (VACE's
19
+ native temporal window) with 8-frame overlap. Later chunks overwrite
20
+ the overlap region so the larger context window wins.
21
+ - Each crop is resized to the VACE-target resolution that
22
+ :func:`pipeline.crop.compute_crop_region` selected, then resized back
23
+ to the original crop dimensions before compositing.
24
+
25
+ ZeroGPU budget
26
+ --------------
27
+ The pipeline is designed to fit inside ~300s on the H200 MIG slice for a
28
+ 15-second clip at ≀30 fps. Cold load (transformer + text encoder + VAE
29
+ + LoRA fuse + FP8 quantize) is ~30-60s; per-chunk inference at 4 steps
30
+ is ~10-20s; ~7 chunks for 15s @ 30fps.
31
+
32
+ Configuration knobs (all read at module import via env vars)
33
+ ----------------------------------------------------------
34
+ - VACE_REPO_ID : HF repo holding the diffusers package (default: mirror)
35
+ - VACE_SUBFOLDER : subfolder within the repo (default: ``vace-14b``)
36
+ - VACE_LORA_REPO_ID : HF repo holding the distill LoRA (default: mirror)
37
+ - VACE_LORA_FILE : LoRA filename (default: lightx2v rank-64 4-step)
38
+
39
+ License: Apache-2.0 (Wan2.1 base) + Apache-2.0 (lightx2v distill LoRA).
40
  """
41
 
42
  from __future__ import annotations
43
 
44
+ import os
45
  from pathlib import Path
46
+ from typing import List, Optional
47
 
48
  import numpy as np
49
+ import torch
50
+ from PIL import Image
51
 
52
  from pipeline.crop import CropRegion
53
 
54
 
55
+ # ---------------------------------------------------------------------------
56
+ # Configuration
57
+ # ---------------------------------------------------------------------------
58
+
59
+ VACE_REPO_ID = os.environ.get(
60
+ "VACE_REPO_ID",
61
+ "JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints",
62
+ )
63
+ VACE_SUBFOLDER = os.environ.get("VACE_SUBFOLDER", "vace-14b")
64
+ VACE_LORA_REPO_ID = os.environ.get(
65
+ "VACE_LORA_REPO_ID",
66
+ "JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints",
67
+ )
68
+ VACE_LORA_FILE = os.environ.get(
69
+ "VACE_LORA_FILE",
70
+ "loras/wan2.1_t2v_14b_lora_rank64_lightx2v_4step.safetensors",
71
+ )
72
+
73
+ # VACE requires num_frames = 4n+1. 81 = 16*5+1 is the documented sweet spot.
74
+ CHUNK_FRAMES = 81
75
+ # Frames shared between consecutive chunks for temporal continuity at seams.
76
+ CHUNK_OVERLAP = 8
77
+
78
+ # Step-distill LoRA enables 4-step inference (~7-8x faster than 30-step base).
79
+ DEFAULT_STEPS_DISTILLED = 4
80
+ DEFAULT_STEPS_BASE = 30
81
+ # CFG-free with the distill LoRA; base would use ~5.0.
82
+ DEFAULT_GUIDANCE_DISTILLED = 1.0
83
+ DEFAULT_GUIDANCE_BASE = 5.0
84
+
85
+ # Empty positive prompt β€” for watermark removal we want the model to fill
86
+ # from the surrounding crop context, not steer to anything specific.
87
+ PROMPT = ""
88
+ NEGATIVE_PROMPT = (
89
+ "watermark, text, logo, subtitles, low quality, "
90
+ "blurry, distortion, artifacts, JPEG compression"
91
+ )
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # Pipeline singleton (cold load is expensive β€” keep it warm across calls)
96
+ # ---------------------------------------------------------------------------
97
+
98
+ _vace_pipe = None
99
+ _vace_device: Optional[str] = None
100
+
101
+
102
+ def _get_pipe():
103
+ """Load (or return cached) WanVACEPipeline configured for fast inpainting."""
104
+ global _vace_pipe, _vace_device
105
+
106
+ current_device = "cuda" if torch.cuda.is_available() else "cpu"
107
+ if _vace_pipe is not None and _vace_device == current_device:
108
+ return _vace_pipe
109
+
110
+ from diffusers import AutoencoderKLWan, WanVACEPipeline
111
+ from diffusers.schedulers.scheduling_unipc_multistep import (
112
+ UniPCMultistepScheduler,
113
+ )
114
+
115
+ # VAE in fp32 (per the official diffusers example) for numerical stability.
116
+ vae = AutoencoderKLWan.from_pretrained(
117
+ VACE_REPO_ID,
118
+ subfolder=f"{VACE_SUBFOLDER}/vae",
119
+ torch_dtype=torch.float32,
120
+ )
121
+ pipe = WanVACEPipeline.from_pretrained(
122
+ VACE_REPO_ID,
123
+ subfolder=VACE_SUBFOLDER,
124
+ vae=vae,
125
+ torch_dtype=torch.bfloat16,
126
+ )
127
+
128
+ # flow_shift = 3.0 β†’ 480P-friendly. 5.0 would be 720P-friendly.
129
+ # We process at the smallest VACE-target resolution for the crop, which
130
+ # is typically in the 480P band, so 3.0 is the right default.
131
+ pipe.scheduler = UniPCMultistepScheduler.from_config(
132
+ pipe.scheduler.config, flow_shift=3.0,
133
+ )
134
+
135
+ # Step-distill LoRA. If it fails to apply we fall back to 30-step base.
136
+ inference_steps = DEFAULT_STEPS_DISTILLED
137
+ inference_guidance = DEFAULT_GUIDANCE_DISTILLED
138
+ try:
139
+ pipe.load_lora_weights(
140
+ VACE_LORA_REPO_ID,
141
+ weight_name=VACE_LORA_FILE,
142
+ adapter_name="distill",
143
+ )
144
+ pipe.set_adapters(["distill"], adapter_weights=[1.0])
145
+ pipe.fuse_lora(adapter_names=["distill"], lora_scale=1.0)
146
+ pipe.unload_lora_weights()
147
+ print(f"[VACE] Distill LoRA fused; {inference_steps}-step inference.")
148
+ except Exception as exc:
149
+ print(
150
+ f"[VACE] Distill LoRA load/fuse failed ({exc}); "
151
+ f"falling back to {DEFAULT_STEPS_BASE}-step base inference."
152
+ )
153
+ inference_steps = DEFAULT_STEPS_BASE
154
+ inference_guidance = DEFAULT_GUIDANCE_BASE
155
+ # Stash for inpaint_frames_vace.
156
+ pipe._wm_steps = inference_steps # type: ignore[attr-defined]
157
+ pipe._wm_guidance = inference_guidance # type: ignore[attr-defined]
158
+
159
+ # FP8 quantization on H100 / H200 (sm_90+).
160
+ if (
161
+ current_device == "cuda"
162
+ and torch.cuda.is_available()
163
+ and torch.cuda.get_device_capability(0)[0] >= 9
164
+ ):
165
+ try:
166
+ from torchao.quantization import (
167
+ float8_dynamic_activation_float8_weight,
168
+ quantize_,
169
+ )
170
+
171
+ quantize_(
172
+ pipe.transformer,
173
+ float8_dynamic_activation_float8_weight(),
174
+ )
175
+ print("[VACE] FP8 dynamic-activation quantization applied.")
176
+ except Exception as exc:
177
+ print(f"[VACE] FP8 quantization unavailable ({exc}); using bf16.")
178
+
179
+ pipe.to(current_device)
180
+ # CPU offload reduces peak VRAM by paging the text encoder + VAE off-GPU
181
+ # between the prompt-encode and decode phases. Negligible runtime cost.
182
+ pipe.enable_model_cpu_offload()
183
+
184
+ _vace_pipe = pipe
185
+ _vace_device = current_device
186
+ return _vace_pipe
187
+
188
+
189
+ # ---------------------------------------------------------------------------
190
+ # Public API
191
+ # ---------------------------------------------------------------------------
192
+
193
  def inpaint_frames_vace(
194
  frame_paths: List[Path],
195
  crop_region: CropRegion,
196
  inpaint_mask: np.ndarray,
197
+ num_inference_steps: Optional[int] = None,
198
+ guidance_scale: Optional[float] = None,
199
  ) -> List[np.ndarray]:
200
  """
201
  Run VACE-14B inpainting on the crop region of each frame.
 
205
  frame_paths : List[Path]
206
  Ordered full-frame PNG paths.
207
  crop_region : CropRegion
208
+ Crop rectangle + VACE-target resolution selected upstream.
209
  inpaint_mask : np.ndarray
210
+ Crop-local binary mask (H Γ— W, uint8). 255 = inpaint, 0 = keep.
211
+ num_inference_steps : int, optional
212
+ Override the auto-selected step count (4 with distill LoRA, 30 base).
213
+ guidance_scale : float, optional
214
+ Override the auto-selected guidance scale (1.0 distilled, 5.0 base).
215
 
216
  Returns
217
  -------
218
  List[np.ndarray]
219
+ Inpainted crop arrays (crop_h Γ— crop_w Γ— 3, uint8 RGB), one per frame,
220
+ in the **original** crop dimensions (not target dimensions).
221
  """
222
+ pipe = _get_pipe()
223
+ steps = num_inference_steps or getattr(pipe, "_wm_steps", DEFAULT_STEPS_DISTILLED)
224
+ guidance = guidance_scale if guidance_scale is not None else getattr(
225
+ pipe, "_wm_guidance", DEFAULT_GUIDANCE_DISTILLED
226
  )
227
+
228
+ target_w = crop_region.target_w
229
+ target_h = crop_region.target_h
230
+
231
+ # ── 1. Load each crop and resize to VACE target resolution ──────────
232
+ crops_pil: List[Image.Image] = []
233
+ for fp in frame_paths:
234
+ img = Image.open(fp).convert("RGB")
235
+ box = (
236
+ crop_region.frame_x,
237
+ crop_region.frame_y,
238
+ crop_region.frame_x + crop_region.frame_w,
239
+ crop_region.frame_y + crop_region.frame_h,
240
+ )
241
+ crop = img.crop(box).resize((target_w, target_h), Image.LANCZOS)
242
+ crops_pil.append(crop)
243
+
244
+ # Static mask, resized to target resolution.
245
+ # NEAREST keeps the mask edges binary so feathering happens later in
246
+ # composite.py rather than smearing into the diffusion conditioning.
247
+ mask_at_target = np.array(
248
+ Image.fromarray(inpaint_mask).resize(
249
+ (target_w, target_h), Image.NEAREST,
250
+ )
251
+ )
252
+ mask_pil = Image.fromarray(mask_at_target).convert("L")
253
+
254
+ # ── 2. Process in overlapping CHUNK_FRAMES windows ──────────────────
255
+ n_frames = len(crops_pil)
256
+ output_pil: List[Optional[Image.Image]] = [None] * n_frames
257
+
258
+ stride = CHUNK_FRAMES - CHUNK_OVERLAP
259
+ chunk_starts = list(range(0, n_frames, stride))
260
+
261
+ # Deterministic seed so re-runs are reproducible.
262
+ generator = torch.Generator(device="cpu").manual_seed(42)
263
+
264
+ for ci, start in enumerate(chunk_starts):
265
+ end = min(start + CHUNK_FRAMES, n_frames)
266
+ chunk_video = list(crops_pil[start:end])
267
+ chunk_len = len(chunk_video)
268
+
269
+ # VACE requires exactly num_frames frames per call. Pad short trailing
270
+ # chunks by repeating the last real frame; we discard the pad output.
271
+ if chunk_len < CHUNK_FRAMES:
272
+ chunk_video.extend(
273
+ [chunk_video[-1]] * (CHUNK_FRAMES - chunk_len)
274
+ )
275
+
276
+ chunk_mask = [mask_pil] * CHUNK_FRAMES
277
+
278
+ result = pipe(
279
+ video=chunk_video,
280
+ mask=chunk_mask,
281
+ prompt=PROMPT,
282
+ negative_prompt=NEGATIVE_PROMPT,
283
+ height=target_h,
284
+ width=target_w,
285
+ num_frames=CHUNK_FRAMES,
286
+ num_inference_steps=steps,
287
+ guidance_scale=guidance,
288
+ generator=generator,
289
+ ).frames[0]
290
+
291
+ # Drop the pad and write into the global frame buffer. Later chunks
292
+ # overwrite the overlap region of earlier ones β€” the second chunk
293
+ # has the larger temporal context for the overlap frames.
294
+ for i in range(chunk_len):
295
+ global_i = start + i
296
+ if output_pil[global_i] is None or ci > 0:
297
+ output_pil[global_i] = result[i]
298
+
299
+ if torch.cuda.is_available():
300
+ torch.cuda.empty_cache()
301
+
302
+ # ── 3. Resize back to original crop dimensions, return as ndarrays ──
303
+ out: List[np.ndarray] = []
304
+ for pil_img in output_pil:
305
+ if pil_img is None:
306
+ raise RuntimeError(
307
+ "VACE: output frame missing after chunked inference. "
308
+ "This indicates a chunking bug; please report."
309
+ )
310
+ resized = pil_img.resize(
311
+ (crop_region.frame_w, crop_region.frame_h),
312
+ Image.LANCZOS,
313
+ )
314
+ out.append(np.array(resized.convert("RGB")))
315
+
316
+ return out
requirements.txt CHANGED
@@ -19,12 +19,13 @@ simple-lama-inpainting>=0.1.2
19
 
20
  # ── Quality mode (VACE-14B) ───────────────────────────────────────────────
21
  # torch / torchvision are pre-installed on ZeroGPU; do not pin here.
22
- diffusers>=0.32.0
 
23
  transformers>=4.44.0
24
  accelerate>=0.33.0
25
  sentencepiece>=0.1.99
26
- # torchao β€” FP8 quantization (uncomment when implementing vace.py)
27
- # torchao>=0.6.0
28
 
29
  # ── Video I/O ─────────────────────────────────────────────────────────────
30
  # ffmpeg binary is provided via packages.txt; no Python wrapper needed.
 
19
 
20
  # ── Quality mode (VACE-14B) ───────────────────────────────────────────────
21
  # torch / torchvision are pre-installed on ZeroGPU; do not pin here.
22
+ # diffusers must be >=0.34.0 for WanVACEPipeline (merged in PR #11582).
23
+ diffusers>=0.34.0
24
  transformers>=4.44.0
25
  accelerate>=0.33.0
26
  sentencepiece>=0.1.99
27
+ peft>=0.13.0 # LoRA loading via load_lora_weights / fuse_lora
28
+ torchao>=0.6.0 # FP8 dynamic-activation quantization on H100/H200
29
 
30
  # ── Video I/O ─────────────────────────────────────────────────────────────
31
  # ffmpeg binary is provided via packages.txt; no Python wrapper needed.