ViTeX-Bench
/

ViTeX-Edit-14B

+"""
+ViTeX-14B inference example.
+Loads:
+  - Wan-AI/Wan2.1-VACE-14B (base model)
+  - ViTeX-Bench/ViTeX-14B   (this fine-tuned VACE module)
+Runs one or more video text-edit jobs, writing MP4 outputs.
+Requires:
+  - The DiffSynth-Studio-TextVACE fork (provides GlyphEncoder + ConditionCrossAttention)
+  - torch >= 2.7.0+cu128 (NCCL >= 2.25.1 recommended on H100)
+  - One NVIDIA GPU with >= 80 GB VRAM (H100 / A100 80 GB)
+  - imageio-ffmpeg, opencv-python
+Usage:
+  python inference_example.py \
+      --vace_video   path/to/source.mp4 \
+      --vace_mask    path/to/mask.mp4 \
+      --glyph_video  path/to/target_glyph.mp4 \
+      --prompt       "Change the sign to read 'HILTON'" \
+      --output       out.mp4
+"""
+import os
+import argparse
+import glob
+import torch
+from PIL import Image
+from huggingface_hub import snapshot_download
+from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig
+from diffsynth.core import load_state_dict
+HEIGHT = 720
+WIDTH  = 1280
+NUM_FRAMES = 121
+NUM_INFERENCE_STEPS = 50
+CFG_SCALE = 5.0
+SEED = 42
+def load_video_frames(path, target_frames=NUM_FRAMES, resize=(HEIGHT, WIDTH)):
+    """Load a video file into a list of PIL Images, optionally subsampling/padding."""
+    import cv2
+    cap = cv2.VideoCapture(path)
+    frames = []
+    while True:
+        ok, frame = cap.read()
+        if not ok:
+            break
+        img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        if resize:
+            img = img.resize((resize[1], resize[0]), Image.LANCZOS)  # (W, H)
+        frames.append(img)
+    cap.release()
+    if not frames:
+        raise ValueError(f"empty video: {path}")
+    if target_frames and len(frames) > target_frames:
+        import numpy as np
+        idx = np.linspace(0, len(frames) - 1, target_frames, dtype=int)
+        frames = [frames[i] for i in idx]
+    elif target_frames and len(frames) < target_frames:
+        frames.extend([frames[-1]] * (target_frames - len(frames)))
+    return frames
+def save_video(frames, path, fps=24):
+    """Save list of PIL Images to an H.264 MP4."""
+    import subprocess, numpy as np
+    import imageio_ffmpeg
+    ffmpeg = imageio_ffmpeg.get_ffmpeg_exe()
+    w, h = frames[0].size
+    cmd = [
+        ffmpeg, "-y",
+        "-f", "rawvideo", "-vcodec", "rawvideo",
+        "-s", f"{w}x{h}", "-pix_fmt", "rgb24",
+        "-r", str(fps),
+        "-i", "-",
+        "-c:v", "libx264", "-preset", "fast", "-crf", "18",
+        "-pix_fmt", "yuv420p",
+        path,
+    ]
+    proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.DEVNULL)
+    for fr in frames:
+        proc.stdin.write(np.array(fr).tobytes())
+    proc.stdin.close()
+    proc.wait()
+def build_pipeline(base_dir, ckpt_path, device="cuda:0"):
+    diffusion_shards = sorted(glob.glob(os.path.join(base_dir, "diffusion_pytorch_model-*.safetensors")))
+    pipe = WanVideoPipeline.from_pretrained(
+        torch_dtype=torch.bfloat16,
+        device=device,
+        model_configs=[
+            ModelConfig(path=diffusion_shards),
+            ModelConfig(path=os.path.join(base_dir, "models_t5_umt5-xxl-enc-bf16.pth")),
+            ModelConfig(path=os.path.join(base_dir, "Wan2.1_VAE.pth")),
+        ],
+        tokenizer_config=ModelConfig(path=os.path.join(base_dir, "google/umt5-xxl")),
+        redirect_common_files=False,
+    )
+    print(f"Loading ViTeX-14B weights from {ckpt_path}")
+    state = load_state_dict(ckpt_path)
+    res = pipe.vace.load_state_dict(state, strict=False)
+    print(f"  loaded {len(state)} keys (missing {len(res.missing_keys)}, unexpected {len(res.unexpected_keys)})")
+    del state
+    return pipe
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--vace_video",  required=True, help="Source RGB video (the one to edit).")
+    p.add_argument("--vace_mask",   required=True, help="Per-frame binary mask: 1=replace, 0=keep.")
+    p.add_argument("--glyph_video", required=True, help="Pre-rendered target glyphs placed in the mask region.")
+    p.add_argument("--prompt",      default="", help="Optional text prompt describing the edit.")
+    p.add_argument("--output",      default="output.mp4")
+    p.add_argument("--height", type=int, default=HEIGHT)
+    p.add_argument("--width",  type=int, default=WIDTH)
+    p.add_argument("--num_frames", type=int, default=NUM_FRAMES)
+    p.add_argument("--num_inference_steps", type=int, default=NUM_INFERENCE_STEPS)
+    p.add_argument("--cfg_scale", type=float, default=CFG_SCALE)
+    p.add_argument("--seed", type=int, default=SEED)
+    p.add_argument("--device", default="cuda:0")
+    args = p.parse_args()
+    # 1. Download base + this model
+    print("Downloading Wan-AI/Wan2.1-VACE-14B (base, ~60 GB)...")
+    base_dir  = snapshot_download("Wan-AI/Wan2.1-VACE-14B")
+    print("Downloading ViTeX-Bench/ViTeX-14B (this model, ~8 GB)...")
+    vitex_dir = snapshot_download("ViTeX-Bench/ViTeX-14B")
+    ckpt_path = os.path.join(vitex_dir, "vitex_14b.safetensors")
+    # 2. Build pipeline
+    pipe = build_pipeline(base_dir, ckpt_path, device=args.device)
+    # 3. Load inputs
+    target_size = (args.height, args.width)
+    vace_video = load_video_frames(args.vace_video,  args.num_frames, target_size)
+    vace_mask  = load_video_frames(args.vace_mask,   args.num_frames, target_size)
+    glyph      = load_video_frames(args.glyph_video, args.num_frames, target_size)
+    # 4. Run
+    print(f"Running pipeline (seed={args.seed}, cfg={args.cfg_scale}, steps={args.num_inference_steps})...")
+    out_frames = pipe(
+        prompt=args.prompt,
+        negative_prompt="",
+        vace_video=vace_video,
+        vace_video_mask=vace_mask,
+        glyph_video=glyph,
+        seed=args.seed,
+        height=args.height,
+        width=args.width,
+        num_frames=args.num_frames,
+        cfg_scale=args.cfg_scale,
+        num_inference_steps=args.num_inference_steps,
+        tiled=True,
+    )
+    save_video(out_frames, args.output)
+    print(f"saved: {args.output}")
+if __name__ == "__main__":
+    main()