Spaces:

EPFL-VILAB
/

VideoFlexTok

Sleeping

App Files Files Community

andreiatanov commited on Apr 14

Commit

8603681

0 Parent(s):

VideoFlexTok demo

Browse files

Files changed (10) hide show

.gitattributes +36 -0
.gitignore +5 -0
README.md +14 -0
app.py +278 -0
examples/apple.mp4 +3 -0
examples/arch.mp4 +3 -0
examples/cat.mp4 +3 -0
examples/porsche.mp4 +3 -0
examples/sculpture.mp4 +3 -0
requirements.txt +25 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.venv310
+__pycache__
+.DS_Store
+ml-videoflextok/
+gradio_cached_examples/

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: VideoFlexTok
+emoji: 🎞️
+colorFrom: pink
+colorTo: indigo
+sdk: gradio
+sdk_version: 6.5.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: 'VideoFlexTok: flexible-length coarse-to-fine video tokenizer'
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import importlib
+import os
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+# Install videoflextok without its deps to avoid huggingface_hub==0.25.2 conflicting
+# with gradio's >=0.33.5 requirement. Compatible dep versions are in requirements.txt.
+def _install_videoflextok():
+    try:
+        import videoflextok  # noqa: F401
+        return
+    except ImportError:
+        pass
+    print("[VideoFlexTok] Installing videoflextok (--no-deps) ...")
+    subprocess.run(
+        [sys.executable, "-m", "pip", "install", "--quiet", "--no-deps",
+         "git+https://github.com/apple/ml-videoflextok.git"],
+        check=True,
+    )
+    importlib.invalidate_caches()
+_install_videoflextok()
+import spaces
+import gradio as gr
+import imageio.v3 as iio
+import numpy as np
+import torch
+from videoflextok.utils.demo import denormalize, read_mp4
+from videoflextok.utils.misc import detect_bf16_support, get_bf16_context
+from videoflextok.wrappers import VideoFlexTokFromHub
+# --- Constants ---------------------------------------------------------------------
+MODEL_ID = "EPFL-VILAB/videoflextok_d18_d28"
+APP_DIR = Path(__file__).resolve().parent
+EXAMPLES_DIR = APP_DIR / "examples"
+EXAMPLE_VIDEOS = sorted(EXAMPLES_DIR.glob("*.mp4"))
+NUM_KEEP_TOKENS = [2**i for i in range(9)]  # 1, 2, 4, 8, 16, 32, 64, 128, 256
+APP_CSS = """
+#col-container {
+  margin: 0 auto;
+  max-width: 1500px;
+}
+#col-input-container {
+  margin: 0 auto;
+  max-width: 420px;
+}
+#run-button {
+  margin: 0 auto;
+}
+"""
+# --- Device setup ------------------------------------------------------------------
+torch.set_grad_enabled(False)
+if torch.cuda.is_available():
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ENABLE_BF16 = DEVICE.type == "cuda" and detect_bf16_support()
+# --- Model loading -----------------------------------------------------------------
+def _patch_for_hf_spaces(model):
+    """Patch TorchDynamo and model for HF Spaces / ZeroGPU compatibility.
+    This PyTorch version's TorchDynamo cannot represent torch.device as a ConstantVariable,
+    causing torch.compile(flex_attention) to crash. The fix was merged into newer PyTorch;
+    here we backport it by adding torch.device to common_constant_types, so the Triton
+    kernel is used correctly instead of falling back to the dense O(n²) math implementation.
+    We also disable block mask compilation (compile_block_mask=False) since create_block_mask
+    uses a separate internal torch.compile call that would hit the same bug.
+    """
+    # Patch TorchDynamo to accept torch.device as a ConstantVariable.
+    # common_constant_types may be closed over in is_base_literal, so patch the method directly.
+    import torch._dynamo.variables.constant as _dynamo_const
+    _orig_is_base_literal = _dynamo_const.ConstantVariable.is_base_literal
+    @staticmethod
+    def _patched_is_base_literal(value):
+        return isinstance(value, torch.device) or _orig_is_base_literal(value)
+    _dynamo_const.ConstantVariable.is_base_literal = _patched_is_base_literal
+    from videoflextok.model.preprocessors.flex_seq_packing import (
+        BlockWiseSequencePacker,
+        BlockWiseSequenceInterleavePacker,
+        BlockWiseSequencePackerWithCrossAttention,
+    )
+    for module in model.modules():
+        if isinstance(module, (
+            BlockWiseSequencePacker,
+            BlockWiseSequenceInterleavePacker,
+            BlockWiseSequencePackerWithCrossAttention,
+        )):
+            module.compile_block_mask = False
+_model = None
+try:
+    print(f"[VideoFlexTok] Loading {MODEL_ID} ...")
+    _model = VideoFlexTokFromHub.from_pretrained(MODEL_ID)
+    _model = _model.to(torch.bfloat16).to(DEVICE).eval()
+    _patch_for_hf_spaces(_model)
+    print("[VideoFlexTok] Model ready.")
+except Exception as exc:
+    print(f"[VideoFlexTok] FATAL: model load failed: {exc}")
+# --- Inference ---------------------------------------------------------------------
+def _stack_reconstructed_videos(videos, output_path: str, fps: int):
+    """Compose 9 reconstructions + original into a 2×5 grid video and write to output_path."""
+    def to_uint8_frames(video_tensor):
+        if video_tensor.ndim == 5:
+            video_tensor = video_tensor[0]
+        frames = denormalize(video_tensor).permute(1, 2, 3, 0).contiguous().numpy()
+        return (np.clip(frames, 0.0, 1.0) * 255).round().astype(np.uint8)
+    def add_border(frames: np.ndarray, border_px: int, color: int) -> np.ndarray:
+        return np.pad(
+            frames,
+            ((0, 0), (border_px, border_px), (border_px, border_px), (0, 0)),
+            mode="constant", constant_values=color,
+        )
+    def compose_row(row_frames: list[np.ndarray], t: int, gap_px: int) -> np.ndarray:
+        gap_col = np.full((row_frames[0].shape[1], gap_px, 3), 255, dtype=np.uint8)
+        items = []
+        for i, frames in enumerate(row_frames):
+            items.append(frames[t])
+            if i < len(row_frames) - 1:
+                items.append(gap_col)
+        return np.concatenate(items, axis=1)
+    border_px, gap_px = 8, 8
+    reconstructed = [add_border(to_uint8_frames(v), border_px, 255) for v in videos[:9]]
+    original = add_border(to_uint8_frames(videos[9]), border_px, 0)
+    all_panels = reconstructed + [original]
+    total_frames = min(p.shape[0] for p in all_panels)
+    all_panels = [p[:total_frames] for p in all_panels]
+    row1 = all_panels[:5]   # k = 1, 2, 4, 8, 16
+    row2 = all_panels[5:]   # k = 32, 64, 128, 256, Original
+    composed = []
+    for t in range(total_frames):
+        row1_img = compose_row(row1, t, gap_px)
+        row2_img = compose_row(row2, t, gap_px)
+        row_gap = np.full((gap_px, row1_img.shape[1], 3), 255, dtype=np.uint8)
+        composed.append(np.concatenate([row1_img, row_gap, row2_img], axis=0))
+    iio.imwrite(
+        output_path, np.stack(composed, axis=0),
+        fps=fps, plugin="FFMPEG", codec="libx264", pixelformat="yuv420p",
+    )
+def reconstruct_video(video_path: str, input_fps: int, timesteps: int, guidance_scale: float, seed: int):
+    if not video_path or not Path(video_path).exists():
+        raise gr.Error("Upload a video first.")
+    if _model is None:
+        raise gr.Error("Model failed to load at startup — check Space logs.")
+    try:
+        preprocess_args = dict(_model.video_preprocess_args)
+        # Public package uses 'overlap_size'; model config key is 'overlap_size_frames'
+        if "overlap_size_frames" in preprocess_args and "overlap_size" not in preprocess_args:
+            preprocess_args["overlap_size"] = preprocess_args.pop("overlap_size_frames")
+        video_tensor = read_mp4(str(video_path), fps=int(input_fps), **preprocess_args)
+    except Exception as exc:
+        raise gr.Error(f"Failed to decode video: {exc}") from exc
+    try:
+        with get_bf16_context(ENABLE_BF16, device_type=DEVICE.type):
+            print(f"[VideoFlexTok] Tokenizing {video_tensor.shape} ...")
+            token_ids = _model.tokenize(video_tensor[None].to(DEVICE))
+            print(f"[VideoFlexTok] Decoding {len(NUM_KEEP_TOKENS)} reconstructions ...")
+            reconstructed = _model.detokenize(
+                [token_ids[0]] * len(NUM_KEEP_TOKENS),
+                num_keep_tokens_list=NUM_KEEP_TOKENS,
+                timesteps=int(timesteps),
+                guidance_scale=float(guidance_scale),
+                perform_norm_guidance=True,
+                generator=torch.Generator(device=DEVICE.type).manual_seed(int(seed)),
+                eta=0.0, momentum=0.0, norm_threshold=0.6, verbose=False,
+            )
+        reconstructed = [v.cpu().float() for v in reconstructed]
+        print("[VideoFlexTok] Inference complete.")
+    except Exception as exc:
+        raise gr.Error(f"Model inference failed: {exc}") from exc
+    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    tmp.close()
+    _stack_reconstructed_videos(reconstructed + [video_tensor], output_path=tmp.name, fps=int(input_fps))
+    info = f"Extracted {video_tensor.shape[1]} frames at {input_fps} FPS"
+    return tmp.name, info
+if spaces is not None and hasattr(spaces, "GPU"):
+    reconstruct_video = spaces.GPU(duration=60)(reconstruct_video)
+# --- UI ----------------------------------------------------------------------------
+with gr.Blocks(title="VideoFlexTok Demo", theme=gr.themes.Base(), css=APP_CSS) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("# VideoFlexTok: Flexible-Length Coarse-to-Fine Video Tokenization")
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="col-input-container"):
+                gr.Markdown(f"""
+[`Website`](https://videoflextok.epfl.ch) | [`GitHub`](https://github.com/apple/ml-videoflextok) | [`Model`](https://huggingface.co/{MODEL_ID})
+Research demo for **VideoFlexTok: Flexible-Length Coarse-to-Fine Video Tokenization** (arXiv 2026).
+Autoencodes your video with `{MODEL_ID}` and shows coarse-to-fine reconstructions.
+VideoFlexTok tokenizes video into `T × 256` tokens ordered coarse-to-fine; this demo shows
+reconstructions from `T × k` tokens for k ∈ `{NUM_KEEP_TOKENS}`. Bottom-right is the original.
+""")
+                input_video = gr.Video(
+                    label="Input video", sources=["upload"], format="mp4",
+                )
+                run_button = gr.Button("Autoencode with VideoFlexTok", elem_id="run-button")
+                if EXAMPLE_VIDEOS:
+                    gr.Examples(
+                        examples=[str(p) for p in EXAMPLE_VIDEOS],
+                        inputs=[input_video],
+                        outputs=[input_video],
+                        fn=lambda p: p,
+                        cache_examples=True,
+                        label="Example videos",
+                    )
+                with gr.Accordion("Advanced Settings", open=False):
+                    gr.Markdown("Adjust target FPS to control how many frames are extracted.")
+                    input_fps = gr.Slider(minimum=1, maximum=16, value=8, step=1, label="Target FPS")
+                    timesteps = gr.Slider(minimum=1, maximum=60, value=20, step=1, label="Denoising steps")
+                    guidance_scale = gr.Slider(minimum=1.0, maximum=30.0, value=25.0, step=0.5, label="Guidance scale")
+                    seed = gr.Number(value=42, precision=0, label="Seed")
+            with gr.Column(scale=4):
+                output_video = gr.Video(label="Reconstructions")
+                status = gr.Markdown()
+    run_button.click(
+        fn=reconstruct_video,
+        inputs=[input_video, input_fps, timesteps, guidance_scale, seed],
+        outputs=[output_video, status],
+    )
+    if DEVICE.type != "cuda":
+        gr.Markdown("Running on CPU — inference will be slow.")
+# --- Launch ------------------------------------------------------------------------
+demo.queue(max_size=16)
+if __name__ == "__main__":
+    server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
+    launch_kwargs = {"server_name": server_name, "ssr_mode": False}
+    if port := os.environ.get("GRADIO_SERVER_PORT"):
+        launch_kwargs["server_port"] = int(port)
+    launch_kwargs["allowed_paths"] = [str(APP_DIR), tempfile.gettempdir()]
+    demo.launch(**launch_kwargs)

examples/apple.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c2f7782fdb34cfa29bd36a92ebf47a4cf006f278c28891d3feb944a526b6a26
+size 71661

examples/arch.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:662e89863b7479fa5323e0a209c67b249b0ad064ff337285ffa10b25a91570a7
+size 63973

examples/cat.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54f7eece320681198e6e817f6c7170a08b22e778435322421f15b68271c95734
+size 58276

examples/porsche.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4738089a93af048948eda8deb0b53c47baf5a898021471508b131784f1bc39f3
+size 293070

examples/sculpture.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3746cdde8a5096398280efa4012b6978751a8a1471245baabaa5605984056fc4
+size 66136

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+gradio==6.5.1
+imageio-ffmpeg==0.6.0
+imageio
+# videoflextok is installed without its deps at Space startup (see app.py).
+# Its pyproject.toml pins huggingface_hub==0.25.2 which conflicts with gradio>=0.33.5,
+# so we install --no-deps and provide compatible versions here instead.
+# git+https://github.com/apple/ml-videoflextok.git
+# Pin torch to 2.8.x — the version videoflextok was developed and tested on.
+# The HF Spaces base image ships 2.9.1 which has TorchDynamo regressions.
+torch==2.8.0
+torchvision==0.23.0
+# videoflextok dependencies (compatible versions)
+diffusers>=0.28.0
+einops>=0.7.0
+huggingface_hub>=0.33.5,<0.40
+hydra-core>=1.3.2
+omegaconf>=2.3.0
+PyYAML>=6.0
+mup
+safetensors>=0.4.0
+tqdm>=4.64.1
+eva-decord==0.6.1