Spaces:

nycu-cplab
/

3AM

Running on Zero

App Files Files Community

nycu-cplab commited on 29 days ago

Commit

fb1055c

1 Parent(s): 9d941d0

examples

Browse files

Files changed (2) hide show

app.py +443 -286
app_cache.py +675 -0

app.py CHANGED Viewed

@@ -1,105 +1,133 @@
 import spaces
 import subprocess
 import sys, os
 from pathlib import Path
 import math
-''' loading modules '''
 ROOT = Path(__file__).resolve().parent
 SAM2 = ROOT / "sam2-src"
 CKPT = SAM2 / "checkpoints" / "sam2.1_hiera_large.pt"
-ASMK = ROOT / "asmk"
-''' download sam2 checkpoints '''
 if not CKPT.exists():
     subprocess.check_call(["bash", "download_ckpts.sh"], cwd=SAM2 / "checkpoints")
-''' install sam2 '''
 try:
-    import sam2.build_sam
 except ModuleNotFoundError:
     subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", "./sam2-src"], cwd=ROOT)
     subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", "./sam2-src[notebooks]"], cwd=ROOT)
-''' install asmk '''
 try:
     import asmk.index  # noqa: F401
-except Exception as e:
-    subprocess.check_call(
-        ["cythonize", "*.pyx"], cwd='./asmk-src/cython'
-    )
-    subprocess.check_call(
-        [sys.executable, "-m", "pip", "install", './asmk-src', "--no-build-isolation"]
-    )
-''' download some checkpoints '''
-if not os.path.exists('./private'):
     from huggingface_hub import snapshot_download
-    local_dir = snapshot_download(
         repo_id="nycu-cplab/3AM",
         local_dir="./private",
         repo_type="model",
     )
-import importlib, site
 for sp in site.getsitepackages():
     site.addsitedir(sp)
 importlib.invalidate_caches()
-import gradio as gr
-import torch
-torch.no_grad().__enter__()
-import numpy as np
-from PIL import Image, ImageDraw
-import cv2
-import copy
-import json
-import logging
-import sys
-# --- Logging Configuration ---
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(message)s",
-    handlers=[
-        logging.StreamHandler(sys.stdout)
-    ]
 )
-logger = logging.getLogger(__name__)
-# Import functions from your engine.py
-from engine import (
     get_predictors,
     get_views,
     prepare_sam2_inputs,
     must3r_features_and_output,
     get_single_frame_mask,
-    get_tracked_masks
 )
-# --- Global Configuration & Model Loading ---
 PREDICTOR_ORIGINAL = None
 PREDICTOR = None
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 def load_models():
     global PREDICTOR_ORIGINAL, PREDICTOR
     if PREDICTOR is None or PREDICTOR_ORIGINAL is None:
         logger.info(f"Initializing models on device: {DEVICE}...")
-        try:
-            PREDICTOR_ORIGINAL, PREDICTOR = get_predictors(device=DEVICE)
-            logger.info("Models loaded successfully.")
-        except Exception as e:
-            logger.error(f"Failed to load models: {e}")
-            raise e
     return PREDICTOR_ORIGINAL, PREDICTOR
-# --- Helper Functions ---
 def video_to_frames(video_path, interval=1):
-    """
-    Extract frames from video path to a list of PIL Images.
-    Respects the frame interval (e.g., interval=5 takes every 5th frame).
-    """
     logger.info(f"Extracting frames from video: {video_path} with interval {interval}")
     cap = cv2.VideoCapture(video_path)
     frames = []
@@ -108,68 +136,50 @@ def video_to_frames(video_path, interval=1):
         ret, frame = cap.read()
         if not ret:
             break
-        # Only keep frame if it matches the interval
         if count % interval == 0:
-            # Convert BGR to RGB
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frames.append(Image.fromarray(frame_rgb))
         count += 1
     cap.release()
     logger.info(f"Extracted {len(frames)} frames (sampled from {count} total frames).")
     return frames
 def draw_points(image_pil, points, labels):
-    """Draws visual markers for clicks on the image."""
     img_draw = image_pil.copy()
     draw = ImageDraw.Draw(img_draw)
-    # Radius of points
     r = 5
     for pt, lbl in zip(points, labels):
         x, y = pt
-        if lbl == 1: # Positive
             color = "green"
-        elif lbl == 0: # Negative
             color = "red"
-        elif lbl == 2: # Box Top-Left
             color = "blue"
-        elif lbl == 3: # Box Bottom-Right
             color = "cyan"
         else:
             color = "yellow"
-        draw.ellipse((x-r, y-r, x+r, y+r), fill=color, outline="white")
     return img_draw
 def overlay_mask(image_pil, mask, color=(255, 0, 0), alpha=0.5):
-    """Overlay a binary mask on a PIL image."""
     if mask is None:
         return image_pil
-    # Ensure mask is bool or 0/1
     mask = mask > 0
     img_np = np.array(image_pil)
     h, w = img_np.shape[:2]
-    # Resize mask to image size if necessary
     if mask.shape[0] != h or mask.shape[1] != w:
-        logger.debug(f"Resizing mask from {mask.shape} to {(h, w)}")
         mask = cv2.resize(mask.astype(np.uint8), (w, h), interpolation=cv2.INTER_NEAREST).astype(bool)
     overlay = img_np.copy()
     overlay[mask] = np.array(color, dtype=np.uint8)
     combined = cv2.addWeighted(overlay, alpha, img_np, 1 - alpha, 0)
     return Image.fromarray(combined)
 def create_video_from_masks(frames, masks_dict, output_path="output_tracking.mp4", fps=24):
-    """Combine original frames and tracking masks into a video."""
     logger.info(f"Creating video output at {output_path} with {len(frames)} frames.")
     if not frames:
         logger.warning("No frames to create video.")
@@ -178,9 +188,9 @@ def create_video_from_masks(frames, masks_dict, output_path="output_tracking.mp4
     if not (fps > 0.0):
         fps = 24.0
     h, w = np.array(frames[0]).shape[:2]
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out = cv2.VideoWriter(output_path, fourcc, fps, (w, h))
     for idx, frame in enumerate(frames):
         mask = masks_dict.get(idx)
         if mask is not None:
@@ -188,26 +198,23 @@ def create_video_from_masks(frames, masks_dict, output_path="output_tracking.mp4
             frame_np = np.array(pil_out)
         else:
             frame_np = np.array(frame)
         frame_bgr = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)
         out.write(frame_bgr)
     out.release()
     logger.info("Video creation complete.")
     return output_path
-# --- GPU Wrapped Functions ---
 def estimate_video_fps(video_path: str) -> float:
     cap = cv2.VideoCapture(video_path)
     fps = float(cap.get(cv2.CAP_PROP_FPS)) or 0.0
     cap.release()
-    # Robust fallback if metadata is missing
     return fps if fps > 0.0 else 24.0
-MAX_GPU_SECONDS = 600  # e.g., 10 minutes
-def clamp_duration(sec: int) -> int:
-    return int(min(MAX_GPU_SECONDS, max(1, sec)))
 def estimate_total_frames(video_path: str) -> int:
     cap = cv2.VideoCapture(video_path)
@@ -215,125 +222,250 @@ def estimate_total_frames(video_path: str) -> int:
     cap.release()
     return max(1, n)
 def get_duration_must3r_features(video_path, interval):
-    # interval is applied to the entire pipeline, so actual processed frames ~= ceil(total / interval)
     total = estimate_total_frames(video_path)
     interval = max(1, int(interval))
     processed = math.ceil(total / interval)
-    # Tune this coefficient based on your observed runtime on ZeroGPU
     sec_per_frame = 2
     return clamp_duration(int(processed * sec_per_frame))
 @spaces.GPU(duration=get_duration_must3r_features)
 def process_video_and_features(video_path, interval):
-    """Load video, subsample frames, get views, MUSt3R features, SAM2 inputs."""
     logger.info(f"Starting GPU process: Video feature extraction (Interval: {interval})")
     load_models()
-    # Pass interval to subsample frames immediately
-    pil_imgs = video_to_frames(video_path, interval=interval)
     if not pil_imgs:
         raise ValueError("Could not extract frames from video.")
-    logger.info("Step 1/3: Getting views...")
     views, resize_funcs = get_views(pil_imgs)
-    # Ensure consistent resizing
-    pil_imgs_resized = [resize_funcs[0].transforms[0](p) for p in pil_imgs]
-    logger.info("Step 2/3: Extracting MUSt3R features...")
     must3r_feats, must3r_outputs = must3r_features_and_output(views, device=DEVICE)
-    logger.debug(f"MUSt3R features extracted. Output keys: {must3r_outputs.keys()}")
-    logger.info("Step 3/3: Preparing SAM2 inputs...")
     sam2_input_images, images_tensor = prepare_sam2_inputs(views, pil_imgs, resize_funcs)
-    logger.debug(f"SAM2 input shape: {sam2_input_images.shape}")
-    logger.info("Feature extraction complete.")
     return pil_imgs, views, resize_funcs, must3r_feats, must3r_outputs, sam2_input_images, images_tensor
 @spaces.GPU
 def generate_frame_mask(image_tensor, points, labels, original_size):
-    """Generate mask for a single frame based on clicks."""
     logger.info(f"Generating mask for single frame. Points: {len(points)}")
     load_models()
     pts_tensor = torch.tensor(points, dtype=torch.float32).unsqueeze(0).to(DEVICE)
     lbl_tensor = torch.tensor(labels, dtype=torch.int32).unsqueeze(0).to(DEVICE)
     w, h = original_size
-    # Normalize points
     pts_tensor[..., 0] /= (w / 1024.0)
     pts_tensor[..., 1] /= (h / 1024.0)
-    try:
-        mask = get_single_frame_mask(
-            image=image_tensor,
-            predictor_original=PREDICTOR_ORIGINAL,
-            points=pts_tensor,
-            labels=lbl_tensor,
-            device=DEVICE
-        )
-        logger.info("Mask generation successful.")
-        mask_np = mask.squeeze().cpu().numpy()
-        return mask_np
-    except Exception as e:
-        logger.error(f"Error during mask generation: {e}")
-        raise e
-def get_duration_tracking(sam2_input_images, must3r_feats, must3r_outputs, start_idx, first_frame_mask):
-    # sam2_input_images is already subsampled, so this is the true number of frames to track
-    try:
-        n = int(getattr(sam2_input_images, "shape")[0])
-    except Exception:
-        n = 100  # fallback if something unexpected is passed
-    sec_per_frame = 2
-    return clamp_duration(int(n * sec_per_frame))
 @spaces.GPU(duration=get_duration_tracking)
 def run_tracking(sam2_input_images, must3r_feats, must3r_outputs, start_idx, first_frame_mask):
-    """Track the mask across the video."""
     logger.info(f"Starting tracking from frame index {start_idx}...")
     load_models()
     mask_tensor = torch.tensor(first_frame_mask).to(DEVICE) > 0
-    try:
-        tracked_masks = get_tracked_masks(
-            sam2_input_images=sam2_input_images,
-            must3r_feats=must3r_feats,
-            must3r_outputs=must3r_outputs,
-            start_idx=start_idx,
-            first_frame_mask=mask_tensor,
-            predictor=PREDICTOR,
-            predictor_original=PREDICTOR_ORIGINAL,
-            device=DEVICE
-        )
-        logger.info(f"Tracking complete. Generated masks for {len(tracked_masks)} frames.")
-        return tracked_masks
-    except Exception as e:
-        logger.error(f"Error during tracking: {e}")
-        raise e
-# --- Gradio Callbacks ---
-def on_video_upload(video_path, interval):
     logger.info(f"User uploaded video: {video_path}, Interval: {interval}")
     if video_path is None:
         return None, None, gr.Slider(value=0, maximum=0), None
-    try:
-        pil_imgs, views, resize_funcs, must3r_feats, must3r_outputs, sam2_input_images, images_tensor = process_video_and_features(video_path, int(interval))
-    except Exception as e:
-        logger.error(f"Failed to process video: {e}")
-        raise gr.Error(f"Processing failed: {str(e)}")
     fps_in = estimate_video_fps(video_path)
     interval_i = max(1, int(interval))
     fps_out = max(1.0, fps_in / interval_i)
-    # Initialize state
     state = {
         "pil_imgs": pil_imgs,
         "views": views,
@@ -349,160 +481,165 @@ def on_video_upload(video_path, interval):
         "video_path": video_path,
         "interval": interval_i,
         "fps_in": fps_in,
-        "fps_out": fps_out
     }
     first_frame = pil_imgs[0]
-    new_slider = gr.Slider(value=0, maximum=len(pil_imgs)-1, step=1, interactive=True)
     return first_frame, state, new_slider, gr.Image(value=first_frame)
 def on_slider_change(state, frame_idx):
     if not state:
         return None
     if frame_idx >= len(state["pil_imgs"]):
         frame_idx = len(state["pil_imgs"]) - 1
     state["frame_idx"] = frame_idx
     state["current_points"] = []
     state["current_labels"] = []
     state["current_mask"] = None
-    frame = state["pil_imgs"][frame_idx]
-    return frame
 def on_image_click(state, evt: gr.SelectData, mode):
-    """
-    Registers the click, updates state, and draws the point/box corner.
-    Does NOT generate the mask.
-    """
     if not state:
         return None
     x, y = evt.index
-    logger.info(f"User clicked at ({x}, {y}) with mode: {mode}")
     label_map = {
         "Positive Point": 1,
         "Negative Point": 0,
         "Box Top-Left": 2,
-        "Box Bottom-Right": 3
     }
     label = label_map[mode]
-    # Update State
     state["current_points"].append([x, y])
     state["current_labels"].append(label)
-    # Visual Feedback Only (Draw points)
     frame_pil = state["pil_imgs"][state["frame_idx"]]
     vis_img = draw_points(frame_pil, state["current_points"], state["current_labels"])
-    # Keep old mask visible if it exists, but don't update it yet
     if state["current_mask"] is not None:
         vis_img = overlay_mask(vis_img, state["current_mask"])
     return vis_img
 def on_generate_mask_click(state):
-    """
-    Called when 'Generate Mask' button is clicked.
-    Validates inputs (box completion) and triggers GPU mask generation.
-    """
     if not state:
         return None
-    logger.info("Generate Mask button clicked.")
     if not state["current_points"]:
         raise gr.Error("No points or boxes annotated.")
-    # --- BOX VALIDATION LOGIC ---
     num_tl = state["current_labels"].count(2)
     num_br = state["current_labels"].count(3)
     if num_tl != num_br or num_tl > 1:
-        logger.warning(f"Box mismatch: TL={num_tl}, BR={num_br}")
-        raise gr.Error(f"Incomplete box detected! You have {num_tl} top-left(s) and {num_br} bottom-right(s). They must match and be <= 1.")
-    # Proceed to inference
     frame_idx = state["frame_idx"]
     full_tensor = state["sam2_input_images"]
-    frame_tensor = full_tensor[frame_idx].unsqueeze(0)
-    original_size = state["pil_imgs"][frame_idx].size
-    try:
-        mask = generate_frame_mask(
-            frame_tensor,
-            state["current_points"],
-            state["current_labels"],
-            original_size
-        )
-    except Exception as e:
-        logger.error(f"Mask generation failed: {e}")
-        raise gr.Error("Failed to generate mask.")
     state["current_mask"] = mask
-    # Visualization: Draw Mask AND Points
     frame_pil = state["pil_imgs"][frame_idx]
     vis_img = overlay_mask(frame_pil, mask)
     vis_img = draw_points(vis_img, state["current_points"], state["current_labels"])
     return vis_img
 def on_track_click(state):
-    logger.info("Track button clicked.")
     if not state or state["current_mask"] is None:
-        logger.warning("Track attempted without mask/state.")
         raise gr.Error("Please annotate a frame and generate a mask first.")
-    # Double check box consistency just in case
     num_tl = state["current_labels"].count(2)
     num_br = state["current_labels"].count(3)
     if num_tl != num_br:
         raise gr.Error("Incomplete box annotations.")
     start_idx = state["frame_idx"]
     first_frame_mask = state["current_mask"]
-    try:
-        tracked_masks_dict = run_tracking(
-            state["sam2_input_images"],
-            state["must3r_feats"],
-            state["must3r_outputs"],
-            start_idx,
-            first_frame_mask
-        )
-        output_path = create_video_from_masks(
-            state["pil_imgs"],
-            tracked_masks_dict,
-            fps=state.get("fps_out", 24.0),
-        )
-        return output_path
-    except Exception as e:
-        logger.error(f"Tracking failed in UI callback: {e}")
-        raise gr.Error(f"Tracking failed: {str(e)}")
-def reset_annotations(state):
-    if not state:
-        return None
-    logger.info("Resetting annotations for current frame.")
-    state["current_points"] = []
-    state["current_labels"] = []
-    state["current_mask"] = None
-    frame_idx = state["frame_idx"]
-    return state["pil_imgs"][frame_idx]
-# --- App Layout ---
 description = """
 <div style="text-align: center;">
-    <h1>3AM: 3egment Anything with Geometric Consistency in Videos</h1>
-    <p>Upload a video, geometric features are extracted automatically. Select a frame, click to annotate objects, and track them in 3D-consistent space.</p>
 </div>
 """
 with gr.Blocks(title="3AM: 3egment Anything") as app:
     gr.HTML(description)
@@ -513,11 +650,12 @@ with gr.Blocks(title="3AM: 3egment Anything") as app:
 1) Upload video
 2) Adjust frame interval → Load frames
 3) Annotate & generate mask
-4) Track through the video
 """
     )
     app_state = gr.State()
     with gr.Row():
         with gr.Column(scale=1):
@@ -525,7 +663,7 @@ with gr.Blocks(title="3AM: 3egment Anything") as app:
             video_input = gr.Video(
                 label="Upload Video",
                 sources=["upload"],
-                height=512
             )
             gr.Markdown("## Step 2 — Set interval, then load frames")
@@ -535,18 +673,15 @@ with gr.Blocks(title="3AM: 3egment Anything") as app:
                 maximum=30,
                 step=1,
                 value=1,
-                info="Default ≈ total_frames / 100"
             )
-            load_btn = gr.Button(
-                "Load Frames",
-                variant="primary"
-            )
             process_status = gr.Textbox(
                 label="Status",
                 value="1) Upload a video.",
-                interactive=False
             )
         with gr.Column(scale=2):
@@ -554,7 +689,7 @@ with gr.Blocks(title="3AM: 3egment Anything") as app:
             img_display = gr.Image(
                 label="Annotate Frame",
                 interactive=True,
-                height=512
             )
             frame_slider = gr.Slider(
@@ -562,7 +697,7 @@ with gr.Blocks(title="3AM: 3egment Anything") as app:
                 minimum=0,
                 maximum=100,
                 step=1,
-                value=0
             )
             with gr.Row():
@@ -574,17 +709,17 @@ with gr.Blocks(title="3AM: 3egment Anything") as app:
                         "Box Bottom-Right",
                     ],
                     value="Positive Point",
-                    label="Annotation Mode"
                 )
                 with gr.Column():
                     gen_mask_btn = gr.Button(
                         "Generate Mask",
                         variant="primary",
-                        interactive=False
                     )
                     reset_btn = gr.Button(
                         "Reset Annotations",
-                        interactive=False
                     )
     gr.Markdown("## Step 4 — Track through the video")
@@ -593,37 +728,43 @@ with gr.Blocks(title="3AM: 3egment Anything") as app:
             "Start Tracking",
             variant="primary",
             scale=1,
-            interactive=False
         )
     with gr.Row():
         video_output = gr.Video(
             label="Tracking Output",
             autoplay=True,
-            height=512
         )
-    # ------------------------------------------------
-    # Events
-    # ------------------------------------------------
-    # Upload: only read metadata & set default interval
-    def on_video_uploaded(video_path):
-        n_frames = estimate_total_frames(video_path)
-        default_interval = max(1, n_frames // 100)
-        return (
-            gr.update(value=default_interval, maximum=min(30, n_frames)),
-            f"Video uploaded ({n_frames} frames). "
-            "2) Adjust interval, then click 'Load Frames'."
-        )
     video_input.upload(
         fn=on_video_uploaded,
         inputs=video_input,
-        outputs=[interval_slider, process_status]
     )
-    # Load frames: heavy compute happens here
     load_btn.click(
         fn=lambda: (
             "Loading frames...",
@@ -631,11 +772,11 @@ with gr.Blocks(title="3AM: 3egment Anything") as app:
             gr.update(interactive=False),
             gr.update(interactive=False),
         ),
-        outputs=[process_status, gen_mask_btn, reset_btn, track_btn]
     ).then(
-        fn=on_video_upload,
         inputs=[video_input, interval_slider],
-        outputs=[img_display, app_state, frame_slider, img_display]
     ).then(
         fn=lambda: (
             "Ready. 3) Annotate and generate mask.",
@@ -643,46 +784,62 @@ with gr.Blocks(title="3AM: 3egment Anything") as app:
             gr.update(interactive=True),
             gr.update(interactive=True),
         ),
-        outputs=[process_status, gen_mask_btn, reset_btn, track_btn]
     )
     frame_slider.change(
         fn=on_slider_change,
         inputs=[app_state, frame_slider],
-        outputs=[img_display]
     )
     img_display.select(
         fn=on_image_click,
         inputs=[app_state, mode_radio],
-        outputs=[img_display]
     )
     gen_mask_btn.click(
         fn=on_generate_mask_click,
         inputs=[app_state],
-        outputs=[img_display]
     )
     reset_btn.click(
         fn=reset_annotations,
         inputs=[app_state],
-        outputs=[img_display]
     )
     track_btn.click(
         fn=lambda: "Tracking in progress...",
-        outputs=process_status
     ).then(
         fn=on_track_click,
         inputs=[app_state],
-        outputs=[video_output]
     ).then(
         fn=lambda: "Tracking complete!",
-        outputs=process_status
     )
 if __name__ == "__main__":
     logger.info("Starting Gradio app...")
-    app.launch()

+# app_user.py
+# User-facing app:
+# - Same workflow as original app.py (upload -> set interval -> Load Frames -> annotate -> Generate Mask -> Track)
+# - Adds an Examples table at the bottom
+#   - Loads examples from ./private/cache/*
+#   - Each row shows the first-frame thumbnail
+#   - Clicking a row instantly loads the cached example (state + precomputed output mp4)
+#
+# Expected cache structure per example directory:
+#   ./private/cache/<cache_id>/
+#       meta.pkl
+#       frames/000000.jpg (thumbnail) + more frames
+#       state_tensors.pt   (must3r_feats, must3r_outputs, sam2_input_images, images_tensor) saved on CPU
+#       output_tracking.mp4
+#
+# Notes:
+# - tracked_masks_dict is not required.
+# - views/resize_funcs are recomputed on load (cheap vs must3r/tracking).
 import spaces
 import subprocess
 import sys, os
 from pathlib import Path
 import math
+import pickle
+from typing import Any, Dict, List, Tuple, Optional
+import importlib, site
+import gradio as gr
+import torch
+import numpy as np
+from PIL import Image, ImageDraw
+import cv2
+import logging
+# ============================================================
+# Bootstrap (same style as your original app.py)
+# ============================================================
 ROOT = Path(__file__).resolve().parent
 SAM2 = ROOT / "sam2-src"
 CKPT = SAM2 / "checkpoints" / "sam2.1_hiera_large.pt"
 if not CKPT.exists():
     subprocess.check_call(["bash", "download_ckpts.sh"], cwd=SAM2 / "checkpoints")
 try:
+    import sam2.build_sam  # noqa: F401
 except ModuleNotFoundError:
     subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", "./sam2-src"], cwd=ROOT)
     subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", "./sam2-src[notebooks]"], cwd=ROOT)
 try:
     import asmk.index  # noqa: F401
+except Exception:
+    subprocess.check_call(["cythonize", "*.pyx"], cwd="./asmk-src/cython")
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "./asmk-src", "--no-build-isolation"])
+if not os.path.exists("./private"):
     from huggingface_hub import snapshot_download
+    snapshot_download(
         repo_id="nycu-cplab/3AM",
         local_dir="./private",
         repo_type="model",
     )
 for sp in site.getsitepackages():
     site.addsitedir(sp)
 importlib.invalidate_caches()
+# ============================================================
+# Logging
+# ============================================================
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
 )
+logger = logging.getLogger("app_user")
+# ============================================================
+# Engine imports
+# ============================================================
+from engine import (  # noqa: E402
     get_predictors,
     get_views,
     prepare_sam2_inputs,
     must3r_features_and_output,
     get_single_frame_mask,
+    get_tracked_masks,
 )
+# ============================================================
+# Globals
+# ============================================================
 PREDICTOR_ORIGINAL = None
 PREDICTOR = None
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+torch.no_grad().__enter__()
 def load_models():
     global PREDICTOR_ORIGINAL, PREDICTOR
     if PREDICTOR is None or PREDICTOR_ORIGINAL is None:
         logger.info(f"Initializing models on device: {DEVICE}...")
+        PREDICTOR_ORIGINAL, PREDICTOR = get_predictors(device=DEVICE)
+        logger.info("Models loaded successfully.")
     return PREDICTOR_ORIGINAL, PREDICTOR
+def to_device_nested(x: Any, device: str) -> Any:
+    if torch.is_tensor(x):
+        return x.to(device)
+    if isinstance(x, dict):
+        return {k: to_device_nested(v, device) for k, v in x.items()}
+    if isinstance(x, list):
+        return [to_device_nested(v, device) for v in x]
+    if isinstance(x, tuple):
+        return tuple(to_device_nested(v, device) for v in x)
+    return x
+# ============================================================
+# Helper Functions
+# ============================================================
 def video_to_frames(video_path, interval=1):
     logger.info(f"Extracting frames from video: {video_path} with interval {interval}")
     cap = cv2.VideoCapture(video_path)
     frames = []
         ret, frame = cap.read()
         if not ret:
             break
         if count % interval == 0:
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frames.append(Image.fromarray(frame_rgb))
         count += 1
     cap.release()
     logger.info(f"Extracted {len(frames)} frames (sampled from {count} total frames).")
     return frames
 def draw_points(image_pil, points, labels):
     img_draw = image_pil.copy()
     draw = ImageDraw.Draw(img_draw)
     r = 5
     for pt, lbl in zip(points, labels):
         x, y = pt
+        if lbl == 1:
             color = "green"
+        elif lbl == 0:
             color = "red"
+        elif lbl == 2:
             color = "blue"
+        elif lbl == 3:
             color = "cyan"
         else:
             color = "yellow"
+        draw.ellipse((x - r, y - r, x + r, y + r), fill=color, outline="white")
     return img_draw
 def overlay_mask(image_pil, mask, color=(255, 0, 0), alpha=0.5):
     if mask is None:
         return image_pil
     mask = mask > 0
     img_np = np.array(image_pil)
     h, w = img_np.shape[:2]
     if mask.shape[0] != h or mask.shape[1] != w:
         mask = cv2.resize(mask.astype(np.uint8), (w, h), interpolation=cv2.INTER_NEAREST).astype(bool)
     overlay = img_np.copy()
     overlay[mask] = np.array(color, dtype=np.uint8)
     combined = cv2.addWeighted(overlay, alpha, img_np, 1 - alpha, 0)
     return Image.fromarray(combined)
 def create_video_from_masks(frames, masks_dict, output_path="output_tracking.mp4", fps=24):
     logger.info(f"Creating video output at {output_path} with {len(frames)} frames.")
     if not frames:
         logger.warning("No frames to create video.")
     if not (fps > 0.0):
         fps = 24.0
     h, w = np.array(frames[0]).shape[:2]
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
     out = cv2.VideoWriter(output_path, fourcc, fps, (w, h))
     for idx, frame in enumerate(frames):
         mask = masks_dict.get(idx)
         if mask is not None:
             frame_np = np.array(pil_out)
         else:
             frame_np = np.array(frame)
         frame_bgr = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)
         out.write(frame_bgr)
     out.release()
     logger.info("Video creation complete.")
     return output_path
+# ============================================================
+# Runtime estimation
+# ============================================================
 def estimate_video_fps(video_path: str) -> float:
     cap = cv2.VideoCapture(video_path)
     fps = float(cap.get(cv2.CAP_PROP_FPS)) or 0.0
     cap.release()
     return fps if fps > 0.0 else 24.0
 def estimate_total_frames(video_path: str) -> int:
     cap = cv2.VideoCapture(video_path)
     cap.release()
     return max(1, n)
+MAX_GPU_SECONDS = 600
+def clamp_duration(sec: int) -> int:
+    return int(min(MAX_GPU_SECONDS, max(1, sec)))
 def get_duration_must3r_features(video_path, interval):
     total = estimate_total_frames(video_path)
     interval = max(1, int(interval))
     processed = math.ceil(total / interval)
     sec_per_frame = 2
     return clamp_duration(int(processed * sec_per_frame))
+def get_duration_tracking(sam2_input_images, must3r_feats, must3r_outputs, start_idx, first_frame_mask):
+    try:
+        n = int(getattr(sam2_input_images, "shape")[0])
+    except Exception:
+        n = 100
+    sec_per_frame = 2
+    return clamp_duration(int(n * sec_per_frame))
+# ============================================================
+# GPU Wrapped Functions
+# ============================================================
 @spaces.GPU(duration=get_duration_must3r_features)
 def process_video_and_features(video_path, interval):
     logger.info(f"Starting GPU process: Video feature extraction (Interval: {interval})")
     load_models()
+    pil_imgs = video_to_frames(video_path, interval=max(1, int(interval)))
     if not pil_imgs:
         raise ValueError("Could not extract frames from video.")
     views, resize_funcs = get_views(pil_imgs)
     must3r_feats, must3r_outputs = must3r_features_and_output(views, device=DEVICE)
     sam2_input_images, images_tensor = prepare_sam2_inputs(views, pil_imgs, resize_funcs)
     return pil_imgs, views, resize_funcs, must3r_feats, must3r_outputs, sam2_input_images, images_tensor
 @spaces.GPU
 def generate_frame_mask(image_tensor, points, labels, original_size):
     logger.info(f"Generating mask for single frame. Points: {len(points)}")
     load_models()
+    # Ensure tensors are on GPU
+    image_tensor = image_tensor.to(DEVICE)
     pts_tensor = torch.tensor(points, dtype=torch.float32).unsqueeze(0).to(DEVICE)
     lbl_tensor = torch.tensor(labels, dtype=torch.int32).unsqueeze(0).to(DEVICE)
     w, h = original_size
     pts_tensor[..., 0] /= (w / 1024.0)
     pts_tensor[..., 1] /= (h / 1024.0)
+    mask = get_single_frame_mask(
+        image=image_tensor,
+        predictor_original=PREDICTOR_ORIGINAL,
+        points=pts_tensor,
+        labels=lbl_tensor,
+        device=DEVICE,
+    )
+    return mask.squeeze().cpu().numpy()
 @spaces.GPU(duration=get_duration_tracking)
 def run_tracking(sam2_input_images, must3r_feats, must3r_outputs, start_idx, first_frame_mask):
     logger.info(f"Starting tracking from frame index {start_idx}...")
     load_models()
+    # Ensure everything is on GPU (cached examples load from CPU)
+    sam2_input_images = sam2_input_images.to(DEVICE)
+    must3r_feats = to_device_nested(must3r_feats, DEVICE)
+    must3r_outputs = to_device_nested(must3r_outputs, DEVICE)
     mask_tensor = torch.tensor(first_frame_mask).to(DEVICE) > 0
+    tracked_masks = get_tracked_masks(
+        sam2_input_images=sam2_input_images,
+        must3r_feats=must3r_feats,
+        must3r_outputs=must3r_outputs,
+        start_idx=start_idx,
+        first_frame_mask=mask_tensor,
+        predictor=PREDICTOR,
+        predictor_original=PREDICTOR_ORIGINAL,
+        device=DEVICE,
+    )
+    logger.info(f"Tracking complete. Generated masks for {len(tracked_masks)} frames.")
+    return tracked_masks
+# ============================================================
+# Cache loader (Examples)
+# ============================================================
+CACHE_ROOT = Path("./private/cache")
+def _read_meta(meta_path: Path) -> Dict[str, Any]:
+    with open(meta_path, "rb") as f:
+        return pickle.load(f)
+def _load_frames_from_dir(frames_dir: Path) -> List[Image.Image]:
+    frames = []
+    for p in sorted(frames_dir.glob("*.jpg")):
+        frames.append(Image.open(p).convert("RGB"))
+    return frames
+def list_example_dirs() -> List[Path]:
+    if not CACHE_ROOT.exists():
+        return []
+    out = []
+    for d in sorted(CACHE_ROOT.iterdir()):
+        if not d.is_dir():
+            continue
+        if (d / "meta.pkl").exists() and (d / "state_tensors.pt").exists() and (d / "output_tracking.mp4").exists():
+            out.append(d)
+    return out
+def build_examples_table():
+    """
+    Each row:
+    [thumbnail_path, video_name, interval, num_frames, cache_id]
+    """
+    rows = []
+    cache_index = {}
+    for d in list_example_dirs():
+        cache_id = d.name
+        meta = _read_meta(d / "meta.pkl")
+        frames_dir = d / "frames"
+        thumb = frames_dir / "000000.jpg"
+        if not thumb.exists():
+            jpgs = sorted(frames_dir.glob("*.jpg"))
+            if not jpgs:
+                continue
+            thumb = jpgs[0]
+        video_name = meta.get("video_name", cache_id)
+        interval = int(meta.get("interval", 1))
+        num_frames = int(meta.get("num_frames", 0))
+        rows.append([
+            str(thumb),     # image cell
+            video_name,
+            interval,
+            num_frames,
+            cache_id,       # hidden but kept
+        ])
+        cache_index[cache_id] = {
+            "dir": d,
+            "meta": meta,
+            "video_mp4": str(d / "output_tracking.mp4"),
+            "frames_dir": frames_dir,
+            "tensors": str(d / "state_tensors.pt"),
+        }
+    return rows, cache_index
+def load_cache_into_state(cache_id: str, cache_index: Dict[str, Dict[str, Any]]) -> Tuple[Dict[str, Any], Image.Image, gr.Slider, str, int]:
+    if cache_id not in cache_index:
+        raise gr.Error(f"Unknown cache_id: {cache_id}")
+    info = cache_index[cache_id]
+    meta = info["meta"]
+    pil_imgs = _load_frames_from_dir(info["frames_dir"])
+    if not pil_imgs:
+        raise gr.Error("Example frames not found or empty.")
+    tensors = torch.load(info["tensors"], map_location="cpu")
+    # Recompute lightweight parts
+    views, resize_funcs = get_views(pil_imgs)
+    fps_in = float(meta.get("fps_in", 24.0))
+    fps_out = float(meta.get("fps_out", 24.0))
+    interval = int(meta.get("interval", 1))
+    state = {
+        "pil_imgs": pil_imgs,
+        "views": views,
+        "resize_funcs": resize_funcs,
+        "must3r_feats": tensors["must3r_feats"],
+        "must3r_outputs": tensors["must3r_outputs"],
+        "sam2_input_images": tensors["sam2_input_images"],
+        "images_tensor": tensors["images_tensor"],
+        "current_points": [],
+        "current_labels": [],
+        "current_mask": None,
+        "frame_idx": 0,
+        "video_path": meta.get("video_name", "example"),
+        "interval": interval,
+        "fps_in": fps_in,
+        "fps_out": fps_out,
+        # precomputed output
+        "output_video_path": info["video_mp4"],
+        "loaded_from_cache": True,
+        "cache_id": cache_id,
+    }
+    first_frame = pil_imgs[0]
+    slider = gr.Slider(value=0, maximum=len(pil_imgs) - 1, step=1, interactive=True)
+    return state, first_frame, slider, info["video_mp4"], interval
+# ============================================================
+# UI callbacks (same semantics as your original app.py)
+# ============================================================
+def on_video_uploaded(video_path):
+    n_frames = estimate_total_frames(video_path)
+    default_interval = max(1, n_frames // 100)
+    return (
+        gr.update(value=default_interval, maximum=min(30, n_frames)),
+        f"Video uploaded ({n_frames} frames). 2) Adjust interval, then click 'Load Frames'.",
+    )
+def on_video_upload_and_load(video_path, interval):
     logger.info(f"User uploaded video: {video_path}, Interval: {interval}")
     if video_path is None:
         return None, None, gr.Slider(value=0, maximum=0), None
+    pil_imgs, views, resize_funcs, must3r_feats, must3r_outputs, sam2_input_images, images_tensor = process_video_and_features(
+        video_path, int(interval)
+    )
     fps_in = estimate_video_fps(video_path)
     interval_i = max(1, int(interval))
     fps_out = max(1.0, fps_in / interval_i)
     state = {
         "pil_imgs": pil_imgs,
         "views": views,
         "video_path": video_path,
         "interval": interval_i,
         "fps_in": fps_in,
+        "fps_out": fps_out,
+        "output_video_path": None,
+        "loaded_from_cache": False,
     }
     first_frame = pil_imgs[0]
+    new_slider = gr.Slider(value=0, maximum=len(pil_imgs) - 1, step=1, interactive=True)
     return first_frame, state, new_slider, gr.Image(value=first_frame)
 def on_slider_change(state, frame_idx):
     if not state:
         return None
+    frame_idx = int(frame_idx)
     if frame_idx >= len(state["pil_imgs"]):
         frame_idx = len(state["pil_imgs"]) - 1
     state["frame_idx"] = frame_idx
     state["current_points"] = []
     state["current_labels"] = []
     state["current_mask"] = None
+    return state["pil_imgs"][frame_idx]
 def on_image_click(state, evt: gr.SelectData, mode):
     if not state:
         return None
     x, y = evt.index
     label_map = {
         "Positive Point": 1,
         "Negative Point": 0,
         "Box Top-Left": 2,
+        "Box Bottom-Right": 3,
     }
     label = label_map[mode]
     state["current_points"].append([x, y])
     state["current_labels"].append(label)
     frame_pil = state["pil_imgs"][state["frame_idx"]]
     vis_img = draw_points(frame_pil, state["current_points"], state["current_labels"])
     if state["current_mask"] is not None:
         vis_img = overlay_mask(vis_img, state["current_mask"])
     return vis_img
 def on_generate_mask_click(state):
     if not state:
         return None
     if not state["current_points"]:
         raise gr.Error("No points or boxes annotated.")
     num_tl = state["current_labels"].count(2)
     num_br = state["current_labels"].count(3)
     if num_tl != num_br or num_tl > 1:
+        raise gr.Error(f"Incomplete box detected! TL={num_tl}, BR={num_br}. Must match and be <= 1.")
     frame_idx = state["frame_idx"]
     full_tensor = state["sam2_input_images"]
+    frame_tensor = full_tensor[frame_idx].unsqueeze(0)
+    original_size = state["pil_imgs"][frame_idx].size
+    mask = generate_frame_mask(
+        frame_tensor,
+        state["current_points"],
+        state["current_labels"],
+        original_size,
+    )
     state["current_mask"] = mask
     frame_pil = state["pil_imgs"][frame_idx]
     vis_img = overlay_mask(frame_pil, mask)
     vis_img = draw_points(vis_img, state["current_points"], state["current_labels"])
     return vis_img
+def reset_annotations(state):
+    if not state:
+        return None
+    state["current_points"] = []
+    state["current_labels"] = []
+    state["current_mask"] = None
+    frame_idx = state["frame_idx"]
+    return state["pil_imgs"][frame_idx]
 def on_track_click(state):
     if not state or state["current_mask"] is None:
         raise gr.Error("Please annotate a frame and generate a mask first.")
     num_tl = state["current_labels"].count(2)
     num_br = state["current_labels"].count(3)
     if num_tl != num_br:
         raise gr.Error("Incomplete box annotations.")
     start_idx = state["frame_idx"]
     first_frame_mask = state["current_mask"]
+    tracked_masks_dict = run_tracking(
+        state["sam2_input_images"],
+        state["must3r_feats"],
+        state["must3r_outputs"],
+        start_idx,
+        first_frame_mask,
+    )
+    output_path = create_video_from_masks(
+        state["pil_imgs"],
+        tracked_masks_dict,
+        fps=state.get("fps_out", 24.0),
+    )
+    state["output_video_path"] = output_path
+    return output_path
+# ============================================================
+# Examples UI: row click handler
+# ============================================================
+def on_example_row_click(evt: gr.SelectData, cache_index_state):
+    row = evt.value
+    # row = [thumb_path, video_name, interval, frames, cache_id]
+    cache_id = row[4]
+    state, first_frame, slider, mp4_path, interval = load_cache_into_state(
+        cache_id, cache_index_state
+    )
+    return (
+        first_frame,
+        state,
+        slider,
+        mp4_path,
+        gr.update(value=interval),
+        "Ready. Example loaded.",
+        gr.update(interactive=True),
+        gr.update(interactive=True),
+        gr.update(interactive=True),
+    )
+# ============================================================
+# Build examples at startup
+# ============================================================
+examples_rows, cache_index = build_examples_table()
+# ============================================================
+# App Layout (match original, add Examples at bottom)
+# ============================================================
 description = """
 <div style="text-align: center;">
+    <h1>3AM: 3egment Anything</h1>
+    <p>Upload a video, extract geometric features, annotate a frame, and track the object.</p>
 </div>
 """
 with gr.Blocks(title="3AM: 3egment Anything") as app:
     gr.HTML(description)
 1) Upload video
 2) Adjust frame interval → Load frames
 3) Annotate & generate mask
+4) Track through the video
 """
     )
     app_state = gr.State()
+    cache_index_state = gr.State(cache_index)
     with gr.Row():
         with gr.Column(scale=1):
             video_input = gr.Video(
                 label="Upload Video",
                 sources=["upload"],
+                height=512,
             )
             gr.Markdown("## Step 2 — Set interval, then load frames")
                 maximum=30,
                 step=1,
                 value=1,
+                info="Default ≈ total_frames / 100",
             )
+            load_btn = gr.Button("Load Frames", variant="primary")
             process_status = gr.Textbox(
                 label="Status",
                 value="1) Upload a video.",
+                interactive=False,
             )
         with gr.Column(scale=2):
             img_display = gr.Image(
                 label="Annotate Frame",
                 interactive=True,
+                height=512,
             )
             frame_slider = gr.Slider(
                 minimum=0,
                 maximum=100,
                 step=1,
+                value=0,
             )
             with gr.Row():
                         "Box Bottom-Right",
                     ],
                     value="Positive Point",
+                    label="Annotation Mode",
                 )
                 with gr.Column():
                     gen_mask_btn = gr.Button(
                         "Generate Mask",
                         variant="primary",
+                        interactive=False,
                     )
                     reset_btn = gr.Button(
                         "Reset Annotations",
+                        interactive=False,
                     )
     gr.Markdown("## Step 4 — Track through the video")
             "Start Tracking",
             variant="primary",
             scale=1,
+            interactive=False,
         )
     with gr.Row():
         video_output = gr.Video(
             label="Tracking Output",
             autoplay=True,
+            height=512,
         )
+    # -------------------------
+    # Examples table at bottom
+    # -------------------------
+    gr.Markdown("## Examples (click a row to load)")
+    examples_df = gr.Dataframe(
+        headers=["Example", "Video", "Interval", "Frames", "cache_id"],
+        datatype=["image", "str", "number", "number", "str"],
+        value=examples_rows,
+        row_count=len(examples_rows),
+        col_count=(5, "fixed"),
+        interactive=False,
+        wrap=True,
+        visible=True,
+    )
+    examples_df.style({"display": "none"}, columns=["cache_id"])
+    # ============================================================
+    # Events (original + examples)
+    # ============================================================
     video_input.upload(
         fn=on_video_uploaded,
         inputs=video_input,
+        outputs=[interval_slider, process_status],
     )
     load_btn.click(
         fn=lambda: (
             "Loading frames...",
             gr.update(interactive=False),
             gr.update(interactive=False),
         ),
+        outputs=[process_status, gen_mask_btn, reset_btn, track_btn],
     ).then(
+        fn=on_video_upload_and_load,
         inputs=[video_input, interval_slider],
+        outputs=[img_display, app_state, frame_slider, img_display],
     ).then(
         fn=lambda: (
             "Ready. 3) Annotate and generate mask.",
             gr.update(interactive=True),
             gr.update(interactive=True),
         ),
+        outputs=[process_status, gen_mask_btn, reset_btn, track_btn],
     )
     frame_slider.change(
         fn=on_slider_change,
         inputs=[app_state, frame_slider],
+        outputs=[img_display],
     )
     img_display.select(
         fn=on_image_click,
         inputs=[app_state, mode_radio],
+        outputs=[img_display],
     )
     gen_mask_btn.click(
         fn=on_generate_mask_click,
         inputs=[app_state],
+        outputs=[img_display],
     )
     reset_btn.click(
         fn=reset_annotations,
         inputs=[app_state],
+        outputs=[img_display],
     )
     track_btn.click(
         fn=lambda: "Tracking in progress...",
+        outputs=process_status,
     ).then(
         fn=on_track_click,
         inputs=[app_state],
+        outputs=[video_output],
     ).then(
         fn=lambda: "Tracking complete!",
+        outputs=process_status,
     )
+    examples_df.select(
+    fn=on_example_row_click,
+    inputs=[cache_index_state],
+    outputs=[
+        img_display,
+        app_state,
+        frame_slider,
+        video_output,
+        interval_slider,
+        process_status,
+        gen_mask_btn,
+        reset_btn,
+        track_btn,
+    ],
+)
 if __name__ == "__main__":
     logger.info("Starting Gradio app...")
+    app.launch()

app_cache.py ADDED Viewed

	@@ -0,0 +1,675 @@

+# app_cache.py
+# Purpose:
+# - Same UI flow (upload -> load frames -> annotate -> generate mask -> track)
+# - After tracking, enable "Save Cache"
+# - You can create multiple caches by repeating the workflow
+#
+# Cache contents per example:
+#   cache/<key>/
+#     meta.pkl
+#     frames/*.jpg
+#     state_tensors.pt   (must3r_feats, must3r_outputs, sam2_input_images, images_tensor) on CPU
+#     output_tracking.mp4
+#
+# Notes:
+# - We do NOT pickle views/resize_funcs (recomputed on load).
+# - We store frames as JPEG to avoid pickling PIL and to be deterministic/reloadable.
+import spaces
+import subprocess
+import sys, os
+from pathlib import Path
+import math
+import hashlib
+import pickle
+from datetime import datetime
+from typing import Any, Dict, List, Tuple
+import importlib, site
+import gradio as gr
+import torch
+import numpy as np
+from PIL import Image, ImageDraw
+import cv2
+import logging
+# ----------------------------
+# Project bootstrap
+# ----------------------------
+ROOT = Path(__file__).resolve().parent
+SAM2 = ROOT / "sam2-src"
+CKPT = SAM2 / "checkpoints" / "sam2.1_hiera_large.pt"
+# download sam2 checkpoints
+if not CKPT.exists():
+    subprocess.check_call(["bash", "download_ckpts.sh"], cwd=SAM2 / "checkpoints")
+# install sam2
+try:
+    import sam2.build_sam  # noqa
+except ModuleNotFoundError:
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", "./sam2-src"], cwd=ROOT)
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", "./sam2-src[notebooks]"], cwd=ROOT)
+# install asmk
+try:
+    import asmk.index  # noqa: F401
+except Exception:
+    subprocess.check_call(["cythonize", "*.pyx"], cwd="./asmk-src/cython")
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "./asmk-src", "--no-build-isolation"])
+# download private checkpoints
+if not os.path.exists("./private"):
+    from huggingface_hub import snapshot_download
+    snapshot_download(
+        repo_id="nycu-cplab/3AM",
+        local_dir="./private",
+        repo_type="model",
+    )
+for sp in site.getsitepackages():
+    site.addsitedir(sp)
+importlib.invalidate_caches()
+# ----------------------------
+# Logging
+# ----------------------------
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger("app_cache")
+# ----------------------------
+# Engine imports
+# ----------------------------
+from engine import (
+    get_predictors,
+    get_views,
+    prepare_sam2_inputs,
+    must3r_features_and_output,
+    get_single_frame_mask,
+    get_tracked_masks,
+)
+# ----------------------------
+# Globals
+# ----------------------------
+PREDICTOR_ORIGINAL = None
+PREDICTOR = None
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def load_models():
+    global PREDICTOR_ORIGINAL, PREDICTOR
+    if PREDICTOR is None or PREDICTOR_ORIGINAL is None:
+        logger.info(f"Initializing models on device: {DEVICE}...")
+        PREDICTOR_ORIGINAL, PREDICTOR = get_predictors(device=DEVICE)
+        logger.info("Models loaded successfully.")
+    return PREDICTOR_ORIGINAL, PREDICTOR
+# Ensure no_grad globally (as you had)
+torch.no_grad().__enter__()
+# ----------------------------
+# Video / visualization helpers
+# ----------------------------
+def video_to_frames(video_path, interval=1):
+    logger.info(f"Extracting frames from video: {video_path} with interval={interval}")
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    count = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if count % interval == 0:
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(Image.fromarray(frame_rgb))
+        count += 1
+    cap.release()
+    logger.info(f"Extracted {len(frames)} frames (sampled from {count} total).")
+    return frames
+def draw_points(image_pil, points, labels):
+    img_draw = image_pil.copy()
+    draw = ImageDraw.Draw(img_draw)
+    r = 5
+    for pt, lbl in zip(points, labels):
+        x, y = pt
+        if lbl == 1:
+            color = "green"
+        elif lbl == 0:
+            color = "red"
+        elif lbl == 2:
+            color = "blue"
+        elif lbl == 3:
+            color = "cyan"
+        else:
+            color = "yellow"
+        draw.ellipse((x-r, y-r, x+r, y+r), fill=color, outline="white")
+    return img_draw
+def overlay_mask(image_pil, mask, color=(255, 0, 0), alpha=0.5):
+    if mask is None:
+        return image_pil
+    mask = mask > 0
+    img_np = np.array(image_pil)
+    h, w = img_np.shape[:2]
+    if mask.shape[0] != h or mask.shape[1] != w:
+        mask = cv2.resize(mask.astype(np.uint8), (w, h), interpolation=cv2.INTER_NEAREST).astype(bool)
+    overlay = img_np.copy()
+    overlay[mask] = np.array(color, dtype=np.uint8)
+    combined = cv2.addWeighted(overlay, alpha, img_np, 1 - alpha, 0)
+    return Image.fromarray(combined)
+def create_video_from_masks(frames, masks_dict, output_path="output_tracking.mp4", fps=24):
+    logger.info(f"Creating video output at {output_path} with {len(frames)} frames.")
+    if not frames:
+        return None
+    fps = float(fps)
+    if not (fps > 0.0):
+        fps = 24.0
+    h, w = np.array(frames[0]).shape[:2]
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    out = cv2.VideoWriter(output_path, fourcc, fps, (w, h))
+    for idx, frame in enumerate(frames):
+        mask = masks_dict.get(idx)
+        if mask is not None:
+            pil_out = overlay_mask(frame, mask, color=(255, 0, 0), alpha=0.6)
+            frame_np = np.array(pil_out)
+        else:
+            frame_np = np.array(frame)
+        frame_bgr = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)
+        out.write(frame_bgr)
+    out.release()
+    return output_path
+# ----------------------------
+# Runtime estimation helpers
+# ----------------------------
+def estimate_video_fps(video_path: str) -> float:
+    cap = cv2.VideoCapture(video_path)
+    fps = float(cap.get(cv2.CAP_PROP_FPS)) or 0.0
+    cap.release()
+    return fps if fps > 0.0 else 24.0
+def estimate_total_frames(video_path: str) -> int:
+    cap = cv2.VideoCapture(video_path)
+    n = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
+    cap.release()
+    return max(1, n)
+MAX_GPU_SECONDS = 600
+def clamp_duration(sec: int) -> int:
+    return int(min(MAX_GPU_SECONDS, max(1, sec)))
+def get_duration_must3r_features(video_path, interval):
+    total = estimate_total_frames(video_path)
+    interval = max(1, int(interval))
+    processed = math.ceil(total / interval)
+    sec_per_frame = 2
+    return clamp_duration(int(processed * sec_per_frame))
+def get_duration_tracking(sam2_input_images, must3r_feats, must3r_outputs, start_idx, first_frame_mask):
+    try:
+        n = int(getattr(sam2_input_images, "shape")[0])
+    except Exception:
+        n = 100
+    sec_per_frame = 2
+    return clamp_duration(int(n * sec_per_frame))
+# ----------------------------
+# GPU functions
+# ----------------------------
+@spaces.GPU(duration=get_duration_must3r_features)
+def process_video_and_features(video_path, interval):
+    logger.info(f"GPU: feature extraction interval={interval}")
+    load_models()
+    pil_imgs = video_to_frames(video_path, interval=max(1, int(interval)))
+    if not pil_imgs:
+        raise ValueError("Could not extract frames.")
+    views, resize_funcs = get_views(pil_imgs)
+    must3r_feats, must3r_outputs = must3r_features_and_output(views, device=DEVICE)
+    sam2_input_images, images_tensor = prepare_sam2_inputs(views, pil_imgs, resize_funcs)
+    return pil_imgs, views, resize_funcs, must3r_feats, must3r_outputs, sam2_input_images, images_tensor
+@spaces.GPU
+def generate_frame_mask(image_tensor, points, labels, original_size):
+    logger.info(f"GPU: generate mask points={len(points)}")
+    load_models()
+    pts_tensor = torch.tensor(points, dtype=torch.float32).unsqueeze(0).to(DEVICE)
+    lbl_tensor = torch.tensor(labels, dtype=torch.int32).unsqueeze(0).to(DEVICE)
+    w, h = original_size
+    pts_tensor[..., 0] /= (w / 1024.0)
+    pts_tensor[..., 1] /= (h / 1024.0)
+    mask = get_single_frame_mask(
+        image=image_tensor,
+        predictor_original=PREDICTOR_ORIGINAL,
+        points=pts_tensor,
+        labels=lbl_tensor,
+        device=DEVICE,
+    )
+    return mask.squeeze().cpu().numpy()
+@spaces.GPU(duration=get_duration_tracking)
+def run_tracking(sam2_input_images, must3r_feats, must3r_outputs, start_idx, first_frame_mask):
+    logger.info(f"GPU: tracking start_idx={start_idx}")
+    load_models()
+    mask_tensor = torch.tensor(first_frame_mask).to(DEVICE) > 0
+    tracked_masks = get_tracked_masks(
+        sam2_input_images=sam2_input_images,
+        must3r_feats=must3r_feats,
+        must3r_outputs=must3r_outputs,
+        start_idx=start_idx,
+        first_frame_mask=mask_tensor,
+        predictor=PREDICTOR,
+        predictor_original=PREDICTOR_ORIGINAL,
+        device=DEVICE,
+    )
+    return tracked_masks
+# ----------------------------
+# Cache utilities
+# ----------------------------
+CACHE_DIR = Path("./cache")
+CACHE_DIR.mkdir(parents=True, exist_ok=True)
+def _make_cache_key(video_path: str, interval: int, start_idx: int) -> str:
+    name = Path(video_path).name if video_path else "video"
+    stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+    s = f"{name}|interval={interval}|start={start_idx}|{stamp}"
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()[:16]
+def _cache_paths(key: str) -> Dict[str, Path]:
+    base = CACHE_DIR / key
+    base.mkdir(parents=True, exist_ok=True)
+    return {
+        "base": base,
+        "meta": base / "meta.pkl",
+        "frames_dir": base / "frames",
+        "tensors": base / "state_tensors.pt",
+        "video": base / "output_tracking.mp4",
+    }
+def _save_frames_as_jpg(pil_imgs: List[Image.Image], frames_dir: Path, quality: int = 95) -> None:
+    frames_dir.mkdir(parents=True, exist_ok=True)
+    for i, im in enumerate(pil_imgs):
+        im.save(frames_dir / f"{i:06d}.jpg", "JPEG", quality=quality, subsampling=0)
+def _to_cpu(obj: Any) -> Any:
+    if torch.is_tensor(obj):
+        return obj.detach().to("cpu")
+    if isinstance(obj, dict):
+        return {k: _to_cpu(v) for k, v in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        out = [_to_cpu(v) for v in obj]
+        return type(obj)(out) if isinstance(obj, tuple) else out
+    return obj
+def _pack_masks_uint8_cpu(tracked_masks_dict: Dict[int, Any]) -> Dict[int, torch.Tensor]:
+    packed: Dict[int, torch.Tensor] = {}
+    for k, v in tracked_masks_dict.items():
+        if isinstance(v, np.ndarray):
+            t = torch.from_numpy(v)
+        else:
+            t = v
+        if not torch.is_tensor(t):
+            t = torch.tensor(t)
+        packed[int(k)] = (t > 0).to(torch.uint8).cpu()
+    return packed
+def save_full_cache_from_state(state: Dict[str, Any]) -> str:
+    if not state:
+        raise ValueError("Empty state.")
+    required = [
+        "pil_imgs",
+        "must3r_feats",
+        "must3r_outputs",
+        "sam2_input_images",
+        "images_tensor",
+        "output_video_path",
+        "video_path",
+        "interval",
+        "fps_in",
+        "fps_out",
+        "last_tracking_start_idx",
+    ]
+    missing = [k for k in required if k not in state or state[k] is None]
+    if missing:
+        raise ValueError(f"State missing fields: {missing}")
+    key = _make_cache_key(
+        str(state["video_path"]),
+        int(state["interval"]),
+        int(state["last_tracking_start_idx"]),
+    )
+    paths = _cache_paths(key)
+    _save_frames_as_jpg(state["pil_imgs"], paths["frames_dir"])
+    torch.save(
+        {
+            "must3r_feats": _to_cpu(state["must3r_feats"]),
+            "must3r_outputs": _to_cpu(state["must3r_outputs"]),
+            "sam2_input_images": _to_cpu(state["sam2_input_images"]),
+            "images_tensor": _to_cpu(state["images_tensor"]),
+        },
+        paths["tensors"],
+    )
+    src = Path(state["output_video_path"])
+    if not src.exists():
+        raise FileNotFoundError(f"Output video not found: {src}")
+    dst = paths["video"]
+    if src.resolve() != dst.resolve():
+        dst.write_bytes(src.read_bytes())
+    meta = {
+        "video_name": Path(str(state["video_path"])).name,
+        "interval": int(state["interval"]),
+        "fps_in": float(state["fps_in"]),
+        "fps_out": float(state["fps_out"]),
+        "num_frames": int(len(state["pil_imgs"])),
+        "start_idx": int(state["last_tracking_start_idx"]),
+        "points": list(state.get("last_points", [])),
+        "labels": list(state.get("last_labels", [])),
+        "cache_key": key,
+    }
+    with open(paths["meta"], "wb") as f:
+        pickle.dump(meta, f)
+    return key
+# ----------------------------
+# UI callbacks
+# ----------------------------
+def on_video_upload(video_path, interval):
+    if video_path is None:
+        return None, None, gr.Slider(value=0, maximum=0), None
+    pil_imgs, views, resize_funcs, must3r_feats, must3r_outputs, sam2_input_images, images_tensor = process_video_and_features(
+        video_path, int(interval)
+    )
+    fps_in = estimate_video_fps(video_path)
+    interval_i = max(1, int(interval))
+    fps_out = max(1.0, fps_in / interval_i)
+    state = {
+        "pil_imgs": pil_imgs,
+        "views": views,
+        "resize_funcs": resize_funcs,
+        "must3r_feats": must3r_feats,
+        "must3r_outputs": must3r_outputs,
+        "sam2_input_images": sam2_input_images,
+        "images_tensor": images_tensor,
+        "current_points": [],
+        "current_labels": [],
+        "current_mask": None,
+        "frame_idx": 0,
+        "video_path": video_path,
+        "interval": interval_i,
+        "fps_in": fps_in,
+        "fps_out": fps_out,
+        # tracking outputs (filled later)
+        "output_video_path": None,
+        "last_tracking_start_idx": None,
+        "last_points": None,
+        "last_labels": None,
+    }
+    first_frame = pil_imgs[0]
+    new_slider = gr.Slider(value=0, maximum=len(pil_imgs) - 1, step=1, interactive=True)
+    return first_frame, state, new_slider, gr.Image(value=first_frame)
+def on_slider_change(state, frame_idx):
+    if not state:
+        return None
+    frame_idx = int(frame_idx)
+    frame_idx = min(frame_idx, len(state["pil_imgs"]) - 1)
+    state["frame_idx"] = frame_idx
+    state["current_points"] = []
+    state["current_labels"] = []
+    state["current_mask"] = None
+    frame = state["pil_imgs"][frame_idx]
+    return frame
+def on_image_click(state, evt: gr.SelectData, mode):
+    if not state:
+        return None
+    x, y = evt.index
+    label_map = {
+        "Positive Point": 1,
+        "Negative Point": 0,
+        "Box Top-Left": 2,
+        "Box Bottom-Right": 3,
+    }
+    label = label_map[mode]
+    state["current_points"].append([x, y])
+    state["current_labels"].append(label)
+    frame_pil = state["pil_imgs"][state["frame_idx"]]
+    vis_img = draw_points(frame_pil, state["current_points"], state["current_labels"])
+    if state["current_mask"] is not None:
+        vis_img = overlay_mask(vis_img, state["current_mask"])
+    return vis_img
+def on_generate_mask_click(state):
+    if not state:
+        return None
+    if not state["current_points"]:
+        raise gr.Error("No points or boxes annotated.")
+    num_tl = state["current_labels"].count(2)
+    num_br = state["current_labels"].count(3)
+    if num_tl != num_br or num_tl > 1:
+        raise gr.Error(f"Incomplete box: TL={num_tl}, BR={num_br}. Must match and be <= 1.")
+    frame_idx = state["frame_idx"]
+    full_tensor = state["sam2_input_images"]
+    frame_tensor = full_tensor[frame_idx].unsqueeze(0)
+    original_size = state["pil_imgs"][frame_idx].size
+    mask = generate_frame_mask(
+        frame_tensor,
+        state["current_points"],
+        state["current_labels"],
+        original_size,
+    )
+    state["current_mask"] = mask
+    frame_pil = state["pil_imgs"][frame_idx]
+    vis_img = overlay_mask(frame_pil, mask)
+    vis_img = draw_points(vis_img, state["current_points"], state["current_labels"])
+    return vis_img
+def reset_annotations(state):
+    if not state:
+        return None
+    state["current_points"] = []
+    state["current_labels"] = []
+    state["current_mask"] = None
+    frame_idx = state["frame_idx"]
+    return state["pil_imgs"][frame_idx]
+def on_track_click(state):
+    if not state or state["current_mask"] is None:
+        raise gr.Error("Generate a mask first.")
+    num_tl = state["current_labels"].count(2)
+    num_br = state["current_labels"].count(3)
+    if num_tl != num_br:
+        raise gr.Error("Incomplete box annotations.")
+    start_idx = int(state["frame_idx"])
+    first_frame_mask = state["current_mask"]
+    tracked_masks_dict = run_tracking(
+        state["sam2_input_images"],
+        state["must3r_feats"],
+        state["must3r_outputs"],
+        start_idx,
+        first_frame_mask,
+    )
+    output_path = create_video_from_masks(
+        state["pil_imgs"],
+        tracked_masks_dict,
+        fps=state.get("fps_out", 24.0),
+    )
+    state["output_video_path"] = output_path
+    state["last_tracking_start_idx"] = start_idx
+    state["last_points"] = list(state.get("current_points", []))
+    state["last_labels"] = list(state.get("current_labels", []))
+    return output_path, state
+def on_save_cache_click(state):
+    key = save_full_cache_from_state(state)
+    return f"Saved cache key: {key}"
+# ----------------------------
+# UI layout
+# ----------------------------
+description = """
+<div style="text-align: center;">
+    <h1>3AM: 3egment Anything with Geometric Consistency in Videos</h1>
+    <p>Cache-builder UI: run full pipeline, then save caches for user examples.</p>
+</div>
+"""
+with gr.Blocks(title="3AM Cache Builder") as app:
+    gr.HTML(description)
+    app_state = gr.State()
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("## Step 1 — Upload video")
+            video_input = gr.Video(label="Upload Video", sources=["upload"], height=512)
+            gr.Markdown("## Step 2 — Set interval, then load frames")
+            interval_slider = gr.Slider(
+                label="Frame Interval",
+                minimum=1,
+                maximum=30,
+                step=1,
+                value=1,
+            )
+            load_btn = gr.Button("Load Frames", variant="primary")
+            process_status = gr.Textbox(label="Status", value="1) Upload a video.", interactive=False)
+        with gr.Column(scale=2):
+            gr.Markdown("## Step 3 — Annotate frame & generate mask")
+            img_display = gr.Image(label="Annotate Frame", interactive=True, height=512)
+            frame_slider = gr.Slider(label="Select Frame", minimum=0, maximum=100, step=1, value=0)
+            with gr.Row():
+                mode_radio = gr.Radio(
+                    choices=["Positive Point", "Negative Point", "Box Top-Left", "Box Bottom-Right"],
+                    value="Positive Point",
+                    label="Annotation Mode",
+                )
+                with gr.Column():
+                    gen_mask_btn = gr.Button("Generate Mask", variant="primary", interactive=False)
+                    reset_btn = gr.Button("Reset Annotations", interactive=False)
+    gr.Markdown("## Step 4 — Track & Save Cache")
+    with gr.Row():
+        track_btn = gr.Button("Start Tracking", variant="primary", interactive=False)
+        save_cache_btn = gr.Button("Save Cache", variant="secondary", interactive=False)
+    with gr.Row():
+        video_output = gr.Video(label="Tracking Output", autoplay=True, height=512)
+    cache_status = gr.Textbox(label="Cache", value="", interactive=False)
+    # ------------------------
+    # Events
+    # ------------------------
+    def on_video_uploaded(video_path):
+        n_frames = estimate_total_frames(video_path)
+        default_interval = max(1, n_frames // 100)
+        return (
+            gr.update(value=default_interval, maximum=min(30, n_frames)),
+            f"Video uploaded ({n_frames} frames). 2) Adjust interval, then click 'Load Frames'.",
+        )
+    video_input.upload(fn=on_video_uploaded, inputs=video_input, outputs=[interval_slider, process_status])
+    load_btn.click(
+        fn=lambda: (
+            "Loading frames...",
+            gr.update(interactive=False),
+            gr.update(interactive=False),
+            gr.update(interactive=False),
+            gr.update(interactive=False),  # save_cache_btn
+            gr.update(value=""),
+        ),
+        outputs=[process_status, gen_mask_btn, reset_btn, track_btn, save_cache_btn, cache_status],
+    ).then(
+        fn=on_video_upload,
+        inputs=[video_input, interval_slider],
+        outputs=[img_display, app_state, frame_slider, img_display],
+    ).then(
+        fn=lambda: (
+            "Ready. 3) Annotate and generate mask.",
+            gr.update(interactive=True),
+            gr.update(interactive=True),
+            gr.update(interactive=True),
+        ),
+        outputs=[process_status, gen_mask_btn, reset_btn, track_btn],
+    )
+    frame_slider.change(fn=on_slider_change, inputs=[app_state, frame_slider], outputs=[img_display])
+    img_display.select(fn=on_image_click, inputs=[app_state, mode_radio], outputs=[img_display])
+    gen_mask_btn.click(fn=on_generate_mask_click, inputs=[app_state], outputs=[img_display])
+    reset_btn.click(fn=reset_annotations, inputs=[app_state], outputs=[img_display])
+    track_btn.click(
+        fn=lambda: (
+            "Tracking in progress...",
+            gr.update(interactive=False),
+            gr.update(interactive=False),
+        ),
+        outputs=[process_status, track_btn, save_cache_btn],
+    ).then(
+        fn=on_track_click,
+        inputs=[app_state],
+        outputs=[video_output, app_state],
+    ).then(
+        fn=lambda: (
+            "Tracking complete. You can save cache.",
+            gr.update(interactive=True),   # track_btn
+            gr.update(interactive=True),   # save_cache_btn
+        ),
+        outputs=[process_status, track_btn, save_cache_btn],
+    )
+    save_cache_btn.click(fn=on_save_cache_click, inputs=[app_state], outputs=[cache_status])
+if __name__ == "__main__":
+    app.launch()