Spaces:

abreza
/

SpatialTrackerV2_ttm

Sleeping

App Files Files Community

abreza commited on Dec 23, 2025

Commit

ae7b7e0

1 Parent(s): c8dc4de

revert

Browse files

Files changed (6) hide show

app.py +98 -1226
src/config.py +0 -44
src/model_manager.py +0 -62
src/spatial_pipeline.py +0 -277
src/ttm_pipeline.py +0 -303
src/utils.py +0 -57

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import sys
 import gradio as gr
 import os
 import numpy as np
@@ -8,109 +7,27 @@ import shutil
 from pathlib import Path
 from einops import rearrange
 from typing import Union
-# Force unbuffered output for HF Spaces logs
-os.environ['PYTHONUNBUFFERED'] = '1'
-# Configure logging FIRST before any other imports
-import logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.StreamHandler(sys.stdout)
-    ]
-)
-logger = logging.getLogger(__name__)
-logger.info("=" * 50)
-logger.info("Starting application initialization...")
-logger.info("=" * 50)
-sys.stdout.flush()
 try:
     import spaces
-    logger.info("✅ HF Spaces module imported successfully")
 except ImportError:
-    logger.warning("⚠️ HF Spaces module not available, using mock")
-    class spaces:
-        @staticmethod
-        def GPU(func=None, duration=None):
-            def decorator(f):
-                return f
-            return decorator if func is None else func
-sys.stdout.flush()
-logger.info("Importing torch...")
-sys.stdout.flush()
 import torch
-logger.info(f"✅ Torch imported. Version: {torch.__version__}, CUDA available: {torch.cuda.is_available()}")
-sys.stdout.flush()
-import torch.nn.functional as F
 import torchvision.transforms as T
 from concurrent.futures import ThreadPoolExecutor
 import atexit
 import uuid
-logger.info("Importing decord...")
-sys.stdout.flush()
 import decord
-logger.info("✅ Decord imported successfully")
-sys.stdout.flush()
-from PIL import Image
-logger.info("Importing SpaTrack models...")
-sys.stdout.flush()
-try:
-    from models.SpaTrackV2.models.vggt4track.models.vggt_moe import VGGT4Track
-    from models.SpaTrackV2.models.vggt4track.utils.load_fn import preprocess_image
-    from models.SpaTrackV2.models.predictor import Predictor
-    from models.SpaTrackV2.models.utils import get_points_on_a_grid
-    logger.info("✅ SpaTrack models imported successfully")
-except Exception as e:
-    logger.error(f"❌ Failed to import SpaTrack models: {e}")
-    raise
-sys.stdout.flush()
-# TTM imports (optional - will be loaded on demand)
-logger.info("Checking TTM (diffusers) availability...")
-sys.stdout.flush()
-TTM_COG_AVAILABLE = False
-TTM_WAN_AVAILABLE = False
-try:
-    from diffusers import CogVideoXImageToVideoPipeline
-    from diffusers.utils import export_to_video, load_image
-    from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
-    from diffusers.utils.torch_utils import randn_tensor
-    from diffusers.video_processor import VideoProcessor
-    TTM_COG_AVAILABLE = True
-    logger.info("✅ CogVideoX TTM available")
-except ImportError as e:
-    logger.info(f"ℹ️ CogVideoX TTM not available: {e}")
-sys.stdout.flush()
-try:
-    from diffusers import AutoencoderKLWan, WanTransformer3DModel
-    from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
-    from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline, retrieve_latents
-    from diffusers.pipelines.wan.pipeline_output import WanPipelineOutput
-    if not TTM_COG_AVAILABLE:
-        from diffusers.utils import export_to_video, load_image
-        from diffusers.utils.torch_utils import randn_tensor
-        from diffusers.video_processor import VideoProcessor
-    TTM_WAN_AVAILABLE = True
-    logger.info("✅ Wan TTM available")
-except ImportError as e:
-    logger.info(f"ℹ️ Wan TTM not available: {e}")
-sys.stdout.flush()
-TTM_AVAILABLE = TTM_COG_AVAILABLE or TTM_WAN_AVAILABLE
-if not TTM_AVAILABLE:
-    logger.warning("⚠️ Diffusers not available. TTM features will be disabled.")
-else:
-    logger.info(f"TTM Status - CogVideoX: {TTM_COG_AVAILABLE}, Wan: {TTM_WAN_AVAILABLE}")
-sys.stdout.flush()
 # Constants
 MAX_FRAMES = 80
@@ -129,126 +46,9 @@ CAMERA_MOVEMENTS = [
     "move_down"
 ]
-# TTM Constants
-TTM_COG_MODEL_ID = "THUDM/CogVideoX-5b-I2V"
-TTM_WAN_MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
-TTM_DTYPE = torch.bfloat16
-TTM_DEFAULT_NUM_FRAMES = 49
-TTM_DEFAULT_NUM_INFERENCE_STEPS = 50
-# TTM Model choices
-TTM_MODELS = []
-if TTM_COG_AVAILABLE:
-    TTM_MODELS.append("CogVideoX-5B")
-if TTM_WAN_AVAILABLE:
-    TTM_MODELS.append("Wan2.2-14B (Recommended)")
-# Global model instances (lazy loaded for HF Spaces GPU compatibility)
-vggt4track_model = None
-tracker_model = None
-ttm_cog_pipeline = None
-ttm_wan_pipeline = None
-MODELS_LOADED = False
-def load_video_to_tensor(video_path: str) -> torch.Tensor:
-    """Returns a video tensor from a video file. shape [1, C, T, H, W], [0, 1] range."""
-    cap = cv2.VideoCapture(video_path)
-    frames = []
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        frames.append(frame)
-    cap.release()
-    frames = np.array(frames)
-    video_tensor = torch.tensor(frames)
-    video_tensor = video_tensor.permute(0, 3, 1, 2).float() / 255.0
-    video_tensor = video_tensor.unsqueeze(0).permute(0, 2, 1, 3, 4)
-    return video_tensor
-def get_ttm_cog_pipeline():
-    """Lazy load CogVideoX TTM pipeline to save memory."""
-    global ttm_cog_pipeline
-    if ttm_cog_pipeline is None and TTM_COG_AVAILABLE:
-        logger.info("Loading TTM CogVideoX pipeline...")
-        ttm_cog_pipeline = CogVideoXImageToVideoPipeline.from_pretrained(
-            TTM_COG_MODEL_ID,
-            torch_dtype=TTM_DTYPE,
-            low_cpu_mem_usage=True,
-        )
-        ttm_cog_pipeline.vae.enable_tiling()
-        ttm_cog_pipeline.vae.enable_slicing()
-        logger.info("TTM CogVideoX pipeline loaded successfully!")
-    return ttm_cog_pipeline
-def get_ttm_wan_pipeline():
-    """Lazy load Wan TTM pipeline to save memory."""
-    global ttm_wan_pipeline
-    if ttm_wan_pipeline is None and TTM_WAN_AVAILABLE:
-        logger.info("Loading TTM Wan 2.2 pipeline...")
-        ttm_wan_pipeline = WanImageToVideoPipeline.from_pretrained(
-            TTM_WAN_MODEL_ID,
-            torch_dtype=TTM_DTYPE,
-        )
-        ttm_wan_pipeline.vae.enable_tiling()
-        ttm_wan_pipeline.vae.enable_slicing()
-        logger.info("TTM Wan 2.2 pipeline loaded successfully!")
-    return ttm_wan_pipeline
-logger.info("Setting up thread pool and utility functions...")
-sys.stdout.flush()
 # Thread pool for delayed deletion
 thread_pool_executor = ThreadPoolExecutor(max_workers=2)
-def load_models():
-    """Load models lazily when GPU is available (inside @spaces.GPU decorated function)."""
-    global vggt4track_model, tracker_model, MODELS_LOADED
-    if MODELS_LOADED:
-        logger.info("Models already loaded, skipping...")
-        return
-    logger.info("🚀 Starting model loading...")
-    sys.stdout.flush()
-    try:
-        logger.info("Loading VGGT4Track model from 'Yuxihenry/SpatialTrackerV2_Front'...")
-        sys.stdout.flush()
-        vggt4track_model = VGGT4Track.from_pretrained("Yuxihenry/SpatialTrackerV2_Front")
-        vggt4track_model.eval()
-        logger.info("✅ VGGT4Track model loaded, moving to CUDA...")
-        sys.stdout.flush()
-        vggt4track_model = vggt4track_model.to("cuda")
-        logger.info("✅ VGGT4Track model on CUDA")
-        sys.stdout.flush()
-        logger.info("Loading Predictor model from 'Yuxihenry/SpatialTrackerV2-Offline'...")
-        sys.stdout.flush()
-        tracker_model = Predictor.from_pretrained("Yuxihenry/SpatialTrackerV2-Offline")
-        tracker_model.eval()
-        logger.info("✅ Predictor model loaded")
-        sys.stdout.flush()
-        MODELS_LOADED = True
-        logger.info("✅ All models loaded successfully!")
-        sys.stdout.flush()
-    except Exception as e:
-        logger.error(f"❌ Failed to load models: {e}")
-        import traceback
-        traceback.print_exc()
-        sys.stdout.flush()
-        raise
 def delete_later(path: Union[str, os.PathLike], delay: int = 600):
     """Delete file or directory after specified delay"""
     def _delete():
@@ -267,7 +67,6 @@ def delete_later(path: Union[str, os.PathLike], delay: int = 600):
     thread_pool_executor.submit(_wait_and_delete)
     atexit.register(_delete)
 def create_user_temp_dir():
     """Create a unique temporary directory for each user session"""
     session_id = str(uuid.uuid4())[:8]
@@ -276,16 +75,17 @@ def create_user_temp_dir():
     delete_later(temp_dir, delay=600)
     return temp_dir
-# Note: Models are loaded lazily inside @spaces.GPU decorated functions
-# This is required for HF Spaces ZeroGPU compatibility
-logger.info("Models will be loaded lazily when GPU is available")
-sys.stdout.flush()
-logger.info("Setting up Gradio static paths...")
 gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])
-logger.info("✅ Static paths configured")
-sys.stdout.flush()
 def generate_camera_trajectory(num_frames: int, movement_type: str,
@@ -311,8 +111,7 @@ def generate_camera_trajectory(num_frames: int, movement_type: str,
         if movement_type == "static":
             pass  # Keep identity
         elif movement_type == "move_forward":
-            # Move along -Z (forward in OpenGL convention)
-            ext[2, 3] = -speed * t
         elif movement_type == "move_backward":
             ext[2, 3] = speed * t  # Move along +Z
         elif movement_type == "move_left":
@@ -369,8 +168,7 @@ def render_from_pointcloud(rgb_frames: np.ndarray,
         base_dir = os.path.dirname(output_path)
         motion_signal_path = os.path.join(base_dir, "motion_signal.mp4")
         mask_path = os.path.join(base_dir, "mask.mp4")
-        out_motion_signal = cv2.VideoWriter(
-            motion_signal_path, fourcc, fps, (W, H))
         out_mask = cv2.VideoWriter(mask_path, fourcc, fps, (W, H))
     # Create meshgrid for pixel coordinates
@@ -450,21 +248,17 @@ def render_from_pointcloud(rgb_frames: np.ndarray,
                 if hole_mask.sum() == 0:
                     break
                 dilated = cv2.dilate(motion_signal_frame, kernel, iterations=1)
-                motion_signal_frame = np.where(
-                    hole_mask[:, :, None] > 0, dilated, motion_signal_frame)
-                hole_mask = (motion_signal_frame.sum(
-                    axis=-1) == 0).astype(np.uint8)
         # Write TTM outputs if enabled
         if generate_ttm_inputs:
             # Motion signal: warped frame with NN inpainting
-            motion_signal_bgr = cv2.cvtColor(
-                motion_signal_frame, cv2.COLOR_RGB2BGR)
             out_motion_signal.write(motion_signal_bgr)
             # Mask: binary mask of valid (projected) pixels - white where valid, black where holes
-            mask_frame = np.stack(
-                [valid_mask, valid_mask, valid_mask], axis=-1)
             out_mask.write(mask_frame)
         # For the rendered output, use the same inpainted result
@@ -484,7 +278,7 @@ def render_from_pointcloud(rgb_frames: np.ndarray,
     }
-@spaces.GPU(duration=180)
 def run_spatial_tracker(video_tensor: torch.Tensor):
     """
     GPU-intensive spatial tracking function.
@@ -495,23 +289,9 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
     Returns:
         Dictionary containing tracking results
     """
-    global vggt4track_model, tracker_model
-    logger.info("run_spatial_tracker: Starting GPU execution...")
-    sys.stdout.flush()
-    # Load models if not already loaded (lazy loading for HF Spaces)
-    load_models()
-    logger.info("run_spatial_tracker: Preprocessing video input...")
-    sys.stdout.flush()
     # Run VGGT to get depth and camera poses
     video_input = preprocess_image(video_tensor)[None].cuda()
-    logger.info("run_spatial_tracker: Running VGGT inference...")
-    sys.stdout.flush()
     with torch.no_grad():
         with torch.cuda.amp.autocast(dtype=torch.bfloat16):
             predictions = vggt4track_model(video_input / 255)
@@ -520,9 +300,6 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
             depth_map = predictions["points_map"][..., 2]
             depth_conf = predictions["unc_metric"]
-    logger.info("run_spatial_tracker: VGGT inference complete")
-    sys.stdout.flush()
     depth_tensor = depth_map.squeeze().cpu().numpy()
     extrs = extrinsic.squeeze().cpu().numpy()
     intrs = intrinsic.squeeze().cpu().numpy()
@@ -530,20 +307,13 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
     unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
     # Setup tracker
-    logger.info("run_spatial_tracker: Setting up tracker...")
-    sys.stdout.flush()
     tracker_model.spatrack.track_num = 512
     tracker_model.to("cuda")
     # Get grid points for tracking
     frame_H, frame_W = video_tensor_gpu.shape[2:]
     grid_pts = get_points_on_a_grid(30, (frame_H, frame_W), device="cpu")
-    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[
-        0].numpy()
-    logger.info("run_spatial_tracker: Running 3D tracker...")
-    sys.stdout.flush()
     # Run tracker
     with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
@@ -571,11 +341,8 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
         conf_depth = T.Resize((new_h, new_w))(conf_depth)
         intrs_out[:, :2, :] = intrs_out[:, :2, :] * scale
-    logger.info("run_spatial_tracker: Moving results to CPU...")
-    sys.stdout.flush()
     # Move results to CPU and return
-    result = {
         'video_out': video_out.cpu(),
         'point_map': point_map.cpu(),
         'conf_depth': conf_depth.cpu(),
@@ -583,11 +350,6 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
         'c2w_traj': c2w_traj.cpu(),
     }
-    logger.info("run_spatial_tracker: Complete!")
-    sys.stdout.flush()
-    return result
 def process_video(video_path: str, camera_movement: str, generate_ttm: bool = True, progress=gr.Progress()):
     """Main processing function
@@ -643,8 +405,7 @@ def process_video(video_path: str, camera_movement: str, generate_ttm: bool = Tr
         c2w_traj = tracking_results['c2w_traj']
         # Get RGB frames and depth
-        rgb_frames = rearrange(
-            video_out.numpy(), "T C H W -> T H W C").astype(np.uint8)
         depth_frames = point_map[:, 2].numpy()
         depth_conf_np = conf_depth.numpy()
@@ -655,8 +416,7 @@ def process_video(video_path: str, camera_movement: str, generate_ttm: bool = Tr
         intrs_np = intrs_out.numpy()
         extrs_np = torch.inverse(c2w_traj).numpy()  # world-to-camera
-        progress(
-            0.7, desc=f"Generating {camera_movement} camera trajectory...")
         # Calculate scene scale from depth
         valid_depth = depth_frames[depth_frames > 0]
@@ -711,995 +471,107 @@ def process_video(video_path: str, camera_movement: str, generate_ttm: bool = Tr
         return None, None, None, None, f"❌ Error: {str(e)}"
-# TTM CogVideoX Pipeline Helper Classes and Functions
-class CogVideoXTTMHelper:
-    """Helper class for TTM-style video generation using CogVideoX pipeline."""
-    def __init__(self, pipeline):
-        self.pipeline = pipeline
-        self.vae = pipeline.vae
-        self.transformer = pipeline.transformer
-        self.scheduler = pipeline.scheduler
-        self.vae_scale_factor_spatial = 2 ** (
-            len(self.vae.config.block_out_channels) - 1)
-        self.vae_scale_factor_temporal = self.vae.config.temporal_compression_ratio
-        self.vae_scaling_factor_image = self.vae.config.scaling_factor
-        self.video_processor = pipeline.video_processor
-    @torch.no_grad()
-    def encode_frames(self, frames: torch.Tensor) -> torch.Tensor:
-        """Encode video frames into latent space. Input shape (B, C, F, H, W), expected range [-1, 1]."""
-        latents = self.vae.encode(frames)[0].sample()
-        latents = latents * self.vae_scaling_factor_image
-        # (B, C, F, H, W) -> (B, F, C, H, W)
-        return latents.permute(0, 2, 1, 3, 4).contiguous()
-    def convert_rgb_mask_to_latent_mask(self, mask: torch.Tensor) -> torch.Tensor:
-        """Convert a per-frame mask [T, 1, H, W] to latent resolution [1, T_latent, 1, H', W']."""
-        k = self.vae_scale_factor_temporal
-        mask0 = mask[0:1]
-        mask1 = mask[1::k]
-        sampled = torch.cat([mask0, mask1], dim=0)
-        pooled = sampled.permute(1, 0, 2, 3).unsqueeze(0)
-        s = self.vae_scale_factor_spatial
-        H_latent = pooled.shape[-2] // s
-        W_latent = pooled.shape[-1] // s
-        pooled = F.interpolate(pooled, size=(
-            pooled.shape[2], H_latent, W_latent), mode="nearest")
-        latent_mask = pooled.permute(0, 2, 1, 3, 4)
-        return latent_mask
-# TTM Wan Pipeline Helper Class
-class WanTTMHelper:
-    """Helper class for TTM-style video generation using Wan pipeline."""
-    def __init__(self, pipeline):
-        self.pipeline = pipeline
-        self.vae = pipeline.vae
-        self.transformer = pipeline.transformer
-        self.scheduler = pipeline.scheduler
-        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal
-        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial
-        self.video_processor = pipeline.video_processor
-    def convert_rgb_mask_to_latent_mask(self, mask: torch.Tensor) -> torch.Tensor:
-        """Convert a per-frame mask [T, 1, H, W] to latent resolution [1, T_latent, 1, H', W']."""
-        k = self.vae_scale_factor_temporal
-        mask0 = mask[0:1]
-        mask1 = mask[1::k]
-        sampled = torch.cat([mask0, mask1], dim=0)
-        pooled = sampled.permute(1, 0, 2, 3).unsqueeze(0)
-        s = self.vae_scale_factor_spatial
-        H_latent = pooled.shape[-2] // s
-        W_latent = pooled.shape[-1] // s
-        pooled = F.interpolate(pooled, size=(
-            pooled.shape[2], H_latent, W_latent), mode="nearest")
-        latent_mask = pooled.permute(0, 2, 1, 3, 4)
-        return latent_mask
-def compute_hw_from_area(image_height: int, image_width: int, max_area: int, mod_value: int) -> tuple:
-    """Compute (height, width) with proper aspect ratio and rounding."""
-    aspect_ratio = image_height / image_width
-    height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
-    width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
-    return int(height), int(width)
-@spaces.GPU(duration=300)
-def run_ttm_cog_generation(
-    first_frame_path: str,
-    motion_signal_path: str,
-    mask_path: str,
-    prompt: str,
-    tweak_index: int = 4,
-    tstrong_index: int = 9,
-    num_frames: int = 49,
-    num_inference_steps: int = 50,
-    guidance_scale: float = 6.0,
-    seed: int = 0,
-    progress=gr.Progress()
-):
-    """
-    Run TTM-style video generation using CogVideoX pipeline.
-    Uses the generated motion signal and mask to guide video generation.
-    """
-    if not TTM_COG_AVAILABLE:
-        return None, "❌ CogVideoX TTM is not available. Please install diffusers package."
-    if first_frame_path is None or motion_signal_path is None or mask_path is None:
-        return None, "❌ Please generate TTM inputs first (first_frame, motion_signal, mask)"
-    progress(0, desc="Loading CogVideoX TTM pipeline...")
-    try:
-        # Get or load the pipeline
-        pipe = get_ttm_cog_pipeline()
-        if pipe is None:
-            return None, "❌ Failed to load CogVideoX TTM pipeline"
-        pipe = pipe.to("cuda")
-        # Create helper
-        ttm_helper = CogVideoXTTMHelper(pipe)
-        progress(0.1, desc="Loading inputs...")
-        # Load first frame
-        image = load_image(first_frame_path)
-        # Get dimensions
-        height = pipe.transformer.config.sample_height * \
-            ttm_helper.vae_scale_factor_spatial
-        width = pipe.transformer.config.sample_width * \
-            ttm_helper.vae_scale_factor_spatial
-        device = "cuda"
-        generator = torch.Generator(device=device).manual_seed(seed)
-        progress(0.15, desc="Encoding prompt...")
-        # Encode prompt
-        do_classifier_free_guidance = guidance_scale > 1.0
-        prompt_embeds, negative_prompt_embeds = pipe.encode_prompt(
-            prompt=prompt,
-            negative_prompt="",
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            num_videos_per_prompt=1,
-            max_sequence_length=226,
-            device=device,
-        )
-        if do_classifier_free_guidance:
-            prompt_embeds = torch.cat(
-                [negative_prompt_embeds, prompt_embeds], dim=0)
-        progress(0.2, desc="Preparing latents...")
-        # Prepare timesteps
-        pipe.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = pipe.scheduler.timesteps
-        # Prepare latents
-        latent_frames = (
-            num_frames - 1) // ttm_helper.vae_scale_factor_temporal + 1
-        # Handle padding for CogVideoX 1.5
-        patch_size_t = pipe.transformer.config.patch_size_t
-        additional_frames = 0
-        if patch_size_t is not None and latent_frames % patch_size_t != 0:
-            additional_frames = patch_size_t - latent_frames % patch_size_t
-            num_frames += additional_frames * ttm_helper.vae_scale_factor_temporal
-        # Preprocess image
-        image_tensor = ttm_helper.video_processor.preprocess(image, height=height, width=width).to(
-            device, dtype=prompt_embeds.dtype
-        )
-        latent_channels = pipe.transformer.config.in_channels // 2
-        latents, image_latents = pipe.prepare_latents(
-            image_tensor,
-            1,  # batch_size
-            latent_channels,
-            num_frames,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            None,
-        )
-        progress(0.3, desc="Loading motion signal and mask...")
-        # Load motion signal video
-        ref_vid = load_video_to_tensor(motion_signal_path).to(device=device)
-        refB, refC, refT, refH, refW = ref_vid.shape
-        ref_vid = F.interpolate(
-            ref_vid.permute(0, 2, 1, 3, 4).reshape(
-                refB*refT, refC, refH, refW),
-            size=(height, width), mode="bicubic", align_corners=True,
-        ).reshape(refB, refT, refC, height, width).permute(0, 2, 1, 3, 4)
-        ref_vid = ttm_helper.video_processor.normalize(
-            ref_vid.to(dtype=pipe.vae.dtype))
-        ref_latents = ttm_helper.encode_frames(ref_vid).float().detach()
-        # Load mask video
-        ref_mask = load_video_to_tensor(mask_path).to(device=device)
-        mB, mC, mT, mH, mW = ref_mask.shape
-        ref_mask = F.interpolate(
-            ref_mask.permute(0, 2, 1, 3, 4).reshape(mB*mT, mC, mH, mW),
-            size=(height, width), mode="nearest",
-        ).reshape(mB, mT, mC, height, width).permute(0, 2, 1, 3, 4)
-        ref_mask = ref_mask[0].permute(1, 0, 2, 3).contiguous()
-        if len(ref_mask.shape) == 4:
-            ref_mask = ref_mask.unsqueeze(0)
-        ref_mask = ref_mask[0, :, :1].contiguous()
-        ref_mask = (ref_mask > 0.5).float().max(dim=1, keepdim=True)[0]
-        motion_mask = ttm_helper.convert_rgb_mask_to_latent_mask(ref_mask)
-        background_mask = 1.0 - motion_mask
-        progress(0.35, desc="Initializing TTM denoising...")
-        # Initialize with noisy reference latents at tweak timestep
-        if tweak_index >= 0:
-            tweak = timesteps[tweak_index]
-            fixed_noise = randn_tensor(
-                ref_latents.shape,
-                generator=generator,
-                device=ref_latents.device,
-                dtype=ref_latents.dtype,
-            )
-            noisy_latents = pipe.scheduler.add_noise(
-                ref_latents, fixed_noise, tweak.long())
-            latents = noisy_latents.to(
-                dtype=latents.dtype, device=latents.device)
-        else:
-            fixed_noise = randn_tensor(
-                ref_latents.shape,
-                generator=generator,
-                device=ref_latents.device,
-                dtype=ref_latents.dtype,
-            )
-            tweak_index = 0
-        # Prepare extra step kwargs
-        extra_step_kwargs = pipe.prepare_extra_step_kwargs(generator, 0.0)
-        # Create rotary embeddings if required
-        image_rotary_emb = (
-            pipe._prepare_rotary_positional_embeddings(
-                height, width, latents.size(1), device)
-            if pipe.transformer.config.use_rotary_positional_embeddings
-            else None
-        )
-        # Create ofs embeddings if required
-        ofs_emb = None if pipe.transformer.config.ofs_embed_dim is None else latents.new_full(
-            (1,), fill_value=2.0)
-        progress(0.4, desc="Running TTM denoising loop...")
-        # Denoising loop
-        total_steps = len(timesteps) - tweak_index
-        old_pred_original_sample = None
-        for i, t in enumerate(timesteps[tweak_index:]):
-            step_progress = 0.4 + 0.5 * (i / total_steps)
-            progress(step_progress,
-                     desc=f"Denoising step {i+1}/{total_steps}...")
-            latent_model_input = torch.cat(
-                [latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = pipe.scheduler.scale_model_input(
-                latent_model_input, t)
-            latent_image_input = torch.cat(
-                [image_latents] * 2) if do_classifier_free_guidance else image_latents
-            latent_model_input = torch.cat(
-                [latent_model_input, latent_image_input], dim=2)
-            timestep = t.expand(latent_model_input.shape[0])
-            # Predict noise
-            noise_pred = pipe.transformer(
-                hidden_states=latent_model_input,
-                encoder_hidden_states=prompt_embeds,
-                timestep=timestep,
-                ofs=ofs_emb,
-                image_rotary_emb=image_rotary_emb,
-                return_dict=False,
-            )[0]
-            noise_pred = noise_pred.float()
-            # Perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * \
-                    (noise_pred_text - noise_pred_uncond)
-            # Compute previous noisy sample
-            if not isinstance(pipe.scheduler, CogVideoXDPMScheduler):
-                latents, old_pred_original_sample = pipe.scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
-                )
-            else:
-                latents, old_pred_original_sample = pipe.scheduler.step(
-                    noise_pred,
-                    old_pred_original_sample,
-                    t,
-                    timesteps[i - 1] if i > 0 else None,
-                    latents,
-                    **extra_step_kwargs,
-                    return_dict=False,
-                )
-            # TTM: In between tweak and tstrong, replace mask with noisy reference latents
-            in_between_tweak_tstrong = (i + tweak_index) < tstrong_index
-            if in_between_tweak_tstrong:
-                if i + tweak_index + 1 < len(timesteps):
-                    prev_t = timesteps[i + tweak_index + 1]
-                    noisy_latents = pipe.scheduler.add_noise(ref_latents, fixed_noise, prev_t.long()).to(
-                        dtype=latents.dtype, device=latents.device
-                    )
-                    latents = latents * background_mask + noisy_latents * motion_mask
-                else:
-                    latents = latents * background_mask + ref_latents * motion_mask
-            latents = latents.to(prompt_embeds.dtype)
-        progress(0.9, desc="Decoding video...")
-        # Decode latents
-        latents = latents[:, additional_frames:]
-        frames = pipe.decode_latents(latents)
-        video = ttm_helper.video_processor.postprocess_video(
-            video=frames, output_type="pil")
-        progress(0.95, desc="Saving video...")
-        # Save video
-        temp_dir = create_user_temp_dir()
-        output_path = os.path.join(temp_dir, "ttm_output.mp4")
-        export_to_video(video[0], output_path, fps=8)
-        progress(1.0, desc="Done!")
-        return output_path, f"✅ CogVideoX TTM video generated successfully!\n\n**Parameters:**\n- Model: CogVideoX-5B\n- tweak_index: {tweak_index}\n- tstrong_index: {tstrong_index}\n- guidance_scale: {guidance_scale}\n- seed: {seed}"
-    except Exception as e:
-        logger.error(f"Error in CogVideoX TTM generation: {e}")
-        import traceback
-        traceback.print_exc()
-        return None, f"❌ Error: {str(e)}"
-@spaces.GPU(duration=300)
-def run_ttm_wan_generation(
-    first_frame_path: str,
-    motion_signal_path: str,
-    mask_path: str,
-    prompt: str,
-    negative_prompt: str = "",
-    tweak_index: int = 3,
-    tstrong_index: int = 7,
-    num_frames: int = 81,
-    num_inference_steps: int = 50,
-    guidance_scale: float = 3.5,
-    seed: int = 0,
-    progress=gr.Progress()
-):
-    """
-    Run TTM-style video generation using Wan 2.2 pipeline.
-    This is the recommended model for TTM as it produces higher-quality results.
-    """
-    if not TTM_WAN_AVAILABLE:
-        return None, "❌ Wan TTM is not available. Please install diffusers with Wan support."
-    if first_frame_path is None or motion_signal_path is None or mask_path is None:
-        return None, "❌ Please generate TTM inputs first (first_frame, motion_signal, mask)"
-    progress(0, desc="Loading Wan 2.2 TTM pipeline...")
-    try:
-        # Get or load the pipeline
-        pipe = get_ttm_wan_pipeline()
-        if pipe is None:
-            return None, "❌ Failed to load Wan TTM pipeline"
-        pipe = pipe.to("cuda")
-        # Create helper
-        ttm_helper = WanTTMHelper(pipe)
-        progress(0.1, desc="Loading inputs...")
-        # Load first frame
-        image = load_image(first_frame_path)
-        # Get dimensions - compute based on image aspect ratio
-        max_area = 480 * 832
-        mod_value = ttm_helper.vae_scale_factor_spatial * \
-            pipe.transformer.config.patch_size[1]
-        height, width = compute_hw_from_area(
-            image.height, image.width, max_area, mod_value)
-        image = image.resize((width, height))
-        device = "cuda"
-        gen_device = device if device.startswith("cuda") else "cpu"
-        generator = torch.Generator(device=gen_device).manual_seed(seed)
-        progress(0.15, desc="Encoding prompt...")
-        # Encode prompt
-        do_classifier_free_guidance = guidance_scale > 1.0
-        prompt_embeds, negative_prompt_embeds = pipe.encode_prompt(
-            prompt=prompt,
-            negative_prompt=negative_prompt if negative_prompt else None,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            num_videos_per_prompt=1,
-            max_sequence_length=512,
-            device=device,
-        )
-        # Get transformer dtype
-        transformer_dtype = pipe.transformer.dtype
-        prompt_embeds = prompt_embeds.to(transformer_dtype)
-        if negative_prompt_embeds is not None:
-            negative_prompt_embeds = negative_prompt_embeds.to(
-                transformer_dtype)
-        # Encode image embedding if transformer supports it
-        image_embeds = None
-        if pipe.transformer.config.image_dim is not None:
-            image_embeds = pipe.encode_image(image, device)
-            image_embeds = image_embeds.repeat(1, 1, 1)
-            image_embeds = image_embeds.to(transformer_dtype)
-        progress(0.2, desc="Preparing latents...")
-        # Prepare timesteps
-        pipe.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = pipe.scheduler.timesteps
-        # Adjust num_frames to be valid for VAE
-        if num_frames % ttm_helper.vae_scale_factor_temporal != 1:
-            num_frames = num_frames // ttm_helper.vae_scale_factor_temporal * \
-                ttm_helper.vae_scale_factor_temporal + 1
-        num_frames = max(num_frames, 1)
-        # Prepare latent variables
-        num_channels_latents = pipe.vae.config.z_dim
-        image_tensor = ttm_helper.video_processor.preprocess(
-            image, height=height, width=width).to(device, dtype=torch.float32)
-        latents_outputs = pipe.prepare_latents(
-            image_tensor,
-            1,  # batch_size
-            num_channels_latents,
-            height,
-            width,
-            num_frames,
-            torch.float32,
-            device,
-            generator,
-            None,
-            None,  # last_image
-        )
-        if hasattr(pipe, 'config') and pipe.config.expand_timesteps:
-            latents, condition, first_frame_mask = latents_outputs
-        else:
-            latents, condition = latents_outputs
-            first_frame_mask = None
-        progress(0.3, desc="Loading motion signal and mask...")
-        # Load motion signal video
-        ref_vid = load_video_to_tensor(motion_signal_path).to(device=device)
-        refB, refC, refT, refH, refW = ref_vid.shape
-        ref_vid = F.interpolate(
-            ref_vid.permute(0, 2, 1, 3, 4).reshape(
-                refB*refT, refC, refH, refW),
-            size=(height, width), mode="bicubic", align_corners=True,
-        ).reshape(refB, refT, refC, height, width).permute(0, 2, 1, 3, 4)
-        ref_vid = ttm_helper.video_processor.normalize(
-            ref_vid.to(dtype=pipe.vae.dtype))
-        ref_latents = retrieve_latents(
-            pipe.vae.encode(ref_vid), sample_mode="argmax")
-        # Normalize latents
-        latents_mean = torch.tensor(pipe.vae.config.latents_mean).view(
-            1, pipe.vae.config.z_dim, 1, 1, 1).to(ref_latents.device, ref_latents.dtype)
-        latents_std = 1.0 / torch.tensor(pipe.vae.config.latents_std).view(
-            1, pipe.vae.config.z_dim, 1, 1, 1).to(ref_latents.device, ref_latents.dtype)
-        ref_latents = (ref_latents - latents_mean) * latents_std
-        # Load mask video
-        ref_mask = load_video_to_tensor(mask_path).to(device=device)
-        mB, mC, mT, mH, mW = ref_mask.shape
-        ref_mask = F.interpolate(
-            ref_mask.permute(0, 2, 1, 3, 4).reshape(mB*mT, mC, mH, mW),
-            size=(height, width), mode="nearest",
-        ).reshape(mB, mT, mC, height, width).permute(0, 2, 1, 3, 4)
-        mask_tc_hw = ref_mask[0].permute(1, 0, 2, 3).contiguous()
-        # Align time dimension
-        if mask_tc_hw.shape[0] > num_frames:
-            mask_tc_hw = mask_tc_hw[:num_frames]
-        elif mask_tc_hw.shape[0] < num_frames:
-            return None, f"❌ num_frames ({num_frames}) > mask frames ({mask_tc_hw.shape[0]}). Please use more mask frames."
-        # Reduce channels if needed
-        if mask_tc_hw.shape[1] > 1:
-            mask_t1_hw = (mask_tc_hw > 0.5).any(dim=1, keepdim=True).float()
-        else:
-            mask_t1_hw = (mask_tc_hw > 0.5).float()
-        motion_mask = ttm_helper.convert_rgb_mask_to_latent_mask(
-            mask_t1_hw).permute(0, 2, 1, 3, 4).contiguous()
-        background_mask = 1.0 - motion_mask
-        progress(0.35, desc="Initializing TTM denoising...")
-        # Initialize with noisy reference latents at tweak timestep
-        if tweak_index >= 0 and tweak_index < len(timesteps):
-            tweak = timesteps[tweak_index]
-            fixed_noise = randn_tensor(
-                ref_latents.shape,
-                generator=generator,
-                device=ref_latents.device,
-                dtype=ref_latents.dtype,
-            )
-            tweak_t = torch.as_tensor(
-                tweak, device=ref_latents.device, dtype=torch.long).view(1)
-            noisy_latents = pipe.scheduler.add_noise(
-                ref_latents, fixed_noise, tweak_t.long())
-            latents = noisy_latents.to(
-                dtype=latents.dtype, device=latents.device)
-        else:
-            fixed_noise = randn_tensor(
-                ref_latents.shape,
-                generator=generator,
-                device=ref_latents.device,
-                dtype=ref_latents.dtype,
-            )
-            tweak_index = 0
-        progress(0.4, desc="Running TTM denoising loop...")
-        # Denoising loop
-        total_steps = len(timesteps) - tweak_index
-        for i, t in enumerate(timesteps[tweak_index:]):
-            step_progress = 0.4 + 0.5 * (i / total_steps)
-            progress(step_progress,
-                     desc=f"Denoising step {i+1}/{total_steps}...")
-            # Prepare model input
-            if first_frame_mask is not None:
-                latent_model_input = (1 - first_frame_mask) * \
-                    condition + first_frame_mask * latents
-                latent_model_input = latent_model_input.to(transformer_dtype)
-                temp_ts = (first_frame_mask[0][0][:, ::2, ::2] * t).flatten()
-                timestep = temp_ts.unsqueeze(0).expand(latents.shape[0], -1)
-            else:
-                latent_model_input = torch.cat(
-                    [latents, condition], dim=1).to(transformer_dtype)
-                timestep = t.expand(latents.shape[0])
-            # Predict noise (conditional)
-            noise_pred = pipe.transformer(
-                hidden_states=latent_model_input,
-                timestep=timestep,
-                encoder_hidden_states=prompt_embeds,
-                encoder_hidden_states_image=image_embeds,
-                return_dict=False,
-            )[0]
-            # CFG
-            if do_classifier_free_guidance:
-                noise_uncond = pipe.transformer(
-                    hidden_states=latent_model_input,
-                    timestep=timestep,
-                    encoder_hidden_states=negative_prompt_embeds,
-                    encoder_hidden_states_image=image_embeds,
-                    return_dict=False,
-                )[0]
-                noise_pred = noise_uncond + guidance_scale * \
-                    (noise_pred - noise_uncond)
-            # Scheduler step
-            latents = pipe.scheduler.step(
-                noise_pred, t, latents, return_dict=False)[0]
-            # TTM: In between tweak and tstrong, replace mask with noisy reference latents
-            in_between_tweak_tstrong = (i + tweak_index) < tstrong_index
-            if in_between_tweak_tstrong:
-                if i + tweak_index + 1 < len(timesteps):
-                    prev_t = timesteps[i + tweak_index + 1]
-                    prev_t = torch.as_tensor(
-                        prev_t, device=ref_latents.device, dtype=torch.long).view(1)
-                    noisy_latents = pipe.scheduler.add_noise(ref_latents, fixed_noise, prev_t.long()).to(
-                        dtype=latents.dtype, device=latents.device
-                    )
-                    latents = latents * background_mask + noisy_latents * motion_mask
-                else:
-                    latents = latents * background_mask + \
-                        ref_latents.to(dtype=latents.dtype,
-                                       device=latents.device) * motion_mask
-        progress(0.9, desc="Decoding video...")
-        # Apply first frame mask if used
-        if first_frame_mask is not None:
-            latents = (1 - first_frame_mask) * condition + \
-                first_frame_mask * latents
-        # Decode latents
-        latents = latents.to(pipe.vae.dtype)
-        latents_mean = torch.tensor(pipe.vae.config.latents_mean).view(
-            1, pipe.vae.config.z_dim, 1, 1, 1).to(latents.device, latents.dtype)
-        latents_std = 1.0 / torch.tensor(pipe.vae.config.latents_std).view(
-            1, pipe.vae.config.z_dim, 1, 1, 1).to(latents.device, latents.dtype)
-        latents = latents / latents_std + latents_mean
-        video = pipe.vae.decode(latents, return_dict=False)[0]
-        video = ttm_helper.video_processor.postprocess_video(
-            video, output_type="pil")
-        progress(0.95, desc="Saving video...")
-        # Save video
-        temp_dir = create_user_temp_dir()
-        output_path = os.path.join(temp_dir, "ttm_wan_output.mp4")
-        export_to_video(video[0], output_path, fps=16)
-        progress(1.0, desc="Done!")
-        return output_path, f"✅ Wan 2.2 TTM video generated successfully!\n\n**Parameters:**\n- Model: Wan2.2-14B\n- tweak_index: {tweak_index}\n- tstrong_index: {tstrong_index}\n- guidance_scale: {guidance_scale}\n- seed: {seed}"
-    except Exception as e:
-        logger.error(f"Error in Wan TTM generation: {e}")
-        import traceback
-        traceback.print_exc()
-        return None, f"❌ Error: {str(e)}"
-def run_ttm_generation(
-    first_frame_path: str,
-    motion_signal_path: str,
-    mask_path: str,
-    prompt: str,
-    negative_prompt: str,
-    model_choice: str,
-    tweak_index: int,
-    tstrong_index: int,
-    num_frames: int,
-    num_inference_steps: int,
-    guidance_scale: float,
-    seed: int,
-    progress=gr.Progress()
-):
-    """
-    Router function that calls the appropriate TTM generation based on model choice.
-    """
-    if "Wan" in model_choice:
-        return run_ttm_wan_generation(
-            first_frame_path=first_frame_path,
-            motion_signal_path=motion_signal_path,
-            mask_path=mask_path,
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            tweak_index=tweak_index,
-            tstrong_index=tstrong_index,
-            num_frames=num_frames,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            seed=seed,
-            progress=progress,
-        )
-    else:
-        return run_ttm_cog_generation(
-            first_frame_path=first_frame_path,
-            motion_signal_path=motion_signal_path,
-            mask_path=mask_path,
-            prompt=prompt,
-            tweak_index=tweak_index,
-            tstrong_index=tstrong_index,
-            num_frames=num_frames,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            seed=seed,
-            progress=progress,
-        )
 # Create Gradio interface
-logger.info("🎨 Creating Gradio interface...")
-sys.stdout.flush()
 with gr.Blocks(
     theme=gr.themes.Soft(),
     title="🎬 Video to Point Cloud Renderer",
     css="""
     .gradio-container {
-        max-width: 1400px !important;
         margin: auto !important;
     }
     """
 ) as demo:
     gr.Markdown("""
-    # 🎬 Video to Point Cloud Renderer + TTM Video Generation
-    Upload a video to generate a 3D point cloud, render it from a new camera perspective,
-    and optionally run **Time-to-Move (TTM)** for motion-controlled video generation.
-    **Workflow:**
-    1. **Step 1**: Upload a video and select camera movement → Generate motion signal & mask
-    2. **Step 2**: (Optional) Run TTM to generate a high-quality video with the motion signal
-    **TTM (Time-to-Move)** uses dual-clock denoising to guide video generation using:
-    - `first_frame.png`: Starting frame
-    - `motion_signal.mp4`: Warped video showing desired motion
-    - `mask.mp4`: Binary mask for motion regions
     """)
-    # State to store paths for TTM
-    first_frame_state = gr.State(None)
-    motion_signal_state = gr.State(None)
-    mask_state = gr.State(None)
-    with gr.Tabs():
-        with gr.Tab("📥 Step 1: Generate Motion Signal"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("### 📥 Input")
-                    video_input = gr.Video(
-                        label="Upload Video",
-                        format="mp4",
-                        height=300
-                    )
-                    camera_movement = gr.Dropdown(
-                        choices=CAMERA_MOVEMENTS,
-                        value="static",
-                        label="🎥 Camera Movement",
-                        info="Select how the camera should move in the rendered video"
-                    )
-                    generate_ttm = gr.Checkbox(
-                        label="🎯 Generate TTM Inputs",
-                        value=True,
-                        info="Generate motion_signal.mp4 and mask.mp4 for Time-to-Move"
-                    )
-                    generate_btn = gr.Button(
-                        "🚀 Generate Motion Signal", variant="primary", size="lg")
-                with gr.Column(scale=1):
-                    gr.Markdown("### 📤 Rendered Output")
-                    output_video = gr.Video(
-                        label="Rendered Video",
-                        height=250
-                    )
-                    first_frame_output = gr.Image(
-                        label="First Frame (first_frame.png)",
-                        height=150
-                    )
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("### 🎯 TTM: Motion Signal")
-                    motion_signal_output = gr.Video(
-                        label="Motion Signal Video (motion_signal.mp4)",
-                        height=250
-                    )
-                with gr.Column(scale=1):
-                    gr.Markdown("### 🎭 TTM: Mask")
-                    mask_output = gr.Video(
-                        label="Mask Video (mask.mp4)",
-                        height=250
-                    )
-            status_text = gr.Markdown("Ready to process...")
-        with gr.Tab("🎬 Step 2: TTM Video Generation"):
-            cog_available = "✅" if TTM_COG_AVAILABLE else "❌"
-            wan_available = "✅" if TTM_WAN_AVAILABLE else "❌"
-            gr.Markdown(f"""
-            ### 🎬 Time-to-Move (TTM) Video Generation
-            **Model Availability:**
-            - {cog_available} CogVideoX-5B-I2V
-            - {wan_available} Wan 2.2-14B (Recommended - higher quality)
-            **TTM Parameters:**
-            - **tweak_index**: When denoising starts *outside* the mask (lower = more dynamic background)
-            - **tstrong_index**: When denoising starts *inside* the mask (higher = more constrained motion)
-            **Recommended values:**
-            - CogVideoX - Cut-and-Drag: `tweak_index=4`, `tstrong_index=9`
-            - CogVideoX - Camera control: `tweak_index=3`, `tstrong_index=7`
-            - **Wan 2.2 (Recommended)**: `tweak_index=3`, `tstrong_index=7`
-            """)
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("### ⚙️ TTM Settings")
-                    ttm_model_choice = gr.Dropdown(
-                        choices=TTM_MODELS,
-                        value=TTM_MODELS[1] if TTM_WAN_AVAILABLE else TTM_MODELS[0],
-                        label="Model",
-                        info="Wan 2.2 is recommended for higher quality"
-                    )
-                    ttm_prompt = gr.Textbox(
-                        label="Prompt",
-                        placeholder="Describe the video content...",
-                        value="A high quality video, smooth motion, natural lighting",
-                        lines=2
-                    )
-                    ttm_negative_prompt = gr.Textbox(
-                        label="Negative Prompt (Wan only)",
-                        placeholder="Things to avoid in the video...",
-                        value="",
-                        lines=1,
-                        visible=True
-                    )
-                    with gr.Row():
-                        ttm_tweak_index = gr.Slider(
-                            minimum=0,
-                            maximum=20,
-                            value=3,
-                            step=1,
-                            label="tweak_index",
-                            info="When background denoising starts"
-                        )
-                        ttm_tstrong_index = gr.Slider(
-                            minimum=0,
-                            maximum=30,
-                            value=7,
-                            step=1,
-                            label="tstrong_index",
-                            info="When mask region denoising starts"
-                        )
-                    with gr.Row():
-                        ttm_num_frames = gr.Slider(
-                            minimum=17,
-                            maximum=81,
-                            value=49,
-                            step=4,
-                            label="Number of Frames"
-                        )
-                        ttm_guidance_scale = gr.Slider(
-                            minimum=1.0,
-                            maximum=15.0,
-                            value=3.5,
-                            step=0.5,
-                            label="Guidance Scale"
-                        )
-                    with gr.Row():
-                        ttm_num_steps = gr.Slider(
-                            minimum=20,
-                            maximum=100,
-                            value=50,
-                            step=5,
-                            label="Inference Steps"
-                        )
-                        ttm_seed = gr.Number(
-                            value=0,
-                            label="Seed",
-                            precision=0
-                        )
-                    ttm_generate_btn = gr.Button(
-                        "🎬 Generate TTM Video",
-                        variant="primary",
-                        size="lg",
-                        interactive=TTM_AVAILABLE
-                    )
-                with gr.Column(scale=1):
-                    gr.Markdown("### 📤 TTM Output")
-                    ttm_output_video = gr.Video(
-                        label="TTM Generated Video",
-                        height=400
-                    )
-                    ttm_status_text = gr.Markdown(
-                        "Upload a video in Step 1 first, then run TTM here.")
-            # TTM Input preview
-            with gr.Accordion("📁 TTM Input Files (from Step 1)", open=False):
-                with gr.Row():
-                    ttm_preview_first_frame = gr.Image(
-                        label="First Frame",
-                        height=150
-                    )
-                    ttm_preview_motion = gr.Video(
-                        label="Motion Signal",
-                        height=150
-                    )
-                    ttm_preview_mask = gr.Video(
-                        label="Mask",
-                        height=150
-                    )
-    # Helper function to update states and preview
-    def process_and_update_states(video_path, camera_movement, generate_ttm_flag, progress=gr.Progress()):
-        result = process_video(video_path, camera_movement,
-                               generate_ttm_flag, progress)
-        output_vid, motion_sig, mask_vid, first_frame, status = result
-        # Return all outputs including state updates and previews
-        return (
-            output_vid,          # output_video
-            motion_sig,          # motion_signal_output
-            mask_vid,            # mask_output
-            first_frame,         # first_frame_output
-            status,              # status_text
-            first_frame,         # first_frame_state
-            motion_sig,          # motion_signal_state
-            mask_vid,            # mask_state
-            first_frame,         # ttm_preview_first_frame
-            motion_sig,          # ttm_preview_motion
-            mask_vid,            # ttm_preview_mask
-        )
     # Event handlers
     generate_btn.click(
-        fn=process_and_update_states,
         inputs=[video_input, camera_movement, generate_ttm],
-        outputs=[
-            output_video, motion_signal_output, mask_output, first_frame_output, status_text,
-            first_frame_state, motion_signal_state, mask_state,
-            ttm_preview_first_frame, ttm_preview_motion, ttm_preview_mask
-        ]
-    )
-    # TTM generation event
-    ttm_generate_btn.click(
-        fn=run_ttm_generation,
-        inputs=[
-            first_frame_state,
-            motion_signal_state,
-            mask_state,
-            ttm_prompt,
-            ttm_negative_prompt,
-            ttm_model_choice,
-            ttm_tweak_index,
-            ttm_tstrong_index,
-            ttm_num_frames,
-            ttm_num_steps,
-            ttm_guidance_scale,
-            ttm_seed
-        ],
-        outputs=[ttm_output_video, ttm_status_text]
     )
     # Examples
     gr.Markdown("### 📁 Examples")
     if os.path.exists("./examples"):
-        example_videos = [f for f in os.listdir(
-            "./examples") if f.endswith(".mp4")][:4]
         if example_videos:
             gr.Examples(
-                examples=[[f"./examples/{v}", "move_forward", True]
-                          for v in example_videos],
                 inputs=[video_input, camera_movement, generate_ttm],
-                outputs=[
-                    output_video, motion_signal_output, mask_output, first_frame_output, status_text,
-                    first_frame_state, motion_signal_state, mask_state,
-                    ttm_preview_first_frame, ttm_preview_motion, ttm_preview_mask
-                ],
-                fn=process_and_update_states,
                 cache_examples=False
             )
 # Launch
-logger.info("✅ Gradio interface created successfully!")
-logger.info("=" * 50)
-logger.info("Application ready to launch")
-logger.info("=" * 50)
-sys.stdout.flush()
 if __name__ == "__main__":
-    logger.info("Starting Gradio server...")
-    sys.stdout.flush()
     demo.launch(share=False)

 import gradio as gr
 import os
 import numpy as np
 from pathlib import Path
 from einops import rearrange
 from typing import Union
 try:
     import spaces
 except ImportError:
+    def spaces(func):
+        return func
 import torch
 import torchvision.transforms as T
+import logging
 from concurrent.futures import ThreadPoolExecutor
 import atexit
 import uuid
 import decord
+from models.SpaTrackV2.models.vggt4track.models.vggt_moe import VGGT4Track
+from models.SpaTrackV2.models.vggt4track.utils.load_fn import preprocess_image
+from models.SpaTrackV2.models.predictor import Predictor
+from models.SpaTrackV2.models.utils import get_points_on_a_grid
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Constants
 MAX_FRAMES = 80
     "move_down"
 ]
 # Thread pool for delayed deletion
 thread_pool_executor = ThreadPoolExecutor(max_workers=2)
 def delete_later(path: Union[str, os.PathLike], delay: int = 600):
     """Delete file or directory after specified delay"""
     def _delete():
     thread_pool_executor.submit(_wait_and_delete)
     atexit.register(_delete)
 def create_user_temp_dir():
     """Create a unique temporary directory for each user session"""
     session_id = str(uuid.uuid4())[:8]
     delete_later(temp_dir, delay=600)
     return temp_dir
+# Global model initialization
+print("🚀 Initializing models...")
+vggt4track_model = VGGT4Track.from_pretrained("Yuxihenry/SpatialTrackerV2_Front")
+vggt4track_model.eval()
+vggt4track_model = vggt4track_model.to("cuda")
+tracker_model = Predictor.from_pretrained("Yuxihenry/SpatialTrackerV2-Offline")
+tracker_model.eval()
+print("✅ Models loaded successfully!")
 gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])
 def generate_camera_trajectory(num_frames: int, movement_type: str,
         if movement_type == "static":
             pass  # Keep identity
         elif movement_type == "move_forward":
+            ext[2, 3] = -speed * t  # Move along -Z (forward in OpenGL convention)
         elif movement_type == "move_backward":
             ext[2, 3] = speed * t  # Move along +Z
         elif movement_type == "move_left":
         base_dir = os.path.dirname(output_path)
         motion_signal_path = os.path.join(base_dir, "motion_signal.mp4")
         mask_path = os.path.join(base_dir, "mask.mp4")
+        out_motion_signal = cv2.VideoWriter(motion_signal_path, fourcc, fps, (W, H))
         out_mask = cv2.VideoWriter(mask_path, fourcc, fps, (W, H))
     # Create meshgrid for pixel coordinates
                 if hole_mask.sum() == 0:
                     break
                 dilated = cv2.dilate(motion_signal_frame, kernel, iterations=1)
+                motion_signal_frame = np.where(hole_mask[:, :, None] > 0, dilated, motion_signal_frame)
+                hole_mask = (motion_signal_frame.sum(axis=-1) == 0).astype(np.uint8)
         # Write TTM outputs if enabled
         if generate_ttm_inputs:
             # Motion signal: warped frame with NN inpainting
+            motion_signal_bgr = cv2.cvtColor(motion_signal_frame, cv2.COLOR_RGB2BGR)
             out_motion_signal.write(motion_signal_bgr)
             # Mask: binary mask of valid (projected) pixels - white where valid, black where holes
+            mask_frame = np.stack([valid_mask, valid_mask, valid_mask], axis=-1)
             out_mask.write(mask_frame)
         # For the rendered output, use the same inpainted result
     }
+@spaces.GPU
 def run_spatial_tracker(video_tensor: torch.Tensor):
     """
     GPU-intensive spatial tracking function.
     Returns:
         Dictionary containing tracking results
     """
     # Run VGGT to get depth and camera poses
     video_input = preprocess_image(video_tensor)[None].cuda()
     with torch.no_grad():
         with torch.cuda.amp.autocast(dtype=torch.bfloat16):
             predictions = vggt4track_model(video_input / 255)
             depth_map = predictions["points_map"][..., 2]
             depth_conf = predictions["unc_metric"]
     depth_tensor = depth_map.squeeze().cpu().numpy()
     extrs = extrinsic.squeeze().cpu().numpy()
     intrs = intrinsic.squeeze().cpu().numpy()
     unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
     # Setup tracker
     tracker_model.spatrack.track_num = 512
     tracker_model.to("cuda")
     # Get grid points for tracking
     frame_H, frame_W = video_tensor_gpu.shape[2:]
     grid_pts = get_points_on_a_grid(30, (frame_H, frame_W), device="cpu")
+    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].numpy()
     # Run tracker
     with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
         conf_depth = T.Resize((new_h, new_w))(conf_depth)
         intrs_out[:, :2, :] = intrs_out[:, :2, :] * scale
     # Move results to CPU and return
+    return {
         'video_out': video_out.cpu(),
         'point_map': point_map.cpu(),
         'conf_depth': conf_depth.cpu(),
         'c2w_traj': c2w_traj.cpu(),
     }
 def process_video(video_path: str, camera_movement: str, generate_ttm: bool = True, progress=gr.Progress()):
     """Main processing function
         c2w_traj = tracking_results['c2w_traj']
         # Get RGB frames and depth
+        rgb_frames = rearrange(video_out.numpy(), "T C H W -> T H W C").astype(np.uint8)
         depth_frames = point_map[:, 2].numpy()
         depth_conf_np = conf_depth.numpy()
         intrs_np = intrs_out.numpy()
         extrs_np = torch.inverse(c2w_traj).numpy()  # world-to-camera
+        progress(0.7, desc=f"Generating {camera_movement} camera trajectory...")
         # Calculate scene scale from depth
         valid_depth = depth_frames[depth_frames > 0]
         return None, None, None, None, f"❌ Error: {str(e)}"
 # Create Gradio interface
+print("🎨 Creating Gradio interface...")
 with gr.Blocks(
     theme=gr.themes.Soft(),
     title="🎬 Video to Point Cloud Renderer",
     css="""
     .gradio-container {
+        max-width: 1200px !important;
         margin: auto !important;
     }
     """
 ) as demo:
     gr.Markdown("""
+    # 🎬 Video to Point Cloud Renderer (TTM Compatible)
+    Upload a video to generate a 3D point cloud and render it from a new camera perspective.
+    Generates outputs compatible with **Time-to-Move (TTM)** motion-controlled video generation.
+    **How it works:**
+    1. Upload a video
+    2. Select a camera movement type
+    3. Click "Generate" to create the rendered video and TTM inputs
+    **TTM Inputs:**
+    - `first_frame.png`: The first frame of the original video
+    - `motion_signal.mp4`: Warped video with nearest-neighbor inpainting
+    - `mask.mp4`: Binary mask showing valid projected pixels (white) vs holes (black)
     """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 📥 Input")
+            video_input = gr.Video(
+                label="Upload Video",
+                format="mp4",
+                height=300
+            )
+            camera_movement = gr.Dropdown(
+                choices=CAMERA_MOVEMENTS,
+                value="static",
+                label="🎥 Camera Movement",
+                info="Select how the camera should move in the rendered video"
+            )
+            generate_ttm = gr.Checkbox(
+                label="🎯 Generate TTM Inputs",
+                value=True,
+                info="Generate motion_signal.mp4 and mask.mp4 for Time-to-Move"
+            )
+            generate_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            gr.Markdown("### 📤 Rendered Output")
+            output_video = gr.Video(
+                label="Rendered Video",
+                height=250
+            )
+            first_frame_output = gr.Image(
+                label="First Frame (first_frame.png)",
+                height=150
+            )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 🎯 TTM: Motion Signal")
+            motion_signal_output = gr.Video(
+                label="Motion Signal Video (motion_signal.mp4)",
+                height=250
+            )
+        with gr.Column(scale=1):
+            gr.Markdown("### 🎭 TTM: Mask")
+            mask_output = gr.Video(
+                label="Mask Video (mask.mp4)",
+                height=250
+            )
+    status_text = gr.Markdown("Ready to process...")
     # Event handlers
     generate_btn.click(
+        fn=process_video,
         inputs=[video_input, camera_movement, generate_ttm],
+        outputs=[output_video, motion_signal_output, mask_output, first_frame_output, status_text]
     )
     # Examples
     gr.Markdown("### 📁 Examples")
     if os.path.exists("./examples"):
+        example_videos = [f for f in os.listdir("./examples") if f.endswith(".mp4")][:4]
         if example_videos:
             gr.Examples(
+                examples=[[f"./examples/{v}", "move_forward", True] for v in example_videos],
                 inputs=[video_input, camera_movement, generate_ttm],
+                outputs=[output_video, motion_signal_output, mask_output, first_frame_output, status_text],
+                fn=process_video,
                 cache_examples=False
             )
 # Launch
 if __name__ == "__main__":
     demo.launch(share=False)

src/config.py DELETED Viewed

@@ -1,44 +0,0 @@
-import torch
-MAX_FRAMES = 80
-OUTPUT_FPS = 24
-RENDER_WIDTH = 512
-RENDER_HEIGHT = 384
-CAMERA_MOVEMENTS = [
-    "static",
-    "move_forward",
-    "move_backward",
-    "move_left",
-    "move_right",
-    "move_up",
-    "move_down"
-]
-TTM_COG_MODEL_ID = "THUDM/CogVideoX-5b-I2V"
-TTM_WAN_MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
-TTM_DTYPE = torch.bfloat16
-TTM_DEFAULT_NUM_FRAMES = 49
-TTM_DEFAULT_NUM_INFERENCE_STEPS = 50
-TTM_COG_AVAILABLE = False
-TTM_WAN_AVAILABLE = False
-try:
-    from diffusers import CogVideoXImageToVideoPipeline
-    TTM_COG_AVAILABLE = True
-except ImportError:
-    pass
-try:
-    from diffusers import AutoencoderKLWan, WanTransformer3DModel
-    TTM_WAN_AVAILABLE = True
-except ImportError:
-    pass
-TTM_AVAILABLE = TTM_COG_AVAILABLE or TTM_WAN_AVAILABLE
-TTM_MODELS = []
-if TTM_COG_AVAILABLE:
-    TTM_MODELS.append("CogVideoX-5B")
-if TTM_WAN_AVAILABLE:
-    TTM_MODELS.append("Wan2.2-14B (Recommended)")

src/model_manager.py DELETED Viewed

@@ -1,62 +0,0 @@
-from models.SpaTrackV2.models.predictor import Predictor
-from models.SpaTrackV2.models.vggt4track.models.vggt_moe import VGGT4Track
-import logging
-from .config import (
-    TTM_COG_AVAILABLE, TTM_WAN_AVAILABLE,
-    TTM_COG_MODEL_ID, TTM_WAN_MODEL_ID, TTM_DTYPE
-)
-logger = logging.getLogger(__name__)
-vggt4track_model = None
-tracker_model = None
-ttm_cog_pipeline = None
-ttm_wan_pipeline = None
-def init_spatial_models():
-    global vggt4track_model, tracker_model
-    print("🚀 Initializing models...")
-    vggt4track_model = VGGT4Track.from_pretrained(
-        "Yuxihenry/SpatialTrackerV2_Front")
-    vggt4track_model.eval()
-    vggt4track_model = vggt4track_model.to("cuda")
-    tracker_model = Predictor.from_pretrained(
-        "Yuxihenry/SpatialTrackerV2-Offline")
-    tracker_model.eval()
-    print("✅ Spatial Models loaded successfully!")
-def get_ttm_cog_pipeline():
-    global ttm_cog_pipeline
-    if ttm_cog_pipeline is None and TTM_COG_AVAILABLE:
-        from diffusers import CogVideoXImageToVideoPipeline
-        logger.info("Loading TTM CogVideoX pipeline...")
-        ttm_cog_pipeline = CogVideoXImageToVideoPipeline.from_pretrained(
-            TTM_COG_MODEL_ID,
-            torch_dtype=TTM_DTYPE,
-            low_cpu_mem_usage=True,
-        )
-        ttm_cog_pipeline.vae.enable_tiling()
-        ttm_cog_pipeline.vae.enable_slicing()
-        logger.info("TTM CogVideoX pipeline loaded successfully!")
-    return ttm_cog_pipeline
-def get_ttm_wan_pipeline():
-    global ttm_wan_pipeline
-    if ttm_wan_pipeline is None and TTM_WAN_AVAILABLE:
-        from diffusers import WanImageToVideoPipeline
-        logger.info("Loading TTM Wan 2.2 pipeline...")
-        ttm_wan_pipeline = WanImageToVideoPipeline.from_pretrained(
-            TTM_WAN_MODEL_ID,
-            torch_dtype=TTM_DTYPE,
-        )
-        ttm_wan_pipeline.vae.enable_tiling()
-        ttm_wan_pipeline.vae.enable_slicing()
-        logger.info("TTM Wan 2.2 pipeline loaded successfully!")
-    return ttm_wan_pipeline
-init_spatial_models()

src/spatial_pipeline.py DELETED Viewed

@@ -1,277 +0,0 @@
-import os
-import cv2
-import numpy as np
-import torch
-import decord
-import gradio as gr
-import torchvision.transforms as T
-from einops import rearrange
-from .config import MAX_FRAMES, OUTPUT_FPS
-from .utils import logger, create_user_temp_dir
-from . import model_manager
-from models.SpaTrackV2.models.vggt4track.utils.load_fn import preprocess_image
-from models.SpaTrackV2.models.utils import get_points_on_a_grid
-try:
-    import spaces
-except ImportError:
-    class spaces:
-        @staticmethod
-        def GPU(func=None, duration=None):
-            def decorator(f):
-                return f
-            return decorator if func is None else func
-def generate_camera_trajectory(num_frames: int, movement_type: str,
-                               base_intrinsics: np.ndarray,
-                               scene_scale: float = 1.0) -> np.ndarray:
-    speed = scene_scale * 0.02
-    extrinsics = np.zeros((num_frames, 4, 4), dtype=np.float32)
-    for t in range(num_frames):
-        ext = np.eye(4, dtype=np.float32)
-        if movement_type == "static":
-            pass
-        elif movement_type == "move_forward":
-            ext[2, 3] = -speed * t
-        elif movement_type == "move_backward":
-            ext[2, 3] = speed * t
-        elif movement_type == "move_left":
-            ext[0, 3] = -speed * t
-        elif movement_type == "move_right":
-            ext[0, 3] = speed * t
-        elif movement_type == "move_up":
-            ext[1, 3] = -speed * t
-        elif movement_type == "move_down":
-            ext[1, 3] = speed * t
-        extrinsics[t] = ext
-    return extrinsics
-def render_from_pointcloud(rgb_frames, depth_frames, intrinsics, original_extrinsics,
-                           new_extrinsics, output_path, fps=24, generate_ttm_inputs=False):
-    T, H, W, _ = rgb_frames.shape
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter(output_path, fourcc, fps, (W, H))
-    motion_signal_path = None
-    mask_path = None
-    out_motion_signal = None
-    out_mask = None
-    if generate_ttm_inputs:
-        base_dir = os.path.dirname(output_path)
-        motion_signal_path = os.path.join(base_dir, "motion_signal.mp4")
-        mask_path = os.path.join(base_dir, "mask.mp4")
-        out_motion_signal = cv2.VideoWriter(
-            motion_signal_path, fourcc, fps, (W, H))
-        out_mask = cv2.VideoWriter(mask_path, fourcc, fps, (W, H))
-    u, v = np.meshgrid(np.arange(W), np.arange(H))
-    ones = np.ones_like(u)
-    for t in range(T):
-        rgb = rgb_frames[t]
-        depth = depth_frames[t]
-        K = intrinsics[t]
-        orig_c2w = np.linalg.inv(original_extrinsics[t])
-        if t == 0:
-            base_c2w = orig_c2w.copy()
-        new_c2w = base_c2w @ new_extrinsics[t]
-        new_w2c = np.linalg.inv(new_c2w)
-        K_inv = np.linalg.inv(K)
-        pixels = np.stack([u, v, ones], axis=-1).reshape(-1, 3)
-        rays_cam = (K_inv @ pixels.T).T
-        points_cam = rays_cam * depth.reshape(-1, 1)
-        points_world = (orig_c2w[:3, :3] @ points_cam.T).T + orig_c2w[:3, 3]
-        points_new_cam = (new_w2c[:3, :3] @ points_world.T).T + new_w2c[:3, 3]
-        points_proj = (K @ points_new_cam.T).T
-        z = np.clip(points_proj[:, 2:3], 1e-6, None)
-        uv_new = points_proj[:, :2] / z
-        rendered = np.zeros((H, W, 3), dtype=np.uint8)
-        z_buffer = np.full((H, W), np.inf, dtype=np.float32)
-        colors = rgb.reshape(-1, 3)
-        depths_new = points_new_cam[:, 2]
-        # Rasterization loop (simplified)
-        for i in range(len(uv_new)):
-            uu, vv = int(round(uv_new[i, 0])), int(round(uv_new[i, 1]))
-            if 0 <= uu < W and 0 <= vv < H and depths_new[i] > 0:
-                if depths_new[i] < z_buffer[vv, uu]:
-                    z_buffer[vv, uu] = depths_new[i]
-                    rendered[vv, uu] = colors[i]
-        # Inpainting for TTM
-        valid_mask = (rendered.sum(axis=-1) > 0).astype(np.uint8) * 255
-        motion_signal_frame = rendered.copy()
-        hole_mask = (motion_signal_frame.sum(axis=-1) == 0).astype(np.uint8)
-        if hole_mask.sum() > 0:
-            kernel = np.ones((3, 3), np.uint8)
-            for _ in range(max(H, W)):
-                if hole_mask.sum() == 0:
-                    break
-                dilated = cv2.dilate(motion_signal_frame, kernel, iterations=1)
-                motion_signal_frame = np.where(
-                    hole_mask[:, :, None] > 0, dilated, motion_signal_frame)
-                hole_mask = (motion_signal_frame.sum(
-                    axis=-1) == 0).astype(np.uint8)
-        if generate_ttm_inputs:
-            out_motion_signal.write(cv2.cvtColor(
-                motion_signal_frame, cv2.COLOR_RGB2BGR))
-            out_mask.write(np.stack([valid_mask]*3, axis=-1))
-        out.write(cv2.cvtColor(motion_signal_frame, cv2.COLOR_RGB2BGR))
-    out.release()
-    if generate_ttm_inputs:
-        out_motion_signal.release()
-        out_mask.release()
-    return {'rendered': output_path, 'motion_signal': motion_signal_path, 'mask': mask_path}
-@spaces.GPU
-def run_spatial_tracker(video_tensor: torch.Tensor):
-    video_input = preprocess_image(video_tensor)[None].cuda()
-    # Use global models from model_manager
-    with torch.no_grad():
-        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-            predictions = model_manager.vggt4track_model(video_input / 255)
-            extrinsic = predictions["poses_pred"]
-            intrinsic = predictions["intrs"]
-            depth_map = predictions["points_map"][..., 2]
-            depth_conf = predictions["unc_metric"]
-    depth_tensor = depth_map.squeeze().cpu().numpy()
-    extrs = extrinsic.squeeze().cpu().numpy()
-    intrs = intrinsic.squeeze().cpu().numpy()
-    video_tensor_gpu = video_input.squeeze()
-    unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
-    model_manager.tracker_model.spatrack.track_num = 512
-    model_manager.tracker_model.to("cuda")
-    frame_H, frame_W = video_tensor_gpu.shape[2:]
-    grid_pts = get_points_on_a_grid(30, (frame_H, frame_W), device="cpu")
-    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[
-        0].numpy()
-    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        results = model_manager.tracker_model.forward(
-            video_tensor_gpu, depth=depth_tensor,
-            intrs=intrs, extrs=extrs,
-            queries=query_xyt,
-            fps=1, full_point=False, iters_track=4,
-            query_no_BA=True, fixed_cam=False, stage=1,
-            unc_metric=unc_metric,
-            support_frame=len(video_tensor_gpu)-1, replace_ratio=0.2
-        )
-    # Unpack tuple from tracker
-    c2w_traj, intrs_out, point_map, conf_depth, track3d_pred, track2d_pred, vis_pred, conf_pred, video_out = results
-    # Resize logic (abbreviated)
-    max_size = 384
-    h, w = video_out.shape[2:]
-    scale = min(max_size / h, max_size / w)
-    if scale < 1:
-        new_h, new_w = int(h * scale), int(w * scale)
-        video_out = T.Resize((new_h, new_w))(video_out)
-        point_map = T.Resize((new_h, new_w))(point_map)
-        conf_depth = T.Resize((new_h, new_w))(conf_depth)
-        intrs_out[:, :2, :] = intrs_out[:, :2, :] * scale
-    return {
-        'video_out': video_out.cpu(),
-        'point_map': point_map.cpu(),
-        'conf_depth': conf_depth.cpu(),
-        'intrs_out': intrs_out.cpu(),
-        'c2w_traj': c2w_traj.cpu(),
-    }
-def process_video(video_path: str, camera_movement: str, generate_ttm: bool = True, progress=gr.Progress()):
-    if video_path is None:
-        return None, None, None, None, "❌ Please upload a video first"
-    progress(0, desc="Initializing...")
-    temp_dir = create_user_temp_dir()
-    out_dir = os.path.join(temp_dir, "results")
-    os.makedirs(out_dir, exist_ok=True)
-    try:
-        progress(0.1, desc="Loading video...")
-        video_reader = decord.VideoReader(video_path)
-        video_tensor = torch.from_numpy(
-            video_reader.get_batch(range(len(video_reader))).asnumpy()
-        ).permute(0, 3, 1, 2).float()
-        fps_skip = max(1, len(video_tensor) // MAX_FRAMES)
-        video_tensor = video_tensor[::fps_skip][:MAX_FRAMES]
-        h, w = video_tensor.shape[2:]
-        scale = 336 / min(h, w)
-        if scale < 1:
-            new_h, new_w = int(h * scale) // 2 * 2, int(w * scale) // 2 * 2
-            video_tensor = T.Resize((new_h, new_w))(video_tensor)
-        progress(0.4, desc="Running 3D tracking...")
-        tracking_results = run_spatial_tracker(video_tensor)
-        video_out = tracking_results['video_out']
-        point_map = tracking_results['point_map']
-        conf_depth = tracking_results['conf_depth']
-        intrs_out = tracking_results['intrs_out']
-        c2w_traj = tracking_results['c2w_traj']
-        rgb_frames = rearrange(
-            video_out.numpy(), "T C H W -> T H W C").astype(np.uint8)
-        depth_frames = point_map[:, 2].numpy()
-        depth_frames[conf_depth.numpy() < 0.5] = 0
-        intrs_np = intrs_out.numpy()
-        extrs_np = torch.inverse(c2w_traj).numpy()
-        progress(
-            0.7, desc=f"Generating {camera_movement} camera trajectory...")
-        valid_depth = depth_frames[depth_frames > 0]
-        scene_scale = np.median(valid_depth) if len(valid_depth) > 0 else 1.0
-        new_extrinsics = generate_camera_trajectory(
-            len(rgb_frames), camera_movement, intrs_np, scene_scale
-        )
-        progress(0.8, desc="Rendering video...")
-        output_video_path = os.path.join(out_dir, "rendered_video.mp4")
-        render_results = render_from_pointcloud(
-            rgb_frames, depth_frames, intrs_np, extrs_np,
-            new_extrinsics, output_video_path, fps=OUTPUT_FPS,
-            generate_ttm_inputs=generate_ttm
-        )
-        first_frame_path = None
-        if generate_ttm:
-            first_frame_path = os.path.join(out_dir, "first_frame.png")
-            cv2.imwrite(first_frame_path, cv2.cvtColor(
-                rgb_frames[0], cv2.COLOR_RGB2BGR))
-        status_msg = f"✅ Video rendered successfully with '{camera_movement}'!"
-        if generate_ttm:
-            status_msg += "\n\n📁 **TTM outputs generated**"
-        return render_results['rendered'], render_results.get('motion_signal'), render_results.get('mask'), first_frame_path, status_msg
-    except Exception as e:
-        logger.error(f"Error processing video: {e}")
-        import traceback
-        traceback.print_exc()
-        return None, None, None, None, f"❌ Error: {str(e)}"

src/ttm_pipeline.py DELETED Viewed

@@ -1,303 +0,0 @@
-import os
-import torch
-import torch.nn.functional as F
-import gradio as gr
-from diffusers.utils import export_to_video, load_image
-from diffusers.utils.torch_utils import randn_tensor
-from diffusers.pipelines.wan.pipeline_wan_i2v import retrieve_latents
-try:
-    import spaces
-except ImportError:
-    class spaces:
-        @staticmethod
-        def GPU(func=None, duration=None):
-            def decorator(f):
-                return f
-            return decorator if func is None else func
-from .config import TTM_COG_AVAILABLE, TTM_WAN_AVAILABLE
-from .utils import create_user_temp_dir, load_video_to_tensor
-from . import model_manager
-# --- Helper Classes ---
-class CogVideoXTTMHelper:
-    def __init__(self, pipeline):
-        self.pipeline = pipeline
-        self.vae = pipeline.vae
-        self.vae_scale_factor_spatial = 2 ** (
-            len(self.vae.config.block_out_channels) - 1)
-        self.vae_scale_factor_temporal = self.vae.config.temporal_compression_ratio
-        self.vae_scaling_factor_image = self.vae.config.scaling_factor
-        self.video_processor = pipeline.video_processor
-    @torch.no_grad()
-    def encode_frames(self, frames: torch.Tensor) -> torch.Tensor:
-        latents = self.vae.encode(
-            frames)[0].sample() * self.vae_scaling_factor_image
-        return latents.permute(0, 2, 1, 3, 4).contiguous()
-    def convert_rgb_mask_to_latent_mask(self, mask: torch.Tensor) -> torch.Tensor:
-        k = self.vae_scale_factor_temporal
-        mask_sampled = torch.cat([mask[0:1], mask[1::k]], dim=0)
-        pooled = mask_sampled.permute(1, 0, 2, 3).unsqueeze(0)
-        s = self.vae_scale_factor_spatial
-        H_l, W_l = pooled.shape[-2] // s, pooled.shape[-1] // s
-        pooled = F.interpolate(pooled, size=(
-            pooled.shape[2], H_l, W_l), mode="nearest")
-        return pooled.permute(0, 2, 1, 3, 4)
-class WanTTMHelper:
-    def __init__(self, pipeline):
-        self.pipeline = pipeline
-        self.vae = pipeline.vae
-        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal
-        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial
-        self.video_processor = pipeline.video_processor
-    def convert_rgb_mask_to_latent_mask(self, mask: torch.Tensor) -> torch.Tensor:
-        k = self.vae_scale_factor_temporal
-        mask_sampled = torch.cat([mask[0:1], mask[1::k]], dim=0)
-        pooled = mask_sampled.permute(1, 0, 2, 3).unsqueeze(0)
-        s = self.vae_scale_factor_spatial
-        H_l, W_l = pooled.shape[-2] // s, pooled.shape[-1] // s
-        pooled = F.interpolate(pooled, size=(
-            pooled.shape[2], H_l, W_l), mode="nearest")
-        return pooled.permute(0, 2, 1, 3, 4)
-def compute_hw_from_area(h, w, max_area, mod_value):
-    aspect = h / w
-    height = round(np.sqrt(max_area * aspect)) // mod_value * mod_value
-    width = round(np.sqrt(max_area / aspect)) // mod_value * mod_value
-    return int(height), int(width)
-# --- Generation Functions ---
-@spaces.GPU(duration=300)
-def run_ttm_cog_generation(first_frame_path, motion_signal_path, mask_path, prompt,
-                           tweak_index=4, tstrong_index=9, num_frames=49,
-                           num_inference_steps=50, guidance_scale=6.0, seed=0, progress=gr.Progress()):
-    if not TTM_COG_AVAILABLE:
-        return None, "❌ CogVideoX TTM not available."
-    pipe = model_manager.get_ttm_cog_pipeline()
-    if not pipe:
-        return None, "❌ Failed to load pipeline"
-    pipe = pipe.to("cuda")
-    ttm_helper = CogVideoXTTMHelper(pipe)
-    device = "cuda"
-    generator = torch.Generator(device=device).manual_seed(seed)
-    image = load_image(first_frame_path)
-    height = pipe.transformer.config.sample_height * \
-        ttm_helper.vae_scale_factor_spatial
-    width = pipe.transformer.config.sample_width * \
-        ttm_helper.vae_scale_factor_spatial
-    do_cfg = guidance_scale > 1.0
-    prompt_embeds, neg_embeds = pipe.encode_prompt(
-        prompt, "", do_cfg, 1, 226, device)
-    if do_cfg:
-        prompt_embeds = torch.cat([neg_embeds, prompt_embeds], dim=0)
-    pipe.scheduler.set_timesteps(num_inference_steps, device=device)
-    timesteps = pipe.scheduler.timesteps
-    latent_frames = (
-        num_frames - 1) // ttm_helper.vae_scale_factor_temporal + 1
-    image_tensor = ttm_helper.video_processor.preprocess(
-        image, height=height, width=width).to(device, dtype=prompt_embeds.dtype)
-    latent_channels = pipe.transformer.config.in_channels // 2
-    latents, image_latents = pipe.prepare_latents(
-        image_tensor, 1, latent_channels, num_frames, height, width, prompt_embeds.dtype, device, generator, None)
-    ref_vid = load_video_to_tensor(motion_signal_path).to(device)
-    ref_vid = F.interpolate(ref_vid.permute(0, 2, 1, 3, 4).flatten(0, 1), size=(
-        height, width), mode="bicubic").view(1, -1, 3, height, width).permute(0, 2, 1, 3, 4)
-    ref_vid = ttm_helper.video_processor.normalize(
-        ref_vid.to(dtype=pipe.vae.dtype))
-    ref_latents = ttm_helper.encode_frames(ref_vid).float().detach()
-    ref_mask = load_video_to_tensor(mask_path).to(device)
-    ref_mask = F.interpolate(ref_mask.permute(0, 2, 1, 3, 4).flatten(0, 1), size=(
-        height, width), mode="nearest").view(1, -1, 3, height, width).permute(0, 2, 1, 3, 4)
-    motion_mask = ttm_helper.convert_rgb_mask_to_latent_mask(
-        ref_mask[0, :, :1].permute(1, 0, 2, 3).contiguous())
-    background_mask = 1.0 - motion_mask
-    fixed_noise = randn_tensor(
-        ref_latents.shape, generator=generator, device=device, dtype=ref_latents.dtype)
-    if tweak_index >= 0:
-        latents = pipe.scheduler.add_noise(
-            ref_latents, fixed_noise, timesteps[tweak_index].long()).to(dtype=latents.dtype)
-    extra_step_kwargs = pipe.prepare_extra_step_kwargs(generator, 0.0)
-    for i, t in enumerate(timesteps[tweak_index:]):
-        progress(0.4 + 0.5 * (i / len(timesteps)), desc="Denoising...")
-        latent_input = torch.cat([latents] * 2) if do_cfg else latents
-        latent_input = pipe.scheduler.scale_model_input(latent_input, t)
-        latent_input = torch.cat([latent_input, torch.cat(
-            [image_latents]*2) if do_cfg else image_latents], dim=2)
-        noise_pred = pipe.transformer(hidden_states=latent_input, encoder_hidden_states=prompt_embeds, timestep=t.expand(
-            latent_input.shape[0]), return_dict=False)[0].float()
-        if do_cfg:
-            uncond, text = noise_pred.chunk(2)
-            noise_pred = uncond + guidance_scale * (text - uncond)
-        latents = pipe.scheduler.step(
-            noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-        if (i + tweak_index) < tstrong_index:
-            next_t = timesteps[i + tweak_index + 1] if i + \
-                tweak_index + 1 < len(timesteps) else None
-            if next_t is not None:
-                noisy_ref = pipe.scheduler.add_noise(
-                    ref_latents, fixed_noise, next_t.long()).to(dtype=latents.dtype)
-                latents = latents * background_mask + noisy_ref * motion_mask
-            else:
-                latents = latents * background_mask + ref_latents * motion_mask
-        latents = latents.to(prompt_embeds.dtype)
-    frames = pipe.decode_latents(latents)
-    video = ttm_helper.video_processor.postprocess_video(
-        video=frames, output_type="pil")
-    out_path = os.path.join(create_user_temp_dir(), "ttm_cog_out.mp4")
-    export_to_video(video[0], out_path, fps=8)
-    return out_path, "✅ Video Generated"
-@spaces.GPU(duration=300)
-def run_ttm_wan_generation(first_frame_path, motion_signal_path, mask_path, prompt, negative_prompt="",
-                           tweak_index=3, tstrong_index=7, num_frames=81, num_inference_steps=50,
-                           guidance_scale=3.5, seed=0, progress=gr.Progress()):
-    if not TTM_WAN_AVAILABLE:
-        return None, "❌ Wan TTM not available."
-    pipe = model_manager.get_ttm_wan_pipeline()
-    if not pipe:
-        return None, "❌ Failed to load pipeline"
-    pipe = pipe.to("cuda")
-    ttm_helper = WanTTMHelper(pipe)
-    device = "cuda"
-    generator = torch.Generator(device=device).manual_seed(seed)
-    image = load_image(first_frame_path)
-    h, w = compute_hw_from_area(image.height, image.width, 480*832,
-                                ttm_helper.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1])
-    image = image.resize((w, h))
-    do_cfg = guidance_scale > 1.0
-    prompt_embeds, neg_embeds = pipe.encode_prompt(
-        prompt, negative_prompt, do_cfg, 1, 512, device)
-    prompt_embeds = prompt_embeds.to(pipe.transformer.dtype)
-    if neg_embeds is not None:
-        neg_embeds = neg_embeds.to(pipe.transformer.dtype)
-    image_embeds = pipe.encode_image(image, device).repeat(1, 1, 1).to(
-        pipe.transformer.dtype) if pipe.transformer.config.image_dim else None
-    pipe.scheduler.set_timesteps(num_inference_steps, device=device)
-    timesteps = pipe.scheduler.timesteps
-    if num_frames % ttm_helper.vae_scale_factor_temporal != 1:
-        num_frames = num_frames // ttm_helper.vae_scale_factor_temporal * \
-            ttm_helper.vae_scale_factor_temporal + 1
-    image_tensor = ttm_helper.video_processor.preprocess(
-        image, height=h, width=w).to(device, dtype=torch.float32)
-    latents, condition = pipe.prepare_latents(
-        image_tensor, 1, pipe.vae.config.z_dim, h, w, num_frames, torch.float32, device, generator, None, None)
-    ref_vid = load_video_to_tensor(motion_signal_path).to(device)
-    ref_vid = F.interpolate(ref_vid.permute(0, 2, 1, 3, 4).flatten(0, 1), size=(
-        h, w), mode="bicubic").view(1, -1, 3, h, w).permute(0, 2, 1, 3, 4)
-    ref_vid = ttm_helper.video_processor.normalize(
-        ref_vid.to(dtype=pipe.vae.dtype))
-    ref_latents = retrieve_latents(
-        pipe.vae.encode(ref_vid), sample_mode="argmax")
-    mean = torch.tensor(pipe.vae.config.latents_mean).view(
-        1, -1, 1, 1, 1).to(device, ref_latents.dtype)
-    std = 1.0 / torch.tensor(pipe.vae.config.latents_std).view(1, -
-                                                               1, 1, 1, 1).to(device, ref_latents.dtype)
-    ref_latents = (ref_latents - mean) * std
-    ref_mask = load_video_to_tensor(mask_path).to(device)
-    ref_mask = F.interpolate(ref_mask.permute(0, 2, 1, 3, 4).flatten(0, 1), size=(
-        h, w), mode="nearest").view(1, -1, 3, h, w).permute(0, 2, 1, 3, 4)
-    mask_tc_hw = ref_mask[0].permute(1, 0, 2, 3).contiguous()[:num_frames]
-    motion_mask = ttm_helper.convert_rgb_mask_to_latent_mask(
-        (mask_tc_hw > 0.5).float()).permute(0, 2, 1, 3, 4).contiguous()
-    background_mask = 1.0 - motion_mask
-    fixed_noise = randn_tensor(
-        ref_latents.shape, generator=generator, device=device, dtype=ref_latents.dtype)
-    if tweak_index >= 0:
-        latents = pipe.scheduler.add_noise(ref_latents, fixed_noise, torch.as_tensor(
-            timesteps[tweak_index], device=device).long())
-    for i, t in enumerate(timesteps[tweak_index:]):
-        progress(0.4 + 0.5 * (i / len(timesteps)), desc=f"Step {i}")
-        latent_in = torch.cat([latents, condition], dim=1).to(
-            pipe.transformer.dtype)
-        ts = t.expand(latents.shape[0])
-        noise_pred = pipe.transformer(hidden_states=latent_in, timestep=ts, encoder_hidden_states=prompt_embeds,
-                                      encoder_hidden_states_image=image_embeds, return_dict=False)[0]
-        if do_cfg:
-            noise_uncond = pipe.transformer(hidden_states=latent_in, timestep=ts, encoder_hidden_states=neg_embeds,
-                                            encoder_hidden_states_image=image_embeds, return_dict=False)[0]
-            noise_pred = noise_uncond + guidance_scale * \
-                (noise_pred - noise_uncond)
-        latents = pipe.scheduler.step(
-            noise_pred, t, latents, return_dict=False)[0]
-        if (i + tweak_index) < tstrong_index:
-            next_t = timesteps[i + tweak_index + 1] if i + \
-                tweak_index + 1 < len(timesteps) else None
-            if next_t is not None:
-                noisy_ref = pipe.scheduler.add_noise(
-                    ref_latents, fixed_noise, torch.as_tensor(next_t, device=device).long())
-                latents = latents * background_mask + noisy_ref * motion_mask
-            else:
-                latents = latents * background_mask + \
-                    ref_latents.to(latents.dtype) * motion_mask
-    latents = latents.to(pipe.vae.dtype)
-    latents = latents / std + mean
-    video = pipe.vae.decode(latents, return_dict=False)[0]
-    video = ttm_helper.video_processor.postprocess_video(
-        video, output_type="pil")
-    out_path = os.path.join(create_user_temp_dir(), "ttm_wan_out.mp4")
-    export_to_video(video[0], out_path, fps=16)
-    return out_path, "✅ Video Generated"
-def run_ttm_generation(first_frame_path, motion_signal_path, mask_path, prompt, negative_prompt,
-                       model_choice, tweak_index, tstrong_index, num_frames, num_inference_steps,
-                       guidance_scale, seed, progress=gr.Progress()):
-    if "Wan" in model_choice:
-        return run_ttm_wan_generation(first_frame_path, motion_signal_path, mask_path, prompt, negative_prompt,
-                                      tweak_index, tstrong_index, num_frames, num_inference_steps, guidance_scale, seed, progress)
-    else:
-        return run_ttm_cog_generation(first_frame_path, motion_signal_path, mask_path, prompt,
-                                      tweak_index, tstrong_index, num_frames, num_inference_steps, guidance_scale, seed, progress)

src/utils.py DELETED Viewed

@@ -1,57 +0,0 @@
-import os
-import cv2
-import time
-import shutil
-import logging
-import uuid
-import torch
-import numpy as np
-import atexit
-from concurrent.futures import ThreadPoolExecutor
-from typing import Union
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-thread_pool_executor = ThreadPoolExecutor(max_workers=2)
-def delete_later(path: Union[str, os.PathLike], delay: int = 600):
-    def _delete():
-        try:
-            if os.path.isfile(path):
-                os.remove(path)
-            elif os.path.isdir(path):
-                shutil.rmtree(path)
-        except Exception as e:
-            logger.warning(f"Failed to delete {path}: {e}")
-    def _wait_and_delete():
-        time.sleep(delay)
-        _delete()
-    thread_pool_executor.submit(_wait_and_delete)
-    atexit.register(_delete)
-def create_user_temp_dir():
-    session_id = str(uuid.uuid4())[:8]
-    temp_dir = os.path.join("temp_local", f"session_{session_id}")
-    os.makedirs(temp_dir, exist_ok=True)
-    delete_later(temp_dir, delay=600)
-    return temp_dir
-def load_video_to_tensor(video_path: str) -> torch.Tensor:
-    cap = cv2.VideoCapture(video_path)
-    frames = []
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        frames.append(frame)
-    cap.release()
-    frames = np.array(frames)
-    video_tensor = torch.tensor(frames)
-    video_tensor = video_tensor.permute(0, 3, 1, 2).float() / 255.0
-    video_tensor = video_tensor.unsqueeze(0).permute(0, 2, 1, 3, 4)
-    return video_tensor