Spaces:

abreza
/

SpatialTrackerV2_ttm

Sleeping

App Files Files Community

abreza commited on Dec 23, 2025

Commit

c8dc4de

1 Parent(s): 530e09a

add some logs to check run issue

Browse files

Files changed (1) hide show

app.py +284 -89

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import os
 import numpy as np
@@ -6,32 +7,75 @@ import time
 import shutil
 from pathlib import Path
 from einops import rearrange
-from typing import Union, Optional
 try:
     import spaces
 except ImportError:
     class spaces:
         @staticmethod
         def GPU(func=None, duration=None):
             def decorator(f):
                 return f
             return decorator if func is None else func
 import torch
 import torch.nn.functional as F
 import torchvision.transforms as T
-import logging
 from concurrent.futures import ThreadPoolExecutor
 import atexit
 import uuid
 import decord
 from PIL import Image
-from models.SpaTrackV2.models.vggt4track.models.vggt_moe import VGGT4Track
-from models.SpaTrackV2.models.vggt4track.utils.load_fn import preprocess_image
-from models.SpaTrackV2.models.predictor import Predictor
-from models.SpaTrackV2.models.utils import get_points_on_a_grid
 # TTM imports (optional - will be loaded on demand)
 TTM_COG_AVAILABLE = False
 TTM_WAN_AVAILABLE = False
 try:
@@ -41,8 +85,10 @@ try:
     from diffusers.utils.torch_utils import randn_tensor
     from diffusers.video_processor import VideoProcessor
     TTM_COG_AVAILABLE = True
-except ImportError:
-    pass
 try:
     from diffusers import AutoencoderKLWan, WanTransformer3DModel
@@ -54,17 +100,17 @@ try:
         from diffusers.utils.torch_utils import randn_tensor
         from diffusers.video_processor import VideoProcessor
     TTM_WAN_AVAILABLE = True
-except ImportError:
-    pass
 TTM_AVAILABLE = TTM_COG_AVAILABLE or TTM_WAN_AVAILABLE
 if not TTM_AVAILABLE:
-    logger_init = logging.getLogger(__name__)
-    logger_init.warning("Diffusers not available. TTM features will be disabled.")
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 # Constants
 MAX_FRAMES = 80
@@ -97,9 +143,12 @@ if TTM_COG_AVAILABLE:
 if TTM_WAN_AVAILABLE:
     TTM_MODELS.append("Wan2.2-14B (Recommended)")
-# Global TTM pipelines (lazy loaded)
 ttm_cog_pipeline = None
 ttm_wan_pipeline = None
 def load_video_to_tensor(video_path: str) -> torch.Tensor:
@@ -150,11 +199,56 @@ def get_ttm_wan_pipeline():
         ttm_wan_pipeline.vae.enable_slicing()
         logger.info("TTM Wan 2.2 pipeline loaded successfully!")
     return ttm_wan_pipeline
-    return ttm_pipeline
 # Thread pool for delayed deletion
 thread_pool_executor = ThreadPoolExecutor(max_workers=2)
 def delete_later(path: Union[str, os.PathLike], delay: int = 600):
     """Delete file or directory after specified delay"""
     def _delete():
@@ -173,6 +267,7 @@ def delete_later(path: Union[str, os.PathLike], delay: int = 600):
     thread_pool_executor.submit(_wait_and_delete)
     atexit.register(_delete)
 def create_user_temp_dir():
     """Create a unique temporary directory for each user session"""
     session_id = str(uuid.uuid4())[:8]
@@ -181,17 +276,16 @@ def create_user_temp_dir():
     delete_later(temp_dir, delay=600)
     return temp_dir
-# Global model initialization
-print("🚀 Initializing models...")
-vggt4track_model = VGGT4Track.from_pretrained("Yuxihenry/SpatialTrackerV2_Front")
-vggt4track_model.eval()
-vggt4track_model = vggt4track_model.to("cuda")
-tracker_model = Predictor.from_pretrained("Yuxihenry/SpatialTrackerV2-Offline")
-tracker_model.eval()
-print("✅ Models loaded successfully!")
 gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])
 def generate_camera_trajectory(num_frames: int, movement_type: str,
@@ -217,7 +311,8 @@ def generate_camera_trajectory(num_frames: int, movement_type: str,
         if movement_type == "static":
             pass  # Keep identity
         elif movement_type == "move_forward":
-            ext[2, 3] = -speed * t  # Move along -Z (forward in OpenGL convention)
         elif movement_type == "move_backward":
             ext[2, 3] = speed * t  # Move along +Z
         elif movement_type == "move_left":
@@ -274,7 +369,8 @@ def render_from_pointcloud(rgb_frames: np.ndarray,
         base_dir = os.path.dirname(output_path)
         motion_signal_path = os.path.join(base_dir, "motion_signal.mp4")
         mask_path = os.path.join(base_dir, "mask.mp4")
-        out_motion_signal = cv2.VideoWriter(motion_signal_path, fourcc, fps, (W, H))
         out_mask = cv2.VideoWriter(mask_path, fourcc, fps, (W, H))
     # Create meshgrid for pixel coordinates
@@ -354,17 +450,21 @@ def render_from_pointcloud(rgb_frames: np.ndarray,
                 if hole_mask.sum() == 0:
                     break
                 dilated = cv2.dilate(motion_signal_frame, kernel, iterations=1)
-                motion_signal_frame = np.where(hole_mask[:, :, None] > 0, dilated, motion_signal_frame)
-                hole_mask = (motion_signal_frame.sum(axis=-1) == 0).astype(np.uint8)
         # Write TTM outputs if enabled
         if generate_ttm_inputs:
             # Motion signal: warped frame with NN inpainting
-            motion_signal_bgr = cv2.cvtColor(motion_signal_frame, cv2.COLOR_RGB2BGR)
             out_motion_signal.write(motion_signal_bgr)
             # Mask: binary mask of valid (projected) pixels - white where valid, black where holes
-            mask_frame = np.stack([valid_mask, valid_mask, valid_mask], axis=-1)
             out_mask.write(mask_frame)
         # For the rendered output, use the same inpainted result
@@ -384,7 +484,7 @@ def render_from_pointcloud(rgb_frames: np.ndarray,
     }
-@spaces.GPU
 def run_spatial_tracker(video_tensor: torch.Tensor):
     """
     GPU-intensive spatial tracking function.
@@ -395,9 +495,23 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
     Returns:
         Dictionary containing tracking results
     """
     # Run VGGT to get depth and camera poses
     video_input = preprocess_image(video_tensor)[None].cuda()
     with torch.no_grad():
         with torch.cuda.amp.autocast(dtype=torch.bfloat16):
             predictions = vggt4track_model(video_input / 255)
@@ -406,6 +520,9 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
             depth_map = predictions["points_map"][..., 2]
             depth_conf = predictions["unc_metric"]
     depth_tensor = depth_map.squeeze().cpu().numpy()
     extrs = extrinsic.squeeze().cpu().numpy()
     intrs = intrinsic.squeeze().cpu().numpy()
@@ -413,13 +530,20 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
     unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
     # Setup tracker
     tracker_model.spatrack.track_num = 512
     tracker_model.to("cuda")
     # Get grid points for tracking
     frame_H, frame_W = video_tensor_gpu.shape[2:]
     grid_pts = get_points_on_a_grid(30, (frame_H, frame_W), device="cpu")
-    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].numpy()
     # Run tracker
     with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
@@ -447,8 +571,11 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
         conf_depth = T.Resize((new_h, new_w))(conf_depth)
         intrs_out[:, :2, :] = intrs_out[:, :2, :] * scale
     # Move results to CPU and return
-    return {
         'video_out': video_out.cpu(),
         'point_map': point_map.cpu(),
         'conf_depth': conf_depth.cpu(),
@@ -456,6 +583,11 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
         'c2w_traj': c2w_traj.cpu(),
     }
 def process_video(video_path: str, camera_movement: str, generate_ttm: bool = True, progress=gr.Progress()):
     """Main processing function
@@ -511,7 +643,8 @@ def process_video(video_path: str, camera_movement: str, generate_ttm: bool = Tr
         c2w_traj = tracking_results['c2w_traj']
         # Get RGB frames and depth
-        rgb_frames = rearrange(video_out.numpy(), "T C H W -> T H W C").astype(np.uint8)
         depth_frames = point_map[:, 2].numpy()
         depth_conf_np = conf_depth.numpy()
@@ -522,7 +655,8 @@ def process_video(video_path: str, camera_movement: str, generate_ttm: bool = Tr
         intrs_np = intrs_out.numpy()
         extrs_np = torch.inverse(c2w_traj).numpy()  # world-to-camera
-        progress(0.7, desc=f"Generating {camera_movement} camera trajectory...")
         # Calculate scene scale from depth
         valid_depth = depth_frames[depth_frames > 0]
@@ -586,7 +720,8 @@ class CogVideoXTTMHelper:
         self.vae = pipeline.vae
         self.transformer = pipeline.transformer
         self.scheduler = pipeline.scheduler
-        self.vae_scale_factor_spatial = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.vae_scale_factor_temporal = self.vae.config.temporal_compression_ratio
         self.vae_scaling_factor_image = self.vae.config.scaling_factor
         self.video_processor = pipeline.video_processor
@@ -596,7 +731,8 @@ class CogVideoXTTMHelper:
         """Encode video frames into latent space. Input shape (B, C, F, H, W), expected range [-1, 1]."""
         latents = self.vae.encode(frames)[0].sample()
         latents = latents * self.vae_scaling_factor_image
-        return latents.permute(0, 2, 1, 3, 4).contiguous()  # (B, C, F, H, W) -> (B, F, C, H, W)
     def convert_rgb_mask_to_latent_mask(self, mask: torch.Tensor) -> torch.Tensor:
         """Convert a per-frame mask [T, 1, H, W] to latent resolution [1, T_latent, 1, H', W']."""
@@ -610,7 +746,8 @@ class CogVideoXTTMHelper:
         s = self.vae_scale_factor_spatial
         H_latent = pooled.shape[-2] // s
         W_latent = pooled.shape[-1] // s
-        pooled = F.interpolate(pooled, size=(pooled.shape[2], H_latent, W_latent), mode="nearest")
         latent_mask = pooled.permute(0, 2, 1, 3, 4)
         return latent_mask
@@ -641,7 +778,8 @@ class WanTTMHelper:
         s = self.vae_scale_factor_spatial
         H_latent = pooled.shape[-2] // s
         W_latent = pooled.shape[-1] // s
-        pooled = F.interpolate(pooled, size=(pooled.shape[2], H_latent, W_latent), mode="nearest")
         latent_mask = pooled.permute(0, 2, 1, 3, 4)
         return latent_mask
@@ -698,8 +836,10 @@ def run_ttm_cog_generation(
         image = load_image(first_frame_path)
         # Get dimensions
-        height = pipe.transformer.config.sample_height * ttm_helper.vae_scale_factor_spatial
-        width = pipe.transformer.config.sample_width * ttm_helper.vae_scale_factor_spatial
         device = "cuda"
         generator = torch.Generator(device=device).manual_seed(seed)
@@ -717,7 +857,8 @@ def run_ttm_cog_generation(
             device=device,
         )
         if do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
         progress(0.2, desc="Preparing latents...")
@@ -726,7 +867,8 @@ def run_ttm_cog_generation(
         timesteps = pipe.scheduler.timesteps
         # Prepare latents
-        latent_frames = (num_frames - 1) // ttm_helper.vae_scale_factor_temporal + 1
         # Handle padding for CogVideoX 1.5
         patch_size_t = pipe.transformer.config.patch_size_t
@@ -760,11 +902,13 @@ def run_ttm_cog_generation(
         ref_vid = load_video_to_tensor(motion_signal_path).to(device=device)
         refB, refC, refT, refH, refW = ref_vid.shape
         ref_vid = F.interpolate(
-            ref_vid.permute(0, 2, 1, 3, 4).reshape(refB*refT, refC, refH, refW),
             size=(height, width), mode="bicubic", align_corners=True,
         ).reshape(refB, refT, refC, height, width).permute(0, 2, 1, 3, 4)
-        ref_vid = ttm_helper.video_processor.normalize(ref_vid.to(dtype=pipe.vae.dtype))
         ref_latents = ttm_helper.encode_frames(ref_vid).float().detach()
         # Load mask video
@@ -795,8 +939,10 @@ def run_ttm_cog_generation(
                 device=ref_latents.device,
                 dtype=ref_latents.dtype,
             )
-            noisy_latents = pipe.scheduler.add_noise(ref_latents, fixed_noise, tweak.long())
-            latents = noisy_latents.to(dtype=latents.dtype, device=latents.device)
         else:
             fixed_noise = randn_tensor(
                 ref_latents.shape,
@@ -811,13 +957,15 @@ def run_ttm_cog_generation(
         # Create rotary embeddings if required
         image_rotary_emb = (
-            pipe._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
             if pipe.transformer.config.use_rotary_positional_embeddings
             else None
         )
         # Create ofs embeddings if required
-        ofs_emb = None if pipe.transformer.config.ofs_embed_dim is None else latents.new_full((1,), fill_value=2.0)
         progress(0.4, desc="Running TTM denoising loop...")
@@ -827,13 +975,18 @@ def run_ttm_cog_generation(
         for i, t in enumerate(timesteps[tweak_index:]):
             step_progress = 0.4 + 0.5 * (i / total_steps)
-            progress(step_progress, desc=f"Denoising step {i+1}/{total_steps}...")
-            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = pipe.scheduler.scale_model_input(latent_model_input, t)
-            latent_image_input = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
-            latent_model_input = torch.cat([latent_model_input, latent_image_input], dim=2)
             timestep = t.expand(latent_model_input.shape[0])
@@ -851,7 +1004,8 @@ def run_ttm_cog_generation(
             # Perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
             # Compute previous noisy sample
             if not isinstance(pipe.scheduler, CogVideoXDPMScheduler):
@@ -889,7 +1043,8 @@ def run_ttm_cog_generation(
         # Decode latents
         latents = latents[:, additional_frames:]
         frames = pipe.decode_latents(latents)
-        video = ttm_helper.video_processor.postprocess_video(video=frames, output_type="pil")
         progress(0.95, desc="Saving video...")
@@ -954,8 +1109,10 @@ def run_ttm_wan_generation(
         # Get dimensions - compute based on image aspect ratio
         max_area = 480 * 832
-        mod_value = ttm_helper.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
-        height, width = compute_hw_from_area(image.height, image.width, max_area, mod_value)
         image = image.resize((width, height))
         device = "cuda"
@@ -979,7 +1136,8 @@ def run_ttm_wan_generation(
         transformer_dtype = pipe.transformer.dtype
         prompt_embeds = prompt_embeds.to(transformer_dtype)
         if negative_prompt_embeds is not None:
-            negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
         # Encode image embedding if transformer supports it
         image_embeds = None
@@ -996,12 +1154,14 @@ def run_ttm_wan_generation(
         # Adjust num_frames to be valid for VAE
         if num_frames % ttm_helper.vae_scale_factor_temporal != 1:
-            num_frames = num_frames // ttm_helper.vae_scale_factor_temporal * ttm_helper.vae_scale_factor_temporal + 1
         num_frames = max(num_frames, 1)
         # Prepare latent variables
         num_channels_latents = pipe.vae.config.z_dim
-        image_tensor = ttm_helper.video_processor.preprocess(image, height=height, width=width).to(device, dtype=torch.float32)
         latents_outputs = pipe.prepare_latents(
             image_tensor,
@@ -1029,16 +1189,21 @@ def run_ttm_wan_generation(
         ref_vid = load_video_to_tensor(motion_signal_path).to(device=device)
         refB, refC, refT, refH, refW = ref_vid.shape
         ref_vid = F.interpolate(
-            ref_vid.permute(0, 2, 1, 3, 4).reshape(refB*refT, refC, refH, refW),
             size=(height, width), mode="bicubic", align_corners=True,
         ).reshape(refB, refT, refC, height, width).permute(0, 2, 1, 3, 4)
-        ref_vid = ttm_helper.video_processor.normalize(ref_vid.to(dtype=pipe.vae.dtype))
-        ref_latents = retrieve_latents(pipe.vae.encode(ref_vid), sample_mode="argmax")
         # Normalize latents
-        latents_mean = torch.tensor(pipe.vae.config.latents_mean).view(1, pipe.vae.config.z_dim, 1, 1, 1).to(ref_latents.device, ref_latents.dtype)
-        latents_std = 1.0 / torch.tensor(pipe.vae.config.latents_std).view(1, pipe.vae.config.z_dim, 1, 1, 1).to(ref_latents.device, ref_latents.dtype)
         ref_latents = (ref_latents - latents_mean) * latents_std
         # Load mask video
@@ -1062,7 +1227,8 @@ def run_ttm_wan_generation(
         else:
             mask_t1_hw = (mask_tc_hw > 0.5).float()
-        motion_mask = ttm_helper.convert_rgb_mask_to_latent_mask(mask_t1_hw).permute(0, 2, 1, 3, 4).contiguous()
         background_mask = 1.0 - motion_mask
         progress(0.35, desc="Initializing TTM denoising...")
@@ -1076,9 +1242,12 @@ def run_ttm_wan_generation(
                 device=ref_latents.device,
                 dtype=ref_latents.dtype,
             )
-            tweak_t = torch.as_tensor(tweak, device=ref_latents.device, dtype=torch.long).view(1)
-            noisy_latents = pipe.scheduler.add_noise(ref_latents, fixed_noise, tweak_t.long())
-            latents = noisy_latents.to(dtype=latents.dtype, device=latents.device)
         else:
             fixed_noise = randn_tensor(
                 ref_latents.shape,
@@ -1095,16 +1264,19 @@ def run_ttm_wan_generation(
         for i, t in enumerate(timesteps[tweak_index:]):
             step_progress = 0.4 + 0.5 * (i / total_steps)
-            progress(step_progress, desc=f"Denoising step {i+1}/{total_steps}...")
             # Prepare model input
             if first_frame_mask is not None:
-                latent_model_input = (1 - first_frame_mask) * condition + first_frame_mask * latents
                 latent_model_input = latent_model_input.to(transformer_dtype)
                 temp_ts = (first_frame_mask[0][0][:, ::2, ::2] * t).flatten()
                 timestep = temp_ts.unsqueeze(0).expand(latents.shape[0], -1)
             else:
-                latent_model_input = torch.cat([latents, condition], dim=1).to(transformer_dtype)
                 timestep = t.expand(latents.shape[0])
             # Predict noise (conditional)
@@ -1125,10 +1297,12 @@ def run_ttm_wan_generation(
                     encoder_hidden_states_image=image_embeds,
                     return_dict=False,
                 )[0]
-                noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
             # Scheduler step
-            latents = pipe.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
             # TTM: In between tweak and tstrong, replace mask with noisy reference latents
             in_between_tweak_tstrong = (i + tweak_index) < tstrong_index
@@ -1136,27 +1310,34 @@ def run_ttm_wan_generation(
             if in_between_tweak_tstrong:
                 if i + tweak_index + 1 < len(timesteps):
                     prev_t = timesteps[i + tweak_index + 1]
-                    prev_t = torch.as_tensor(prev_t, device=ref_latents.device, dtype=torch.long).view(1)
                     noisy_latents = pipe.scheduler.add_noise(ref_latents, fixed_noise, prev_t.long()).to(
                         dtype=latents.dtype, device=latents.device
                     )
                     latents = latents * background_mask + noisy_latents * motion_mask
                 else:
-                    latents = latents * background_mask + ref_latents.to(dtype=latents.dtype, device=latents.device) * motion_mask
         progress(0.9, desc="Decoding video...")
         # Apply first frame mask if used
         if first_frame_mask is not None:
-            latents = (1 - first_frame_mask) * condition + first_frame_mask * latents
         # Decode latents
         latents = latents.to(pipe.vae.dtype)
-        latents_mean = torch.tensor(pipe.vae.config.latents_mean).view(1, pipe.vae.config.z_dim, 1, 1, 1).to(latents.device, latents.dtype)
-        latents_std = 1.0 / torch.tensor(pipe.vae.config.latents_std).view(1, pipe.vae.config.z_dim, 1, 1, 1).to(latents.device, latents.dtype)
         latents = latents / latents_std + latents_mean
         video = pipe.vae.decode(latents, return_dict=False)[0]
-        video = ttm_helper.video_processor.postprocess_video(video, output_type="pil")
         progress(0.95, desc="Saving video...")
@@ -1226,7 +1407,8 @@ def run_ttm_generation(
 # Create Gradio interface
-print("🎨 Creating Gradio interface...")
 with gr.Blocks(
     theme=gr.themes.Soft(),
@@ -1283,7 +1465,8 @@ with gr.Blocks(
                         info="Generate motion_signal.mp4 and mask.mp4 for Time-to-Move"
                     )
-                    generate_btn = gr.Button("🚀 Generate Motion Signal", variant="primary", size="lg")
                 with gr.Column(scale=1):
                     gr.Markdown("### 📤 Rendered Output")
@@ -1419,7 +1602,8 @@ with gr.Blocks(
                         label="TTM Generated Video",
                         height=400
                     )
-                    ttm_status_text = gr.Markdown("Upload a video in Step 1 first, then run TTM here.")
             # TTM Input preview
             with gr.Accordion("📁 TTM Input Files (from Step 1)", open=False):
@@ -1439,7 +1623,8 @@ with gr.Blocks(
     # Helper function to update states and preview
     def process_and_update_states(video_path, camera_movement, generate_ttm_flag, progress=gr.Progress()):
-        result = process_video(video_path, camera_movement, generate_ttm_flag, progress)
         output_vid, motion_sig, mask_vid, first_frame, status = result
         # Return all outputs including state updates and previews
@@ -1491,10 +1676,12 @@ with gr.Blocks(
     # Examples
     gr.Markdown("### 📁 Examples")
     if os.path.exists("./examples"):
-        example_videos = [f for f in os.listdir("./examples") if f.endswith(".mp4")][:4]
         if example_videos:
             gr.Examples(
-                examples=[[f"./examples/{v}", "move_forward", True] for v in example_videos],
                 inputs=[video_input, camera_movement, generate_ttm],
                 outputs=[
                     output_video, motion_signal_output, mask_output, first_frame_output, status_text,
@@ -1506,5 +1693,13 @@ with gr.Blocks(
             )
 # Launch
 if __name__ == "__main__":
     demo.launch(share=False)

+import sys
 import gradio as gr
 import os
 import numpy as np
 import shutil
 from pathlib import Path
 from einops import rearrange
+from typing import Union
+# Force unbuffered output for HF Spaces logs
+os.environ['PYTHONUNBUFFERED'] = '1'
+# Configure logging FIRST before any other imports
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger(__name__)
+logger.info("=" * 50)
+logger.info("Starting application initialization...")
+logger.info("=" * 50)
+sys.stdout.flush()
 try:
     import spaces
+    logger.info("✅ HF Spaces module imported successfully")
 except ImportError:
+    logger.warning("⚠️ HF Spaces module not available, using mock")
     class spaces:
         @staticmethod
         def GPU(func=None, duration=None):
             def decorator(f):
                 return f
             return decorator if func is None else func
+sys.stdout.flush()
+logger.info("Importing torch...")
+sys.stdout.flush()
 import torch
+logger.info(f"✅ Torch imported. Version: {torch.__version__}, CUDA available: {torch.cuda.is_available()}")
+sys.stdout.flush()
 import torch.nn.functional as F
 import torchvision.transforms as T
 from concurrent.futures import ThreadPoolExecutor
 import atexit
 import uuid
+logger.info("Importing decord...")
+sys.stdout.flush()
 import decord
+logger.info("✅ Decord imported successfully")
+sys.stdout.flush()
 from PIL import Image
+logger.info("Importing SpaTrack models...")
+sys.stdout.flush()
+try:
+    from models.SpaTrackV2.models.vggt4track.models.vggt_moe import VGGT4Track
+    from models.SpaTrackV2.models.vggt4track.utils.load_fn import preprocess_image
+    from models.SpaTrackV2.models.predictor import Predictor
+    from models.SpaTrackV2.models.utils import get_points_on_a_grid
+    logger.info("✅ SpaTrack models imported successfully")
+except Exception as e:
+    logger.error(f"❌ Failed to import SpaTrack models: {e}")
+    raise
+sys.stdout.flush()
 # TTM imports (optional - will be loaded on demand)
+logger.info("Checking TTM (diffusers) availability...")
+sys.stdout.flush()
 TTM_COG_AVAILABLE = False
 TTM_WAN_AVAILABLE = False
 try:
     from diffusers.utils.torch_utils import randn_tensor
     from diffusers.video_processor import VideoProcessor
     TTM_COG_AVAILABLE = True
+    logger.info("✅ CogVideoX TTM available")
+except ImportError as e:
+    logger.info(f"ℹ️ CogVideoX TTM not available: {e}")
+sys.stdout.flush()
 try:
     from diffusers import AutoencoderKLWan, WanTransformer3DModel
         from diffusers.utils.torch_utils import randn_tensor
         from diffusers.video_processor import VideoProcessor
     TTM_WAN_AVAILABLE = True
+    logger.info("✅ Wan TTM available")
+except ImportError as e:
+    logger.info(f"ℹ️ Wan TTM not available: {e}")
+sys.stdout.flush()
 TTM_AVAILABLE = TTM_COG_AVAILABLE or TTM_WAN_AVAILABLE
 if not TTM_AVAILABLE:
+    logger.warning("⚠️ Diffusers not available. TTM features will be disabled.")
+else:
+    logger.info(f"TTM Status - CogVideoX: {TTM_COG_AVAILABLE}, Wan: {TTM_WAN_AVAILABLE}")
+sys.stdout.flush()
 # Constants
 MAX_FRAMES = 80
 if TTM_WAN_AVAILABLE:
     TTM_MODELS.append("Wan2.2-14B (Recommended)")
+# Global model instances (lazy loaded for HF Spaces GPU compatibility)
+vggt4track_model = None
+tracker_model = None
 ttm_cog_pipeline = None
 ttm_wan_pipeline = None
+MODELS_LOADED = False
 def load_video_to_tensor(video_path: str) -> torch.Tensor:
         ttm_wan_pipeline.vae.enable_slicing()
         logger.info("TTM Wan 2.2 pipeline loaded successfully!")
     return ttm_wan_pipeline
+logger.info("Setting up thread pool and utility functions...")
+sys.stdout.flush()
 # Thread pool for delayed deletion
 thread_pool_executor = ThreadPoolExecutor(max_workers=2)
+def load_models():
+    """Load models lazily when GPU is available (inside @spaces.GPU decorated function)."""
+    global vggt4track_model, tracker_model, MODELS_LOADED
+    if MODELS_LOADED:
+        logger.info("Models already loaded, skipping...")
+        return
+    logger.info("🚀 Starting model loading...")
+    sys.stdout.flush()
+    try:
+        logger.info("Loading VGGT4Track model from 'Yuxihenry/SpatialTrackerV2_Front'...")
+        sys.stdout.flush()
+        vggt4track_model = VGGT4Track.from_pretrained("Yuxihenry/SpatialTrackerV2_Front")
+        vggt4track_model.eval()
+        logger.info("✅ VGGT4Track model loaded, moving to CUDA...")
+        sys.stdout.flush()
+        vggt4track_model = vggt4track_model.to("cuda")
+        logger.info("✅ VGGT4Track model on CUDA")
+        sys.stdout.flush()
+        logger.info("Loading Predictor model from 'Yuxihenry/SpatialTrackerV2-Offline'...")
+        sys.stdout.flush()
+        tracker_model = Predictor.from_pretrained("Yuxihenry/SpatialTrackerV2-Offline")
+        tracker_model.eval()
+        logger.info("✅ Predictor model loaded")
+        sys.stdout.flush()
+        MODELS_LOADED = True
+        logger.info("✅ All models loaded successfully!")
+        sys.stdout.flush()
+    except Exception as e:
+        logger.error(f"❌ Failed to load models: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.stdout.flush()
+        raise
 def delete_later(path: Union[str, os.PathLike], delay: int = 600):
     """Delete file or directory after specified delay"""
     def _delete():
     thread_pool_executor.submit(_wait_and_delete)
     atexit.register(_delete)
 def create_user_temp_dir():
     """Create a unique temporary directory for each user session"""
     session_id = str(uuid.uuid4())[:8]
     delete_later(temp_dir, delay=600)
     return temp_dir
+# Note: Models are loaded lazily inside @spaces.GPU decorated functions
+# This is required for HF Spaces ZeroGPU compatibility
+logger.info("Models will be loaded lazily when GPU is available")
+sys.stdout.flush()
+logger.info("Setting up Gradio static paths...")
 gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])
+logger.info("✅ Static paths configured")
+sys.stdout.flush()
 def generate_camera_trajectory(num_frames: int, movement_type: str,
         if movement_type == "static":
             pass  # Keep identity
         elif movement_type == "move_forward":
+            # Move along -Z (forward in OpenGL convention)
+            ext[2, 3] = -speed * t
         elif movement_type == "move_backward":
             ext[2, 3] = speed * t  # Move along +Z
         elif movement_type == "move_left":
         base_dir = os.path.dirname(output_path)
         motion_signal_path = os.path.join(base_dir, "motion_signal.mp4")
         mask_path = os.path.join(base_dir, "mask.mp4")
+        out_motion_signal = cv2.VideoWriter(
+            motion_signal_path, fourcc, fps, (W, H))
         out_mask = cv2.VideoWriter(mask_path, fourcc, fps, (W, H))
     # Create meshgrid for pixel coordinates
                 if hole_mask.sum() == 0:
                     break
                 dilated = cv2.dilate(motion_signal_frame, kernel, iterations=1)
+                motion_signal_frame = np.where(
+                    hole_mask[:, :, None] > 0, dilated, motion_signal_frame)
+                hole_mask = (motion_signal_frame.sum(
+                    axis=-1) == 0).astype(np.uint8)
         # Write TTM outputs if enabled
         if generate_ttm_inputs:
             # Motion signal: warped frame with NN inpainting
+            motion_signal_bgr = cv2.cvtColor(
+                motion_signal_frame, cv2.COLOR_RGB2BGR)
             out_motion_signal.write(motion_signal_bgr)
             # Mask: binary mask of valid (projected) pixels - white where valid, black where holes
+            mask_frame = np.stack(
+                [valid_mask, valid_mask, valid_mask], axis=-1)
             out_mask.write(mask_frame)
         # For the rendered output, use the same inpainted result
     }
+@spaces.GPU(duration=180)
 def run_spatial_tracker(video_tensor: torch.Tensor):
     """
     GPU-intensive spatial tracking function.
     Returns:
         Dictionary containing tracking results
     """
+    global vggt4track_model, tracker_model
+    logger.info("run_spatial_tracker: Starting GPU execution...")
+    sys.stdout.flush()
+    # Load models if not already loaded (lazy loading for HF Spaces)
+    load_models()
+    logger.info("run_spatial_tracker: Preprocessing video input...")
+    sys.stdout.flush()
     # Run VGGT to get depth and camera poses
     video_input = preprocess_image(video_tensor)[None].cuda()
+    logger.info("run_spatial_tracker: Running VGGT inference...")
+    sys.stdout.flush()
     with torch.no_grad():
         with torch.cuda.amp.autocast(dtype=torch.bfloat16):
             predictions = vggt4track_model(video_input / 255)
             depth_map = predictions["points_map"][..., 2]
             depth_conf = predictions["unc_metric"]
+    logger.info("run_spatial_tracker: VGGT inference complete")
+    sys.stdout.flush()
     depth_tensor = depth_map.squeeze().cpu().numpy()
     extrs = extrinsic.squeeze().cpu().numpy()
     intrs = intrinsic.squeeze().cpu().numpy()
     unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
     # Setup tracker
+    logger.info("run_spatial_tracker: Setting up tracker...")
+    sys.stdout.flush()
     tracker_model.spatrack.track_num = 512
     tracker_model.to("cuda")
     # Get grid points for tracking
     frame_H, frame_W = video_tensor_gpu.shape[2:]
     grid_pts = get_points_on_a_grid(30, (frame_H, frame_W), device="cpu")
+    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[
+        0].numpy()
+    logger.info("run_spatial_tracker: Running 3D tracker...")
+    sys.stdout.flush()
     # Run tracker
     with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
         conf_depth = T.Resize((new_h, new_w))(conf_depth)
         intrs_out[:, :2, :] = intrs_out[:, :2, :] * scale
+    logger.info("run_spatial_tracker: Moving results to CPU...")
+    sys.stdout.flush()
     # Move results to CPU and return
+    result = {
         'video_out': video_out.cpu(),
         'point_map': point_map.cpu(),
         'conf_depth': conf_depth.cpu(),
         'c2w_traj': c2w_traj.cpu(),
     }
+    logger.info("run_spatial_tracker: Complete!")
+    sys.stdout.flush()
+    return result
 def process_video(video_path: str, camera_movement: str, generate_ttm: bool = True, progress=gr.Progress()):
     """Main processing function
         c2w_traj = tracking_results['c2w_traj']
         # Get RGB frames and depth
+        rgb_frames = rearrange(
+            video_out.numpy(), "T C H W -> T H W C").astype(np.uint8)
         depth_frames = point_map[:, 2].numpy()
         depth_conf_np = conf_depth.numpy()
         intrs_np = intrs_out.numpy()
         extrs_np = torch.inverse(c2w_traj).numpy()  # world-to-camera
+        progress(
+            0.7, desc=f"Generating {camera_movement} camera trajectory...")
         # Calculate scene scale from depth
         valid_depth = depth_frames[depth_frames > 0]
         self.vae = pipeline.vae
         self.transformer = pipeline.transformer
         self.scheduler = pipeline.scheduler
+        self.vae_scale_factor_spatial = 2 ** (
+            len(self.vae.config.block_out_channels) - 1)
         self.vae_scale_factor_temporal = self.vae.config.temporal_compression_ratio
         self.vae_scaling_factor_image = self.vae.config.scaling_factor
         self.video_processor = pipeline.video_processor
         """Encode video frames into latent space. Input shape (B, C, F, H, W), expected range [-1, 1]."""
         latents = self.vae.encode(frames)[0].sample()
         latents = latents * self.vae_scaling_factor_image
+        # (B, C, F, H, W) -> (B, F, C, H, W)
+        return latents.permute(0, 2, 1, 3, 4).contiguous()
     def convert_rgb_mask_to_latent_mask(self, mask: torch.Tensor) -> torch.Tensor:
         """Convert a per-frame mask [T, 1, H, W] to latent resolution [1, T_latent, 1, H', W']."""
         s = self.vae_scale_factor_spatial
         H_latent = pooled.shape[-2] // s
         W_latent = pooled.shape[-1] // s
+        pooled = F.interpolate(pooled, size=(
+            pooled.shape[2], H_latent, W_latent), mode="nearest")
         latent_mask = pooled.permute(0, 2, 1, 3, 4)
         return latent_mask
         s = self.vae_scale_factor_spatial
         H_latent = pooled.shape[-2] // s
         W_latent = pooled.shape[-1] // s
+        pooled = F.interpolate(pooled, size=(
+            pooled.shape[2], H_latent, W_latent), mode="nearest")
         latent_mask = pooled.permute(0, 2, 1, 3, 4)
         return latent_mask
         image = load_image(first_frame_path)
         # Get dimensions
+        height = pipe.transformer.config.sample_height * \
+            ttm_helper.vae_scale_factor_spatial
+        width = pipe.transformer.config.sample_width * \
+            ttm_helper.vae_scale_factor_spatial
         device = "cuda"
         generator = torch.Generator(device=device).manual_seed(seed)
             device=device,
         )
         if do_classifier_free_guidance:
+            prompt_embeds = torch.cat(
+                [negative_prompt_embeds, prompt_embeds], dim=0)
         progress(0.2, desc="Preparing latents...")
         timesteps = pipe.scheduler.timesteps
         # Prepare latents
+        latent_frames = (
+            num_frames - 1) // ttm_helper.vae_scale_factor_temporal + 1
         # Handle padding for CogVideoX 1.5
         patch_size_t = pipe.transformer.config.patch_size_t
         ref_vid = load_video_to_tensor(motion_signal_path).to(device=device)
         refB, refC, refT, refH, refW = ref_vid.shape
         ref_vid = F.interpolate(
+            ref_vid.permute(0, 2, 1, 3, 4).reshape(
+                refB*refT, refC, refH, refW),
             size=(height, width), mode="bicubic", align_corners=True,
         ).reshape(refB, refT, refC, height, width).permute(0, 2, 1, 3, 4)
+        ref_vid = ttm_helper.video_processor.normalize(
+            ref_vid.to(dtype=pipe.vae.dtype))
         ref_latents = ttm_helper.encode_frames(ref_vid).float().detach()
         # Load mask video
                 device=ref_latents.device,
                 dtype=ref_latents.dtype,
             )
+            noisy_latents = pipe.scheduler.add_noise(
+                ref_latents, fixed_noise, tweak.long())
+            latents = noisy_latents.to(
+                dtype=latents.dtype, device=latents.device)
         else:
             fixed_noise = randn_tensor(
                 ref_latents.shape,
         # Create rotary embeddings if required
         image_rotary_emb = (
+            pipe._prepare_rotary_positional_embeddings(
+                height, width, latents.size(1), device)
             if pipe.transformer.config.use_rotary_positional_embeddings
             else None
         )
         # Create ofs embeddings if required
+        ofs_emb = None if pipe.transformer.config.ofs_embed_dim is None else latents.new_full(
+            (1,), fill_value=2.0)
         progress(0.4, desc="Running TTM denoising loop...")
         for i, t in enumerate(timesteps[tweak_index:]):
             step_progress = 0.4 + 0.5 * (i / total_steps)
+            progress(step_progress,
+                     desc=f"Denoising step {i+1}/{total_steps}...")
+            latent_model_input = torch.cat(
+                [latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = pipe.scheduler.scale_model_input(
+                latent_model_input, t)
+            latent_image_input = torch.cat(
+                [image_latents] * 2) if do_classifier_free_guidance else image_latents
+            latent_model_input = torch.cat(
+                [latent_model_input, latent_image_input], dim=2)
             timestep = t.expand(latent_model_input.shape[0])
             # Perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
             # Compute previous noisy sample
             if not isinstance(pipe.scheduler, CogVideoXDPMScheduler):
         # Decode latents
         latents = latents[:, additional_frames:]
         frames = pipe.decode_latents(latents)
+        video = ttm_helper.video_processor.postprocess_video(
+            video=frames, output_type="pil")
         progress(0.95, desc="Saving video...")
         # Get dimensions - compute based on image aspect ratio
         max_area = 480 * 832
+        mod_value = ttm_helper.vae_scale_factor_spatial * \
+            pipe.transformer.config.patch_size[1]
+        height, width = compute_hw_from_area(
+            image.height, image.width, max_area, mod_value)
         image = image.resize((width, height))
         device = "cuda"
         transformer_dtype = pipe.transformer.dtype
         prompt_embeds = prompt_embeds.to(transformer_dtype)
         if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                transformer_dtype)
         # Encode image embedding if transformer supports it
         image_embeds = None
         # Adjust num_frames to be valid for VAE
         if num_frames % ttm_helper.vae_scale_factor_temporal != 1:
+            num_frames = num_frames // ttm_helper.vae_scale_factor_temporal * \
+                ttm_helper.vae_scale_factor_temporal + 1
         num_frames = max(num_frames, 1)
         # Prepare latent variables
         num_channels_latents = pipe.vae.config.z_dim
+        image_tensor = ttm_helper.video_processor.preprocess(
+            image, height=height, width=width).to(device, dtype=torch.float32)
         latents_outputs = pipe.prepare_latents(
             image_tensor,
         ref_vid = load_video_to_tensor(motion_signal_path).to(device=device)
         refB, refC, refT, refH, refW = ref_vid.shape
         ref_vid = F.interpolate(
+            ref_vid.permute(0, 2, 1, 3, 4).reshape(
+                refB*refT, refC, refH, refW),
             size=(height, width), mode="bicubic", align_corners=True,
         ).reshape(refB, refT, refC, height, width).permute(0, 2, 1, 3, 4)
+        ref_vid = ttm_helper.video_processor.normalize(
+            ref_vid.to(dtype=pipe.vae.dtype))
+        ref_latents = retrieve_latents(
+            pipe.vae.encode(ref_vid), sample_mode="argmax")
         # Normalize latents
+        latents_mean = torch.tensor(pipe.vae.config.latents_mean).view(
+            1, pipe.vae.config.z_dim, 1, 1, 1).to(ref_latents.device, ref_latents.dtype)
+        latents_std = 1.0 / torch.tensor(pipe.vae.config.latents_std).view(
+            1, pipe.vae.config.z_dim, 1, 1, 1).to(ref_latents.device, ref_latents.dtype)
         ref_latents = (ref_latents - latents_mean) * latents_std
         # Load mask video
         else:
             mask_t1_hw = (mask_tc_hw > 0.5).float()
+        motion_mask = ttm_helper.convert_rgb_mask_to_latent_mask(
+            mask_t1_hw).permute(0, 2, 1, 3, 4).contiguous()
         background_mask = 1.0 - motion_mask
         progress(0.35, desc="Initializing TTM denoising...")
                 device=ref_latents.device,
                 dtype=ref_latents.dtype,
             )
+            tweak_t = torch.as_tensor(
+                tweak, device=ref_latents.device, dtype=torch.long).view(1)
+            noisy_latents = pipe.scheduler.add_noise(
+                ref_latents, fixed_noise, tweak_t.long())
+            latents = noisy_latents.to(
+                dtype=latents.dtype, device=latents.device)
         else:
             fixed_noise = randn_tensor(
                 ref_latents.shape,
         for i, t in enumerate(timesteps[tweak_index:]):
             step_progress = 0.4 + 0.5 * (i / total_steps)
+            progress(step_progress,
+                     desc=f"Denoising step {i+1}/{total_steps}...")
             # Prepare model input
             if first_frame_mask is not None:
+                latent_model_input = (1 - first_frame_mask) * \
+                    condition + first_frame_mask * latents
                 latent_model_input = latent_model_input.to(transformer_dtype)
                 temp_ts = (first_frame_mask[0][0][:, ::2, ::2] * t).flatten()
                 timestep = temp_ts.unsqueeze(0).expand(latents.shape[0], -1)
             else:
+                latent_model_input = torch.cat(
+                    [latents, condition], dim=1).to(transformer_dtype)
                 timestep = t.expand(latents.shape[0])
             # Predict noise (conditional)
                     encoder_hidden_states_image=image_embeds,
                     return_dict=False,
                 )[0]
+                noise_pred = noise_uncond + guidance_scale * \
+                    (noise_pred - noise_uncond)
             # Scheduler step
+            latents = pipe.scheduler.step(
+                noise_pred, t, latents, return_dict=False)[0]
             # TTM: In between tweak and tstrong, replace mask with noisy reference latents
             in_between_tweak_tstrong = (i + tweak_index) < tstrong_index
             if in_between_tweak_tstrong:
                 if i + tweak_index + 1 < len(timesteps):
                     prev_t = timesteps[i + tweak_index + 1]
+                    prev_t = torch.as_tensor(
+                        prev_t, device=ref_latents.device, dtype=torch.long).view(1)
                     noisy_latents = pipe.scheduler.add_noise(ref_latents, fixed_noise, prev_t.long()).to(
                         dtype=latents.dtype, device=latents.device
                     )
                     latents = latents * background_mask + noisy_latents * motion_mask
                 else:
+                    latents = latents * background_mask + \
+                        ref_latents.to(dtype=latents.dtype,
+                                       device=latents.device) * motion_mask
         progress(0.9, desc="Decoding video...")
         # Apply first frame mask if used
         if first_frame_mask is not None:
+            latents = (1 - first_frame_mask) * condition + \
+                first_frame_mask * latents
         # Decode latents
         latents = latents.to(pipe.vae.dtype)
+        latents_mean = torch.tensor(pipe.vae.config.latents_mean).view(
+            1, pipe.vae.config.z_dim, 1, 1, 1).to(latents.device, latents.dtype)
+        latents_std = 1.0 / torch.tensor(pipe.vae.config.latents_std).view(
+            1, pipe.vae.config.z_dim, 1, 1, 1).to(latents.device, latents.dtype)
         latents = latents / latents_std + latents_mean
         video = pipe.vae.decode(latents, return_dict=False)[0]
+        video = ttm_helper.video_processor.postprocess_video(
+            video, output_type="pil")
         progress(0.95, desc="Saving video...")
 # Create Gradio interface
+logger.info("🎨 Creating Gradio interface...")
+sys.stdout.flush()
 with gr.Blocks(
     theme=gr.themes.Soft(),
                         info="Generate motion_signal.mp4 and mask.mp4 for Time-to-Move"
                     )
+                    generate_btn = gr.Button(
+                        "🚀 Generate Motion Signal", variant="primary", size="lg")
                 with gr.Column(scale=1):
                     gr.Markdown("### 📤 Rendered Output")
                         label="TTM Generated Video",
                         height=400
                     )
+                    ttm_status_text = gr.Markdown(
+                        "Upload a video in Step 1 first, then run TTM here.")
             # TTM Input preview
             with gr.Accordion("📁 TTM Input Files (from Step 1)", open=False):
     # Helper function to update states and preview
     def process_and_update_states(video_path, camera_movement, generate_ttm_flag, progress=gr.Progress()):
+        result = process_video(video_path, camera_movement,
+                               generate_ttm_flag, progress)
         output_vid, motion_sig, mask_vid, first_frame, status = result
         # Return all outputs including state updates and previews
     # Examples
     gr.Markdown("### 📁 Examples")
     if os.path.exists("./examples"):
+        example_videos = [f for f in os.listdir(
+            "./examples") if f.endswith(".mp4")][:4]
         if example_videos:
             gr.Examples(
+                examples=[[f"./examples/{v}", "move_forward", True]
+                          for v in example_videos],
                 inputs=[video_input, camera_movement, generate_ttm],
                 outputs=[
                     output_video, motion_signal_output, mask_output, first_frame_output, status_text,
             )
 # Launch
+logger.info("✅ Gradio interface created successfully!")
+logger.info("=" * 50)
+logger.info("Application ready to launch")
+logger.info("=" * 50)
+sys.stdout.flush()
 if __name__ == "__main__":
+    logger.info("Starting Gradio server...")
+    sys.stdout.flush()
     demo.launch(share=False)