Spaces:

abreza
/

SpatialTrackerV2_ttm

Sleeping

App Files Files Community

abreza commited on Dec 24, 2025

Commit

9462c17

1 Parent(s): 85cb605

move

Browse files

vggt4track_model to cuda inside hugging face zero gpu inference

Files changed (1) hide show

app.py +10 -8

app.py CHANGED Viewed

@@ -87,9 +87,9 @@ def create_user_temp_dir():
 # Global model initialization for Spatial Tracker
 print("🚀 Initializing tracking models...")
-vggt4track_model = VGGT4Track.from_pretrained("Yuxihenry/SpatialTrackerV2_Front")
 vggt4track_model.eval()
-vggt4track_model = vggt4track_model.to("cuda")
 if not hasattr(vggt4track_model, 'infer'):
     vggt4track_model.infer = vggt4track_model.forward
@@ -105,7 +105,6 @@ wan_pipeline.vae.enable_tiling()
 wan_pipeline.vae.enable_slicing()
 print("✅ Tracking models loaded successfully!")
 gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])
@@ -201,7 +200,6 @@ def render_from_pointcloud(rgb_frames, depth_frames, intrinsics, original_extrin
     return {'rendered': output_path, 'motion_signal': motion_signal_path, 'mask': mask_path}
 @spaces.GPU
 def run_spatial_tracker(video_tensor: torch.Tensor):
     """
@@ -216,6 +214,8 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
     # Run VGGT to get depth and camera poses
     video_input = preprocess_image(video_tensor)[None].cuda()
     with torch.no_grad():
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             predictions = vggt4track_model(video_input / 255)
@@ -237,7 +237,8 @@ def run_spatial_tracker(video_tensor: torch.Tensor):
     # Get grid points for tracking
     frame_H, frame_W = video_tensor_gpu.shape[2:]
     grid_pts = get_points_on_a_grid(30, (frame_H, frame_W), device="cpu")
-    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].numpy()
     # Run tracker
     with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
@@ -292,7 +293,6 @@ def run_wan_ttm_generation(prompt, tweak_index, tstrong_index, first_frame_path,
         "毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
     )
     wan_pipeline.to("cuda")
     # Match resolution logic from run_wan.py
@@ -390,7 +390,8 @@ def process_video(video_path, camera_movement, generate_ttm=True, progress=gr.Pr
 # --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft(), title="🎬 TTM Wan Video Generator") as demo:
     gr.Markdown("# 🎬 Video to Point Cloud & TTM Wan Generator")
-    gr.Markdown("Transform standard videos into 3D-aware motion signals for Time-to-Move (TTM) generation.")
     # Shared state for TTM files - initialized as empty strings
     first_frame_file = gr.State("")
@@ -437,7 +438,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="🎬 TTM Wan Video Generator") as
             # the path string instead of the raw pixel array.
             motion_signal_output = gr.Video(label="motion_signal.mp4")
             mask_output = gr.Video(label="mask.mp4")
-            first_frame_output = gr.Image(label="first_frame.png", type="filepath")
     # --- Event Handlers ---

 # Global model initialization for Spatial Tracker
 print("🚀 Initializing tracking models...")
+vggt4track_model = VGGT4Track.from_pretrained(
+    "Yuxihenry/SpatialTrackerV2_Front")
 vggt4track_model.eval()
 if not hasattr(vggt4track_model, 'infer'):
     vggt4track_model.infer = vggt4track_model.forward
 wan_pipeline.vae.enable_slicing()
 print("✅ Tracking models loaded successfully!")
 gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])
     return {'rendered': output_path, 'motion_signal': motion_signal_path, 'mask': mask_path}
 @spaces.GPU
 def run_spatial_tracker(video_tensor: torch.Tensor):
     """
     # Run VGGT to get depth and camera poses
     video_input = preprocess_image(video_tensor)[None].cuda()
+    vggt4track_model = vggt4track_model.to("cuda")
     with torch.no_grad():
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             predictions = vggt4track_model(video_input / 255)
     # Get grid points for tracking
     frame_H, frame_W = video_tensor_gpu.shape[2:]
     grid_pts = get_points_on_a_grid(30, (frame_H, frame_W), device="cpu")
+    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[
+        0].numpy()
     # Run tracker
     with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
         "毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
     )
     wan_pipeline.to("cuda")
     # Match resolution logic from run_wan.py
 # --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft(), title="🎬 TTM Wan Video Generator") as demo:
     gr.Markdown("# 🎬 Video to Point Cloud & TTM Wan Generator")
+    gr.Markdown(
+        "Transform standard videos into 3D-aware motion signals for Time-to-Move (TTM) generation.")
     # Shared state for TTM files - initialized as empty strings
     first_frame_file = gr.State("")
             # the path string instead of the raw pixel array.
             motion_signal_output = gr.Video(label="motion_signal.mp4")
             mask_output = gr.Video(label="mask.mp4")
+            first_frame_output = gr.Image(
+                label="first_frame.png", type="filepath")
     # --- Event Handlers ---