Spaces:

Yuxihenry
/

SpatialTrackerV2

Running on Zero

App Files Files Community

xiaoyuxi commited on Jul 2, 2025

Commit

e43b66a

1 Parent(s): 4d35051

vggt_da

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +69 -0
_viz/viz_template.html +1769 -0
app.py +508 -645
app_3rd/README.md +12 -0
app_3rd/sam_utils/hf_sam_predictor.py +129 -0
app_3rd/sam_utils/inference.py +123 -0
app_3rd/spatrack_utils/infer_track.py +195 -0
app_release.py +1278 -0
config/__init__.py +0 -0
config/magic_infer_moge.yaml +48 -0
frontend_app_local.py +1036 -0
models/SpaTrackV2/models/SpaTrack.py +758 -0
models/SpaTrackV2/models/__init__.py +0 -0
models/SpaTrackV2/models/blocks.py +519 -0
models/SpaTrackV2/models/camera_transform.py +248 -0
models/SpaTrackV2/models/depth_refiner/backbone.py +472 -0
models/SpaTrackV2/models/depth_refiner/decode_head.py +619 -0
models/SpaTrackV2/models/depth_refiner/depth_refiner.py +115 -0
models/SpaTrackV2/models/depth_refiner/network.py +429 -0
models/SpaTrackV2/models/depth_refiner/stablilization_attention.py +1187 -0
models/SpaTrackV2/models/depth_refiner/stablizer.py +342 -0
models/SpaTrackV2/models/predictor.py +153 -0
models/SpaTrackV2/models/tracker3D/TrackRefiner.py +1478 -0
models/SpaTrackV2/models/tracker3D/co_tracker/cotracker_base.py +418 -0
models/SpaTrackV2/models/tracker3D/co_tracker/utils.py +929 -0
models/SpaTrackV2/models/tracker3D/delta_utils/__init__.py +0 -0
models/SpaTrackV2/models/tracker3D/delta_utils/blocks.py +842 -0
models/SpaTrackV2/models/tracker3D/delta_utils/upsample_transformer.py +438 -0
models/SpaTrackV2/models/tracker3D/spatrack_modules/alignment.py +471 -0
models/SpaTrackV2/models/tracker3D/spatrack_modules/ba.py +538 -0
models/SpaTrackV2/models/tracker3D/spatrack_modules/blocks.py +15 -0
models/SpaTrackV2/models/tracker3D/spatrack_modules/dynamic_point_refine.py +0 -0
models/SpaTrackV2/models/tracker3D/spatrack_modules/geometry_numpy.py +401 -0
models/SpaTrackV2/models/tracker3D/spatrack_modules/geometry_torch.py +323 -0
models/SpaTrackV2/models/tracker3D/spatrack_modules/pointmap_updator.py +104 -0
models/SpaTrackV2/models/tracker3D/spatrack_modules/simple_vit_1d.py +125 -0
models/SpaTrackV2/models/tracker3D/spatrack_modules/tools.py +289 -0
models/SpaTrackV2/models/tracker3D/spatrack_modules/utils.py +1006 -0
models/SpaTrackV2/models/utils.py +1221 -0
models/SpaTrackV2/utils/embeddings.py +247 -0
models/SpaTrackV2/utils/model_utils.py +444 -0
models/SpaTrackV2/utils/visualizer.py +352 -0
models/moge/__init__.py +0 -0
models/moge/model/__init__.py +18 -0
models/moge/model/dinov2/__init__.py +6 -0
models/moge/model/dinov2/hub/__init__.py +4 -0
models/moge/model/dinov2/hub/backbones.py +156 -0
models/moge/model/dinov2/hub/utils.py +39 -0
models/moge/model/dinov2/layers/__init__.py +11 -0
models/moge/model/dinov2/layers/attention.py +89 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,69 @@

+# ignore the multi media
+checkpoints
+**/checkpoints/
+**/temp/
+temp
+assets_dev
+assets/example0/results
+assets/example0/snowboard.npz
+assets/example1/results
+assets/davis_eval
+assets/*/results
+*gradio*
+#
+models/monoD/zoeDepth/ckpts/*
+models/monoD/depth_anything/ckpts/*
+vis_results
+dist_encrypted
+# remove the dependencies
+deps
+# filter the __pycache__ files
+__pycache__/
+/**/**/__pycache__
+/**/__pycache__
+outputs
+scripts/lauch_exp/config
+scripts/lauch_exp/submit_job.log
+scripts/lauch_exp/hydra_output
+scripts/lauch_wulan
+scripts/custom_video
+# ignore the visualizer
+viser
+viser_result
+benchmark/results
+benchmark
+ossutil_output
+prev_version
+spat_ceres
+wandb
+*.log
+seg_target.py
+eval_davis.py
+eval_multiple_gpu.py
+eval_pose_scan.py
+eval_single_gpu.py
+infer_cam.py
+infer_stream.py
+*.egg-info/
+**/*.egg-info
+eval_kinectics.py
+models/SpaTrackV2/datasets
+scripts
+config/fix_2d.yaml
+models/SpaTrackV2/datasets
+scripts/
+models/**/build
+models/**/dist
+temp_local

_viz/viz_template.html ADDED Viewed

	@@ -0,0 +1,1769 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>3D Point Cloud Visualizer</title>
+  <style>
+    :root {
+      --primary: #9b59b6; /* Brighter purple for dark mode */
+      --primary-light: #3a2e4a;
+      --secondary: #a86add;
+      --accent: #ff6e6e;
+      --bg: #1a1a1a;
+      --surface: #2c2c2c;
+      --text: #e0e0e0;
+      --text-secondary: #a0a0a0;
+      --border: #444444;
+      --shadow: rgba(0, 0, 0, 0.2);
+      --shadow-hover: rgba(0, 0, 0, 0.3);
+      --space-sm: 16px;
+      --space-md: 24px;
+      --space-lg: 32px;
+    }
+    body {
+      margin: 0;
+      overflow: hidden;
+      background: var(--bg);
+      color: var(--text);
+      font-family: 'Inter', sans-serif;
+      -webkit-font-smoothing: antialiased;
+    }
+    #canvas-container {
+      position: absolute;
+      width: 100%;
+      height: 100%;
+    }
+    #ui-container {
+      position: absolute;
+      top: 0;
+      left: 0;
+      width: 100%;
+      height: 100%;
+      pointer-events: none;
+      z-index: 10;
+    }
+    #status-bar {
+      position: absolute;
+      top: 16px;
+      left: 16px;
+      background: rgba(30, 30, 30, 0.9);
+      padding: 8px 16px;
+      border-radius: 8px;
+      pointer-events: auto;
+      box-shadow: 0 4px 6px var(--shadow);
+      backdrop-filter: blur(4px);
+      border: 1px solid var(--border);
+      color: var(--text);
+      transition: opacity 0.5s ease, transform 0.5s ease;
+      font-weight: 500;
+    }
+    #status-bar.hidden {
+      opacity: 0;
+      transform: translateY(-20px);
+      pointer-events: none;
+    }
+    #control-panel {
+      position: absolute;
+      bottom: 16px;
+      left: 50%;
+      transform: translateX(-50%);
+      background: rgba(44, 44, 44, 0.95);
+      padding: 12px 16px;
+      border-radius: 12px;
+      display: flex;
+      gap: 16px;
+      align-items: center;
+      pointer-events: auto;
+      box-shadow: 0 4px 10px var(--shadow);
+      backdrop-filter: blur(4px);
+      border: 1px solid var(--border);
+    }
+    #timeline {
+      width: 400px;
+      height: 8px;
+      background: rgba(255, 255, 255, 0.1);
+      border-radius: 4px;
+      position: relative;
+      cursor: pointer;
+    }
+    #progress {
+      position: absolute;
+      height: 100%;
+      background: var(--primary);
+      border-radius: 4px;
+      width: 0%;
+    }
+    #playback-controls {
+      display: flex;
+      gap: 8px;
+      align-items: center;
+    }
+    button {
+      background: rgba(255, 255, 255, 0.08);
+      border: 1px solid var(--border);
+      color: var(--text);
+      padding: 8px 12px;
+      border-radius: 6px;
+      cursor: pointer;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      transition: background 0.2s, transform 0.2s;
+      font-family: 'Inter', sans-serif;
+      font-weight: 500;
+    }
+    button:hover {
+      background: rgba(255, 255, 255, 0.15);
+      transform: translateY(-1px);
+    }
+    button.active {
+      background: var(--primary);
+      color: white;
+      box-shadow: 0 2px 8px rgba(155, 89, 182, 0.4);
+    }
+    select, input {
+      background: rgba(255, 255, 255, 0.08);
+      border: 1px solid var(--border);
+      color: var(--text);
+      padding: 8px 12px;
+      border-radius: 6px;
+      cursor: pointer;
+      font-family: 'Inter', sans-serif;
+    }
+    .icon {
+      width: 20px;
+      height: 20px;
+      fill: currentColor;
+    }
+    .tooltip {
+      position: absolute;
+      bottom: 100%;
+      left: 50%;
+      transform: translateX(-50%);
+      background: var(--surface);
+      color: var(--text);
+      padding: 6px 12px;
+      border-radius: 6px;
+      font-size: 14px;
+      white-space: nowrap;
+      margin-bottom: 8px;
+      opacity: 0;
+      transition: opacity 0.2s;
+      pointer-events: none;
+      box-shadow: 0 2px 4px var(--shadow);
+      border: 1px solid var(--border);
+    }
+    button:hover .tooltip {
+      opacity: 1;
+    }
+    #settings-panel {
+      position: absolute;
+      top: 16px;
+      right: 16px;
+      background: rgba(44, 44, 44, 0.98);
+      padding: 20px;
+      border-radius: 12px;
+      width: 300px;
+      max-height: calc(100vh - 40px);
+      overflow-y: auto;
+      pointer-events: auto;
+      box-shadow: 0 4px 15px var(--shadow);
+      backdrop-filter: blur(4px);
+      border: 1px solid var(--border);
+      display: block;
+      opacity: 1;
+      scrollbar-width: thin;
+      scrollbar-color: var(--primary-light) transparent;
+      transition: transform 0.35s ease-in-out, opacity 0.3s ease-in-out;
+    }
+    #settings-panel.is-hidden {
+      transform: translateX(calc(100% + 20px));
+      opacity: 0;
+      pointer-events: none;
+    }
+    #settings-panel::-webkit-scrollbar {
+      width: 6px;
+    }
+    #settings-panel::-webkit-scrollbar-track {
+      background: transparent;
+    }
+    #settings-panel::-webkit-scrollbar-thumb {
+      background-color: var(--primary-light);
+      border-radius: 6px;
+    }
+    @media (max-height: 700px) {
+      #settings-panel {
+        max-height: calc(100vh - 40px);
+      }
+    }
+    @media (max-width: 768px) {
+      #control-panel {
+        width: 90%;
+        flex-wrap: wrap;
+        justify-content: center;
+      }
+      #timeline {
+        width: 100%;
+        order: 3;
+        margin-top: 10px;
+      }
+      #settings-panel {
+        width: 280px;
+        right: 10px;
+        top: 10px;
+        max-height: calc(100vh - 20px);
+      }
+    }
+    .settings-group {
+      margin-bottom: 16px;
+    }
+    .settings-group h3 {
+      margin: 0 0 8px 0;
+      font-size: 14px;
+      font-weight: 500;
+      color: var(--text-secondary);
+    }
+    .slider-container {
+      display: flex;
+      align-items: center;
+      gap: 12px;
+    }
+    .slider-container label {
+      min-width: 80px;
+      font-size: 14px;
+    }
+    input[type="range"] {
+      flex-grow: 1;
+      height: 4px;
+      -webkit-appearance: none;
+      background: rgba(255, 255, 255, 0.1);
+      border-radius: 2px;
+    }
+    input[type="range"]::-webkit-slider-thumb {
+      -webkit-appearance: none;
+      width: 16px;
+      height: 16px;
+      border-radius: 50%;
+      background: var(--primary);
+      cursor: pointer;
+    }
+    .toggle-switch {
+      position: relative;
+      display: inline-block;
+      width: 40px;
+      height: 20px;
+    }
+    .toggle-switch input {
+      opacity: 0;
+      width: 0;
+      height: 0;
+    }
+    .toggle-slider {
+      position: absolute;
+      cursor: pointer;
+      top: 0;
+      left: 0;
+      right: 0;
+      bottom: 0;
+      background: rgba(255, 255, 255, 0.1);
+      transition: .4s;
+      border-radius: 20px;
+    }
+    .toggle-slider:before {
+      position: absolute;
+      content: "";
+      height: 16px;
+      width: 16px;
+      left: 2px;
+      bottom: 2px;
+      background: var(--surface);
+      border: 1px solid var(--border);
+      transition: .4s;
+      border-radius: 50%;
+    }
+    input:checked + .toggle-slider {
+      background: var(--primary);
+    }
+    input:checked + .toggle-slider:before {
+      transform: translateX(20px);
+    }
+    .checkbox-container {
+      display: flex;
+      align-items: center;
+      gap: 8px;
+      margin-bottom: 8px;
+    }
+    .checkbox-container label {
+      font-size: 14px;
+      cursor: pointer;
+    }
+    #loading-overlay {
+      position: absolute;
+      top: 0;
+      left: 0;
+      width: 100%;
+      height: 100%;
+      background: var(--bg);
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      justify-content: center;
+      z-index: 100;
+      transition: opacity 0.5s;
+    }
+    #loading-overlay.fade-out {
+      opacity: 0;
+      pointer-events: none;
+    }
+    .spinner {
+      width: 50px;
+      height: 50px;
+      border: 5px solid rgba(155, 89, 182, 0.2);
+      border-radius: 50%;
+      border-top-color: var(--primary);
+      animation: spin 1s ease-in-out infinite;
+      margin-bottom: 16px;
+    }
+    @keyframes spin {
+      to { transform: rotate(360deg); }
+    }
+    #loading-text {
+      margin-top: 16px;
+      font-size: 18px;
+      color: var(--text);
+      font-weight: 500;
+    }
+    #frame-counter {
+      color: var(--text-secondary);
+      font-size: 14px;
+      font-weight: 500;
+      min-width: 120px;
+      text-align: center;
+      padding: 0 8px;
+    }
+    .control-btn {
+      background: rgba(255, 255, 255, 0.08);
+      border: 1px solid var(--border);
+      padding: 8px 12px;
+      border-radius: 6px;
+      cursor: pointer;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      transition: all 0.2s ease;
+    }
+    .control-btn:hover {
+      background: rgba(255, 255, 255, 0.15);
+      transform: translateY(-1px);
+    }
+    .control-btn.active {
+      background: var(--primary);
+      color: white;
+    }
+    .control-btn.active:hover {
+      background: var(--primary);
+      box-shadow: 0 2px 8px rgba(155, 89, 182, 0.4);
+    }
+    #settings-toggle-btn {
+      position: relative;
+      border-radius: 6px;
+      z-index: 20;
+    }
+    #settings-toggle-btn.active {
+      background: var(--primary);
+      color: white;
+    }
+    #status-bar,
+    #control-panel,
+    #settings-panel,
+    button,
+    input,
+    select,
+    .toggle-switch {
+      pointer-events: auto;
+    }
+    h2 {
+      font-size: 1.2rem;
+      font-weight: 600;
+      margin-top: 0;
+      margin-bottom: var(--space-md);
+      color: var(--primary);
+      cursor: move;
+      user-select: none;
+      display: flex;
+      align-items: center;
+    }
+    .drag-handle {
+      font-size: 14px;
+      margin-right: 8px;
+      opacity: 0.6;
+    }
+    h2:hover .drag-handle {
+      opacity: 1;
+    }
+    .loading-subtitle {
+      font-size: 14px;
+      color: var(--text-secondary);
+      margin-top: 8px;
+    }
+    #reset-view-btn {
+      background: var(--primary-light);
+      color: var(--primary);
+      border: 1px solid rgba(155, 89, 182, 0.2);
+      font-weight: 600;
+      transition: all 0.2s;
+    }
+    #reset-view-btn:hover {
+      background: var(--primary);
+      color: white;
+      transform: translateY(-2px);
+      box-shadow: 0 4px 8px rgba(155, 89, 182, 0.3);
+    }
+    #settings-panel.visible {
+      display: block;
+      opacity: 1;
+      animation: slideIn 0.3s ease forwards;
+    }
+    @keyframes slideIn {
+      from {
+        transform: translateY(20px);
+        opacity: 0;
+      }
+      to {
+        transform: translateY(0);
+        opacity: 1;
+      }
+    }
+    .dragging {
+      opacity: 0.9;
+      box-shadow: 0 8px 20px rgba(0, 0, 0, 0.15) !important;
+      transition: none !important;
+    }
+    /* Tooltip for draggable element */
+    .tooltip-drag {
+      position: absolute;
+      left: 50%;
+      transform: translateX(-50%);
+      background: var(--primary);
+      color: white;
+      font-size: 12px;
+      padding: 4px 8px;
+      border-radius: 4px;
+      opacity: 0;
+      pointer-events: none;
+      transition: opacity 0.3s;
+      white-space: nowrap;
+      bottom: 100%;
+      margin-bottom: 8px;
+    }
+    h2:hover .tooltip-drag {
+      opacity: 1;
+    }
+    .btn-group {
+      display: flex;
+      margin-top: 16px;
+    }
+    #reset-view-btn, #reset-settings-btn {
+      background: var(--primary-light);
+      color: var(--primary);
+      border: 1px solid rgba(155, 89, 182, 0.2);
+      font-weight: 600;
+      transition: all 0.2s;
+    }
+    #reset-view-btn:hover, #reset-settings-btn:hover {
+      background: var(--primary);
+      color: white;
+      transform: translateY(-2px);
+      box-shadow: 0 4px 8px rgba(155, 89, 182, 0.3);
+    }
+    #show-settings-btn {
+      position: absolute;
+      top: 16px;
+      right: 16px;
+      z-index: 15;
+      display: none;
+    }
+  </style>
+</head>
+<body>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
+  <div id="canvas-container"></div>
+  <div id="ui-container">
+    <div id="status-bar">Initializing...</div>
+    <div id="control-panel">
+      <button id="play-pause-btn" class="control-btn">
+        <svg class="icon" viewBox="0 0 24 24">
+          <path id="play-icon" d="M8 5v14l11-7z"/>
+          <path id="pause-icon" d="M6 19h4V5H6v14zm8-14v14h4V5h-4z" style="display: none;"/>
+        </svg>
+        <span class="tooltip">Play/Pause</span>
+      </button>
+      <div id="timeline">
+        <div id="progress"></div>
+      </div>
+      <div id="frame-counter">Frame 0 / 0</div>
+      <div id="playback-controls">
+        <button id="speed-btn" class="control-btn">1x</button>
+      </div>
+    </div>
+    <div id="settings-panel">
+      <h2>
+        <span class="drag-handle">☰</span>
+        Visualization Settings
+        <button id="hide-settings-btn" class="control-btn" style="margin-left: auto; padding: 4px;" title="Hide Panel">
+          <svg class="icon" viewBox="0 0 24 24" style="width: 18px; height: 18px;">
+            <path d="M14.59 7.41L18.17 11H4v2h14.17l-3.58 3.59L16 18l6-6-6-6-1.41 1.41z"/>
+          </svg>
+        </button>
+      </h2>
+      <div class="settings-group">
+        <h3>Point Cloud</h3>
+        <div class="slider-container">
+          <label for="point-size">Size</label>
+          <input type="range" id="point-size" min="0.005" max="0.1" step="0.005" value="0.03">
+        </div>
+        <div class="slider-container">
+          <label for="point-opacity">Opacity</label>
+          <input type="range" id="point-opacity" min="0.1" max="1" step="0.05" value="1">
+        </div>
+        <div class="slider-container">
+          <label for="max-depth">Max Depth</label>
+          <input type="range" id="max-depth" min="0.1" max="10" step="0.2" value="100">
+        </div>
+      </div>
+      <div class="settings-group">
+        <h3>Trajectory</h3>
+        <div class="checkbox-container">
+          <label class="toggle-switch">
+            <input type="checkbox" id="show-trajectory" checked>
+            <span class="toggle-slider"></span>
+          </label>
+          <label for="show-trajectory">Show Trajectory</label>
+        </div>
+        <div class="checkbox-container">
+          <label class="toggle-switch">
+            <input type="checkbox" id="enable-rich-trail">
+            <span class="toggle-slider"></span>
+          </label>
+          <label for="enable-rich-trail">Visual-Rich Trail</label>
+        </div>
+        <div class="slider-container">
+          <label for="trajectory-line-width">Line Width</label>
+          <input type="range" id="trajectory-line-width" min="0.5" max="5" step="0.5" value="1.5">
+        </div>
+        <div class="slider-container">
+          <label for="trajectory-ball-size">Ball Size</label>
+          <input type="range" id="trajectory-ball-size" min="0.005" max="0.05" step="0.001" value="0.02">
+        </div>
+        <div class="slider-container">
+          <label for="trajectory-history">History Frames</label>
+          <input type="range" id="trajectory-history" min="1" max="500" step="1" value="30">
+        </div>
+        <div class="slider-container" id="tail-opacity-container" style="display: none;">
+          <label for="trajectory-fade">Tail Opacity</label>
+          <input type="range" id="trajectory-fade" min="0" max="1" step="0.05" value="0.0">
+        </div>
+      </div>
+      <div class="settings-group">
+        <h3>Camera</h3>
+        <div class="checkbox-container">
+          <label class="toggle-switch">
+            <input type="checkbox" id="show-camera-frustum" checked>
+            <span class="toggle-slider"></span>
+          </label>
+          <label for="show-camera-frustum">Show Camera Frustum</label>
+        </div>
+        <div class="slider-container">
+          <label for="frustum-size">Size</label>
+          <input type="range" id="frustum-size" min="0.02" max="0.5" step="0.01" value="0.2">
+        </div>
+      </div>
+      <div class="settings-group">
+        <div class="btn-group">
+          <button id="reset-view-btn" style="flex: 1; margin-right: 5px;">Reset View</button>
+          <button id="reset-settings-btn" style="flex: 1; margin-left: 5px;">Reset Settings</button>
+        </div>
+      </div>
+    </div>
+    <button id="show-settings-btn" class="control-btn" title="Show Settings">
+      <svg class="icon" viewBox="0 0 24 24">
+        <path d="M19.14,12.94c0.04-0.3,0.06-0.61,0.06-0.94c0-0.32-0.02-0.64-0.07-0.94l2.03-1.58c0.18-0.14,0.23-0.41,0.12-0.61 l-1.92-3.32c-0.12-0.22-0.37-0.29-0.59-0.22l-2.39,0.96c-0.5-0.38-1.03-0.7-1.62-0.94L14.4,2.81c-0.04-0.24-0.24-0.41-0.48-0.41 h-3.84c-0.24,0-0.43,0.17-0.47,0.41L9.25,5.35C8.66,5.59,8.12,5.92,7.63,6.29L5.24,5.33c-0.22-0.08-0.47,0-0.59,0.22L2.74,8.87 C2.62,9.08,2.66,9.34,2.86,9.48l2.03,1.58C4.84,11.36,4.8,11.69,4.8,12s0.02,0.64,0.07,0.94l-2.03,1.58 c-0.18,0.14-0.23,0.41-0.12,0.61l1.92,3.32c0.12,0.22,0.37,0.29,0.59,0.22l2.39-0.96c0.5,0.38,1.03,0.7,1.62,0.94l0.36,2.54 c0.04,0.24,0.24,0.41,0.48,0.41h3.84c0.24,0,0.44-0.17,0.47-0.41l0.36-2.54c0.59-0.24,1.13-0.56,1.62-0.94l2.39,0.96 c0.22,0.08,0.47,0,0.59-0.22l1.92-3.32c0.12-0.22,0.07-0.47-0.12-0.61L19.14,12.94z M12,15.6c-1.98,0-3.6-1.62-3.6-3.6 s1.62-3.6,3.6-3.6s3.6,1.62,3.6,3.6S13.98,15.6,12,15.6z"/>
+      </svg>
+    </button>
+  </div>
+  <div id="loading-overlay">
+    <!-- <div class="spinner"></div> -->
+    <div id="loading-text"></div>
+    <div class="loading-subtitle" style="font-size: xx-large;">Interactive Viewer of 3D Tracking</div>
+  </div>
+  <!-- Libraries -->
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/pako/2.1.0/pako.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/three@0.132.2/build/three.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/three@0.132.2/examples/js/controls/OrbitControls.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/dat.gui@0.7.7/build/dat.gui.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/three@0.132.2/examples/js/lines/LineSegmentsGeometry.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/three@0.132.2/examples/js/lines/LineGeometry.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/three@0.132.2/examples/js/lines/LineMaterial.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/three@0.132.2/examples/js/lines/LineSegments2.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/three@0.132.2/examples/js/lines/Line2.js"></script>
+  <script>
+    class PointCloudVisualizer {
+      constructor() {
+        this.data = null;
+        this.config = {};
+        this.currentFrame = 0;
+        this.isPlaying = false;
+        this.playbackSpeed = 1;
+        this.lastFrameTime = 0;
+        this.defaultSettings = null;
+        this.ui = {
+          statusBar: document.getElementById('status-bar'),
+          playPauseBtn: document.getElementById('play-pause-btn'),
+          speedBtn: document.getElementById('speed-btn'),
+          timeline: document.getElementById('timeline'),
+          progress: document.getElementById('progress'),
+          settingsPanel: document.getElementById('settings-panel'),
+          loadingOverlay: document.getElementById('loading-overlay'),
+          loadingText: document.getElementById('loading-text'),
+          settingsToggleBtn: document.getElementById('settings-toggle-btn'),
+          frameCounter: document.getElementById('frame-counter'),
+          pointSize: document.getElementById('point-size'),
+          pointOpacity: document.getElementById('point-opacity'),
+          maxDepth: document.getElementById('max-depth'),
+          showTrajectory: document.getElementById('show-trajectory'),
+          enableRichTrail: document.getElementById('enable-rich-trail'),
+          trajectoryLineWidth: document.getElementById('trajectory-line-width'),
+          trajectoryBallSize: document.getElementById('trajectory-ball-size'),
+          trajectoryHistory: document.getElementById('trajectory-history'),
+          trajectoryFade: document.getElementById('trajectory-fade'),
+          tailOpacityContainer: document.getElementById('tail-opacity-container'),
+          resetViewBtn: document.getElementById('reset-view-btn'),
+          showCameraFrustum: document.getElementById('show-camera-frustum'),
+          frustumSize: document.getElementById('frustum-size'),
+          hideSettingsBtn: document.getElementById('hide-settings-btn'),
+          showSettingsBtn: document.getElementById('show-settings-btn')
+        };
+        this.scene = null;
+        this.camera = null;
+        this.renderer = null;
+        this.controls = null;
+        this.pointCloud = null;
+        this.trajectories = [];
+        this.cameraFrustum = null;
+        this.initThreeJS();
+        this.loadDefaultSettings().then(() => {
+          this.initEventListeners();
+          this.loadData();
+        });
+      }
+      async loadDefaultSettings() {
+        try {
+          const urlParams = new URLSearchParams(window.location.search);
+          const dataPath = urlParams.get('data') || '';
+          const defaultSettings = {
+            pointSize: 0.03,
+            pointOpacity: 1.0,
+            showTrajectory: true,
+            trajectoryLineWidth: 2.5,
+            trajectoryBallSize: 0.015,
+            trajectoryHistory: 0,
+            showCameraFrustum: true,
+            frustumSize: 0.2
+          };
+          if (!dataPath) {
+            this.defaultSettings = defaultSettings;
+            this.applyDefaultSettings();
+            return;
+          }
+          // Try to extract dataset and videoId from the data path
+          // Expected format: demos/datasetname/videoid.bin
+          const pathParts = dataPath.split('/');
+          if (pathParts.length < 3) {
+            this.defaultSettings = defaultSettings;
+            this.applyDefaultSettings();
+            return;
+          }
+          const datasetName = pathParts[pathParts.length - 2];
+          let videoId = pathParts[pathParts.length - 1].replace('.bin', '');
+          // Load settings from data.json
+          const response = await fetch('./data.json');
+          if (!response.ok) {
+            this.defaultSettings = defaultSettings;
+            this.applyDefaultSettings();
+            return;
+          }
+          const settingsData = await response.json();
+          // Check if this dataset and video exist
+          if (settingsData[datasetName] && settingsData[datasetName][videoId]) {
+            this.defaultSettings = settingsData[datasetName][videoId];
+          } else {
+            this.defaultSettings = defaultSettings;
+          }
+          this.applyDefaultSettings();
+        } catch (error) {
+          console.error("Error loading default settings:", error);
+          this.defaultSettings = {
+            pointSize: 0.03,
+            pointOpacity: 1.0,
+            showTrajectory: true,
+            trajectoryLineWidth: 2.5,
+            trajectoryBallSize: 0.015,
+            trajectoryHistory: 0,
+            showCameraFrustum: true,
+            frustumSize: 0.2
+          };
+          this.applyDefaultSettings();
+        }
+      }
+      applyDefaultSettings() {
+        if (!this.defaultSettings) return;
+        if (this.ui.pointSize) {
+          this.ui.pointSize.value = this.defaultSettings.pointSize;
+        }
+        if (this.ui.pointOpacity) {
+          this.ui.pointOpacity.value = this.defaultSettings.pointOpacity;
+        }
+        if (this.ui.maxDepth) {
+          this.ui.maxDepth.value = this.defaultSettings.maxDepth || 100.0;
+        }
+        if (this.ui.showTrajectory) {
+          this.ui.showTrajectory.checked = this.defaultSettings.showTrajectory;
+        }
+        if (this.ui.trajectoryLineWidth) {
+          this.ui.trajectoryLineWidth.value = this.defaultSettings.trajectoryLineWidth;
+        }
+        if (this.ui.trajectoryBallSize) {
+          this.ui.trajectoryBallSize.value = this.defaultSettings.trajectoryBallSize;
+        }
+        if (this.ui.trajectoryHistory) {
+          this.ui.trajectoryHistory.value = this.defaultSettings.trajectoryHistory;
+        }
+        if (this.ui.showCameraFrustum) {
+          this.ui.showCameraFrustum.checked = this.defaultSettings.showCameraFrustum;
+        }
+        if (this.ui.frustumSize) {
+          this.ui.frustumSize.value = this.defaultSettings.frustumSize;
+        }
+      }
+      initThreeJS() {
+        this.scene = new THREE.Scene();
+        this.scene.background = new THREE.Color(0x1a1a1a);
+        this.camera = new THREE.PerspectiveCamera(60, window.innerWidth / window.innerHeight, 0.1, 10000);
+        this.camera.position.set(0, 0, 0);
+        this.renderer = new THREE.WebGLRenderer({ antialias: true });
+        this.renderer.setPixelRatio(window.devicePixelRatio);
+        this.renderer.setSize(window.innerWidth, window.innerHeight);
+        document.getElementById('canvas-container').appendChild(this.renderer.domElement);
+        this.controls = new THREE.OrbitControls(this.camera, this.renderer.domElement);
+        this.controls.enableDamping = true;
+        this.controls.dampingFactor = 0.05;
+        this.controls.target.set(0, 0, 0);
+        this.controls.minDistance = 0.1;
+        this.controls.maxDistance = 1000;
+        this.controls.update();
+        const ambientLight = new THREE.AmbientLight(0xffffff, 0.5);
+        this.scene.add(ambientLight);
+        const directionalLight = new THREE.DirectionalLight(0xffffff, 0.8);
+        directionalLight.position.set(1, 1, 1);
+        this.scene.add(directionalLight);
+      }
+      initEventListeners() {
+        window.addEventListener('resize', () => this.onWindowResize());
+        this.ui.playPauseBtn.addEventListener('click', () => this.togglePlayback());
+        this.ui.timeline.addEventListener('click', (e) => {
+          const rect = this.ui.timeline.getBoundingClientRect();
+          const pos = (e.clientX - rect.left) / rect.width;
+          this.seekTo(pos);
+        });
+        this.ui.speedBtn.addEventListener('click', () => this.cyclePlaybackSpeed());
+        this.ui.pointSize.addEventListener('input', () => this.updatePointCloudSettings());
+        this.ui.pointOpacity.addEventListener('input', () => this.updatePointCloudSettings());
+        this.ui.maxDepth.addEventListener('input', () => this.updatePointCloudSettings());
+        this.ui.showTrajectory.addEventListener('change', () => {
+          this.trajectories.forEach(trajectory => {
+            trajectory.visible = this.ui.showTrajectory.checked;
+          });
+        });
+        this.ui.enableRichTrail.addEventListener('change', () => {
+            this.ui.tailOpacityContainer.style.display = this.ui.enableRichTrail.checked ? 'flex' : 'none';
+            this.updateTrajectories(this.currentFrame);
+        });
+        this.ui.trajectoryLineWidth.addEventListener('input', () => this.updateTrajectorySettings());
+        this.ui.trajectoryBallSize.addEventListener('input', () => this.updateTrajectorySettings());
+        this.ui.trajectoryHistory.addEventListener('input', () => {
+          this.updateTrajectories(this.currentFrame);
+        });
+        this.ui.trajectoryFade.addEventListener('input', () => {
+          this.updateTrajectories(this.currentFrame);
+        });
+        this.ui.resetViewBtn.addEventListener('click', () => this.resetView());
+        const resetSettingsBtn = document.getElementById('reset-settings-btn');
+        if (resetSettingsBtn) {
+          resetSettingsBtn.addEventListener('click', () => this.resetSettings());
+        }
+        document.addEventListener('keydown', (e) => {
+          if (e.key === 'Escape' && this.ui.settingsPanel.classList.contains('visible')) {
+            this.ui.settingsPanel.classList.remove('visible');
+            this.ui.settingsToggleBtn.classList.remove('active');
+          }
+        });
+        if (this.ui.settingsToggleBtn) {
+          this.ui.settingsToggleBtn.addEventListener('click', () => {
+            const isVisible = this.ui.settingsPanel.classList.toggle('visible');
+            this.ui.settingsToggleBtn.classList.toggle('active', isVisible);
+            if (isVisible) {
+              const panelRect = this.ui.settingsPanel.getBoundingClientRect();
+              const viewportHeight = window.innerHeight;
+              if (panelRect.bottom > viewportHeight) {
+                this.ui.settingsPanel.style.bottom = 'auto';
+                this.ui.settingsPanel.style.top = '80px';
+              }
+            }
+          });
+        }
+        if (this.ui.frustumSize) {
+          this.ui.frustumSize.addEventListener('input', () => this.updateFrustumDimensions());
+        }
+        this.makeElementDraggable(this.ui.settingsPanel);
+        if (this.ui.hideSettingsBtn && this.ui.showSettingsBtn && this.ui.settingsPanel) {
+          this.ui.hideSettingsBtn.addEventListener('click', () => {
+            this.ui.settingsPanel.classList.add('is-hidden');
+            this.ui.showSettingsBtn.style.display = 'flex';
+          });
+          this.ui.showSettingsBtn.addEventListener('click', () => {
+            this.ui.settingsPanel.classList.remove('is-hidden');
+            this.ui.showSettingsBtn.style.display = 'none';
+          });
+        }
+      }
+      makeElementDraggable(element) {
+        let pos1 = 0, pos2 = 0, pos3 = 0, pos4 = 0;
+        const dragHandle = element.querySelector('h2');
+        if (dragHandle) {
+          dragHandle.onmousedown = dragMouseDown;
+          dragHandle.title = "Drag to move panel";
+        } else {
+          element.onmousedown = dragMouseDown;
+        }
+        function dragMouseDown(e) {
+          e = e || window.event;
+          e.preventDefault();
+          pos3 = e.clientX;
+          pos4 = e.clientY;
+          document.onmouseup = closeDragElement;
+          document.onmousemove = elementDrag;
+          element.classList.add('dragging');
+        }
+        function elementDrag(e) {
+          e = e || window.event;
+          e.preventDefault();
+          pos1 = pos3 - e.clientX;
+          pos2 = pos4 - e.clientY;
+          pos3 = e.clientX;
+          pos4 = e.clientY;
+          const newTop = element.offsetTop - pos2;
+          const newLeft = element.offsetLeft - pos1;
+          const viewportWidth = window.innerWidth;
+          const viewportHeight = window.innerHeight;
+          const panelRect = element.getBoundingClientRect();
+          const maxTop = viewportHeight - 50;
+          const maxLeft = viewportWidth - 50;
+          element.style.top = Math.min(Math.max(newTop, 0), maxTop) + "px";
+          element.style.left = Math.min(Math.max(newLeft, 0), maxLeft) + "px";
+          // Remove bottom/right settings when dragging
+          element.style.bottom = 'auto';
+          element.style.right = 'auto';
+        }
+        function closeDragElement() {
+          document.onmouseup = null;
+          document.onmousemove = null;
+          element.classList.remove('dragging');
+        }
+      }
+      async loadData() {
+        try {
+          // this.ui.loadingText.textContent = "Loading binary data...";
+          let arrayBuffer;
+          if (window.embeddedBase64) {
+            // Base64 embedded path
+            const binaryString = atob(window.embeddedBase64);
+            const len = binaryString.length;
+            const bytes = new Uint8Array(len);
+            for (let i = 0; i < len; i++) {
+              bytes[i] = binaryString.charCodeAt(i);
+            }
+            arrayBuffer = bytes.buffer;
+          } else {
+            // Default fetch path (fallback)
+            const urlParams = new URLSearchParams(window.location.search);
+            const dataPath = urlParams.get('data') || 'data.bin';
+            const response = await fetch(dataPath);
+            if (!response.ok) throw new Error(`Failed to load ${dataPath}`);
+            arrayBuffer = await response.arrayBuffer();
+          }
+          const dataView = new DataView(arrayBuffer);
+          const headerLen = dataView.getUint32(0, true);
+          const headerText = new TextDecoder("utf-8").decode(arrayBuffer.slice(4, 4 + headerLen));
+          const header = JSON.parse(headerText);
+          const compressedBlob = new Uint8Array(arrayBuffer, 4 + headerLen);
+          const decompressed = pako.inflate(compressedBlob).buffer;
+          const arrays = {};
+          for (const key in header) {
+            if (key === "meta") continue;
+            const meta = header[key];
+            const { dtype, shape, offset, length } = meta;
+            const slice = decompressed.slice(offset, offset + length);
+            let typedArray;
+            switch (dtype) {
+              case "uint8": typedArray = new Uint8Array(slice); break;
+              case "uint16": typedArray = new Uint16Array(slice); break;
+              case "float32": typedArray = new Float32Array(slice); break;
+              case "float64": typedArray = new Float64Array(slice); break;
+              default: throw new Error(`Unknown dtype: ${dtype}`);
+            }
+            arrays[key] = { data: typedArray, shape: shape };
+          }
+          this.data = arrays;
+          this.config = header.meta;
+          this.initCameraWithCorrectFOV();
+          this.ui.loadingText.textContent = "Creating point cloud...";
+          this.initPointCloud();
+          this.initTrajectories();
+          setTimeout(() => {
+            this.ui.loadingOverlay.classList.add('fade-out');
+            this.ui.statusBar.classList.add('hidden');
+            this.startAnimation();
+          }, 500);
+        } catch (error) {
+          console.error("Error loading data:", error);
+          this.ui.statusBar.textContent = `Error: ${error.message}`;
+          // this.ui.loadingText.textContent = `Error loading data: ${error.message}`;
+        }
+      }
+      initPointCloud() {
+        const numPoints = this.config.resolution[0] * this.config.resolution[1];
+        const positions = new Float32Array(numPoints * 3);
+        const colors = new Float32Array(numPoints * 3);
+        const geometry = new THREE.BufferGeometry();
+        geometry.setAttribute('position', new THREE.BufferAttribute(positions, 3).setUsage(THREE.DynamicDrawUsage));
+        geometry.setAttribute('color', new THREE.BufferAttribute(colors, 3).setUsage(THREE.DynamicDrawUsage));
+        const pointSize = parseFloat(this.ui.pointSize.value) || this.defaultSettings.pointSize;
+        const pointOpacity = parseFloat(this.ui.pointOpacity.value) || this.defaultSettings.pointOpacity;
+        const material = new THREE.PointsMaterial({
+          size: pointSize,
+          vertexColors: true,
+          transparent: true,
+          opacity: pointOpacity,
+          sizeAttenuation: true
+        });
+        this.pointCloud = new THREE.Points(geometry, material);
+        this.scene.add(this.pointCloud);
+      }
+      initTrajectories() {
+        if (!this.data.trajectories) return;
+        this.trajectories.forEach(trajectory => {
+            if (trajectory.userData.lineSegments) {
+                trajectory.userData.lineSegments.forEach(segment => {
+                    segment.geometry.dispose();
+                    segment.material.dispose();
+                });
+            }
+            this.scene.remove(trajectory);
+        });
+        this.trajectories = [];
+        const shape = this.data.trajectories.shape;
+        if (!shape || shape.length < 2) return;
+        const [totalFrames, numTrajectories] = shape;
+        const palette = this.createColorPalette(numTrajectories);
+        const resolution = new THREE.Vector2(window.innerWidth, window.innerHeight);
+        const maxHistory = 500; // Max value of the history slider, for the object pool
+        for (let i = 0; i < numTrajectories; i++) {
+            const trajectoryGroup = new THREE.Group();
+            const ballSize = parseFloat(this.ui.trajectoryBallSize.value);
+            const sphereGeometry = new THREE.SphereGeometry(ballSize, 16, 16);
+            const sphereMaterial = new THREE.MeshBasicMaterial({ color: palette[i], transparent: true });
+            const positionMarker = new THREE.Mesh(sphereGeometry, sphereMaterial);
+            trajectoryGroup.add(positionMarker);
+            // High-Performance Line (default)
+            const simpleLineGeometry = new THREE.BufferGeometry();
+            const simpleLinePositions = new Float32Array(maxHistory * 3);
+            simpleLineGeometry.setAttribute('position', new THREE.BufferAttribute(simpleLinePositions, 3).setUsage(THREE.DynamicDrawUsage));
+            const simpleLine = new THREE.Line(simpleLineGeometry, new THREE.LineBasicMaterial({ color: palette[i] }));
+            simpleLine.frustumCulled = false;
+            trajectoryGroup.add(simpleLine);
+            // High-Quality Line Segments (for rich trail)
+            const lineSegments = [];
+            const lineWidth = parseFloat(this.ui.trajectoryLineWidth.value);
+            // Create a pool of line segment objects
+            for (let j = 0; j < maxHistory - 1; j++) {
+                const lineGeometry = new THREE.LineGeometry();
+                lineGeometry.setPositions([0, 0, 0, 0, 0, 0]);
+                const lineMaterial = new THREE.LineMaterial({
+                    color: palette[i],
+                    linewidth: lineWidth,
+                    resolution: resolution,
+                    transparent: true,
+                    depthWrite: false, // Correctly handle transparency
+                    opacity: 0
+                });
+                const segment = new THREE.Line2(lineGeometry, lineMaterial);
+                segment.frustumCulled = false;
+                segment.visible = false; // Start with all segments hidden
+                trajectoryGroup.add(segment);
+                lineSegments.push(segment);
+            }
+            trajectoryGroup.userData = {
+                marker: positionMarker,
+                simpleLine: simpleLine,
+                lineSegments: lineSegments,
+                color: palette[i]
+            };
+            this.scene.add(trajectoryGroup);
+            this.trajectories.push(trajectoryGroup);
+        }
+        const showTrajectory = this.ui.showTrajectory.checked;
+        this.trajectories.forEach(trajectory => trajectory.visible = showTrajectory);
+      }
+      createColorPalette(count) {
+        const colors = [];
+        const hueStep = 360 / count;
+        for (let i = 0; i < count; i++) {
+          const hue = (i * hueStep) % 360;
+          const color = new THREE.Color().setHSL(hue / 360, 0.8, 0.6);
+          colors.push(color);
+        }
+        return colors;
+      }
+      updatePointCloud(frameIndex) {
+        if (!this.data || !this.pointCloud) return;
+        const positions = this.pointCloud.geometry.attributes.position.array;
+        const colors = this.pointCloud.geometry.attributes.color.array;
+        const rgbVideo = this.data.rgb_video;
+        const depthsRgb = this.data.depths_rgb;
+        const intrinsics = this.data.intrinsics;
+        const invExtrinsics = this.data.inv_extrinsics;
+        const width = this.config.resolution[0];
+        const height = this.config.resolution[1];
+        const numPoints = width * height;
+        const K = this.get3x3Matrix(intrinsics.data, intrinsics.shape, frameIndex);
+        const fx = K[0][0], fy = K[1][1], cx = K[0][2], cy = K[1][2];
+        const invExtrMat = this.get4x4Matrix(invExtrinsics.data, invExtrinsics.shape, frameIndex);
+        const transform = this.getTransformElements(invExtrMat);
+        const rgbFrame = this.getFrame(rgbVideo.data, rgbVideo.shape, frameIndex);
+        const depthFrame = this.getFrame(depthsRgb.data, depthsRgb.shape, frameIndex);
+        const maxDepth = parseFloat(this.ui.maxDepth.value) || 10.0;
+        let validPointCount = 0;
+        for (let i = 0; i < numPoints; i++) {
+          const xPix = i % width;
+          const yPix = Math.floor(i / width);
+          const d0 = depthFrame[i * 3];
+          const d1 = depthFrame[i * 3 + 1];
+          const depthEncoded = d0 | (d1 << 8);
+          const depthValue = (depthEncoded / ((1 << 16) - 1)) *
+                           (this.config.depthRange[1] - this.config.depthRange[0]) +
+                           this.config.depthRange[0];
+          if (depthValue === 0 || depthValue > maxDepth) {
+            continue;
+          }
+          const X = ((xPix - cx) * depthValue) / fx;
+          const Y = ((yPix - cy) * depthValue) / fy;
+          const Z = depthValue;
+          const tx = transform.m11 * X + transform.m12 * Y + transform.m13 * Z + transform.m14;
+          const ty = transform.m21 * X + transform.m22 * Y + transform.m23 * Z + transform.m24;
+          const tz = transform.m31 * X + transform.m32 * Y + transform.m33 * Z + transform.m34;
+          const index = validPointCount * 3;
+          positions[index] = tx;
+          positions[index + 1] = -ty;
+          positions[index + 2] = -tz;
+          colors[index] = rgbFrame[i * 3] / 255;
+          colors[index + 1] = rgbFrame[i * 3 + 1] / 255;
+          colors[index + 2] = rgbFrame[i * 3 + 2] / 255;
+          validPointCount++;
+        }
+        this.pointCloud.geometry.setDrawRange(0, validPointCount);
+        this.pointCloud.geometry.attributes.position.needsUpdate = true;
+        this.pointCloud.geometry.attributes.color.needsUpdate = true;
+        this.pointCloud.geometry.computeBoundingSphere(); // Important for camera culling
+        this.updateTrajectories(frameIndex);
+        const progress = (frameIndex + 1) / this.config.totalFrames;
+        this.ui.progress.style.width = `${progress * 100}%`;
+        if (this.ui.frameCounter && this.config.totalFrames) {
+          this.ui.frameCounter.textContent = `Frame ${frameIndex} / ${this.config.totalFrames - 1}`;
+        }
+        this.updateCameraFrustum(frameIndex);
+      }
+      updateTrajectories(frameIndex) {
+        if (!this.data.trajectories || this.trajectories.length === 0) return;
+        const trajectoryData = this.data.trajectories.data;
+        const [totalFrames, numTrajectories] = this.data.trajectories.shape;
+        const historyFrames = parseInt(this.ui.trajectoryHistory.value);
+        const tailOpacity = parseFloat(this.ui.trajectoryFade.value);
+        const isRichMode = this.ui.enableRichTrail.checked;
+        for (let i = 0; i < numTrajectories; i++) {
+          const trajectoryGroup = this.trajectories[i];
+          const { marker, simpleLine, lineSegments } = trajectoryGroup.userData;
+          const currentPos = new THREE.Vector3();
+          const currentOffset = (frameIndex * numTrajectories + i) * 3;
+          currentPos.x = trajectoryData[currentOffset];
+          currentPos.y = -trajectoryData[currentOffset + 1];
+          currentPos.z = -trajectoryData[currentOffset + 2];
+          marker.position.copy(currentPos);
+          marker.material.opacity = 1.0;
+          const historyToShow = Math.min(historyFrames, frameIndex + 1);
+          if (isRichMode) {
+              // --- High-Quality Mode ---
+              simpleLine.visible = false;
+              for (let j = 0; j < lineSegments.length; j++) {
+                  const segment = lineSegments[j];
+                  if (j < historyToShow - 1) {
+                      const headFrame = frameIndex - j;
+                      const tailFrame = frameIndex - j - 1;
+                      const headOffset = (headFrame * numTrajectories + i) * 3;
+                      const tailOffset = (tailFrame * numTrajectories + i) * 3;
+                      const positions = [
+                          trajectoryData[headOffset], -trajectoryData[headOffset + 1], -trajectoryData[headOffset + 2],
+                          trajectoryData[tailOffset], -trajectoryData[tailOffset + 1], -trajectoryData[tailOffset + 2]
+                      ];
+                      segment.geometry.setPositions(positions);
+                      const headOpacity = 1.0;
+                      const normalizedAge = j / Math.max(1, historyToShow - 2);
+                      const alpha = headOpacity - (headOpacity - tailOpacity) * normalizedAge;
+                      segment.material.opacity = Math.max(0, alpha);
+                      segment.visible = true;
+                  } else {
+                      segment.visible = false;
+                  }
+              }
+          } else {
+              // --- Performance Mode ---
+              lineSegments.forEach(s => s.visible = false);
+              simpleLine.visible = true;
+              const positions = simpleLine.geometry.attributes.position.array;
+              for (let j = 0; j < historyToShow; j++) {
+                  const historyFrame = Math.max(0, frameIndex - j);
+                  const offset = (historyFrame * numTrajectories + i) * 3;
+                  positions[j * 3] = trajectoryData[offset];
+                  positions[j * 3 + 1] = -trajectoryData[offset + 1];
+                  positions[j * 3 + 2] = -trajectoryData[offset + 2];
+              }
+              simpleLine.geometry.setDrawRange(0, historyToShow);
+              simpleLine.geometry.attributes.position.needsUpdate = true;
+          }
+        }
+      }
+      updateTrajectorySettings() {
+        if (!this.trajectories || this.trajectories.length === 0) return;
+        const ballSize = parseFloat(this.ui.trajectoryBallSize.value);
+        const lineWidth = parseFloat(this.ui.trajectoryLineWidth.value);
+        this.trajectories.forEach(trajectoryGroup => {
+          const { marker, lineSegments } = trajectoryGroup.userData;
+          marker.geometry.dispose();
+          marker.geometry = new THREE.SphereGeometry(ballSize, 16, 16);
+          // Line width only affects rich mode
+          lineSegments.forEach(segment => {
+            if (segment.material) {
+              segment.material.linewidth = lineWidth;
+            }
+          });
+        });
+        this.updateTrajectories(this.currentFrame);
+      }
+      getDepthColor(normalizedDepth) {
+        const hue = (1 - normalizedDepth) * 240 / 360;
+        const color = new THREE.Color().setHSL(hue, 1.0, 0.5);
+        return color;
+      }
+      getFrame(typedArray, shape, frameIndex) {
+        const [T, H, W, C] = shape;
+        const frameSize = H * W * C;
+        const offset = frameIndex * frameSize;
+        return typedArray.subarray(offset, offset + frameSize);
+      }
+      get3x3Matrix(typedArray, shape, frameIndex) {
+        const frameSize = 9;
+        const offset = frameIndex * frameSize;
+        const K = [];
+        for (let i = 0; i < 3; i++) {
+          const row = [];
+          for (let j = 0; j < 3; j++) {
+            row.push(typedArray[offset + i * 3 + j]);
+          }
+          K.push(row);
+        }
+        return K;
+      }
+      get4x4Matrix(typedArray, shape, frameIndex) {
+        const frameSize = 16;
+        const offset = frameIndex * frameSize;
+        const M = [];
+        for (let i = 0; i < 4; i++) {
+          const row = [];
+          for (let j = 0; j < 4; j++) {
+            row.push(typedArray[offset + i * 4 + j]);
+          }
+          M.push(row);
+        }
+        return M;
+      }
+      getTransformElements(matrix) {
+        return {
+          m11: matrix[0][0], m12: matrix[0][1], m13: matrix[0][2], m14: matrix[0][3],
+          m21: matrix[1][0], m22: matrix[1][1], m23: matrix[1][2], m24: matrix[1][3],
+          m31: matrix[2][0], m32: matrix[2][1], m33: matrix[2][2], m34: matrix[2][3]
+        };
+      }
+      togglePlayback() {
+        this.isPlaying = !this.isPlaying;
+        const playIcon = document.getElementById('play-icon');
+        const pauseIcon = document.getElementById('pause-icon');
+        if (this.isPlaying) {
+          playIcon.style.display = 'none';
+          pauseIcon.style.display = 'block';
+          this.lastFrameTime = performance.now();
+        } else {
+          playIcon.style.display = 'block';
+          pauseIcon.style.display = 'none';
+        }
+      }
+      cyclePlaybackSpeed() {
+        const speeds = [0.5, 1, 2, 4, 8];
+        const speedRates = speeds.map(s => s * this.config.baseFrameRate);
+        let currentIndex = 0;
+        const normalizedSpeed = this.playbackSpeed / this.config.baseFrameRate;
+        for (let i = 0; i < speeds.length; i++) {
+          if (Math.abs(normalizedSpeed - speeds[i]) < Math.abs(normalizedSpeed - speeds[currentIndex])) {
+            currentIndex = i;
+          }
+        }
+        const nextIndex = (currentIndex + 1) % speeds.length;
+        this.playbackSpeed = speedRates[nextIndex];
+        this.ui.speedBtn.textContent = `${speeds[nextIndex]}x`;
+        if (speeds[nextIndex] === 1) {
+          this.ui.speedBtn.classList.remove('active');
+        } else {
+          this.ui.speedBtn.classList.add('active');
+        }
+      }
+      seekTo(position) {
+        const frameIndex = Math.floor(position * this.config.totalFrames);
+        this.currentFrame = Math.max(0, Math.min(frameIndex, this.config.totalFrames - 1));
+        this.updatePointCloud(this.currentFrame);
+      }
+      updatePointCloudSettings() {
+        if (!this.pointCloud) return;
+        const size = parseFloat(this.ui.pointSize.value);
+        const opacity = parseFloat(this.ui.pointOpacity.value);
+        this.pointCloud.material.size = size;
+        this.pointCloud.material.opacity = opacity;
+        this.pointCloud.material.needsUpdate = true;
+        this.updatePointCloud(this.currentFrame);
+      }
+      updateControls() {
+        if (!this.controls) return;
+        this.controls.update();
+      }
+      resetView() {
+        if (!this.camera || !this.controls) return;
+        // Reset camera position
+        this.camera.position.set(0, 0, this.config.cameraZ || 0);
+        // Reset controls
+        this.controls.reset();
+        // Set target slightly in front of camera
+        this.controls.target.set(0, 0, -1);
+        this.controls.update();
+        // Show status message
+        this.ui.statusBar.textContent = "View reset";
+        this.ui.statusBar.classList.remove('hidden');
+        // Hide status message after a few seconds
+        setTimeout(() => {
+          this.ui.statusBar.classList.add('hidden');
+        }, 3000);
+      }
+      onWindowResize() {
+        if (!this.camera || !this.renderer) return;
+        const windowAspect = window.innerWidth / window.innerHeight;
+        this.camera.aspect = windowAspect;
+        this.camera.updateProjectionMatrix();
+        this.renderer.setSize(window.innerWidth, window.innerHeight);
+        if (this.trajectories && this.trajectories.length > 0) {
+          const resolution = new THREE.Vector2(window.innerWidth, window.innerHeight);
+          this.trajectories.forEach(trajectory => {
+            const { lineSegments } = trajectory.userData;
+            if (lineSegments && lineSegments.length > 0) {
+              lineSegments.forEach(segment => {
+                if (segment.material && segment.material.resolution) {
+                  segment.material.resolution.copy(resolution);
+                }
+              });
+            }
+          });
+        }
+        if (this.cameraFrustum) {
+          const resolution = new THREE.Vector2(window.innerWidth, window.innerHeight);
+          this.cameraFrustum.children.forEach(line => {
+            if (line.material && line.material.resolution) {
+              line.material.resolution.copy(resolution);
+            }
+          });
+        }
+      }
+      startAnimation() {
+        this.isPlaying = true;
+        this.lastFrameTime = performance.now();
+        this.camera.position.set(0, 0, this.config.cameraZ || 0);
+        this.controls.target.set(0, 0, -1);
+        this.controls.update();
+        this.playbackSpeed = this.config.baseFrameRate;
+        document.getElementById('play-icon').style.display = 'none';
+        document.getElementById('pause-icon').style.display = 'block';
+        this.animate();
+      }
+      animate() {
+        requestAnimationFrame(() => this.animate());
+        if (this.controls) {
+          this.controls.update();
+        }
+        if (this.isPlaying && this.data) {
+          const now = performance.now();
+          const delta = (now - this.lastFrameTime) / 1000;
+          const framesToAdvance = Math.floor(delta * this.config.baseFrameRate * this.playbackSpeed);
+          if (framesToAdvance > 0) {
+            this.currentFrame = (this.currentFrame + framesToAdvance) % this.config.totalFrames;
+            this.lastFrameTime = now;
+            this.updatePointCloud(this.currentFrame);
+          }
+        }
+        if (this.renderer && this.scene && this.camera) {
+          this.renderer.render(this.scene, this.camera);
+        }
+      }
+      initCameraWithCorrectFOV() {
+        const fov = this.config.fov || 60;
+        const windowAspect = window.innerWidth / window.innerHeight;
+        this.camera = new THREE.PerspectiveCamera(
+          fov,
+          windowAspect,
+          0.1,
+          10000
+        );
+        this.controls.object = this.camera;
+        this.controls.update();
+        this.initCameraFrustum();
+      }
+      initCameraFrustum() {
+        this.cameraFrustum = new THREE.Group();
+        this.scene.add(this.cameraFrustum);
+        this.initCameraFrustumGeometry();
+        const showCameraFrustum = this.ui.showCameraFrustum ? this.ui.showCameraFrustum.checked : (this.defaultSettings ? this.defaultSettings.showCameraFrustum : false);
+        this.cameraFrustum.visible = showCameraFrustum;
+      }
+      initCameraFrustumGeometry() {
+        const fov = this.config.fov || 60;
+        const originalAspect = this.config.original_aspect_ratio || 1.33;
+        const size = parseFloat(this.ui.frustumSize.value) || this.defaultSettings.frustumSize;
+        const halfHeight = Math.tan(THREE.MathUtils.degToRad(fov / 2)) * size;
+        const halfWidth = halfHeight * originalAspect;
+        const vertices = [
+          new THREE.Vector3(0, 0, 0),
+          new THREE.Vector3(-halfWidth, -halfHeight, size),
+          new THREE.Vector3(halfWidth, -halfHeight, size),
+          new THREE.Vector3(halfWidth, halfHeight, size),
+          new THREE.Vector3(-halfWidth, halfHeight, size)
+        ];
+        const resolution = new THREE.Vector2(window.innerWidth, window.innerHeight);
+        const linePairs = [
+          [1, 2], [2, 3], [3, 4], [4, 1],
+          [0, 1], [0, 2], [0, 3], [0, 4]
+        ];
+        const colors = {
+          edge: new THREE.Color(0x3366ff),
+          ray: new THREE.Color(0x33cc66)
+        };
+        linePairs.forEach((pair, index) => {
+          const positions = [
+            vertices[pair[0]].x, vertices[pair[0]].y, vertices[pair[0]].z,
+            vertices[pair[1]].x, vertices[pair[1]].y, vertices[pair[1]].z
+          ];
+          const lineGeometry = new THREE.LineGeometry();
+          lineGeometry.setPositions(positions);
+          let color = index < 4 ? colors.edge : colors.ray;
+          const lineMaterial = new THREE.LineMaterial({
+            color: color,
+            linewidth: 2,
+            resolution: resolution,
+            dashed: false
+          });
+          const line = new THREE.Line2(lineGeometry, lineMaterial);
+          this.cameraFrustum.add(line);
+        });
+      }
+      updateCameraFrustum(frameIndex) {
+        if (!this.cameraFrustum || !this.data) return;
+        const invExtrinsics = this.data.inv_extrinsics;
+        if (!invExtrinsics) return;
+        const invExtrMat = this.get4x4Matrix(invExtrinsics.data, invExtrinsics.shape, frameIndex);
+        const matrix = new THREE.Matrix4();
+        matrix.set(
+          invExtrMat[0][0], invExtrMat[0][1], invExtrMat[0][2], invExtrMat[0][3],
+          invExtrMat[1][0], invExtrMat[1][1], invExtrMat[1][2], invExtrMat[1][3],
+          invExtrMat[2][0], invExtrMat[2][1], invExtrMat[2][2], invExtrMat[2][3],
+          invExtrMat[3][0], invExtrMat[3][1], invExtrMat[3][2], invExtrMat[3][3]
+        );
+        const position = new THREE.Vector3();
+        position.setFromMatrixPosition(matrix);
+        const rotMatrix = new THREE.Matrix4().extractRotation(matrix);
+        const coordinateCorrection = new THREE.Matrix4().makeRotationX(Math.PI);
+        const finalRotation = new THREE.Matrix4().multiplyMatrices(coordinateCorrection, rotMatrix);
+        const quaternion = new THREE.Quaternion();
+        quaternion.setFromRotationMatrix(finalRotation);
+        position.y = -position.y;
+        position.z = -position.z;
+        this.cameraFrustum.position.copy(position);
+        this.cameraFrustum.quaternion.copy(quaternion);
+        const showCameraFrustum = this.ui.showCameraFrustum ? this.ui.showCameraFrustum.checked : this.defaultSettings.showCameraFrustum;
+        if (this.cameraFrustum.visible !== showCameraFrustum) {
+          this.cameraFrustum.visible = showCameraFrustum;
+        }
+        const resolution = new THREE.Vector2(window.innerWidth, window.innerHeight);
+        this.cameraFrustum.children.forEach(line => {
+          if (line.material && line.material.resolution) {
+            line.material.resolution.copy(resolution);
+          }
+        });
+      }
+      updateFrustumDimensions() {
+        if (!this.cameraFrustum) return;
+        while(this.cameraFrustum.children.length > 0) {
+          const child = this.cameraFrustum.children[0];
+          if (child.geometry) child.geometry.dispose();
+          if (child.material) child.material.dispose();
+          this.cameraFrustum.remove(child);
+        }
+        this.initCameraFrustumGeometry();
+        this.updateCameraFrustum(this.currentFrame);
+      }
+      resetSettings() {
+        if (!this.defaultSettings) return;
+        this.applyDefaultSettings();
+        this.updatePointCloudSettings();
+        this.updateTrajectorySettings();
+        this.updateFrustumDimensions();
+        this.ui.statusBar.textContent = "Settings reset to defaults";
+        this.ui.statusBar.classList.remove('hidden');
+        setTimeout(() => {
+          this.ui.statusBar.classList.add('hidden');
+        }, 3000);
+      }
+    }
+    window.addEventListener('DOMContentLoaded', () => {
+      new PointCloudVisualizer();
+    });
+  </script>
+</body>
+</html>

app.py CHANGED Viewed

@@ -4,160 +4,363 @@ import json
 import numpy as np
 import cv2
 import base64
-import requests
 import time
-from typing import List, Tuple
-from gradio_client.utils import handle_file
 from pathlib import Path
-# Backend Space URL - replace with your actual backend space URL
-BACKEND_SPACE_URL = "Yuxihenry/SpatialTrackerV2_Backend"  # Replace with actual backend space URL
-hf_token = os.getenv("HF_TOKEN")  # Replace with your actual Hugging Face token
-# Debug information
-print(f"🔧 Environment Debug Info:")
-print(f"   - Backend URL: {BACKEND_SPACE_URL}")
-print(f"   - HF Token available: {'Yes' if hf_token else 'No'}")
-print(f"   - HF Token length: {len(hf_token) if hf_token else 0}")
-# Flag to track if backend is available
-BACKEND_AVAILABLE = False
-backend_client = None
-def check_user_permissions():
-    """Check if user has necessary permissions"""
-    print("🔐 Checking user permissions...")
-    if not hf_token:
-        print("❌ No HF Token found")
-        print("🔧 To get a token:")
-        print("   1. Go to https://huggingface.co/settings/tokens")
-        print("   2. Create a new token with 'read' permissions")
-        print("   3. Set it as environment variable: export HF_TOKEN='your_token'")
-        return False
-    # Try to access user info
     try:
-        headers = {'Authorization': f'Bearer {hf_token}'}
-        response = requests.get('https://huggingface.co/api/whoami', headers=headers, timeout=5)
-        if response.status_code == 200:
-            user_info = response.json()
-            username = user_info.get('name', 'Unknown')
-            print(f"✅ Authenticated as: {username}")
-            # Check if user has access to the specific space
-            space_url = f"https://huggingface.co/api/spaces/{BACKEND_SPACE_URL}"
-            space_response = requests.get(space_url, headers=headers, timeout=5)
-            if space_response.status_code == 200:
-                print("✅ You have access to the backend Space")
-                return True
-            elif space_response.status_code == 401:
-                print("❌ You don't have access to the backend Space")
-                print("🔧 Solutions:")
-                print("   1. Contact the Space owner to add you as collaborator")
-                print("   2. Ask the owner to make the Space public")
-                return False
-            elif space_response.status_code == 404:
-                print("❌ Backend Space not found")
-                print("🔧 Please check if the Space URL is correct")
-                return False
-            else:
-                print(f"⚠️  Unexpected response checking Space access: {space_response.status_code}")
-                return False
-        else:
-            print(f"❌ Token validation failed: {response.status_code}")
-            print("🔧 Your token might be invalid or expired")
-            return False
     except Exception as e:
-        print(f"❌ Error checking permissions: {e}")
-        return False
-def check_backend_space_status():
-    """Check if backend space is running via HTTP request"""
-    try:
-        backend_url = f"https://huggingface.co/spaces/{BACKEND_SPACE_URL}"
-        print(f"🔍 Checking backend space status: {backend_url}")
-        # Prepare headers with authentication if token is available
-        headers = {}
-        if hf_token:
-            headers['Authorization'] = f'Bearer {hf_token}'
-            print(f"🔐 Using HF Token for authentication")
-        # Try to access the space page
-        response = requests.get(backend_url, headers=headers, timeout=10)
-        if response.status_code == 200:
-            print("✅ Backend space page is accessible")
-            # Check if space is running (look for common indicators)
-            page_content = response.text.lower()
-            if "runtime error" in page_content:
-                print("❌ Backend space has runtime error")
-                return False
-            elif "building" in page_content:
-                print("🔄 Backend space is building...")
-                return False
-            elif "sleeping" in page_content:
-                print("😴 Backend space is sleeping")
-                return False
-            else:
-                print("✅ Backend space appears to be running")
-                return True
-        elif response.status_code == 401:
-            print("❌ Authentication failed (HTTP 401)")
-            print("🔧 This means:")
-            print("   - The backend Space is private")
-            print("   - Your HF Token doesn't have access to this Space")
-            print("   - You need to be added as a collaborator to the Space")
-            print("   - Or the Space owner needs to make it public")
-            return False
-        elif response.status_code == 404:
-            print("❌ Backend space not found (HTTP 404)")
-            print("🔧 Please check if the Space URL is correct:")
-            print(f"   Current URL: {BACKEND_SPACE_URL}")
-            return False
-        else:
-            print(f"❌ Backend space not accessible (HTTP {response.status_code})")
-            print(f"🔧 Response: {response.text[:200]}...")
-            return False
-    except requests.RequestException as e:
-        print(f"❌ Failed to check backend space status: {e}")
-        return False
-    except Exception as e:
-        print(f"❌ Unexpected error checking backend: {e}")
-        return False
-def initialize_backend():
-    """Initialize backend connection using gradio_client"""
-    global backend_client, BACKEND_AVAILABLE
-    try:
-        from gradio_client import Client
-        # Connect to HF Space
-        if hf_token:
-            backend_client = Client(BACKEND_SPACE_URL, hf_token=hf_token)
-        else:
-            backend_client = Client(BACKEND_SPACE_URL)
-        # Test the connection
-        backend_client.view_api()
-        BACKEND_AVAILABLE = True
-        return True
-    except Exception as e:
-        print(f"❌ Backend connection failed: {e}")
-        BACKEND_AVAILABLE = False
-        return False
 def numpy_to_base64(arr):
     """Convert numpy array to base64 string"""
@@ -167,24 +370,6 @@ def base64_to_numpy(b64_str, shape, dtype):
     """Convert base64 string back to numpy array"""
     return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)
-def base64_to_image(b64_str):
-    """Convert base64 string to numpy image array"""
-    if not b64_str:
-        return None
-    try:
-        # Decode base64 to bytes
-        img_bytes = base64.b64decode(b64_str)
-        # Convert bytes to numpy array
-        nparr = np.frombuffer(img_bytes, np.uint8)
-        # Decode image
-        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
-        # Convert BGR to RGB
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-        return img
-    except Exception as e:
-        print(f"Error converting base64 to image: {e}")
-        return None
 def get_video_name(video_path):
     """Extract video name without extension"""
     return os.path.splitext(os.path.basename(video_path))[0]
@@ -197,7 +382,6 @@ def extract_first_frame(video_path):
         cap.release()
         if ret:
-            # Convert BGR to RGB
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             return frame_rgb
         else:
@@ -214,116 +398,65 @@ def handle_video_upload(video):
                 gr.update(value=756),
                 gr.update(value=3))
-    try:
-        if BACKEND_AVAILABLE and backend_client:
-            # Try to use backend API
-            try:
-                print("🔧 Calling backend API for video upload...")
-                # Call the unified API with upload_video function type - fix: use handle_file wrapper
-                result = backend_client.predict(
-                    "upload_video",  # function_type
-                    handle_file(video),           # video file - wrapped with handle_file
-                    "",              # original_image_state (not used for upload)
-                    [],              # selected_points (not used for upload)
-                    "positive_point", # point_type (not used for upload)
-                    0,               # point_x (not used for upload)
-                    0,               # point_y (not used for upload)
-                    50,              # grid_size (not used for upload)
-                    756,             # vo_points (not used for upload)
-                    3,               # fps (not used for upload)
-                    api_name="/unified_api"
-                )
-                print(f"✅ Backend video upload API call successful!")
-                print(f"🔧 Result type: {type(result)}")
-                print(f"🔧 Result: {result}")
-                # Parse the result - expect a dict with success status
-                if isinstance(result, dict) and result.get("success"):
-                    # Extract data from backend response
-                    original_image_state = result.get("original_image_state", "")
-                    display_image = result.get("display_image", None)
-                    selected_points = result.get("selected_points", [])
-                    # Fix: Convert display_image from list back to numpy array if needed
-                    if isinstance(display_image, list):
-                        display_image = np.array(display_image, dtype=np.uint8)
-                        print(f"🔧 Converted display_image from list to numpy array: {display_image.shape}")
-                    # Get video settings based on video name
-                    video_name = get_video_name(video)
-                    print(f"🎬 Video path: '{video}' -> Video name: '{video_name}'")
-                    grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
-                    print(f"🎬 Video settings for '{video_name}': grid_size={grid_size_val}, vo_points={vo_points_val}, fps={fps_val}")
-                    return (original_image_state, display_image, selected_points,
-                            gr.update(value=grid_size_val),
-                            gr.update(value=vo_points_val),
-                            gr.update(value=fps_val))
-                else:
-                    print("Backend processing failed, using local fallback")
-                    # Fallback to local processing
-                    pass
-            except Exception as e:
-                print(f"Backend API call failed: {e}")
-                # Fallback to local processing
-                pass
-        # Fallback: local processing
-        print("Using local video processing...")
-        display_image = extract_first_frame(video)
-        if display_image is not None:
-            # Create a state format compatible with backend
-            import tempfile
-            import shutil
-            # Create a temporary directory for this session
-            session_id = str(int(time.time() * 1000))  # Use timestamp as session ID
-            temp_dir = os.path.join("temp_frontend", f"session_{session_id}")
-            os.makedirs(temp_dir, exist_ok=True)
-            # Copy video to temp directory with standardized name
-            video_name = get_video_name(video)
-            temp_video_path = os.path.join(temp_dir, f"{video_name}.mp4")
-            shutil.copy(video, temp_video_path)
-            # Create state format compatible with backend
-            frame_data = {
-                'data': numpy_to_base64(display_image),
-                'shape': display_image.shape,
-                'dtype': str(display_image.dtype),
-                'temp_dir': temp_dir,
-                'video_name': video_name,
-                'video_path': temp_video_path  # Keep for backward compatibility
-            }
-            original_image_state = json.dumps(frame_data)
-        else:
-            # Fallback to simple state if frame extraction fails
-            original_image_state = json.dumps({
-                "video_path": video,
-                "frame": "local_processing_failed"
-            })
-        # Get video settings
         video_name = get_video_name(video)
-        print(f"🎬 Local fallback - Video path: '{video}' -> Video name: '{video_name}'")
-        grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
-        print(f"🎬 Local fallback - Video settings for '{video_name}': grid_size={grid_size_val}, vo_points={vo_points_val}, fps={fps_val}")
-        return (original_image_state, display_image, [],
-                gr.update(value=grid_size_val),
-                gr.update(value=vo_points_val),
-                gr.update(value=fps_val))
-    except Exception as e:
-        print(f"Error in handle_video_upload: {e}")
         return (None, None, [],
                 gr.update(value=50),
                 gr.update(value=756),
                 gr.update(value=3))
 def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData):
     """Handle point selection for SAM"""
@@ -331,357 +464,142 @@ def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.Sele
         return None, []
     try:
-        if BACKEND_AVAILABLE and backend_client:
-            # Try to use backend API
-            try:
-                print(f"🔧 Calling backend select point API: x={evt.index[0]}, y={evt.index[1]}, type={point_type}")
-                # Call the unified API with select_point function type
-                result = backend_client.predict(
-                    "select_point",  # function_type
-                    None,            # video file (not used for select_point)
-                    original_img,    # original_image_state
-                    sel_pix,         # selected_points
-                    point_type,      # point_type
-                    evt.index[0],    # point_x
-                    evt.index[1],    # point_y
-                    50,              # grid_size (not used for select_point)
-                    756,             # vo_points (not used for select_point)
-                    3,               # fps (not used for select_point)
-                    api_name="/unified_api"
-                )
-                print(f"✅ Backend select point API call successful!")
-                print(f"🔧 Result type: {type(result)}")
-                print(f"🔧 Result: {result}")
-                # Parse the result - expect a dict with success status
-                if isinstance(result, dict) and result.get("success"):
-                    display_image = result.get("display_image", None)
-                    new_sel_pix = result.get("selected_points", sel_pix)
-                    # Fix: Convert display_image from list back to numpy array if needed
-                    if isinstance(display_image, list):
-                        display_image = np.array(display_image, dtype=np.uint8)
-                        print(f"🔧 Converted display_image from list to numpy array: {display_image.shape}")
-                    return display_image, new_sel_pix
-                else:
-                    print("Backend processing failed, using local fallback")
-                    # Fallback to local processing
-                    pass
-            except Exception as e:
-                print(f"Backend API call failed: {e}")
-                # Check for specific gradio_client errors
-                if "AppError" in str(type(e)):
-                    print("🔧 Backend Space has internal errors (AppError)")
-                    print("🔧 The backend Space code has bugs or configuration issues")
-                    print("🔧 Contact the Space owner to fix the backend implementation")
-                elif "Could not fetch config" in str(e):
-                    print("🔧 Config fetch failed - possible Gradio version mismatch")
-                    print("🔧 Frontend and backend may be using incompatible Gradio versions")
-                elif "timeout" in str(e).lower():
-                    print("🔧 Backend request timed out - Space might be overloaded")
-                else:
-                    print(f"🔧 Unexpected error type: {type(e).__name__}")
-                print("🔄 Showing error message instead of visualization...")
-                # Fallback to local processing
-                pass
-        # Fallback: local processing with improved visualization
-        print("Using local point selection with enhanced visualization...")
-        # Parse original image state
-        try:
-            state_data = json.loads(original_img)
-            video_path = state_data.get("video_path")
-        except:
-            video_path = None
-        if video_path:
-            # Re-extract frame and add point with mask visualization
-            display_image = extract_first_frame(video_path)
-            if display_image is not None:
-                # Add point to the image with enhanced visualization
-                x, y = evt.index[0], evt.index[1]
-                color = (0, 255, 0) if point_type == 'positive_point' else (255, 0, 0)
-                # Draw a larger, more visible point
-                cv2.circle(display_image, (x, y), 8, color, -1)
-                cv2.circle(display_image, (x, y), 12, (255, 255, 255), 2)
-                # Add point to selected points list - fix logic to match local version
-                new_sel_pix = sel_pix.copy() if sel_pix else []
-                new_sel_pix.append([x, y, point_type])
-                return display_image, new_sel_pix
-        return None, []
     except Exception as e:
-        print(f"Error in select_point: {e}")
         return None, []
 def reset_points(original_img: str, sel_pix):
-    """Reset points and restore original image"""
     if original_img is None:
         return None, []
     try:
-        if BACKEND_AVAILABLE and backend_client:
-            # Try to use backend API
-            try:
-                print("🔧 Calling backend reset points API...")
-                # Call the unified API with reset_points function type
-                result = backend_client.predict(
-                    "reset_points",  # function_type
-                    None,            # video file (not used for reset_points)
-                    original_img,    # original_image_state
-                    sel_pix,         # selected_points
-                    "positive_point", # point_type (not used for reset_points)
-                    0,               # point_x (not used for reset_points)
-                    0,               # point_y (not used for reset_points)
-                    50,              # grid_size (not used for reset_points)
-                    756,             # vo_points (not used for reset_points)
-                    3,               # fps (not used for reset_points)
-                    api_name="/unified_api"
-                )
-                print(f"✅ Backend reset points API call successful!")
-                print(f"🔧 Result: {result}")
-                # Parse the result
-                if isinstance(result, dict) and result.get("success"):
-                    display_image = result.get("display_image", None)
-                    new_sel_pix = result.get("selected_points", [])
-                    # Fix: Convert display_image from list back to numpy array if needed
-                    if isinstance(display_image, list):
-                        display_image = np.array(display_image, dtype=np.uint8)
-                        print(f"🔧 Converted display_image from list to numpy array: {display_image.shape}")
-                    return display_image, new_sel_pix
-                else:
-                    print("Backend processing failed, using local fallback")
-                    # Fallback to local processing
-                    pass
-            except Exception as e:
-                print(f"Backend API call failed: {e}")
-                # Fallback to local processing
-                pass
-        # Fallback: local processing
-        print("Using local reset points...")
-        # Parse original image state
-        try:
-            state_data = json.loads(original_img)
-            video_path = state_data.get("video_path")
-        except:
-            video_path = None
-        if video_path:
-            # Re-extract original frame
-            display_image = extract_first_frame(video_path)
-            return display_image, []
-        return None, []
     except Exception as e:
-        print(f"Error in reset_points: {e}")
         return None, []
-gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])
 def launch_viz(grid_size, vo_points, fps, original_image_state):
     """Launch visualization with user-specific temp directory"""
     if original_image_state is None:
         return None, None, None
     try:
-        if BACKEND_AVAILABLE and backend_client:
-            # Try to use backend API
-            try:
-                print(f"🔧 Calling backend API with parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
-                print(f"🔧 Original image state type: {type(original_image_state)}")
-                print(f"🔧 Original image state preview: {str(original_image_state)[:100]}...")
-                # Validate and potentially fix the original_image_state format
-                state_to_send = original_image_state
-                # Check if this is a local processing state that needs to be converted
-                try:
-                    if isinstance(original_image_state, str):
-                        parsed_state = json.loads(original_image_state)
-                        if "video_path" in parsed_state and "frame" in parsed_state:
-                            # This is a local processing state, we need to handle differently
-                            print("🔧 Detected local processing state, cannot use backend for tracking")
-                            print("🔧 Backend requires proper video upload state from backend API")
-                            # Fall through to local processing
-                            raise ValueError("Local state cannot be processed by backend")
-                except json.JSONDecodeError:
-                    print("🔧 Invalid JSON state, cannot send to backend")
-                    raise ValueError("Invalid state format")
-                # Call the unified API with run_tracker function type
-                result = backend_client.predict(
-                    "run_tracker",        # function_type
-                    None,                 # video file (not used for run_tracker)
-                    state_to_send,        # original_image_state
-                    [],                   # selected_points (not used for run_tracker)
-                    "positive_point",     # point_type (not used for run_tracker)
-                    0,                    # point_x (not used for run_tracker)
-                    0,                    # point_y (not used for run_tracker)
-                    grid_size,            # grid_size
-                    vo_points,            # vo_points
-                    fps,                  # fps
-                    api_name="/unified_api"
-                )
-                print(f"✅ Backend API call successful!")
-                print(f"🔧 Result type: {type(result)}")
-                print(f"🔧 Result: {result}")
-                # Parse the result
-                if isinstance(result, dict) and result.get("success"):
-                    viz_html = result.get("viz_html", "")
-                    track_video_path = result.get("track_video_path", "")
-                    track_video_content = result.get("track_video_content", None)
-                    track_video_filename = result.get("track_video_filename", "tracked_video.mp4")
-                    # Save HTML to _viz directory (like local version)
-                    viz_dir = './_viz'
-                    os.makedirs(viz_dir, exist_ok=True)
-                    random_path = f'./_viz/_{time.time()}.html'
-                    with open(random_path, 'w', encoding='utf-8') as f:
-                        f.write(viz_html)
-                    # Create iframe HTML
-                    iframe_html = f"""
-                    <div style='border: 3px solid #667eea; border-radius: 10px;
-                                background: #f8f9ff; height: 650px; width: 100%;
-                                box-shadow: 0 8px 32px rgba(102, 126, 234, 0.3);
-                                margin: 0; padding: 0; box-sizing: border-box; overflow: hidden;'>
-                        <iframe id="viz_iframe" src="/gradio_api/file={random_path}"
-                                width="100%" height="650" frameborder="0"
-                                style="border: none; display: block; width: 100%; height: 650px;
-                                       margin: 0; padding: 0; border-radius: 7px;">
-                        </iframe>
-                    </div>
-                    """
-                    print(f"💾 HTML saved to: {random_path}")
-                    print(f"📊 HTML content preview: {viz_html[:200]}...")
-                    # If we have base64 encoded video content, save it as a temporary file
-                    local_video_path = None
-                    if track_video_content:
-                        try:
-                            # Create a temporary file for the video
-                            temp_video_dir = "temp_frontend_videos"
-                            os.makedirs(temp_video_dir, exist_ok=True)
-                            # Generate unique filename to avoid conflicts
-                            timestamp = str(int(time.time() * 1000))
-                            local_video_path = os.path.join(temp_video_dir, f"{timestamp}_{track_video_filename}")
-                            # Decode base64 and save as video file
-                            video_bytes = base64.b64decode(track_video_content)
-                            with open(local_video_path, 'wb') as f:
-                                f.write(video_bytes)
-                            print(f"✅ Successfully saved tracking video to: {local_video_path}")
-                            print(f"�� Video file size: {len(video_bytes)} bytes")
-                        except Exception as e:
-                            print(f"❌ Failed to process tracking video: {e}")
-                            local_video_path = None
-                    else:
-                        print("⚠️ No tracking video content received from backend")
-                    # 返回iframe HTML、视频路径和HTML文件路径（用于下载）
-                    return iframe_html, local_video_path, random_path
-                else:
-                    error_msg = result.get("error", "Unknown error") if isinstance(result, dict) else "Backend processing failed"
-                    print(f"❌ Backend processing failed: {error_msg}")
-                    # Fall through to error message
-                    pass
-            except Exception as e:
-                print(f"❌ Backend API call failed: {e}")
-                print(f"🔧 Error type: {type(e)}")
-                print(f"🔧 Error details: {str(e)}")
-                # Check for specific gradio_client errors
-                if "AppError" in str(type(e)):
-                    print("🔧 Backend Space has internal errors (AppError)")
-                    print("🔧 The backend Space code has bugs or configuration issues")
-                    print("🔧 Contact the Space owner to fix the backend implementation")
-                elif "Could not fetch config" in str(e):
-                    print("🔧 Config fetch failed - possible Gradio version mismatch")
-                    print("🔧 Frontend and backend may be using incompatible Gradio versions")
-                elif "timeout" in str(e).lower():
-                    print("🔧 Backend request timed out - Space might be overloaded")
-                elif "Expecting value" in str(e):
-                    print("🔧 JSON parsing error in backend - state format mismatch")
-                    print("🔧 This happens when using local processing state with backend API")
-                    print("🔧 Please upload video again to use backend processing")
-                else:
-                    print(f"🔧 Unexpected error type: {type(e).__name__}")
-                print("🔄 Showing error message instead of visualization...")
-                # Fall through to error message
-                pass
-        # Create an informative error message based on the state
-        state_info = ""
-        try:
-            if isinstance(original_image_state, str):
-                parsed_state = json.loads(original_image_state)
-                if "video_path" in parsed_state:
-                    video_name = os.path.basename(parsed_state["video_path"])
-                    state_info = f"Video: {video_name}"
-        except:
-            state_info = "State format unknown"
-        # Fallback: show message that backend is required
-        error_message = f"""
-        <div style='border: 3px solid #ff6b6b; border-radius: 10px; padding: 20px; background-color: #fff5f5;'>
-            <h3 style='color: #d63031; margin-bottom: 15px;'>⚠️ Backend Processing Required</h3>
-            <p style='color: #2d3436; line-height: 1.6;'>
-                The tracking and visualization features require backend processing. The current setup is using local processing which is incompatible with the backend API.
-            </p>
-            <h4 style='color: #d63031; margin: 15px 0 10px 0;'>Solutions:</h4>
-            <ul style='color: #2d3436; line-height: 1.6;'>
-                <li><strong>Upload video again:</strong> This will properly initialize the backend state</li>
-                <li><strong>Select points on the frame:</strong> Ensure you've clicked on the object to track</li>
-                <li><strong>Check backend connection:</strong> Ensure the backend Space is running</li>
-                <li><strong>Use compatible state:</strong> Avoid local processing mode</li>
-            </ul>
-            <div style='background-color: #f8f9fa; border-radius: 5px; padding: 10px; margin-top: 15px;'>
-                <p style='color: #2d3436; font-weight: bold; margin: 0 0 5px 0;'>Debug Information:</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>Backend Available: {BACKEND_AVAILABLE}</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>Backend Client: {backend_client is not None}</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>Backend URL: {BACKEND_SPACE_URL}</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>State Info: {state_info}</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>Processing Mode: {"Backend" if BACKEND_AVAILABLE else "Local (Limited)"}</p>
-            </div>
-            <div style='background-color: #e3f2fd; border-radius: 5px; padding: 10px; margin-top: 10px; border-left: 4px solid #2196f3;'>
-                <p style='color: #1976d2; font-weight: bold; margin: 0 0 5px 0;'>💡 Quick Fix:</p>
-                <p style='color: #1976d2; font-size: 13px; margin: 0;'>
-                    Try uploading your video again - this should properly initialize the backend state for tracking.
-                </p>
-            </div>
-        </div>
-        """
-        return error_message, None, None
     except Exception as e:
-        print(f"Error in launch_viz: {e}")
-        return None, None, None
 def clear_all():
     """Clear all buffers and temporary files"""
@@ -699,10 +617,6 @@ def clear_all_with_download():
             None,  # tracking_video_download
             None)  # HTML download component
-def update_tracker_model(model_name):
-    """Update tracker model (placeholder function)"""
-    return
 def get_video_settings(video_name):
     """Get video-specific settings based on video name"""
     video_settings = {
@@ -726,68 +640,14 @@ def get_video_settings(video_name):
         "cinema_1": (45, 756, 3),
     }
-    return video_settings.get(video_name, (50, 756, 3))
-def test_backend_connection():
-    """Test if backend is actually working"""
-    global BACKEND_AVAILABLE
-    if not backend_client:
-        return False
-    try:
-        print("Testing backend connection with a simple call...")
-        # Check if we have fns available
-        if hasattr(backend_client, 'fns') and backend_client.fns:
-            print("✅ Backend API functions are available")
-            print(f"🔧 Available function indices: {list(backend_client.fns.keys())}")
-            return True
-        else:
-            print("❌ Backend API functions not found")
-            return False
-    except Exception as e:
-        print(f"❌ Backend connection test failed: {e}")
-        return False
-def test_backend_api():
-    """Test specific backend API functions"""
-    if not BACKEND_AVAILABLE or not backend_client:
-        print("❌ Backend not available for testing")
-        return False
-    try:
-        print("🧪 Testing backend API functions...")
-        # Test if fns exist and show available indices
-        if hasattr(backend_client, 'fns') and backend_client.fns:
-            print(f"✅ Backend has {len(backend_client.fns)} functions available")
-            for idx in backend_client.fns.keys():
-                print(f"✅ Function {idx} is available")
-        else:
-            print("❌ No functions found in backend API")
-            return False
-        return True
-    except Exception as e:
-        print(f"❌ Backend API test failed: {e}")
-        return False
-# Initialize the backend connection
-print("🚀 Initializing frontend application...")
-result = initialize_backend()
-# Test backend connection if available
-if result and BACKEND_AVAILABLE:
-    print("✅ Backend connection successful!")
-else:
-    print("❌ Backend connection failed!")
 # Create the Gradio interface
 print("🎨 Creating Gradio interface...")
 with gr.Blocks(
     theme=gr.themes.Soft(),
-    title="🎯 [SpatialTracker V2](https://github.com/henry123-boy/SpaTrackerV2) - Frontend Interface",
     css="""
     .gradio-container {
         max-width: 1200px !important;
@@ -997,7 +857,11 @@ with gr.Blocks(
     """
 ) as demo:
     gr.Markdown("""
     Welcome to [SpatialTracker V2](https://github.com/henry123-boy/SpaTrackerV2)! This interface allows you to track any pixels in 3D using our model.
     **⚡ Quick Start:** Upload video → Click "Start Tracking Now!"
@@ -1010,9 +874,8 @@ with gr.Blocks(
     """)
-    # Status indicator - more compact
-    status_info = "🟢 Backend Connected" if BACKEND_AVAILABLE else "🟡 Standalone Mode"
-    gr.Markdown(f"**Status:** {status_info} | Backend: {BACKEND_SPACE_URL}")
     # Main content area - video upload left, 3D visualization right
     with gr.Row():
@@ -1151,7 +1014,7 @@ with gr.Blocks(
                 with gr.Row():
                     reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary", size="sm")
-    # Downloads section - hidden but still functional for backend processing
     with gr.Row(visible=False):
         with gr.Column(scale=1):
             tracking_video_download = gr.File(
@@ -1266,8 +1129,8 @@ with gr.Blocks(
 # Launch the interface
 if __name__ == "__main__":
-    print("🌟 Launching SpatialTracker V2 Frontend...")
-    print(f"🔗 Backend Status: {'Connected' if BACKEND_AVAILABLE else 'Disconnected'}")
     demo.launch(
         server_name="0.0.0.0",

 import numpy as np
 import cv2
 import base64
 import time
+import tempfile
+import shutil
+import glob
+import threading
+import subprocess
+import struct
+import zlib
 from pathlib import Path
+from einops import rearrange
+from typing import List, Tuple, Union
+import torch
+import logging
+from concurrent.futures import ThreadPoolExecutor
+import atexit
+import uuid
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Import custom modules with error handling
+try:
+    from app_3rd.sam_utils.inference import SamPredictor, get_sam_predictor, run_inference
+    from app_3rd.spatrack_utils.infer_track import get_tracker_predictor, run_tracker, get_points_on_a_grid
+except ImportError as e:
+    logger.error(f"Failed to import custom modules: {e}")
+    raise
+try:
+    import spaces
+except ImportError:
+    # Fallback for local development
+    def spaces(func):
+        return func
+# Constants
+MAX_FRAMES = 80
+COLORS = [(0, 0, 255), (0, 255, 255)]  # BGR: Red for negative, Yellow for positive
+MARKERS = [1, 5]  # Cross for negative, Star for positive
+MARKER_SIZE = 8
+# Thread pool for delayed deletion
+thread_pool_executor = ThreadPoolExecutor(max_workers=2)
+def delete_later(path: Union[str, os.PathLike], delay: int = 600):
+    """Delete file or directory after specified delay (default 10 minutes)"""
+    def _delete():
+        try:
+            if os.path.isfile(path):
+                os.remove(path)
+            elif os.path.isdir(path):
+                shutil.rmtree(path)
+        except Exception as e:
+            logger.warning(f"Failed to delete {path}: {e}")
+    def _wait_and_delete():
+        time.sleep(delay)
+        _delete()
+    thread_pool_executor.submit(_wait_and_delete)
+    atexit.register(_delete)
+def create_user_temp_dir():
+    """Create a unique temporary directory for each user session"""
+    session_id = str(uuid.uuid4())[:8]  # Short unique ID
+    temp_dir = os.path.join("temp_local", f"session_{session_id}")
+    os.makedirs(temp_dir, exist_ok=True)
+    # Schedule deletion after 10 minutes
+    delete_later(temp_dir, delay=600)
+    return temp_dir
+from huggingface_hub import hf_hub_download
+# init the model
+os.environ["VGGT_DIR"] = hf_hub_download("Yuxihenry/SpatialTrackerCkpts", "spatrack_front.pth") #, force_download=True)
+if os.environ.get("VGGT_DIR", None) is not None:
+    from models.vggt.vggt.models.vggt_moe import VGGT_MoE
+    from models.vggt.vggt.utils.load_fn import preprocess_image
+    vggt_model = VGGT_MoE()
+    vggt_model.load_state_dict(torch.load(os.environ.get("VGGT_DIR")), strict=False)
+    vggt_model.eval()
+    vggt_model = vggt_model.to("cuda")
+# Global model initialization
+print("🚀 Initializing local models...")
+tracker_model, _ = get_tracker_predictor(".", vo_points=756)
+predictor = get_sam_predictor()
+print("✅ Models loaded successfully!")
+gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])
+@spaces.GPU
+def gpu_run_inference(predictor_arg, image, points, boxes):
+    """GPU-accelerated SAM inference"""
+    if predictor_arg is None:
+        print("Initializing SAM predictor inside GPU function...")
+        predictor_arg = get_sam_predictor(predictor=predictor)
+    # Ensure predictor is on GPU
     try:
+        if hasattr(predictor_arg, 'model'):
+            predictor_arg.model = predictor_arg.model.cuda()
+        elif hasattr(predictor_arg, 'sam'):
+            predictor_arg.sam = predictor_arg.sam.cuda()
+        elif hasattr(predictor_arg, 'to'):
+            predictor_arg = predictor_arg.to('cuda')
+        if hasattr(image, 'cuda'):
+            image = image.cuda()
     except Exception as e:
+        print(f"Warning: Could not move predictor to GPU: {e}")
+    return run_inference(predictor_arg, image, points, boxes)
+@spaces.GPU
+def gpu_run_tracker(tracker_model_arg, tracker_viser_arg, temp_dir, video_name, grid_size, vo_points, fps):
+    """GPU-accelerated tracking"""
+    import torchvision.transforms as T
+    import decord
+    if tracker_model_arg is None or tracker_viser_arg is None:
+        print("Initializing tracker models inside GPU function...")
+        out_dir = os.path.join(temp_dir, "results")
+        os.makedirs(out_dir, exist_ok=True)
+        tracker_model_arg, tracker_viser_arg = get_tracker_predictor(out_dir, vo_points=vo_points, tracker_model=tracker_model)
+    # Setup paths
+    video_path = os.path.join(temp_dir, f"{video_name}.mp4")
+    mask_path = os.path.join(temp_dir, f"{video_name}.png")
+    out_dir = os.path.join(temp_dir, "results")
+    os.makedirs(out_dir, exist_ok=True)
+    # Load video using decord
+    video_reader = decord.VideoReader(video_path)
+    video_tensor = torch.from_numpy(video_reader.get_batch(range(len(video_reader))).asnumpy()).permute(0, 3, 1, 2)
+    # Resize to ensure minimum side is 336
+    h, w = video_tensor.shape[2:]
+    scale = max(224 / h, 224 / w)
+    if scale < 1:
+        new_h, new_w = int(h * scale), int(w * scale)
+        video_tensor = T.Resize((new_h, new_w))(video_tensor)
+    video_tensor = video_tensor[::fps].float()[:MAX_FRAMES]
+    # Move to GPU
+    video_tensor = video_tensor.cuda()
+    print(f"Video tensor shape: {video_tensor.shape}, device: {video_tensor.device}")
+    depth_tensor = None
+    intrs = None
+    extrs = None
+    data_npz_load = {}
+    # run vggt
+    if os.environ.get("VGGT_DIR", None) is not None:
+        # process the image tensor
+        video_tensor = preprocess_image(video_tensor)[None]
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                # Predict attributes including cameras, depth maps, and point maps.
+                predictions = vggt_model(video_tensor.cuda()/255)
+                extrinsic, intrinsic = predictions["poses_pred"], predictions["intrs"]
+                depth_map, depth_conf = predictions["points_map"][..., 2], predictions["unc_metric"]
+        depth_tensor = depth_map.squeeze().cpu().numpy()
+        extrs = np.eye(4)[None].repeat(len(depth_tensor), axis=0)
+        extrs = extrinsic.squeeze().cpu().numpy()
+        intrs = intrinsic.squeeze().cpu().numpy()
+        video_tensor = video_tensor.squeeze()
+        #NOTE: 20% of the depth is not reliable
+        # threshold = depth_conf.squeeze()[0].view(-1).quantile(0.6).item()
+        unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
+    # Load and process mask
+    if os.path.exists(mask_path):
+        mask = cv2.imread(mask_path)
+        mask = cv2.resize(mask, (video_tensor.shape[3], video_tensor.shape[2]))
+        mask = mask.sum(axis=-1)>0
+    else:
+        mask = np.ones_like(video_tensor[0,0].cpu().numpy())>0
+        grid_size = 10
+    # Get frame dimensions and create grid points
+    frame_H, frame_W = video_tensor.shape[2:]
+    grid_pts = get_points_on_a_grid(grid_size, (frame_H, frame_W), device="cuda")
+    # Sample mask values at grid points and filter
+    if os.path.exists(mask_path):
+        grid_pts_int = grid_pts[0].long()
+        mask_values = mask[grid_pts_int.cpu()[...,1], grid_pts_int.cpu()[...,0]]
+        grid_pts = grid_pts[:, mask_values]
+    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].cpu().numpy()
+    print(f"Query points shape: {query_xyt.shape}")
+    # Run model inference
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        (
+            c2w_traj, intrs, point_map, conf_depth,
+            track3d_pred, track2d_pred, vis_pred, conf_pred, video
+        ) = tracker_model_arg.forward(video_tensor, depth=depth_tensor,
+                            intrs=intrs, extrs=extrs,
+                            queries=query_xyt,
+                            fps=1, full_point=False, iters_track=4,
+                            query_no_BA=True, fixed_cam=False, stage=1,
+                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
+        # Resize results to avoid large I/O
+        max_size = 224
+        h, w = video.shape[2:]
+        scale = min(max_size / h, max_size / w)
+        if scale < 1:
+            new_h, new_w = int(h * scale), int(w * scale)
+            video = T.Resize((new_h, new_w))(video)
+            video_tensor = T.Resize((new_h, new_w))(video_tensor)
+            point_map = T.Resize((new_h, new_w))(point_map)
+            track2d_pred[...,:2] = track2d_pred[...,:2] * scale
+            intrs[:,:2,:] = intrs[:,:2,:] * scale
+            conf_depth = T.Resize((new_h, new_w))(conf_depth)
+        # Visualize tracks
+        tracker_viser_arg.visualize(video=video[None],
+                        tracks=track2d_pred[None][...,:2],
+                        visibility=vis_pred[None],filename="test")
+        # Save in tapip3d format
+        data_npz_load["coords"] = (torch.einsum("tij,tnj->tni", c2w_traj[:,:3,:3], track3d_pred[:,:,:3].cpu()) + c2w_traj[:,:3,3][:,None,:]).numpy()
+        data_npz_load["extrinsics"] = torch.inverse(c2w_traj).cpu().numpy()
+        data_npz_load["intrinsics"] = intrs.cpu().numpy()
+        data_npz_load["depths"] = point_map[:,2,...].cpu().numpy()
+        data_npz_load["video"] = (video_tensor).cpu().numpy()/255
+        data_npz_load["visibs"] = vis_pred.cpu().numpy()
+        data_npz_load["confs"] = conf_pred.cpu().numpy()
+        data_npz_load["confs_depth"] = conf_depth.cpu().numpy()
+        np.savez(os.path.join(out_dir, f'result.npz'), **data_npz_load)
+    return None
+def compress_and_write(filename, header, blob):
+    header_bytes = json.dumps(header).encode("utf-8")
+    header_len = struct.pack("<I", len(header_bytes))
+    with open(filename, "wb") as f:
+        f.write(header_len)
+        f.write(header_bytes)
+        f.write(blob)
+def process_point_cloud_data(npz_file, width=256, height=192, fps=4):
+    fixed_size = (width, height)
+    data = np.load(npz_file)
+    extrinsics = data["extrinsics"]
+    intrinsics = data["intrinsics"]
+    trajs = data["coords"]
+    T, C, H, W = data["video"].shape
+    fx = intrinsics[0, 0, 0]
+    fy = intrinsics[0, 1, 1]
+    fov_y = 2 * np.arctan(H / (2 * fy)) * (180 / np.pi)
+    fov_x = 2 * np.arctan(W / (2 * fx)) * (180 / np.pi)
+    original_aspect_ratio = (W / fx) / (H / fy)
+    rgb_video = (rearrange(data["video"], "T C H W -> T H W C") * 255).astype(np.uint8)
+    rgb_video = np.stack([cv2.resize(frame, fixed_size, interpolation=cv2.INTER_AREA)
+                          for frame in rgb_video])
+    depth_video = data["depths"].astype(np.float32)
+    if "confs_depth" in data.keys():
+        confs = (data["confs_depth"].astype(np.float32) > 0.5).astype(np.float32)
+        depth_video = depth_video * confs
+    depth_video = np.stack([cv2.resize(frame, fixed_size, interpolation=cv2.INTER_NEAREST)
+                            for frame in depth_video])
+    scale_x = fixed_size[0] / W
+    scale_y = fixed_size[1] / H
+    intrinsics = intrinsics.copy()
+    intrinsics[:, 0, :] *= scale_x
+    intrinsics[:, 1, :] *= scale_y
+    min_depth = float(depth_video.min()) * 0.8
+    max_depth = float(depth_video.max()) * 1.5
+    depth_normalized = (depth_video - min_depth) / (max_depth - min_depth)
+    depth_int = (depth_normalized * ((1 << 16) - 1)).astype(np.uint16)
+    depths_rgb = np.zeros((T, fixed_size[1], fixed_size[0], 3), dtype=np.uint8)
+    depths_rgb[:, :, :, 0] = (depth_int & 0xFF).astype(np.uint8)
+    depths_rgb[:, :, :, 1] = ((depth_int >> 8) & 0xFF).astype(np.uint8)
+    first_frame_inv = np.linalg.inv(extrinsics[0])
+    normalized_extrinsics = np.array([first_frame_inv @ ext for ext in extrinsics])
+    normalized_trajs = np.zeros_like(trajs)
+    for t in range(T):
+        homogeneous_trajs = np.concatenate([trajs[t], np.ones((trajs.shape[1], 1))], axis=1)
+        transformed_trajs = (first_frame_inv @ homogeneous_trajs.T).T
+        normalized_trajs[t] = transformed_trajs[:, :3]
+    arrays = {
+        "rgb_video": rgb_video,
+        "depths_rgb": depths_rgb,
+        "intrinsics": intrinsics,
+        "extrinsics": normalized_extrinsics,
+        "inv_extrinsics": np.linalg.inv(normalized_extrinsics),
+        "trajectories": normalized_trajs.astype(np.float32),
+        "cameraZ": 0.0
+    }
+    header = {}
+    blob_parts = []
+    offset = 0
+    for key, arr in arrays.items():
+        arr = np.ascontiguousarray(arr)
+        arr_bytes = arr.tobytes()
+        header[key] = {
+            "dtype": str(arr.dtype),
+            "shape": arr.shape,
+            "offset": offset,
+            "length": len(arr_bytes)
+        }
+        blob_parts.append(arr_bytes)
+        offset += len(arr_bytes)
+    raw_blob = b"".join(blob_parts)
+    compressed_blob = zlib.compress(raw_blob, level=9)
+    header["meta"] = {
+        "depthRange": [min_depth, max_depth],
+        "totalFrames": int(T),
+        "resolution": fixed_size,
+        "baseFrameRate": fps,
+        "numTrajectoryPoints": normalized_trajs.shape[1],
+        "fov": float(fov_y),
+        "fov_x": float(fov_x),
+        "original_aspect_ratio": float(original_aspect_ratio),
+        "fixed_aspect_ratio": float(fixed_size[0]/fixed_size[1])
+    }
+    compress_and_write('./_viz/data.bin', header, compressed_blob)
+    with open('./_viz/data.bin', "rb") as f:
+        encoded_blob = base64.b64encode(f.read()).decode("ascii")
+    os.unlink('./_viz/data.bin')
+    random_path = f'./_viz/_{time.time()}.html'
+    with open('./_viz/viz_template.html') as f:
+        html_template = f.read()
+    html_out = html_template.replace(
+        "<head>",
+        f"<head>\n<script>window.embeddedBase64 = `{encoded_blob}`;</script>"
+    )
+    with open(random_path,'w') as f:
+        f.write(html_out)
+    return random_path
 def numpy_to_base64(arr):
     """Convert numpy array to base64 string"""
     """Convert base64 string back to numpy array"""
     return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)
 def get_video_name(video_path):
     """Extract video name without extension"""
     return os.path.splitext(os.path.basename(video_path))[0]
         cap.release()
         if ret:
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             return frame_rgb
         else:
                 gr.update(value=756),
                 gr.update(value=3))
+    # Create user-specific temporary directory
+    user_temp_dir = create_user_temp_dir()
+    # Get original video name and copy to temp directory
+    if isinstance(video, str):
         video_name = get_video_name(video)
+        video_path = os.path.join(user_temp_dir, f"{video_name}.mp4")
+        shutil.copy(video, video_path)
+    else:
+        video_name = get_video_name(video.name)
+        video_path = os.path.join(user_temp_dir, f"{video_name}.mp4")
+        with open(video_path, 'wb') as f:
+            f.write(video.read())
+    print(f"📁 Video saved to: {video_path}")
+    # Extract first frame
+    frame = extract_first_frame(video_path)
+    if frame is None:
         return (None, None, [],
                 gr.update(value=50),
                 gr.update(value=756),
                 gr.update(value=3))
+    # Resize frame to have minimum side length of 336
+    h, w = frame.shape[:2]
+    scale = 336 / min(h, w)
+    new_h, new_w = int(h * scale)//2*2, int(w * scale)//2*2
+    frame = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+    # Store frame data with temp directory info
+    frame_data = {
+        'data': numpy_to_base64(frame),
+        'shape': frame.shape,
+        'dtype': str(frame.dtype),
+        'temp_dir': user_temp_dir,
+        'video_name': video_name,
+        'video_path': video_path
+    }
+    # Get video-specific settings
+    print(f"🎬 Video path: '{video}' -> Video name: '{video_name}'")
+    grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
+    print(f"🎬 Video settings for '{video_name}': grid_size={grid_size_val}, vo_points={vo_points_val}, fps={fps_val}")
+    return (json.dumps(frame_data), frame, [],
+            gr.update(value=grid_size_val),
+            gr.update(value=vo_points_val),
+            gr.update(value=fps_val))
+def save_masks(o_masks, video_name, temp_dir):
+    """Save binary masks to files in user-specific temp directory"""
+    o_files = []
+    for mask, _ in o_masks:
+        o_mask = np.uint8(mask.squeeze() * 255)
+        o_file = os.path.join(temp_dir, f"{video_name}.png")
+        cv2.imwrite(o_file, o_mask)
+        o_files.append(o_file)
+    return o_files
 def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData):
     """Handle point selection for SAM"""
         return None, []
     try:
+        # Convert stored image data back to numpy array
+        frame_data = json.loads(original_img)
+        original_img_array = base64_to_numpy(frame_data['data'], frame_data['shape'], frame_data['dtype'])
+        temp_dir = frame_data.get('temp_dir', 'temp_local')
+        video_name = frame_data.get('video_name', 'video')
+        # Create a display image for visualization
+        display_img = original_img_array.copy()
+        new_sel_pix = sel_pix.copy() if sel_pix else []
+        new_sel_pix.append((evt.index, 1 if point_type == 'positive_point' else 0))
+        print(f"🎯 Running SAM inference for point: {evt.index}, type: {point_type}")
+        # Run SAM inference
+        o_masks = gpu_run_inference(None, original_img_array, new_sel_pix, [])
+        # Draw points on display image
+        for point, label in new_sel_pix:
+            cv2.drawMarker(display_img, point, COLORS[label], markerType=MARKERS[label], markerSize=MARKER_SIZE, thickness=2)
+        # Draw mask overlay on display image
+        if o_masks:
+            mask = o_masks[0][0]
+            overlay = display_img.copy()
+            overlay[mask.squeeze()!=0] = [20, 60, 200]  # Light blue
+            display_img = cv2.addWeighted(overlay, 0.6, display_img, 0.4, 0)
+            # Save mask for tracking
+            save_masks(o_masks, video_name, temp_dir)
+            print(f"✅ Mask saved for video: {video_name}")
+        return display_img, new_sel_pix
     except Exception as e:
+        print(f"❌ Error in select_point: {e}")
         return None, []
 def reset_points(original_img: str, sel_pix):
+    """Reset all points and clear the mask"""
     if original_img is None:
         return None, []
     try:
+        # Convert stored image data back to numpy array
+        frame_data = json.loads(original_img)
+        original_img_array = base64_to_numpy(frame_data['data'], frame_data['shape'], frame_data['dtype'])
+        temp_dir = frame_data.get('temp_dir', 'temp_local')
+        # Create a display image (just the original image)
+        display_img = original_img_array.copy()
+        # Clear all points
+        new_sel_pix = []
+        # Clear any existing masks
+        for mask_file in glob.glob(os.path.join(temp_dir, "*.png")):
+            try:
+                os.remove(mask_file)
+            except Exception as e:
+                logger.warning(f"Failed to remove mask file {mask_file}: {e}")
+        print("🔄 Points and masks reset")
+        return display_img, new_sel_pix
     except Exception as e:
+        print(f"❌ Error in reset_points: {e}")
         return None, []
 def launch_viz(grid_size, vo_points, fps, original_image_state):
     """Launch visualization with user-specific temp directory"""
     if original_image_state is None:
         return None, None, None
     try:
+        # Get user's temp directory from stored frame data
+        frame_data = json.loads(original_image_state)
+        temp_dir = frame_data.get('temp_dir', 'temp_local')
+        video_name = frame_data.get('video_name', 'video')
+        print(f"🚀 Starting tracking for video: {video_name}")
+        print(f"📊 Parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
+        # Check for mask files
+        mask_files = glob.glob(os.path.join(temp_dir, "*.png"))
+        video_files = glob.glob(os.path.join(temp_dir, "*.mp4"))
+        if not video_files:
+            print("❌ No video file found")
+            return "❌ Error: No video file found", None, None
+        video_path = video_files[0]
+        mask_path = mask_files[0] if mask_files else None
+        # Run tracker
+        print("🎯 Running tracker...")
+        out_dir = os.path.join(temp_dir, "results")
+        os.makedirs(out_dir, exist_ok=True)
+        gpu_run_tracker(None, None, temp_dir, video_name, grid_size, vo_points, fps)
+        # Process results
+        npz_path = os.path.join(out_dir, "result.npz")
+        track2d_video = os.path.join(out_dir, "test_pred_track.mp4")
+        if os.path.exists(npz_path):
+            print("📊 Processing 3D visualization...")
+            html_path = process_point_cloud_data(npz_path)
+            # Schedule deletion of generated files
+            delete_later(html_path, delay=600)
+            if os.path.exists(track2d_video):
+                delete_later(track2d_video, delay=600)
+            delete_later(npz_path, delay=600)
+            # Create iframe HTML
+            iframe_html = f"""
+            <div style='border: 3px solid #667eea; border-radius: 10px;
+                        background: #f8f9ff; height: 650px; width: 100%;
+                        box-shadow: 0 8px 32px rgba(102, 126, 234, 0.3);
+                        margin: 0; padding: 0; box-sizing: border-box; overflow: hidden;'>
+                <iframe id="viz_iframe" src="/gradio_api/file={html_path}"
+                        width="100%" height="650" frameborder="0"
+                        style="border: none; display: block; width: 100%; height: 650px;
+                               margin: 0; padding: 0; border-radius: 7px;">
+                </iframe>
+            </div>
+            """
+            print("✅ Tracking completed successfully!")
+            return iframe_html, track2d_video if os.path.exists(track2d_video) else None, html_path
+        else:
+            print("❌ Tracking failed - no results generated")
+            return "❌ Error: Tracking failed to generate results", None, None
     except Exception as e:
+        print(f"❌ Error in launch_viz: {e}")
+        return f"❌ Error: {str(e)}", None, None
 def clear_all():
     """Clear all buffers and temporary files"""
             None,  # tracking_video_download
             None)  # HTML download component
 def get_video_settings(video_name):
     """Get video-specific settings based on video name"""
     video_settings = {
         "cinema_1": (45, 756, 3),
     }
+    return video_settings.get(video_name, (50, 756, 3))
 # Create the Gradio interface
 print("🎨 Creating Gradio interface...")
 with gr.Blocks(
     theme=gr.themes.Soft(),
+    title="🎯 [SpatialTracker V2](https://github.com/henry123-boy/SpaTrackerV2)",
     css="""
     .gradio-container {
         max-width: 1200px !important;
     """
 ) as demo:
+    # Add prominent main title
     gr.Markdown("""
+    # ✨ SpatialTrackerV2
     Welcome to [SpatialTracker V2](https://github.com/henry123-boy/SpaTrackerV2)! This interface allows you to track any pixels in 3D using our model.
     **⚡ Quick Start:** Upload video → Click "Start Tracking Now!"
     """)
+    # Status indicator
+    gr.Markdown("**Status:** 🟢 Local Processing Mode")
     # Main content area - video upload left, 3D visualization right
     with gr.Row():
                 with gr.Row():
                     reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary", size="sm")
+    # Downloads section - hidden but still functional for local processing
     with gr.Row(visible=False):
         with gr.Column(scale=1):
             tracking_video_download = gr.File(
 # Launch the interface
 if __name__ == "__main__":
+    print("🌟 Launching SpatialTracker V2 Local Version...")
+    print("🔗 Running in Local Processing Mode")
     demo.launch(
         server_name="0.0.0.0",

app_3rd/README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+# 🌟 SpatialTrackerV2 Integrated with SAM 🌟
+SAM receives a point prompt and generates a mask for the target object, facilitating easy interaction to obtain the object's 3D trajectories with SpaTrack2.
+## Installation
+```
+python -m pip install git+https://github.com/facebookresearch/segment-anything.git
+cd app_3rd/sam_utils
+mkdir checkpoints
+cd checkpoints
+wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
+```

app_3rd/sam_utils/hf_sam_predictor.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import gc
+import numpy as np
+import torch
+from typing import Optional, Tuple, List, Union
+import warnings
+import cv2
+try:
+    from transformers import SamModel, SamProcessor
+    from huggingface_hub import hf_hub_download
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+    warnings.warn("transformers or huggingface_hub not available. HF SAM models will not work.")
+# Hugging Face model mapping
+HF_MODELS = {
+    'vit_b': 'facebook/sam-vit-base',
+    'vit_l': 'facebook/sam-vit-large',
+    'vit_h': 'facebook/sam-vit-huge'
+}
+class HFSamPredictor:
+    """
+    Hugging Face version of SamPredictor that wraps the transformers SAM models.
+    This class provides the same interface as the original SamPredictor for seamless integration.
+    """
+    def __init__(self, model: SamModel, processor: SamProcessor, device: Optional[str] = None):
+        """
+        Initialize the HF SAM predictor.
+        Args:
+            model: The SAM model from transformers
+            processor: The SAM processor from transformers
+            device: Device to run the model on ('cuda', 'cpu', etc.)
+        """
+        self.model = model
+        self.processor = processor
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model.to(self.device)
+        self.model.eval()
+        # Store the current image and its features
+        self.original_size = None
+        self.input_size = None
+        self.features = None
+        self.image = None
+    @classmethod
+    def from_pretrained(cls, model_name: str, device: Optional[str] = None) -> 'HFSamPredictor':
+        """
+        Load a SAM model from Hugging Face Hub.
+        Args:
+            model_name: Model name from HF_MODELS or direct HF model path
+            device: Device to load the model on
+        Returns:
+            HFSamPredictor instance
+        """
+        if not HF_AVAILABLE:
+            raise ImportError("transformers and huggingface_hub are required for HF SAM models")
+        # Map model type to HF model name if needed
+        if model_name in HF_MODELS:
+            model_name = HF_MODELS[model_name]
+        print(f"Loading SAM model from Hugging Face: {model_name}")
+        # Load model and processor
+        model = SamModel.from_pretrained(model_name)
+        processor = SamProcessor.from_pretrained(model_name)
+        return cls(model, processor, device)
+    def preprocess(self, image: np.ndarray,
+                         input_points: List[List[float]], input_labels: List[int]) -> None:
+        """
+        Set the image for prediction. This preprocesses the image and extracts features.
+        Args:
+            image: Input image as numpy array (H, W, C) in RGB format
+        """
+        if image.dtype != np.uint8:
+            image = (image * 255).astype(np.uint8)
+        self.image = image
+        self.original_size = image.shape[:2]
+        # Use dummy point to ensure processor returns original_sizes & reshaped_input_sizes
+        inputs = self.processor(
+            images=image,
+            input_points=input_points,
+            input_labels=input_labels,
+            return_tensors="pt"
+        )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        self.input_size = inputs['pixel_values'].shape[-2:]
+        self.features = inputs
+        return inputs
+def get_hf_sam_predictor(model_type: str = 'vit_h', device: Optional[str] = None,
+                        image: Optional[np.ndarray] = None) -> HFSamPredictor:
+    """
+    Get a Hugging Face SAM predictor with the same interface as the original get_sam_predictor.
+    Args:
+        model_type: Model type ('vit_b', 'vit_l', 'vit_h')
+        device: Device to run the model on
+        image: Optional image to set immediately
+    Returns:
+        HFSamPredictor instance
+    """
+    if not HF_AVAILABLE:
+        raise ImportError("transformers and huggingface_hub are required for HF SAM models")
+    if device is None:
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Load the predictor
+    predictor = HFSamPredictor.from_pretrained(model_type, device)
+    # Set image if provided
+    if image is not None:
+        predictor.set_image(image)
+    return predictor

app_3rd/sam_utils/inference.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import gc
+import numpy as np
+import torch
+from segment_anything import SamPredictor, sam_model_registry
+# Try to import HF SAM support
+try:
+    from app_3rd.sam_utils.hf_sam_predictor import get_hf_sam_predictor, HFSamPredictor
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+models = {
+  'vit_b': 'app_3rd/sam_utils/checkpoints/sam_vit_b_01ec64.pth',
+  'vit_l': 'app_3rd/sam_utils/checkpoints/sam_vit_l_0b3195.pth',
+  'vit_h': 'app_3rd/sam_utils/checkpoints/sam_vit_h_4b8939.pth'
+}
+def get_sam_predictor(model_type='vit_b', device=None, image=None, use_hf=True, predictor=None):
+  """
+  Get SAM predictor with option to use HuggingFace version
+  Args:
+      model_type: Model type ('vit_b', 'vit_l', 'vit_h')
+      device: Device to run on
+      image: Optional image to set immediately
+      use_hf: Whether to use HuggingFace SAM instead of original SAM
+  """
+  if predictor is not None:
+    return predictor
+  if use_hf:
+    if not HF_AVAILABLE:
+      raise ImportError("HuggingFace SAM not available. Install transformers and huggingface_hub.")
+    return get_hf_sam_predictor(model_type, device, image)
+  # Original SAM logic
+  if device is None and torch.cuda.is_available():
+    device = 'cuda'
+  elif device is None:
+    device = 'cpu'
+  # sam model
+  sam = sam_model_registry[model_type](checkpoint=models[model_type])
+  sam = sam.to(device)
+  predictor = SamPredictor(sam)
+  if image is not None:
+    predictor.set_image(image)
+  return predictor
+def run_inference(predictor, input_x, selected_points, multi_object: bool = False):
+  """
+  Run inference with either original SAM or HF SAM predictor
+  Args:
+      predictor: SamPredictor or HFSamPredictor instance
+      input_x: Input image
+      selected_points: List of (point, label) tuples
+      multi_object: Whether to handle multiple objects
+  """
+  if len(selected_points) == 0:
+    return []
+  # Check if using HF SAM
+  if isinstance(predictor, HFSamPredictor):
+    return _run_hf_inference(predictor, input_x, selected_points, multi_object)
+  else:
+    return _run_original_inference(predictor, input_x, selected_points, multi_object)
+def _run_original_inference(predictor: SamPredictor, input_x, selected_points, multi_object: bool = False):
+  """Run inference with original SAM"""
+  points = torch.Tensor(
+      [p for p, _ in selected_points]
+  ).to(predictor.device).unsqueeze(1)
+  labels = torch.Tensor(
+      [int(l) for _, l in selected_points]
+  ).to(predictor.device).unsqueeze(1)
+  transformed_points = predictor.transform.apply_coords_torch(
+      points, input_x.shape[:2])
+  masks, scores, logits = predictor.predict_torch(
+    point_coords=transformed_points[:,0][None],
+    point_labels=labels[:,0][None],
+    multimask_output=False,
+  )
+  masks = masks[0].cpu().numpy()  # N 1 H W   N is the number of points
+  gc.collect()
+  torch.cuda.empty_cache()
+  return [(masks, 'final_mask')]
+def _run_hf_inference(predictor: HFSamPredictor, input_x, selected_points, multi_object: bool = False):
+  """Run inference with HF SAM"""
+  # Prepare points and labels for HF SAM
+  select_pts = [[list(p) for p, _ in selected_points]]
+  select_lbls = [[int(l) for _, l in selected_points]]
+  # Preprocess inputs
+  inputs = predictor.preprocess(input_x, select_pts, select_lbls)
+  # Run inference
+  with torch.no_grad():
+    outputs = predictor.model(**inputs)
+  # Post-process masks
+  masks = predictor.processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(),
+    inputs["original_sizes"].cpu(),
+    inputs["reshaped_input_sizes"].cpu(),
+  )
+  masks = masks[0][:,:1,...].cpu().numpy()
+  gc.collect()
+  torch.cuda.empty_cache()
+  return [(masks, 'final_mask')]

app_3rd/spatrack_utils/infer_track.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import pycolmap
+from models.SpaTrackV2.models.predictor import Predictor
+import yaml
+import easydict
+import os
+import numpy as np
+import cv2
+import torch
+import torchvision.transforms as T
+from PIL import Image
+import io
+import moviepy.editor as mp
+from models.SpaTrackV2.utils.visualizer import Visualizer
+import tqdm
+from models.SpaTrackV2.models.utils import get_points_on_a_grid
+import glob
+from rich import print
+import argparse
+import decord
+from huggingface_hub import hf_hub_download
+config = {
+    "ckpt_dir": "Yuxihenry/SpatialTrackerCkpts",  # HuggingFace repo ID
+    "cfg_dir": "config/magic_infer_moge.yaml",
+}
+def get_tracker_predictor(output_dir: str, vo_points: int = 756, tracker_model=None):
+    """
+    Initialize and return the tracker predictor and visualizer
+    Args:
+        output_dir: Directory to save visualization results
+        vo_points: Number of points for visual odometry
+    Returns:
+        Tuple of (tracker_predictor, visualizer)
+    """
+    viz = True
+    os.makedirs(output_dir, exist_ok=True)
+    with open(config["cfg_dir"], "r") as f:
+        cfg = yaml.load(f, Loader=yaml.FullLoader)
+    cfg = easydict.EasyDict(cfg)
+    cfg.out_dir = output_dir
+    cfg.model.track_num = vo_points
+    # Check if it's a local path or HuggingFace repo
+    if tracker_model is not None:
+        model = tracker_model
+        model.spatrack.track_num = vo_points
+    else:
+        if os.path.exists(config["ckpt_dir"]):
+            # Local file
+            model = Predictor.from_pretrained(config["ckpt_dir"], model_cfg=cfg["model"])
+        else:
+            # HuggingFace repo - download the model
+            print(f"Downloading model from HuggingFace: {config['ckpt_dir']}")
+            checkpoint_path = hf_hub_download(
+                repo_id=config["ckpt_dir"],
+                repo_type="model",
+                filename="SpaTrack3_offline.pth"
+            )
+            model = Predictor.from_pretrained(checkpoint_path, model_cfg=cfg["model"])
+        model.eval()
+        model.to("cuda")
+    viser = Visualizer(save_dir=cfg.out_dir, grayscale=True,
+                     fps=10, pad_value=0, tracks_leave_trace=5)
+    return model, viser
+def run_tracker(model, viser, temp_dir, video_name, grid_size, vo_points, fps=3):
+    """
+    Run tracking on a video sequence
+    Args:
+        model: Tracker predictor instance
+        viser: Visualizer instance
+        temp_dir: Directory containing temporary files
+        video_name: Name of the video file (without extension)
+        grid_size: Size of the tracking grid
+        vo_points: Number of points for visual odometry
+        fps: Frames per second for visualization
+    """
+    # Setup paths
+    video_path = os.path.join(temp_dir, f"{video_name}.mp4")
+    mask_path = os.path.join(temp_dir, f"{video_name}.png")
+    out_dir = os.path.join(temp_dir, "results")
+    os.makedirs(out_dir, exist_ok=True)
+    # Load video using decord
+    video_reader = decord.VideoReader(video_path)
+    video_tensor = torch.from_numpy(video_reader.get_batch(range(len(video_reader))).asnumpy()).permute(0, 3, 1, 2)  # Convert to tensor and permute to (N, C, H, W)
+    # resize make sure the shortest side is 336
+    h, w = video_tensor.shape[2:]
+    scale = max(336 / h, 336 / w)
+    if scale < 1:
+        new_h, new_w = int(h * scale), int(w * scale)
+        video_tensor = T.Resize((new_h, new_w))(video_tensor)
+    video_tensor = video_tensor[::fps].float()
+    depth_tensor = None
+    intrs = None
+    extrs = None
+    data_npz_load = {}
+    # Load and process mask
+    if os.path.exists(mask_path):
+        mask = cv2.imread(mask_path)
+        mask = cv2.resize(mask, (video_tensor.shape[3], video_tensor.shape[2]))
+        mask = mask.sum(axis=-1)>0
+    else:
+        mask = np.ones_like(video_tensor[0,0].numpy())>0
+    # Get frame dimensions and create grid points
+    frame_H, frame_W = video_tensor.shape[2:]
+    grid_pts = get_points_on_a_grid(grid_size, (frame_H, frame_W), device="cpu")
+    # Sample mask values at grid points and filter out points where mask=0
+    if os.path.exists(mask_path):
+        grid_pts_int = grid_pts[0].long()
+        mask_values = mask[grid_pts_int[...,1], grid_pts_int[...,0]]
+        grid_pts = grid_pts[:, mask_values]
+    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].numpy()
+    # run vggt
+    if os.environ.get("VGGT_DIR", None) is not None:
+        vggt_model = VGGT()
+        vggt_model.load_state_dict(torch.load(VGGT_DIR))
+        vggt_model.eval()
+        vggt_model = vggt_model.to("cuda")
+        # process the image tensor
+        video_tensor = preprocess_image(video_tensor)[None]
+        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+            # Predict attributes including cameras, depth maps, and point maps.
+            aggregated_tokens_list, ps_idx = vggt_model.aggregator(video_tensor.cuda()/255)
+            pose_enc = vggt_model.camera_head(aggregated_tokens_list)[-1]
+            # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
+            extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, video_tensor.shape[-2:])
+            # Predict Depth Maps
+            depth_map, depth_conf = vggt_model.depth_head(aggregated_tokens_list, video_tensor.cuda()/255, ps_idx)
+            # clear the cache
+            del vggt_model, aggregated_tokens_list, ps_idx, pose_enc
+            torch.cuda.empty_cache()
+        depth_tensor = depth_map.squeeze().cpu().numpy()
+        extrs = np.eye(4)[None].repeat(len(depth_tensor), axis=0)
+        extrs[:, :3, :4] = extrinsic.squeeze().cpu().numpy()
+        intrs = intrinsic.squeeze().cpu().numpy()
+        video_tensor = video_tensor.squeeze()
+        #NOTE: 20% of the depth is not reliable
+        # threshold = depth_conf.squeeze().view(-1).quantile(0.5)
+        unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
+    # Run model inference
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        (
+            c2w_traj, intrs, point_map, conf_depth,
+            track3d_pred, track2d_pred, vis_pred, conf_pred, video
+        ) = model.forward(video_tensor, depth=depth_tensor,
+                            intrs=intrs, extrs=extrs,
+                            queries=query_xyt,
+                            fps=1, full_point=False, iters_track=4,
+                            query_no_BA=True, fixed_cam=False, stage=1,
+                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
+        # Resize results to avoid too large I/O Burden
+        max_size = 336
+        h, w = video.shape[2:]
+        scale = min(max_size / h, max_size / w)
+        if scale < 1:
+            new_h, new_w = int(h * scale), int(w * scale)
+            video = T.Resize((new_h, new_w))(video)
+            video_tensor = T.Resize((new_h, new_w))(video_tensor)
+            point_map = T.Resize((new_h, new_w))(point_map)
+            track2d_pred[...,:2] = track2d_pred[...,:2] * scale
+            intrs[:,:2,:] = intrs[:,:2,:] * scale
+            if depth_tensor is not None:
+                depth_tensor = T.Resize((new_h, new_w))(depth_tensor)
+            conf_depth = T.Resize((new_h, new_w))(conf_depth)
+        # Visualize tracks
+        viser.visualize(video=video[None],
+                        tracks=track2d_pred[None][...,:2],
+                        visibility=vis_pred[None],filename="test")
+        # Save in tapip3d format
+        data_npz_load["coords"] = (torch.einsum("tij,tnj->tni", c2w_traj[:,:3,:3], track3d_pred[:,:,:3].cpu()) + c2w_traj[:,:3,3][:,None,:]).numpy()
+        data_npz_load["extrinsics"] = torch.inverse(c2w_traj).cpu().numpy()
+        data_npz_load["intrinsics"] = intrs.cpu().numpy()
+        data_npz_load["depths"] = point_map[:,2,...].cpu().numpy()
+        data_npz_load["video"] = (video_tensor).cpu().numpy()/255
+        data_npz_load["visibs"] = vis_pred.cpu().numpy()
+        data_npz_load["confs"] = conf_pred.cpu().numpy()
+        data_npz_load["confs_depth"] = conf_depth.cpu().numpy()
+        np.savez(os.path.join(out_dir, f'result.npz'), **data_npz_load)
+        print(f"Results saved to {out_dir}.\nTo visualize them with tapip3d, run: [bold yellow]python tapip3d_viz.py {out_dir}/result.npz[/bold yellow]")

app_release.py ADDED Viewed

	@@ -0,0 +1,1278 @@

+import gradio as gr
+import os
+import json
+import numpy as np
+import cv2
+import base64
+import requests
+import time
+from typing import List, Tuple
+from gradio_client.utils import handle_file
+from pathlib import Path
+# Backend Space URL - replace with your actual backend space URL
+BACKEND_SPACE_URL = "Yuxihenry/SpatialTrackerV2_Backend"  # Replace with actual backend space URL
+hf_token = os.getenv("HF_TOKEN")  # Replace with your actual Hugging Face token
+# Debug information
+print(f"🔧 Environment Debug Info:")
+print(f"   - Backend URL: {BACKEND_SPACE_URL}")
+print(f"   - HF Token available: {'Yes' if hf_token else 'No'}")
+print(f"   - HF Token length: {len(hf_token) if hf_token else 0}")
+# Flag to track if backend is available
+BACKEND_AVAILABLE = False
+backend_client = None
+def check_user_permissions():
+    """Check if user has necessary permissions"""
+    print("🔐 Checking user permissions...")
+    if not hf_token:
+        print("❌ No HF Token found")
+        print("🔧 To get a token:")
+        print("   1. Go to https://huggingface.co/settings/tokens")
+        print("   2. Create a new token with 'read' permissions")
+        print("   3. Set it as environment variable: export HF_TOKEN='your_token'")
+        return False
+    # Try to access user info
+    try:
+        headers = {'Authorization': f'Bearer {hf_token}'}
+        response = requests.get('https://huggingface.co/api/whoami', headers=headers, timeout=5)
+        if response.status_code == 200:
+            user_info = response.json()
+            username = user_info.get('name', 'Unknown')
+            print(f"✅ Authenticated as: {username}")
+            # Check if user has access to the specific space
+            space_url = f"https://huggingface.co/api/spaces/{BACKEND_SPACE_URL}"
+            space_response = requests.get(space_url, headers=headers, timeout=5)
+            if space_response.status_code == 200:
+                print("✅ You have access to the backend Space")
+                return True
+            elif space_response.status_code == 401:
+                print("❌ You don't have access to the backend Space")
+                print("🔧 Solutions:")
+                print("   1. Contact the Space owner to add you as collaborator")
+                print("   2. Ask the owner to make the Space public")
+                return False
+            elif space_response.status_code == 404:
+                print("❌ Backend Space not found")
+                print("🔧 Please check if the Space URL is correct")
+                return False
+            else:
+                print(f"⚠️  Unexpected response checking Space access: {space_response.status_code}")
+                return False
+        else:
+            print(f"❌ Token validation failed: {response.status_code}")
+            print("🔧 Your token might be invalid or expired")
+            return False
+    except Exception as e:
+        print(f"❌ Error checking permissions: {e}")
+        return False
+def check_backend_space_status():
+    """Check if backend space is running via HTTP request"""
+    try:
+        backend_url = f"https://huggingface.co/spaces/{BACKEND_SPACE_URL}"
+        print(f"🔍 Checking backend space status: {backend_url}")
+        # Prepare headers with authentication if token is available
+        headers = {}
+        if hf_token:
+            headers['Authorization'] = f'Bearer {hf_token}'
+            print(f"🔐 Using HF Token for authentication")
+        # Try to access the space page
+        response = requests.get(backend_url, headers=headers, timeout=10)
+        if response.status_code == 200:
+            print("✅ Backend space page is accessible")
+            # Check if space is running (look for common indicators)
+            page_content = response.text.lower()
+            if "runtime error" in page_content:
+                print("❌ Backend space has runtime error")
+                return False
+            elif "building" in page_content:
+                print("🔄 Backend space is building...")
+                return False
+            elif "sleeping" in page_content:
+                print("😴 Backend space is sleeping")
+                return False
+            else:
+                print("✅ Backend space appears to be running")
+                return True
+        elif response.status_code == 401:
+            print("❌ Authentication failed (HTTP 401)")
+            print("🔧 This means:")
+            print("   - The backend Space is private")
+            print("   - Your HF Token doesn't have access to this Space")
+            print("   - You need to be added as a collaborator to the Space")
+            print("   - Or the Space owner needs to make it public")
+            return False
+        elif response.status_code == 404:
+            print("❌ Backend space not found (HTTP 404)")
+            print("🔧 Please check if the Space URL is correct:")
+            print(f"   Current URL: {BACKEND_SPACE_URL}")
+            return False
+        else:
+            print(f"❌ Backend space not accessible (HTTP {response.status_code})")
+            print(f"🔧 Response: {response.text[:200]}...")
+            return False
+    except requests.RequestException as e:
+        print(f"❌ Failed to check backend space status: {e}")
+        return False
+    except Exception as e:
+        print(f"❌ Unexpected error checking backend: {e}")
+        return False
+def initialize_backend():
+    """Initialize backend connection using gradio_client"""
+    global backend_client, BACKEND_AVAILABLE
+    try:
+        from gradio_client import Client
+        # Connect to HF Space
+        if hf_token:
+            backend_client = Client(BACKEND_SPACE_URL, hf_token=hf_token)
+        else:
+            backend_client = Client(BACKEND_SPACE_URL)
+        # Test the connection
+        backend_client.view_api()
+        BACKEND_AVAILABLE = True
+        return True
+    except Exception as e:
+        print(f"❌ Backend connection failed: {e}")
+        BACKEND_AVAILABLE = False
+        return False
+def numpy_to_base64(arr):
+    """Convert numpy array to base64 string"""
+    return base64.b64encode(arr.tobytes()).decode('utf-8')
+def base64_to_numpy(b64_str, shape, dtype):
+    """Convert base64 string back to numpy array"""
+    return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)
+def base64_to_image(b64_str):
+    """Convert base64 string to numpy image array"""
+    if not b64_str:
+        return None
+    try:
+        # Decode base64 to bytes
+        img_bytes = base64.b64decode(b64_str)
+        # Convert bytes to numpy array
+        nparr = np.frombuffer(img_bytes, np.uint8)
+        # Decode image
+        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+        # Convert BGR to RGB
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        return img
+    except Exception as e:
+        print(f"Error converting base64 to image: {e}")
+        return None
+def get_video_name(video_path):
+    """Extract video name without extension"""
+    return os.path.splitext(os.path.basename(video_path))[0]
+def extract_first_frame(video_path):
+    """Extract first frame from video file"""
+    try:
+        cap = cv2.VideoCapture(video_path)
+        ret, frame = cap.read()
+        cap.release()
+        if ret:
+            # Convert BGR to RGB
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            return frame_rgb
+        else:
+            return None
+    except Exception as e:
+        print(f"Error extracting first frame: {e}")
+        return None
+def handle_video_upload(video):
+    """Handle video upload and extract first frame"""
+    if video is None:
+        return (None, None, [],
+                gr.update(value=50),
+                gr.update(value=756),
+                gr.update(value=3))
+    try:
+        if BACKEND_AVAILABLE and backend_client:
+            # Try to use backend API
+            try:
+                print("🔧 Calling backend API for video upload...")
+                # Call the unified API with upload_video function type - fix: use handle_file wrapper
+                result = backend_client.predict(
+                    "upload_video",  # function_type
+                    handle_file(video),           # video file - wrapped with handle_file
+                    "",              # original_image_state (not used for upload)
+                    [],              # selected_points (not used for upload)
+                    "positive_point", # point_type (not used for upload)
+                    0,               # point_x (not used for upload)
+                    0,               # point_y (not used for upload)
+                    50,              # grid_size (not used for upload)
+                    756,             # vo_points (not used for upload)
+                    3,               # fps (not used for upload)
+                    api_name="/unified_api"
+                )
+                print(f"✅ Backend video upload API call successful!")
+                print(f"🔧 Result type: {type(result)}")
+                print(f"🔧 Result: {result}")
+                # Parse the result - expect a dict with success status
+                if isinstance(result, dict) and result.get("success"):
+                    # Extract data from backend response
+                    original_image_state = result.get("original_image_state", "")
+                    display_image = result.get("display_image", None)
+                    selected_points = result.get("selected_points", [])
+                    # Fix: Convert display_image from list back to numpy array if needed
+                    if isinstance(display_image, list):
+                        display_image = np.array(display_image, dtype=np.uint8)
+                        print(f"🔧 Converted display_image from list to numpy array: {display_image.shape}")
+                    # Get video settings based on video name
+                    video_name = get_video_name(video)
+                    print(f"🎬 Video path: '{video}' -> Video name: '{video_name}'")
+                    grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
+                    print(f"🎬 Video settings for '{video_name}': grid_size={grid_size_val}, vo_points={vo_points_val}, fps={fps_val}")
+                    return (original_image_state, display_image, selected_points,
+                            gr.update(value=grid_size_val),
+                            gr.update(value=vo_points_val),
+                            gr.update(value=fps_val))
+                else:
+                    print("Backend processing failed, using local fallback")
+                    # Fallback to local processing
+                    pass
+            except Exception as e:
+                print(f"Backend API call failed: {e}")
+                # Fallback to local processing
+                pass
+        # Fallback: local processing
+        print("Using local video processing...")
+        display_image = extract_first_frame(video)
+        if display_image is not None:
+            # Create a state format compatible with backend
+            import tempfile
+            import shutil
+            # Create a temporary directory for this session
+            session_id = str(int(time.time() * 1000))  # Use timestamp as session ID
+            temp_dir = os.path.join("temp_frontend", f"session_{session_id}")
+            os.makedirs(temp_dir, exist_ok=True)
+            # Copy video to temp directory with standardized name
+            video_name = get_video_name(video)
+            temp_video_path = os.path.join(temp_dir, f"{video_name}.mp4")
+            shutil.copy(video, temp_video_path)
+            # Create state format compatible with backend
+            frame_data = {
+                'data': numpy_to_base64(display_image),
+                'shape': display_image.shape,
+                'dtype': str(display_image.dtype),
+                'temp_dir': temp_dir,
+                'video_name': video_name,
+                'video_path': temp_video_path  # Keep for backward compatibility
+            }
+            original_image_state = json.dumps(frame_data)
+        else:
+            # Fallback to simple state if frame extraction fails
+            original_image_state = json.dumps({
+                "video_path": video,
+                "frame": "local_processing_failed"
+            })
+        # Get video settings
+        video_name = get_video_name(video)
+        print(f"🎬 Local fallback - Video path: '{video}' -> Video name: '{video_name}'")
+        grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
+        print(f"🎬 Local fallback - Video settings for '{video_name}': grid_size={grid_size_val}, vo_points={vo_points_val}, fps={fps_val}")
+        return (original_image_state, display_image, [],
+                gr.update(value=grid_size_val),
+                gr.update(value=vo_points_val),
+                gr.update(value=fps_val))
+    except Exception as e:
+        print(f"Error in handle_video_upload: {e}")
+        return (None, None, [],
+                gr.update(value=50),
+                gr.update(value=756),
+                gr.update(value=3))
+def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData):
+    """Handle point selection for SAM"""
+    if original_img is None:
+        return None, []
+    try:
+        if BACKEND_AVAILABLE and backend_client:
+            # Try to use backend API
+            try:
+                print(f"🔧 Calling backend select point API: x={evt.index[0]}, y={evt.index[1]}, type={point_type}")
+                # Call the unified API with select_point function type
+                result = backend_client.predict(
+                    "select_point",  # function_type
+                    None,            # video file (not used for select_point)
+                    original_img,    # original_image_state
+                    sel_pix,         # selected_points
+                    point_type,      # point_type
+                    evt.index[0],    # point_x
+                    evt.index[1],    # point_y
+                    50,              # grid_size (not used for select_point)
+                    756,             # vo_points (not used for select_point)
+                    3,               # fps (not used for select_point)
+                    api_name="/unified_api"
+                )
+                print(f"✅ Backend select point API call successful!")
+                print(f"🔧 Result type: {type(result)}")
+                print(f"🔧 Result: {result}")
+                # Parse the result - expect a dict with success status
+                if isinstance(result, dict) and result.get("success"):
+                    display_image = result.get("display_image", None)
+                    new_sel_pix = result.get("selected_points", sel_pix)
+                    # Fix: Convert display_image from list back to numpy array if needed
+                    if isinstance(display_image, list):
+                        display_image = np.array(display_image, dtype=np.uint8)
+                        print(f"🔧 Converted display_image from list to numpy array: {display_image.shape}")
+                    return display_image, new_sel_pix
+                else:
+                    print("Backend processing failed, using local fallback")
+                    # Fallback to local processing
+                    pass
+            except Exception as e:
+                print(f"Backend API call failed: {e}")
+                # Check for specific gradio_client errors
+                if "AppError" in str(type(e)):
+                    print("🔧 Backend Space has internal errors (AppError)")
+                    print("🔧 The backend Space code has bugs or configuration issues")
+                    print("🔧 Contact the Space owner to fix the backend implementation")
+                elif "Could not fetch config" in str(e):
+                    print("🔧 Config fetch failed - possible Gradio version mismatch")
+                    print("🔧 Frontend and backend may be using incompatible Gradio versions")
+                elif "timeout" in str(e).lower():
+                    print("🔧 Backend request timed out - Space might be overloaded")
+                else:
+                    print(f"🔧 Unexpected error type: {type(e).__name__}")
+                print("🔄 Showing error message instead of visualization...")
+                # Fallback to local processing
+                pass
+        # Fallback: local processing with improved visualization
+        print("Using local point selection with enhanced visualization...")
+        # Parse original image state
+        try:
+            state_data = json.loads(original_img)
+            video_path = state_data.get("video_path")
+        except:
+            video_path = None
+        if video_path:
+            # Re-extract frame and add point with mask visualization
+            display_image = extract_first_frame(video_path)
+            if display_image is not None:
+                # Add point to the image with enhanced visualization
+                x, y = evt.index[0], evt.index[1]
+                color = (0, 255, 0) if point_type == 'positive_point' else (255, 0, 0)
+                # Draw a larger, more visible point
+                cv2.circle(display_image, (x, y), 8, color, -1)
+                cv2.circle(display_image, (x, y), 12, (255, 255, 255), 2)
+                # Add point to selected points list - fix logic to match local version
+                new_sel_pix = sel_pix.copy() if sel_pix else []
+                new_sel_pix.append([x, y, point_type])
+                return display_image, new_sel_pix
+        return None, []
+    except Exception as e:
+        print(f"Error in select_point: {e}")
+        return None, []
+def reset_points(original_img: str, sel_pix):
+    """Reset points and restore original image"""
+    if original_img is None:
+        return None, []
+    try:
+        if BACKEND_AVAILABLE and backend_client:
+            # Try to use backend API
+            try:
+                print("🔧 Calling backend reset points API...")
+                # Call the unified API with reset_points function type
+                result = backend_client.predict(
+                    "reset_points",  # function_type
+                    None,            # video file (not used for reset_points)
+                    original_img,    # original_image_state
+                    sel_pix,         # selected_points
+                    "positive_point", # point_type (not used for reset_points)
+                    0,               # point_x (not used for reset_points)
+                    0,               # point_y (not used for reset_points)
+                    50,              # grid_size (not used for reset_points)
+                    756,             # vo_points (not used for reset_points)
+                    3,               # fps (not used for reset_points)
+                    api_name="/unified_api"
+                )
+                print(f"✅ Backend reset points API call successful!")
+                print(f"🔧 Result: {result}")
+                # Parse the result
+                if isinstance(result, dict) and result.get("success"):
+                    display_image = result.get("display_image", None)
+                    new_sel_pix = result.get("selected_points", [])
+                    # Fix: Convert display_image from list back to numpy array if needed
+                    if isinstance(display_image, list):
+                        display_image = np.array(display_image, dtype=np.uint8)
+                        print(f"🔧 Converted display_image from list to numpy array: {display_image.shape}")
+                    return display_image, new_sel_pix
+                else:
+                    print("Backend processing failed, using local fallback")
+                    # Fallback to local processing
+                    pass
+            except Exception as e:
+                print(f"Backend API call failed: {e}")
+                # Fallback to local processing
+                pass
+        # Fallback: local processing
+        print("Using local reset points...")
+        # Parse original image state
+        try:
+            state_data = json.loads(original_img)
+            video_path = state_data.get("video_path")
+        except:
+            video_path = None
+        if video_path:
+            # Re-extract original frame
+            display_image = extract_first_frame(video_path)
+            return display_image, []
+        return None, []
+    except Exception as e:
+        print(f"Error in reset_points: {e}")
+        return None, []
+gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])
+def launch_viz(grid_size, vo_points, fps, original_image_state):
+    """Launch visualization with user-specific temp directory"""
+    if original_image_state is None:
+        return None, None, None
+    try:
+        if BACKEND_AVAILABLE and backend_client:
+            # Try to use backend API
+            try:
+                print(f"🔧 Calling backend API with parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
+                print(f"🔧 Original image state type: {type(original_image_state)}")
+                print(f"🔧 Original image state preview: {str(original_image_state)[:100]}...")
+                # Validate and potentially fix the original_image_state format
+                state_to_send = original_image_state
+                # Check if this is a local processing state that needs to be converted
+                try:
+                    if isinstance(original_image_state, str):
+                        parsed_state = json.loads(original_image_state)
+                        if "video_path" in parsed_state and "frame" in parsed_state:
+                            # This is a local processing state, we need to handle differently
+                            print("🔧 Detected local processing state, cannot use backend for tracking")
+                            print("🔧 Backend requires proper video upload state from backend API")
+                            # Fall through to local processing
+                            raise ValueError("Local state cannot be processed by backend")
+                except json.JSONDecodeError:
+                    print("🔧 Invalid JSON state, cannot send to backend")
+                    raise ValueError("Invalid state format")
+                # Call the unified API with run_tracker function type
+                result = backend_client.predict(
+                    "run_tracker",        # function_type
+                    None,                 # video file (not used for run_tracker)
+                    state_to_send,        # original_image_state
+                    [],                   # selected_points (not used for run_tracker)
+                    "positive_point",     # point_type (not used for run_tracker)
+                    0,                    # point_x (not used for run_tracker)
+                    0,                    # point_y (not used for run_tracker)
+                    grid_size,            # grid_size
+                    vo_points,            # vo_points
+                    fps,                  # fps
+                    api_name="/unified_api"
+                )
+                print(f"✅ Backend API call successful!")
+                print(f"🔧 Result type: {type(result)}")
+                print(f"🔧 Result: {result}")
+                # Parse the result
+                if isinstance(result, dict) and result.get("success"):
+                    viz_html = result.get("viz_html", "")
+                    track_video_path = result.get("track_video_path", "")
+                    track_video_content = result.get("track_video_content", None)
+                    track_video_filename = result.get("track_video_filename", "tracked_video.mp4")
+                    # Save HTML to _viz directory (like local version)
+                    viz_dir = './_viz'
+                    os.makedirs(viz_dir, exist_ok=True)
+                    random_path = f'./_viz/_{time.time()}.html'
+                    with open(random_path, 'w', encoding='utf-8') as f:
+                        f.write(viz_html)
+                    # Create iframe HTML
+                    iframe_html = f"""
+                    <div style='border: 3px solid #667eea; border-radius: 10px;
+                                background: #f8f9ff; height: 650px; width: 100%;
+                                box-shadow: 0 8px 32px rgba(102, 126, 234, 0.3);
+                                margin: 0; padding: 0; box-sizing: border-box; overflow: hidden;'>
+                        <iframe id="viz_iframe" src="/gradio_api/file={random_path}"
+                                width="100%" height="650" frameborder="0"
+                                style="border: none; display: block; width: 100%; height: 650px;
+                                       margin: 0; padding: 0; border-radius: 7px;">
+                        </iframe>
+                    </div>
+                    """
+                    print(f"💾 HTML saved to: {random_path}")
+                    print(f"📊 HTML content preview: {viz_html[:200]}...")
+                    # If we have base64 encoded video content, save it as a temporary file
+                    local_video_path = None
+                    if track_video_content:
+                        try:
+                            # Create a temporary file for the video
+                            temp_video_dir = "temp_frontend_videos"
+                            os.makedirs(temp_video_dir, exist_ok=True)
+                            # Generate unique filename to avoid conflicts
+                            timestamp = str(int(time.time() * 1000))
+                            local_video_path = os.path.join(temp_video_dir, f"{timestamp}_{track_video_filename}")
+                            # Decode base64 and save as video file
+                            video_bytes = base64.b64decode(track_video_content)
+                            with open(local_video_path, 'wb') as f:
+                                f.write(video_bytes)
+                            print(f"✅ Successfully saved tracking video to: {local_video_path}")
+                            print(f"🔧 Video file size: {len(video_bytes)} bytes")
+                        except Exception as e:
+                            print(f"❌ Failed to process tracking video: {e}")
+                            local_video_path = None
+                    else:
+                        print("⚠️ No tracking video content received from backend")
+                    # 返回iframe HTML、视频路径和HTML文件路径（用于下载）
+                    return iframe_html, local_video_path, random_path
+                else:
+                    error_msg = result.get("error", "Unknown error") if isinstance(result, dict) else "Backend processing failed"
+                    print(f"❌ Backend processing failed: {error_msg}")
+                    # Fall through to error message
+                    pass
+            except Exception as e:
+                print(f"❌ Backend API call failed: {e}")
+                print(f"🔧 Error type: {type(e)}")
+                print(f"🔧 Error details: {str(e)}")
+                # Check for specific gradio_client errors
+                if "AppError" in str(type(e)):
+                    print("🔧 Backend Space has internal errors (AppError)")
+                    print("🔧 The backend Space code has bugs or configuration issues")
+                    print("🔧 Contact the Space owner to fix the backend implementation")
+                elif "Could not fetch config" in str(e):
+                    print("🔧 Config fetch failed - possible Gradio version mismatch")
+                    print("🔧 Frontend and backend may be using incompatible Gradio versions")
+                elif "timeout" in str(e).lower():
+                    print("🔧 Backend request timed out - Space might be overloaded")
+                elif "Expecting value" in str(e):
+                    print("🔧 JSON parsing error in backend - state format mismatch")
+                    print("🔧 This happens when using local processing state with backend API")
+                    print("🔧 Please upload video again to use backend processing")
+                else:
+                    print(f"🔧 Unexpected error type: {type(e).__name__}")
+                print("🔄 Showing error message instead of visualization...")
+                # Fall through to error message
+                pass
+        # Create an informative error message based on the state
+        state_info = ""
+        try:
+            if isinstance(original_image_state, str):
+                parsed_state = json.loads(original_image_state)
+                if "video_path" in parsed_state:
+                    video_name = os.path.basename(parsed_state["video_path"])
+                    state_info = f"Video: {video_name}"
+        except:
+            state_info = "State format unknown"
+        # Fallback: show message that backend is required
+        error_message = f"""
+        <div style='border: 3px solid #ff6b6b; border-radius: 10px; padding: 20px; background-color: #fff5f5;'>
+            <h3 style='color: #d63031; margin-bottom: 15px;'>⚠️ Backend Processing Required</h3>
+            <p style='color: #2d3436; line-height: 1.6;'>
+                The tracking and visualization features require backend processing. The current setup is using local processing which is incompatible with the backend API.
+            </p>
+            <h4 style='color: #d63031; margin: 15px 0 10px 0;'>Solutions:</h4>
+            <ul style='color: #2d3436; line-height: 1.6;'>
+                <li><strong>Upload video again:</strong> This will properly initialize the backend state</li>
+                <li><strong>Select points on the frame:</strong> Ensure you've clicked on the object to track</li>
+                <li><strong>Check backend connection:</strong> Ensure the backend Space is running</li>
+                <li><strong>Use compatible state:</strong> Avoid local processing mode</li>
+            </ul>
+            <div style='background-color: #f8f9fa; border-radius: 5px; padding: 10px; margin-top: 15px;'>
+                <p style='color: #2d3436; font-weight: bold; margin: 0 0 5px 0;'>Debug Information:</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Backend Available: {BACKEND_AVAILABLE}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Backend Client: {backend_client is not None}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Backend URL: {BACKEND_SPACE_URL}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>State Info: {state_info}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Processing Mode: {"Backend" if BACKEND_AVAILABLE else "Local (Limited)"}</p>
+            </div>
+            <div style='background-color: #e3f2fd; border-radius: 5px; padding: 10px; margin-top: 10px; border-left: 4px solid #2196f3;'>
+                <p style='color: #1976d2; font-weight: bold; margin: 0 0 5px 0;'>💡 Quick Fix:</p>
+                <p style='color: #1976d2; font-size: 13px; margin: 0;'>
+                    Try uploading your video again - this should properly initialize the backend state for tracking.
+                </p>
+            </div>
+        </div>
+        """
+        return error_message, None, None
+    except Exception as e:
+        print(f"Error in launch_viz: {e}")
+        return None, None, None
+def clear_all():
+    """Clear all buffers and temporary files"""
+    return (None, None, [],
+            gr.update(value=50),
+            gr.update(value=756),
+            gr.update(value=3))
+def clear_all_with_download():
+    """Clear all buffers including both download components"""
+    return (None, None, [],
+            gr.update(value=50),
+            gr.update(value=756),
+            gr.update(value=3),
+            None,  # tracking_video_download
+            None)  # HTML download component
+def update_tracker_model(model_name):
+    """Update tracker model (placeholder function)"""
+    return
+def get_video_settings(video_name):
+    """Get video-specific settings based on video name"""
+    video_settings = {
+        "kiss": (45, 700, 10),
+        "backpack": (40, 600, 2),
+        "kitchen": (60, 800, 3),
+        "pillow": (35, 500, 2),
+        "handwave": (35, 500, 8),
+        "hockey": (45, 700, 2),
+        "drifting": (35, 1000, 6),
+        "basketball": (45, 1500, 5),
+        "ken_block_0": (45, 700, 2),
+        "ego_kc1": (45, 500, 4),
+        "vertical_place": (45, 500, 3),
+        "ego_teaser": (45, 1200, 10),
+        "robot_unitree": (45, 500, 4),
+        "robot_3": (35, 400, 5),
+        "teleop2": (45, 256, 7),
+        "pusht": (45, 256, 10),
+        "cinema_0": (45, 356, 5),
+        "cinema_1": (45, 756, 3),
+    }
+    return video_settings.get(video_name, (50, 756, 3))
+def test_backend_connection():
+    """Test if backend is actually working"""
+    global BACKEND_AVAILABLE
+    if not backend_client:
+        return False
+    try:
+        print("Testing backend connection with a simple call...")
+        # Check if we have fns available
+        if hasattr(backend_client, 'fns') and backend_client.fns:
+            print("✅ Backend API functions are available")
+            print(f"🔧 Available function indices: {list(backend_client.fns.keys())}")
+            return True
+        else:
+            print("❌ Backend API functions not found")
+            return False
+    except Exception as e:
+        print(f"❌ Backend connection test failed: {e}")
+        return False
+def test_backend_api():
+    """Test specific backend API functions"""
+    if not BACKEND_AVAILABLE or not backend_client:
+        print("❌ Backend not available for testing")
+        return False
+    try:
+        print("🧪 Testing backend API functions...")
+        # Test if fns exist and show available indices
+        if hasattr(backend_client, 'fns') and backend_client.fns:
+            print(f"✅ Backend has {len(backend_client.fns)} functions available")
+            for idx in backend_client.fns.keys():
+                print(f"✅ Function {idx} is available")
+        else:
+            print("❌ No functions found in backend API")
+            return False
+        return True
+    except Exception as e:
+        print(f"❌ Backend API test failed: {e}")
+        return False
+# Initialize the backend connection
+print("🚀 Initializing frontend application...")
+result = initialize_backend()
+# Test backend connection if available
+if result and BACKEND_AVAILABLE:
+    print("✅ Backend connection successful!")
+else:
+    print("❌ Backend connection failed!")
+# Create the Gradio interface
+print("🎨 Creating Gradio interface...")
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    title="🎯 [SpatialTracker V2](https://github.com/henry123-boy/SpaTrackerV2) - Frontend Interface",
+    css="""
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: auto !important;
+    }
+    .gr-button {
+        margin: 5px;
+    }
+    .gr-form {
+        background: white;
+        border-radius: 10px;
+        padding: 20px;
+        box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+    }
+    /* 固定3D可视化器尺寸 */
+    #viz_container {
+        height: 650px !important;
+        min-height: 650px !important;
+        max-height: 650px !important;
+        width: 100% !important;
+        margin: 0 !important;
+        padding: 0 !important;
+        overflow: hidden !important;
+    }
+    #viz_container > div {
+        height: 650px !important;
+        min-height: 650px !important;
+        max-height: 650px !important;
+        width: 100% !important;
+        margin: 0 !important;
+        padding: 0 !important;
+        box-sizing: border-box !important;
+    }
+    #viz_container iframe {
+        height: 650px !important;
+        min-height: 650px !important;
+        max-height: 650px !important;
+        width: 100% !important;
+        border: none !important;
+        display: block !important;
+        margin: 0 !important;
+        padding: 0 !important;
+        box-sizing: border-box !important;
+    }
+    /* 固定视频上传组件高度 */
+    .gr-video {
+        height: 300px !important;
+        min-height: 300px !important;
+        max-height: 300px !important;
+    }
+    .gr-video video {
+        height: 260px !important;
+        max-height: 260px !important;
+        object-fit: contain !important;
+        background: #f8f9fa;
+    }
+    .gr-video .gr-video-player {
+        height: 260px !important;
+        max-height: 260px !important;
+    }
+    /* 水平滚动的示例视频样式 */
+    .example-videos .gr-examples {
+        overflow: visible !important;
+    }
+    .example-videos .gr-examples .gr-table-wrapper {
+        overflow-x: auto !important;
+        overflow-y: hidden !important;
+        scrollbar-width: thin;
+        scrollbar-color: #667eea #f1f1f1;
+    }
+    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar {
+        height: 8px;
+    }
+    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-track {
+        background: #f1f1f1;
+        border-radius: 4px;
+    }
+    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 4px;
+    }
+    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb:hover {
+        background: linear-gradient(135deg, #5a6fd8 0%, #6a4190 100%);
+    }
+    .example-videos .gr-examples .gr-table {
+        display: flex !important;
+        flex-wrap: nowrap !important;
+        min-width: max-content !important;
+        gap: 10px !important;
+    }
+    .example-videos .gr-examples .gr-table tbody {
+        display: flex !important;
+        flex-direction: row !important;
+        flex-wrap: nowrap !important;
+        gap: 10px !important;
+    }
+    .example-videos .gr-examples .gr-table tbody tr {
+        display: flex !important;
+        flex-direction: column !important;
+        min-width: 120px !important;
+        max-width: 120px !important;
+        margin: 0 !important;
+        background: white;
+        border-radius: 8px;
+        box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+        transition: all 0.3s ease;
+        cursor: pointer;
+    }
+    .example-videos .gr-examples .gr-table tbody tr:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 4px 12px rgba(102, 126, 234, 0.2);
+    }
+    .example-videos .gr-examples .gr-table tbody tr td {
+        text-align: center !important;
+        padding: 8px !important;
+        border: none !important;
+    }
+    .example-videos .gr-examples .gr-table tbody tr td video {
+        border-radius: 6px !important;
+        width: 100% !important;
+        height: auto !important;
+    }
+    .example-videos .gr-examples .gr-table tbody tr td:last-child {
+        font-size: 12px !important;
+        font-weight: 500 !important;
+        color: #333 !important;
+        padding-top: 4px !important;
+    }
+    /* 新的水平滚动示例视频样式 */
+    .horizontal-examples .gr-examples {
+        overflow: visible !important;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper {
+        overflow-x: auto !important;
+        overflow-y: hidden !important;
+        scrollbar-width: thin;
+        scrollbar-color: #667eea #f1f1f1;
+        padding: 10px 0;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar {
+        height: 8px;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-track {
+        background: #f1f1f1;
+        border-radius: 4px;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 4px;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb:hover {
+        background: linear-gradient(135deg, #5a6fd8 0%, #6a4190 100%);
+    }
+    .horizontal-examples .gr-examples .gr-table {
+        display: flex !important;
+        flex-wrap: nowrap !important;
+        min-width: max-content !important;
+        gap: 15px !important;
+        padding-bottom: 10px;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody {
+        display: flex !important;
+        flex-direction: row !important;
+        flex-wrap: nowrap !important;
+        gap: 15px !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr {
+        display: flex !important;
+        flex-direction: column !important;
+        min-width: 160px !important;
+        max-width: 160px !important;
+        margin: 0 !important;
+        background: white;
+        border-radius: 12px;
+        box-shadow: 0 3px 12px rgba(0,0,0,0.12);
+        transition: all 0.3s ease;
+        cursor: pointer;
+        overflow: hidden;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr:hover {
+        transform: translateY(-4px);
+        box-shadow: 0 8px 20px rgba(102, 126, 234, 0.25);
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td {
+        text-align: center !important;
+        padding: 0 !important;
+        border: none !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td:first-child {
+        padding: 0 !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td video {
+        border-radius: 8px 8px 0 0 !important;
+        width: 100% !important;
+        height: 90px !important;
+        object-fit: cover !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td:last-child {
+        font-size: 11px !important;
+        font-weight: 600 !important;
+        color: #333 !important;
+        padding: 8px 12px !important;
+        background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
+        border-radius: 0 0 8px 8px;
+    }
+    """
+) as demo:
+    gr.Markdown("""
+    Welcome to [SpatialTracker V2](https://github.com/henry123-boy/SpaTrackerV2)! This interface allows you to track any pixels in 3D using our model.
+    **⚡ Quick Start:** Upload video → Click "Start Tracking Now!"
+    **🔬 Advanced Usage with SAM:**
+    1. Upload a video file or select from examples below
+    2. Expand "Manual Point Selection" to click on specific objects for SAM-guided tracking
+    3. Adjust tracking parameters for optimal performance
+    4. Click "Start Tracking Now!" to begin 3D tracking with SAM guidance
+    """)
+    # Status indicator - more compact
+    status_info = "🟢 Backend Connected" if BACKEND_AVAILABLE else "🟡 Standalone Mode"
+    gr.Markdown(f"**Status:** {status_info} | Backend: {BACKEND_SPACE_URL}")
+    # Main content area - video upload left, 3D visualization right
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Video upload section
+            with gr.Group():
+                gr.Markdown("### 📂 Select Video")
+                # Define video_input here so it can be referenced in examples
+                video_input = gr.Video(
+                    label="Upload Video or Select Example",
+                    format="mp4",
+                    height=250  # Matched height with 3D viz
+                )
+                # Horizontal video examples with slider
+                gr.Markdown("**Examples:** (scroll horizontally to see all videos)")
+                # Traditional examples but with horizontal scroll styling
+                with gr.Row(elem_classes=["horizontal-examples"]):
+                    gr.Examples(
+                        examples=[
+                            ["./examples/kiss.mp4"],
+                            ["./examples/backpack.mp4"],
+                            ["./examples/kitchen.mp4"],
+                            ["./examples/pillow.mp4"],
+                            ["./examples/handwave.mp4"],
+                            ["./examples/hockey.mp4"],
+                            ["./examples/drifting.mp4"],
+                            ["./examples/basketball.mp4"],
+                            ["./examples/ken_block_0.mp4"],
+                            ["./examples/ego_kc1.mp4"],
+                            ["./examples/vertical_place.mp4"],
+                            ["./examples/ego_teaser.mp4"],
+                            ["./examples/robot_unitree.mp4"],
+                            ["./examples/robot_3.mp4"],
+                            ["./examples/teleop2.mp4"],
+                            ["./examples/pusht.mp4"],
+                            ["./examples/cinema_0.mp4"],
+                            ["./examples/cinema_1.mp4"],
+                        ],
+                        inputs=[video_input],
+                        outputs=[video_input],
+                        fn=None,
+                        cache_examples=False,
+                        label="",
+                        examples_per_page=6  # Show 6 examples per page so they can wrap to multiple rows
+                    )
+        with gr.Column(scale=2):
+            # 3D Visualization - wider and taller to match left side
+            with gr.Group():
+                gr.Markdown("### 🌐 3D Trajectory Visualization")
+                viz_html = gr.HTML(
+                    label="3D Trajectory Visualization",
+                    value="""
+                    <div style='border: 3px solid #667eea; border-radius: 10px;
+                                background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
+                                text-align: center; height: 650px; display: flex;
+                                flex-direction: column; justify-content: center; align-items: center;
+                                box-shadow: 0 4px 16px rgba(102, 126, 234, 0.15);
+                                margin: 0; padding: 20px; box-sizing: border-box;'>
+                        <div style='font-size: 56px; margin-bottom: 25px;'>🌐</div>
+                        <h3 style='color: #667eea; margin-bottom: 18px; font-size: 28px; font-weight: 600;'>
+                            3D Trajectory Visualization
+                        </h3>
+                        <p style='color: #666; font-size: 18px; line-height: 1.6; max-width: 550px; margin-bottom: 30px;'>
+                            Track any pixels in 3D space with camera motion
+                        </p>
+                        <div style='background: rgba(102, 126, 234, 0.1); border-radius: 30px;
+                                    padding: 15px 30px; border: 1px solid rgba(102, 126, 234, 0.2);'>
+                            <span style='color: #667eea; font-weight: 600; font-size: 16px;'>
+                                ⚡ Powered by SpatialTracker V2
+                            </span>
+                        </div>
+                    </div>
+                    """,
+                    elem_id="viz_container"
+                )
+    # Start button section - below video area
+    with gr.Row():
+        with gr.Column(scale=3):
+            launch_btn = gr.Button("🚀 Start Tracking Now!", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            clear_all_btn = gr.Button("🗑️ Clear All", variant="secondary", size="sm")
+    # Tracking parameters section
+    with gr.Row():
+        gr.Markdown("### ⚙️ Tracking Parameters")
+    with gr.Row():
+        grid_size = gr.Slider(
+            minimum=10, maximum=100, step=10, value=50,
+            label="Grid Size", info="Tracking detail level"
+        )
+        vo_points = gr.Slider(
+            minimum=100, maximum=2000, step=50, value=756,
+            label="VO Points", info="Motion accuracy"
+        )
+        fps = gr.Slider(
+            minimum=1, maximum=30, step=1, value=3,
+            label="FPS", info="Processing speed"
+        )
+    # Advanced Point Selection with SAM - Collapsed by default
+    with gr.Row():
+        gr.Markdown("### 🎯 Advanced: Manual Point Selection with SAM")
+    with gr.Accordion("🔬 SAM Point Selection Controls", open=False):
+        gr.HTML("""
+        <div style='margin-bottom: 15px;'>
+            <ul style='color: #4a5568; font-size: 14px; line-height: 1.6; margin: 0; padding-left: 20px;'>
+                <li>Click on target objects in the image for SAM-guided segmentation</li>
+                <li>Positive points: include these areas | Negative points: exclude these areas</li>
+                <li>Get more accurate 3D tracking results with SAM's powerful segmentation</li>
+            </ul>
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column():
+                interactive_frame = gr.Image(
+                    label="Click to select tracking points with SAM guidance",
+                    type="numpy",
+                    interactive=True,
+                    height=300
+                )
+                with gr.Row():
+                    point_type = gr.Radio(
+                        choices=["positive_point", "negative_point"],
+                        value="positive_point",
+                        label="Point Type",
+                        info="Positive: track these areas | Negative: avoid these areas"
+                    )
+                with gr.Row():
+                    reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary", size="sm")
+    # Downloads section - hidden but still functional for backend processing
+    with gr.Row(visible=False):
+        with gr.Column(scale=1):
+            tracking_video_download = gr.File(
+                label="📹 Download 2D Tracking Video",
+                interactive=False,
+                visible=False
+            )
+        with gr.Column(scale=1):
+            html_download = gr.File(
+                label="📄 Download 3D Visualization HTML",
+                interactive=False,
+                visible=False
+            )
+    # GitHub Star Section
+    gr.HTML("""
+    <div style='background: linear-gradient(135deg, #e8eaff 0%, #f0f2ff 100%);
+                border-radius: 8px; padding: 20px; margin: 15px 0;
+                box-shadow: 0 2px 8px rgba(102, 126, 234, 0.1);
+                border: 1px solid rgba(102, 126, 234, 0.15);'>
+        <div style='text-align: center;'>
+            <h3 style='color: #4a5568; margin: 0 0 10px 0; font-size: 18px; font-weight: 600;'>
+                ⭐ Love SpatialTracker? Give us a Star! ⭐
+            </h3>
+            <p style='color: #666; margin: 0 0 15px 0; font-size: 14px; line-height: 1.5;'>
+                Help us grow by starring our repository on GitHub! Your support means a lot to the community. 🚀
+            </p>
+            <a href="https://github.com/henry123-boy/SpaTrackerV2" target="_blank"
+               style='display: inline-flex; align-items: center; gap: 8px;
+                      background: rgba(102, 126, 234, 0.1); color: #4a5568;
+                      padding: 10px 20px; border-radius: 25px; text-decoration: none;
+                      font-weight: bold; font-size: 14px; border: 1px solid rgba(102, 126, 234, 0.2);
+                      transition: all 0.3s ease;'
+               onmouseover="this.style.background='rgba(102, 126, 234, 0.15)'; this.style.transform='translateY(-2px)'"
+               onmouseout="this.style.background='rgba(102, 126, 234, 0.1)'; this.style.transform='translateY(0)'">
+                <span style='font-size: 16px;'>⭐</span>
+                Star SpatialTracker V2 on GitHub
+            </a>
+        </div>
+    </div>
+    """)
+    # Acknowledgments Section
+    gr.HTML("""
+    <div style='background: linear-gradient(135deg, #fff8e1 0%, #fffbf0 100%);
+                border-radius: 8px; padding: 20px; margin: 15px 0;
+                box-shadow: 0 2px 8px rgba(255, 193, 7, 0.1);
+                border: 1px solid rgba(255, 193, 7, 0.2);'>
+        <div style='text-align: center;'>
+            <h3 style='color: #5d4037; margin: 0 0 10px 0; font-size: 18px; font-weight: 600;'>
+                📚 Acknowledgments
+            </h3>
+            <p style='color: #5d4037; margin: 0 0 15px 0; font-size: 14px; line-height: 1.5;'>
+                Our 3D visualizer is adapted from <strong>TAPIP3D</strong>. We thank the authors for their excellent work and contribution to the computer vision community!
+            </p>
+            <a href="https://github.com/zbw001/TAPIP3D" target="_blank"
+               style='display: inline-flex; align-items: center; gap: 8px;
+                      background: rgba(255, 193, 7, 0.15); color: #5d4037;
+                      padding: 10px 20px; border-radius: 25px; text-decoration: none;
+                      font-weight: bold; font-size: 14px; border: 1px solid rgba(255, 193, 7, 0.3);
+                      transition: all 0.3s ease;'
+               onmouseover="this.style.background='rgba(255, 193, 7, 0.25)'; this.style.transform='translateY(-2px)'"
+               onmouseout="this.style.background='rgba(255, 193, 7, 0.15)'; this.style.transform='translateY(0)'">
+                📚 Visit TAPIP3D Repository
+            </a>
+        </div>
+    </div>
+    """)
+    # Footer
+    gr.HTML("""
+    <div style='text-align: center; margin: 20px 0 10px 0;'>
+        <span style='font-size: 12px; color: #888; font-style: italic;'>
+            Powered by SpatialTracker V2 | Built with ❤️ for the Computer Vision Community
+        </span>
+    </div>
+    """)
+    # Hidden state variables
+    original_image_state = gr.State(None)
+    selected_points = gr.State([])
+    # Event handlers
+    video_input.change(
+        fn=handle_video_upload,
+        inputs=[video_input],
+        outputs=[original_image_state, interactive_frame, selected_points, grid_size, vo_points, fps]
+    )
+    interactive_frame.select(
+        fn=select_point,
+        inputs=[original_image_state, selected_points, point_type],
+        outputs=[interactive_frame, selected_points]
+    )
+    reset_points_btn.click(
+        fn=reset_points,
+        inputs=[original_image_state, selected_points],
+        outputs=[interactive_frame, selected_points]
+    )
+    clear_all_btn.click(
+        fn=clear_all_with_download,
+        outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps, tracking_video_download, html_download]
+    )
+    launch_btn.click(
+        fn=launch_viz,
+        inputs=[grid_size, vo_points, fps, original_image_state],
+        outputs=[viz_html, tracking_video_download, html_download]
+    )
+# Launch the interface
+if __name__ == "__main__":
+    print("🌟 Launching SpatialTracker V2 Frontend...")
+    print(f"🔗 Backend Status: {'Connected' if BACKEND_AVAILABLE else 'Disconnected'}")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        debug=True,
+        show_error=True
+    )

config/__init__.py ADDED Viewed

File without changes

config/magic_infer_moge.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+seed: 0
+# config the hydra logger, only in hydra `$` can be decoded as cite
+data: ./assets/room
+vis_track: false
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  job_logging: {}
+  hydra_logging: {}
+mixed_precision: bf16
+visdom:
+  viz_ip: "localhost"
+  port: 6666
+relax_load: false
+res_all: 336
+# config the ckpt path
+# ckpts: "/mnt/bn/xyxdata/home/codes/my_projs/SpaTrack2/checkpoints/new_base.pth"
+ckpts: "Yuxihenry/SpatialTracker_Files"
+batch_size: 1
+input:
+  type: image
+fps: 1
+model_wind_size: 32
+model:
+  backbone_cfg:
+    ckpt_dir: "checkpoints/model.pt"
+  chunk_size: 24        # downsample factor for patchified features
+  ckpt_fwd: true
+  ft_cfg:
+    mode: "fix"
+    paras_name:  []
+  resolution: 336
+  max_len: 512
+  Track_cfg:
+    base_ckpt: "checkpoints/scaled_offline.pth"
+    base:
+      stride: 4
+      corr_radius: 3
+      window_len: 60
+    stablizer: True
+    mode: "online"
+    s_wind: 200
+    overlap: 4
+  track_num: 0
+dist_train:
+  num_nodes: 1

frontend_app_local.py ADDED Viewed

	@@ -0,0 +1,1036 @@

+import gradio as gr
+import os
+import json
+import numpy as np
+import cv2
+import base64
+import time
+import tempfile
+import shutil
+import glob
+import threading
+import subprocess
+import struct
+import zlib
+from pathlib import Path
+from einops import rearrange
+from typing import List, Tuple, Union
+import torch
+import logging
+from concurrent.futures import ThreadPoolExecutor
+import atexit
+import uuid
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Import custom modules with error handling
+try:
+    from app_3rd.sam_utils.inference import SamPredictor, get_sam_predictor, run_inference
+    from app_3rd.spatrack_utils.infer_track import get_tracker_predictor, run_tracker, get_points_on_a_grid
+except ImportError as e:
+    logger.error(f"Failed to import custom modules: {e}")
+    raise
+try:
+    import spaces
+except ImportError:
+    # Fallback for local development
+    def spaces(func):
+        return func
+# Constants
+MAX_FRAMES = 80
+COLORS = [(0, 0, 255), (0, 255, 255)]  # BGR: Red for negative, Yellow for positive
+MARKERS = [1, 5]  # Cross for negative, Star for positive
+MARKER_SIZE = 8
+# Thread pool for delayed deletion
+thread_pool_executor = ThreadPoolExecutor(max_workers=2)
+def delete_later(path: Union[str, os.PathLike], delay: int = 600):
+    """Delete file or directory after specified delay (default 10 minutes)"""
+    def _delete():
+        try:
+            if os.path.isfile(path):
+                os.remove(path)
+            elif os.path.isdir(path):
+                shutil.rmtree(path)
+        except Exception as e:
+            logger.warning(f"Failed to delete {path}: {e}")
+    def _wait_and_delete():
+        time.sleep(delay)
+        _delete()
+    thread_pool_executor.submit(_wait_and_delete)
+    atexit.register(_delete)
+def create_user_temp_dir():
+    """Create a unique temporary directory for each user session"""
+    session_id = str(uuid.uuid4())[:8]  # Short unique ID
+    temp_dir = os.path.join("temp_local", f"session_{session_id}")
+    os.makedirs(temp_dir, exist_ok=True)
+    # Schedule deletion after 10 minutes
+    delete_later(temp_dir, delay=600)
+    return temp_dir
+# Initialize VGGT model
+try:
+    import vggt
+except:
+    subprocess.run(["pip", "install", "-e", "./models/vggt"], check=True)
+from huggingface_hub import hf_hub_download
+os.environ["VGGT_DIR"] = hf_hub_download("facebook/VGGT-1B", "model.pt")
+if os.environ.get("VGGT_DIR", None) is not None:
+    from vggt.models.vggt import VGGT
+    from vggt.utils.load_fn import preprocess_image
+    from vggt.utils.pose_enc import pose_encoding_to_extri_intri
+    vggt_model = VGGT()
+    vggt_model.load_state_dict(torch.load(os.environ.get("VGGT_DIR")))
+    vggt_model.eval()
+    vggt_model = vggt_model.to("cuda")
+# Global model initialization
+print("🚀 Initializing local models...")
+tracker_model, _ = get_tracker_predictor(".", vo_points=756)
+predictor = get_sam_predictor()
+print("✅ Models loaded successfully!")
+gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])
+@spaces.GPU
+def gpu_run_inference(predictor_arg, image, points, boxes):
+    """GPU-accelerated SAM inference"""
+    if predictor_arg is None:
+        print("Initializing SAM predictor inside GPU function...")
+        predictor_arg = get_sam_predictor(predictor=predictor)
+    # Ensure predictor is on GPU
+    try:
+        if hasattr(predictor_arg, 'model'):
+            predictor_arg.model = predictor_arg.model.cuda()
+        elif hasattr(predictor_arg, 'sam'):
+            predictor_arg.sam = predictor_arg.sam.cuda()
+        elif hasattr(predictor_arg, 'to'):
+            predictor_arg = predictor_arg.to('cuda')
+        if hasattr(image, 'cuda'):
+            image = image.cuda()
+    except Exception as e:
+        print(f"Warning: Could not move predictor to GPU: {e}")
+    return run_inference(predictor_arg, image, points, boxes)
+@spaces.GPU
+def gpu_run_tracker(tracker_model_arg, tracker_viser_arg, temp_dir, video_name, grid_size, vo_points, fps):
+    """GPU-accelerated tracking"""
+    import torchvision.transforms as T
+    import decord
+    if tracker_model_arg is None or tracker_viser_arg is None:
+        print("Initializing tracker models inside GPU function...")
+        out_dir = os.path.join(temp_dir, "results")
+        os.makedirs(out_dir, exist_ok=True)
+        tracker_model_arg, tracker_viser_arg = get_tracker_predictor(out_dir, vo_points=vo_points, tracker_model=tracker_model)
+    # Setup paths
+    video_path = os.path.join(temp_dir, f"{video_name}.mp4")
+    mask_path = os.path.join(temp_dir, f"{video_name}.png")
+    out_dir = os.path.join(temp_dir, "results")
+    os.makedirs(out_dir, exist_ok=True)
+    # Load video using decord
+    video_reader = decord.VideoReader(video_path)
+    video_tensor = torch.from_numpy(video_reader.get_batch(range(len(video_reader))).asnumpy()).permute(0, 3, 1, 2)
+    # Resize to ensure minimum side is 336
+    h, w = video_tensor.shape[2:]
+    scale = max(224 / h, 224 / w)
+    if scale < 1:
+        new_h, new_w = int(h * scale), int(w * scale)
+        video_tensor = T.Resize((new_h, new_w))(video_tensor)
+    video_tensor = video_tensor[::fps].float()[:MAX_FRAMES]
+    # Move to GPU
+    video_tensor = video_tensor.cuda()
+    print(f"Video tensor shape: {video_tensor.shape}, device: {video_tensor.device}")
+    depth_tensor = None
+    intrs = None
+    extrs = None
+    data_npz_load = {}
+    # Run VGGT for depth and camera estimation
+    if os.environ.get("VGGT_DIR", None) is not None:
+        video_tensor = preprocess_image(video_tensor)[None]
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                #TODO: remove this
+                single_frame=False
+                if single_frame==True:
+                    video_tensor = rearrange(video_tensor, "b s c h w -> (b s) 1 c h w")
+                aggregated_tokens_list, ps_idx = vggt_model.aggregator(video_tensor.cuda()/255)
+                pose_enc = vggt_model.camera_head(aggregated_tokens_list)[-1]
+                extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, video_tensor.shape[-2:])
+                depth_map, depth_conf = vggt_model.depth_head(aggregated_tokens_list, video_tensor.cuda()/255, ps_idx)
+                #TODO: remove this
+                if single_frame==True:
+                    video_tensor = rearrange(video_tensor, "(b s) 1 c h w -> b s c h w", b=1)
+                    depth_map = rearrange(depth_map, "(b s) 1 h w c -> b s h w c", b=video_tensor.shape[0])
+                    depth_conf = rearrange(depth_conf, "(b s) 1 h w -> b s h w", b=video_tensor.shape[0])
+                    extrinsic = rearrange(extrinsic, "(b s) 1 e f -> b s e f", b=1)
+                    intrinsic = rearrange(intrinsic, "(b s) 1 e f -> b s e f", b=1)
+        depth_tensor = depth_map.squeeze().cpu().numpy()
+        extrs = np.eye(4)[None].repeat(len(depth_tensor), axis=0)
+        extrs[:, :3, :4] = extrinsic.squeeze().cpu().numpy()
+        intrs = intrinsic.squeeze().cpu().numpy()
+        video_tensor = video_tensor.squeeze()
+        threshold = depth_conf.squeeze()[0].view(-1).quantile(0.6).item()
+        unc_metric = depth_conf.squeeze().cpu().numpy() > threshold
+    # Load and process mask
+    if os.path.exists(mask_path):
+        mask = cv2.imread(mask_path)
+        mask = cv2.resize(mask, (video_tensor.shape[3], video_tensor.shape[2]))
+        mask = mask.sum(axis=-1)>0
+    else:
+        mask = np.ones_like(video_tensor[0,0].cpu().numpy())>0
+        grid_size = 10
+    # Get frame dimensions and create grid points
+    frame_H, frame_W = video_tensor.shape[2:]
+    grid_pts = get_points_on_a_grid(grid_size, (frame_H, frame_W), device="cuda")
+    # Sample mask values at grid points and filter
+    if os.path.exists(mask_path):
+        grid_pts_int = grid_pts[0].long()
+        mask_values = mask[grid_pts_int.cpu()[...,1], grid_pts_int.cpu()[...,0]]
+        grid_pts = grid_pts[:, mask_values]
+    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].cpu().numpy()
+    print(f"Query points shape: {query_xyt.shape}")
+    # Run model inference
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        (
+            c2w_traj, intrs, point_map, conf_depth,
+            track3d_pred, track2d_pred, vis_pred, conf_pred, video
+        ) = tracker_model_arg.forward(video_tensor, depth=depth_tensor,
+                            intrs=intrs, extrs=extrs,
+                            queries=query_xyt,
+                            fps=1, full_point=False, iters_track=4,
+                            query_no_BA=True, fixed_cam=False, stage=1,
+                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
+        # Resize results to avoid large I/O
+        max_size = 224
+        h, w = video.shape[2:]
+        scale = min(max_size / h, max_size / w)
+        if scale < 1:
+            new_h, new_w = int(h * scale), int(w * scale)
+            video = T.Resize((new_h, new_w))(video)
+            video_tensor = T.Resize((new_h, new_w))(video_tensor)
+            point_map = T.Resize((new_h, new_w))(point_map)
+            track2d_pred[...,:2] = track2d_pred[...,:2] * scale
+            intrs[:,:2,:] = intrs[:,:2,:] * scale
+            conf_depth = T.Resize((new_h, new_w))(conf_depth)
+        # Visualize tracks
+        tracker_viser_arg.visualize(video=video[None],
+                        tracks=track2d_pred[None][...,:2],
+                        visibility=vis_pred[None],filename="test")
+        # Save in tapip3d format
+        data_npz_load["coords"] = (torch.einsum("tij,tnj->tni", c2w_traj[:,:3,:3], track3d_pred[:,:,:3].cpu()) + c2w_traj[:,:3,3][:,None,:]).numpy()
+        data_npz_load["extrinsics"] = torch.inverse(c2w_traj).cpu().numpy()
+        data_npz_load["intrinsics"] = intrs.cpu().numpy()
+        data_npz_load["depths"] = point_map[:,2,...].cpu().numpy()
+        data_npz_load["video"] = (video_tensor).cpu().numpy()/255
+        data_npz_load["visibs"] = vis_pred.cpu().numpy()
+        data_npz_load["confs"] = conf_pred.cpu().numpy()
+        data_npz_load["confs_depth"] = conf_depth.cpu().numpy()
+        np.savez(os.path.join(out_dir, f'result.npz'), **data_npz_load)
+    return None
+def compress_and_write(filename, header, blob):
+    header_bytes = json.dumps(header).encode("utf-8")
+    header_len = struct.pack("<I", len(header_bytes))
+    with open(filename, "wb") as f:
+        f.write(header_len)
+        f.write(header_bytes)
+        f.write(blob)
+def process_point_cloud_data(npz_file, width=256, height=192, fps=4):
+    fixed_size = (width, height)
+    data = np.load(npz_file)
+    extrinsics = data["extrinsics"]
+    intrinsics = data["intrinsics"]
+    trajs = data["coords"]
+    T, C, H, W = data["video"].shape
+    fx = intrinsics[0, 0, 0]
+    fy = intrinsics[0, 1, 1]
+    fov_y = 2 * np.arctan(H / (2 * fy)) * (180 / np.pi)
+    fov_x = 2 * np.arctan(W / (2 * fx)) * (180 / np.pi)
+    original_aspect_ratio = (W / fx) / (H / fy)
+    rgb_video = (rearrange(data["video"], "T C H W -> T H W C") * 255).astype(np.uint8)
+    rgb_video = np.stack([cv2.resize(frame, fixed_size, interpolation=cv2.INTER_AREA)
+                          for frame in rgb_video])
+    depth_video = data["depths"].astype(np.float32)
+    if "confs_depth" in data.keys():
+        confs = (data["confs_depth"].astype(np.float32) > 0.5).astype(np.float32)
+        depth_video = depth_video * confs
+    depth_video = np.stack([cv2.resize(frame, fixed_size, interpolation=cv2.INTER_NEAREST)
+                            for frame in depth_video])
+    scale_x = fixed_size[0] / W
+    scale_y = fixed_size[1] / H
+    intrinsics = intrinsics.copy()
+    intrinsics[:, 0, :] *= scale_x
+    intrinsics[:, 1, :] *= scale_y
+    min_depth = float(depth_video.min()) * 0.8
+    max_depth = float(depth_video.max()) * 1.5
+    depth_normalized = (depth_video - min_depth) / (max_depth - min_depth)
+    depth_int = (depth_normalized * ((1 << 16) - 1)).astype(np.uint16)
+    depths_rgb = np.zeros((T, fixed_size[1], fixed_size[0], 3), dtype=np.uint8)
+    depths_rgb[:, :, :, 0] = (depth_int & 0xFF).astype(np.uint8)
+    depths_rgb[:, :, :, 1] = ((depth_int >> 8) & 0xFF).astype(np.uint8)
+    first_frame_inv = np.linalg.inv(extrinsics[0])
+    normalized_extrinsics = np.array([first_frame_inv @ ext for ext in extrinsics])
+    normalized_trajs = np.zeros_like(trajs)
+    for t in range(T):
+        homogeneous_trajs = np.concatenate([trajs[t], np.ones((trajs.shape[1], 1))], axis=1)
+        transformed_trajs = (first_frame_inv @ homogeneous_trajs.T).T
+        normalized_trajs[t] = transformed_trajs[:, :3]
+    arrays = {
+        "rgb_video": rgb_video,
+        "depths_rgb": depths_rgb,
+        "intrinsics": intrinsics,
+        "extrinsics": normalized_extrinsics,
+        "inv_extrinsics": np.linalg.inv(normalized_extrinsics),
+        "trajectories": normalized_trajs.astype(np.float32),
+        "cameraZ": 0.0
+    }
+    header = {}
+    blob_parts = []
+    offset = 0
+    for key, arr in arrays.items():
+        arr = np.ascontiguousarray(arr)
+        arr_bytes = arr.tobytes()
+        header[key] = {
+            "dtype": str(arr.dtype),
+            "shape": arr.shape,
+            "offset": offset,
+            "length": len(arr_bytes)
+        }
+        blob_parts.append(arr_bytes)
+        offset += len(arr_bytes)
+    raw_blob = b"".join(blob_parts)
+    compressed_blob = zlib.compress(raw_blob, level=9)
+    header["meta"] = {
+        "depthRange": [min_depth, max_depth],
+        "totalFrames": int(T),
+        "resolution": fixed_size,
+        "baseFrameRate": fps,
+        "numTrajectoryPoints": normalized_trajs.shape[1],
+        "fov": float(fov_y),
+        "fov_x": float(fov_x),
+        "original_aspect_ratio": float(original_aspect_ratio),
+        "fixed_aspect_ratio": float(fixed_size[0]/fixed_size[1])
+    }
+    compress_and_write('./_viz/data.bin', header, compressed_blob)
+    with open('./_viz/data.bin', "rb") as f:
+        encoded_blob = base64.b64encode(f.read()).decode("ascii")
+    os.unlink('./_viz/data.bin')
+    random_path = f'./_viz/_{time.time()}.html'
+    with open('./_viz/viz_template.html') as f:
+        html_template = f.read()
+    html_out = html_template.replace(
+        "<head>",
+        f"<head>\n<script>window.embeddedBase64 = `{encoded_blob}`;</script>"
+    )
+    with open(random_path,'w') as f:
+        f.write(html_out)
+    return random_path
+def numpy_to_base64(arr):
+    """Convert numpy array to base64 string"""
+    return base64.b64encode(arr.tobytes()).decode('utf-8')
+def base64_to_numpy(b64_str, shape, dtype):
+    """Convert base64 string back to numpy array"""
+    return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)
+def get_video_name(video_path):
+    """Extract video name without extension"""
+    return os.path.splitext(os.path.basename(video_path))[0]
+def extract_first_frame(video_path):
+    """Extract first frame from video file"""
+    try:
+        cap = cv2.VideoCapture(video_path)
+        ret, frame = cap.read()
+        cap.release()
+        if ret:
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            return frame_rgb
+        else:
+            return None
+    except Exception as e:
+        print(f"Error extracting first frame: {e}")
+        return None
+def handle_video_upload(video):
+    """Handle video upload and extract first frame"""
+    if video is None:
+        return None, None, [], 50, 756, 3
+    # Create user-specific temporary directory
+    user_temp_dir = create_user_temp_dir()
+    # Get original video name and copy to temp directory
+    if isinstance(video, str):
+        video_name = get_video_name(video)
+        video_path = os.path.join(user_temp_dir, f"{video_name}.mp4")
+        shutil.copy(video, video_path)
+    else:
+        video_name = get_video_name(video.name)
+        video_path = os.path.join(user_temp_dir, f"{video_name}.mp4")
+        with open(video_path, 'wb') as f:
+            f.write(video.read())
+    print(f"📁 Video saved to: {video_path}")
+    # Extract first frame
+    frame = extract_first_frame(video_path)
+    if frame is None:
+        return None, None, [], 50, 756, 3
+    # Resize frame to have minimum side length of 336
+    h, w = frame.shape[:2]
+    scale = 336 / min(h, w)
+    new_h, new_w = int(h * scale)//2*2, int(w * scale)//2*2
+    frame = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+    # Store frame data with temp directory info
+    frame_data = {
+        'data': numpy_to_base64(frame),
+        'shape': frame.shape,
+        'dtype': str(frame.dtype),
+        'temp_dir': user_temp_dir,
+        'video_name': video_name,
+        'video_path': video_path
+    }
+    # Get video-specific settings
+    print(f"🎬 Video path: '{video}' -> Video name: '{video_name}'")
+    grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
+    print(f"🎬 Video settings for '{video_name}': grid_size={grid_size_val}, vo_points={vo_points_val}, fps={fps_val}")
+    return (json.dumps(frame_data), frame, [],
+            gr.update(value=grid_size_val),
+            gr.update(value=vo_points_val),
+            gr.update(value=fps_val))
+def save_masks(o_masks, video_name, temp_dir):
+    """Save binary masks to files in user-specific temp directory"""
+    o_files = []
+    for mask, _ in o_masks:
+        o_mask = np.uint8(mask.squeeze() * 255)
+        o_file = os.path.join(temp_dir, f"{video_name}.png")
+        cv2.imwrite(o_file, o_mask)
+        o_files.append(o_file)
+    return o_files
+def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData):
+    """Handle point selection for SAM"""
+    if original_img is None:
+        return None, []
+    try:
+        # Convert stored image data back to numpy array
+        frame_data = json.loads(original_img)
+        original_img_array = base64_to_numpy(frame_data['data'], frame_data['shape'], frame_data['dtype'])
+        temp_dir = frame_data.get('temp_dir', 'temp_local')
+        video_name = frame_data.get('video_name', 'video')
+        # Create a display image for visualization
+        display_img = original_img_array.copy()
+        new_sel_pix = sel_pix.copy() if sel_pix else []
+        new_sel_pix.append((evt.index, 1 if point_type == 'positive_point' else 0))
+        print(f"🎯 Running SAM inference for point: {evt.index}, type: {point_type}")
+        # Run SAM inference
+        o_masks = gpu_run_inference(None, original_img_array, new_sel_pix, [])
+        # Draw points on display image
+        for point, label in new_sel_pix:
+            cv2.drawMarker(display_img, point, COLORS[label], markerType=MARKERS[label], markerSize=MARKER_SIZE, thickness=2)
+        # Draw mask overlay on display image
+        if o_masks:
+            mask = o_masks[0][0]
+            overlay = display_img.copy()
+            overlay[mask.squeeze()!=0] = [20, 60, 200]  # Light blue
+            display_img = cv2.addWeighted(overlay, 0.6, display_img, 0.4, 0)
+            # Save mask for tracking
+            save_masks(o_masks, video_name, temp_dir)
+            print(f"✅ Mask saved for video: {video_name}")
+        return display_img, new_sel_pix
+    except Exception as e:
+        print(f"❌ Error in select_point: {e}")
+        return None, []
+def reset_points(original_img: str, sel_pix):
+    """Reset all points and clear the mask"""
+    if original_img is None:
+        return None, []
+    try:
+        # Convert stored image data back to numpy array
+        frame_data = json.loads(original_img)
+        original_img_array = base64_to_numpy(frame_data['data'], frame_data['shape'], frame_data['dtype'])
+        temp_dir = frame_data.get('temp_dir', 'temp_local')
+        # Create a display image (just the original image)
+        display_img = original_img_array.copy()
+        # Clear all points
+        new_sel_pix = []
+        # Clear any existing masks
+        for mask_file in glob.glob(os.path.join(temp_dir, "*.png")):
+            try:
+                os.remove(mask_file)
+            except Exception as e:
+                logger.warning(f"Failed to remove mask file {mask_file}: {e}")
+        print("🔄 Points and masks reset")
+        return display_img, new_sel_pix
+    except Exception as e:
+        print(f"❌ Error in reset_points: {e}")
+        return None, []
+def launch_viz(grid_size, vo_points, fps, original_image_state):
+    """Launch visualization with user-specific temp directory"""
+    if original_image_state is None:
+        return None, None
+    try:
+        # Get user's temp directory from stored frame data
+        frame_data = json.loads(original_image_state)
+        temp_dir = frame_data.get('temp_dir', 'temp_local')
+        video_name = frame_data.get('video_name', 'video')
+        print(f"🚀 Starting tracking for video: {video_name}")
+        print(f"📊 Parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
+        # Check for mask files
+        mask_files = glob.glob(os.path.join(temp_dir, "*.png"))
+        video_files = glob.glob(os.path.join(temp_dir, "*.mp4"))
+        if not video_files:
+            print("❌ No video file found")
+            return "❌ Error: No video file found", None
+        video_path = video_files[0]
+        mask_path = mask_files[0] if mask_files else None
+        # Run tracker
+        print("🎯 Running tracker...")
+        out_dir = os.path.join(temp_dir, "results")
+        os.makedirs(out_dir, exist_ok=True)
+        gpu_run_tracker(None, None, temp_dir, video_name, grid_size, vo_points, fps)
+        # Process results
+        npz_path = os.path.join(out_dir, "result.npz")
+        track2d_video = os.path.join(out_dir, "test_pred_track.mp4")
+        if os.path.exists(npz_path):
+            print("📊 Processing 3D visualization...")
+            html_path = process_point_cloud_data(npz_path)
+            # Schedule deletion of generated files
+            delete_later(html_path, delay=600)
+            if os.path.exists(track2d_video):
+                delete_later(track2d_video, delay=600)
+            delete_later(npz_path, delay=600)
+            # Create iframe HTML
+            iframe_html = f"""
+            <div style='border: 3px solid #667eea; border-radius: 10px; overflow: hidden; box-shadow: 0 8px 32px rgba(102, 126, 234, 0.3);'>
+                <iframe id="viz_iframe" src="/gradio_api/file={html_path}" width="100%" height="950px" style="border:none;"></iframe>
+            </div>
+            """
+            print("✅ Tracking completed successfully!")
+            return iframe_html, track2d_video if os.path.exists(track2d_video) else None
+        else:
+            print("❌ Tracking failed - no results generated")
+            return "❌ Error: Tracking failed to generate results", None
+    except Exception as e:
+        print(f"❌ Error in launch_viz: {e}")
+        return f"❌ Error: {str(e)}", None
+def clear_all():
+    """Clear all buffers and temporary files"""
+    return (None, None, [],
+            gr.update(value=50),
+            gr.update(value=756),
+            gr.update(value=3))
+def get_video_settings(video_name):
+    """Get video-specific settings based on video name"""
+    video_settings = {
+        "kiss": (45, 700, 10),
+        "backpack": (40, 600, 2),
+        "kitchen": (60, 800, 3),
+        "pillow": (35, 500, 2),
+        "handwave": (35, 500, 8),
+        "hockey": (45, 700, 2),
+        "drifting": (35, 1000, 6),
+        "basketball": (45, 1500, 5),
+        "ken_block_0": (45, 700, 2),
+        "ego_kc1": (45, 500, 4),
+        "vertical_place": (45, 500, 3),
+        "ego_teaser": (45, 1200, 10),
+        "robot_unitree": (45, 500, 4),
+        "droid_robot": (35, 400, 5),
+        "robot_2": (45, 256, 5),
+        "cinema_0": (45, 356, 5),
+        "cinema_1": (45, 756, 3),
+    }
+    return video_settings.get(video_name, (50, 756, 3))
+# Create the Gradio interface
+print("🎨 Creating Gradio interface...")
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    title="SpatialTracker V2 - Local",
+    css="""
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: auto !important;
+    }
+    .gr-button {
+        margin: 5px;
+    }
+    .gr-form {
+        background: white;
+        border-radius: 10px;
+        padding: 20px;
+        box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+    }
+    .gr-video {
+        height: 300px !important;
+        min-height: 300px !important;
+        max-height: 300px !important;
+    }
+    .gr-video video {
+        height: 260px !important;
+        max-height: 260px !important;
+        object-fit: contain !important;
+        background: #f8f9fa;
+    }
+    .horizontal-examples .gr-examples {
+        overflow: visible !important;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper {
+        overflow-x: auto !important;
+        overflow-y: hidden !important;
+        scrollbar-width: thin;
+        scrollbar-color: #667eea #f1f1f1;
+        padding: 10px 0;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar {
+        height: 8px;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-track {
+        background: #f1f1f1;
+        border-radius: 4px;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 4px;
+    }
+    .horizontal-examples .gr-examples .gr-table {
+        display: flex !important;
+        flex-wrap: nowrap !important;
+        min-width: max-content !important;
+        gap: 15px !important;
+        padding-bottom: 10px;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody {
+        display: flex !important;
+        flex-direction: row !important;
+        flex-wrap: nowrap !important;
+        gap: 15px !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr {
+        display: flex !important;
+        flex-direction: column !important;
+        min-width: 160px !important;
+        max-width: 160px !important;
+        margin: 0 !important;
+        background: white;
+        border-radius: 12px;
+        box-shadow: 0 3px 12px rgba(0,0,0,0.12);
+        transition: all 0.3s ease;
+        cursor: pointer;
+        overflow: hidden;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr:hover {
+        transform: translateY(-4px);
+        box-shadow: 0 8px 20px rgba(102, 126, 234, 0.25);
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td {
+        text-align: center !important;
+        padding: 0 !important;
+        border: none !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td:first-child {
+        padding: 0 !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td video {
+        border-radius: 8px 8px 0 0 !important;
+        width: 100% !important;
+        height: 90px !important;
+        object-fit: cover !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td:last-child {
+        font-size: 11px !important;
+        font-weight: 600 !important;
+        color: #333 !important;
+        padding: 8px 12px !important;
+        background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
+        border-radius: 0 0 8px 8px;
+    }
+    """
+) as demo:
+    gr.Markdown("""
+    # 🎯 SpatialTracker V2 - Local Version
+    Welcome to SpatialTracker V2! This interface allows you to track any pixels in 3D using our model.
+    **Instructions:**
+    1. Upload a video file or select from examples below
+    2. Click on the object you want to track in the first frame
+    3. Adjust tracking parameters if needed
+    4. Click "Launch Visualization" to start tracking
+    """)
+    # Status indicator
+    gr.Markdown("**Status:** 🟢 Local Processing Mode")
+    gr.Markdown("<small style='color: #666;'>All processing runs locally with GPU acceleration</small>")
+    # GitHub Star Reminder
+    gr.HTML("""
+    <div style='background: linear-gradient(135deg, #e8eaff 0%, #f0f2ff 100%);
+                border-radius: 10px;
+                padding: 15px;
+                margin: 15px 0;
+                box-shadow: 0 2px 8px rgba(102, 126, 234, 0.1);
+                border: 1px solid rgba(102, 126, 234, 0.15);'>
+        <div style='text-align: center; color: #4a5568;'>
+            <h3 style='margin: 0 0 10px 0; font-size: 18px; text-shadow: none; color: #2d3748;'>
+                ⭐ Love SpatialTracker? Give us a Star! ⭐
+            </h3>
+            <p style='margin: 0 0 12px 0; font-size: 14px; opacity: 0.8; color: #4a5568;'>
+                Help us grow by starring our repository on GitHub! 🚀
+            </p>
+            <div style='display: flex; justify-content: center;'>
+                <a href="https://github.com/henry123-boy/SpaTrackerV2"
+                   target="_blank"
+                   style='display: inline-flex;
+                          align-items: center;
+                          gap: 6px;
+                          background: rgba(102, 126, 234, 0.1);
+                          color: #4a5568;
+                          padding: 8px 16px;
+                          border-radius: 20px;
+                          text-decoration: none;
+                          font-weight: bold;
+                          font-size: 14px;
+                          backdrop-filter: blur(5px);
+                          border: 1px solid rgba(102, 126, 234, 0.2);
+                          transition: all 0.3s ease;'
+                   onmouseover="this.style.background='rgba(102, 126, 234, 0.15)'; this.style.transform='translateY(-1px)'"
+                   onmouseout="this.style.background='rgba(102, 126, 234, 0.1)'; this.style.transform='translateY(0)'">
+                    <span style='font-size: 16px;'>⭐</span>
+                    Star on GitHub
+                </a>
+            </div>
+        </div>
+    </div>
+    """)
+    # Example videos section
+    with gr.Group():
+        gr.Markdown("### 📂 Example Videos")
+        gr.Markdown("Try these example videos to get started quickly:")
+        gr.HTML("""
+        <div style='background-color: #f8f9ff; border-radius: 8px; padding: 10px; margin: 10px 0; border-left: 4px solid #667eea;'>
+            <p style='margin: 0; font-size: 13px; color: #666; display: flex; align-items: center; gap: 8px;'>
+                <span style='font-size: 16px;'>💡</span>
+                <strong>Tip:</strong> Scroll horizontally below to see all example videos
+            </p>
+        </div>
+        """)
+        video_input = gr.Video(
+            label="Upload Video or Select Example",
+            format="mp4",
+            height=300
+        )
+        with gr.Group(elem_classes=["horizontal-examples"]):
+            gr.Examples(
+                examples=[
+                    ["examples/kiss.mp4"],
+                    ["examples/backpack.mp4"],
+                    ["examples/pillow.mp4"],
+                    ["examples/handwave.mp4"],
+                    ["examples/hockey.mp4"],
+                    ["examples/drifting.mp4"],
+                    ["examples/ken_block_0.mp4"],
+                    ["examples/kitchen.mp4"],
+                    ["examples/basketball.mp4"],
+                    ["examples/ego_kc1.mp4"],
+                    ["examples/vertical_place.mp4"],
+                    ["examples/ego_teaser.mp4"],
+                    ["examples/robot_unitree.mp4"],
+                    ["examples/droid_robot.mp4"],
+                    ["examples/robot_2.mp4"],
+                    ["examples/cinema_0.mp4"],
+                    ["examples/cinema_1.mp4"],
+                ],
+                inputs=video_input,
+                label="🎬 Click on any example to load it",
+                examples_per_page=16
+            )
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Interactive frame display
+            with gr.Group():
+                gr.Markdown("### 🎯 Point Selection")
+                gr.Markdown("Click on the object you want to track in the frame below:")
+                interactive_frame = gr.Image(
+                    label="Click to select tracking points",
+                    type="numpy",
+                    interactive=True
+                )
+                with gr.Row():
+                    point_type = gr.Radio(
+                        choices=["positive_point", "negative_point"],
+                        value="positive_point",
+                        label="Point Type",
+                        info="Positive points indicate the object to track, negative points indicate areas to avoid"
+                    )
+                with gr.Row():
+                    reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary")
+                    clear_all_btn = gr.Button("🗑️ Clear All", variant="stop")
+        with gr.Column(scale=1):
+            # Tracking results
+            with gr.Group():
+                gr.Markdown("### 🎬 Tracking Results")
+                tracking_result_video = gr.Video(
+                    label="Tracking Result Video",
+                    interactive=False,
+                    height=300
+                )
+    # 3D Visualization - Make it larger and more prominent
+    with gr.Row():
+        with gr.Column():
+            with gr.Group():
+                gr.Markdown("### 🌐 3D Trajectory Visualization")
+                gr.Markdown("Interactive 3D visualization of 3D point tracking and camera motion:")
+                viz_html = gr.HTML(
+                    label="3D Trajectory Visualization",
+                    value="""
+                    <div style='border: 3px solid #667eea; border-radius: 15px; padding: 40px;
+                                background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
+                                text-align: center; min-height: 600px; display: flex;
+                                flex-direction: column; justify-content: center; align-items: center;
+                                box-shadow: 0 8px 32px rgba(102, 126, 234, 0.2);'>
+                        <div style='font-size: 48px; margin-bottom: 20px;'>🌐</div>
+                        <h2 style='color: #667eea; margin-bottom: 15px; font-size: 28px; font-weight: 600;'>
+                            3D Trajectory Visualization
+                        </h2>
+                        <p style='color: #666; font-size: 16px; line-height: 1.6; max-width: 500px; margin-bottom: 25px;'>
+                            Perceive the world with Pixel-wise 3D Motions!
+                        </p>
+                        <div style='background: rgba(102, 126, 234, 0.1); border-radius: 25px;
+                                    padding: 12px 24px; border: 2px solid rgba(102, 126, 234, 0.2);'>
+                            <span style='color: #667eea; font-weight: 600; font-size: 14px;'>
+                                ⚡ Powered by SpatialTracker V2
+                            </span>
+                        </div>
+                    </div>
+                    """,
+                    elem_id="viz_container"
+                )
+    # Advanced settings section
+    with gr.Accordion("⚙️ Advanced Settings", open=True):
+        gr.Markdown("Adjust these parameters to optimize tracking performance:")
+        with gr.Row():
+            grid_size = gr.Slider(
+                minimum=10,
+                maximum=100,
+                step=10,
+                value=50,
+                label="Grid Size",
+                info="Size of the tracking grid (larger = more detailed)"
+            )
+            vo_points = gr.Slider(
+                minimum=100,
+                maximum=2000,
+                step=50,
+                value=756,
+                label="VO Points",
+                info="Number of visual odometry points (more = better accuracy)"
+            )
+            fps = gr.Slider(
+                minimum=1,
+                maximum=30,
+                step=1,
+                value=3,
+                label="FPS",
+                info="Frames per second for processing (higher = smoother but slower)"
+            )
+    # Launch button
+    with gr.Row():
+        launch_btn = gr.Button("🚀 Launch Visualization", variant="primary", size="lg")
+    # Hidden state variables
+    original_image_state = gr.State(None)
+    selected_points = gr.State([])
+    # Event handlers
+    video_input.change(
+        fn=handle_video_upload,
+        inputs=[video_input],
+        outputs=[original_image_state, interactive_frame, selected_points, grid_size, vo_points, fps]
+    )
+    interactive_frame.select(
+        fn=select_point,
+        inputs=[original_image_state, selected_points, point_type],
+        outputs=[interactive_frame, selected_points]
+    )
+    reset_points_btn.click(
+        fn=reset_points,
+        inputs=[original_image_state, selected_points],
+        outputs=[interactive_frame, selected_points]
+    )
+    clear_all_btn.click(
+        fn=clear_all,
+        outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps]
+    )
+    launch_btn.click(
+        fn=launch_viz,
+        inputs=[grid_size, vo_points, fps, original_image_state],
+        outputs=[viz_html, tracking_result_video]
+    )
+    # Acknowledgment section for TAPIP3D - moved to the end
+    gr.HTML("""
+    <div style='background: linear-gradient(135deg, #fff8e1 0%, #fffbf0 100%);
+                border-radius: 8px;
+                padding: 12px;
+                margin: 15px 0;
+                box-shadow: 0 1px 4px rgba(255, 193, 7, 0.1);
+                border: 1px solid rgba(255, 193, 7, 0.2);'>
+        <div style='text-align: center; color: #5d4037;'>
+            <h5 style='margin: 0 0 6px 0; font-size: 14px; color: #5d4037;'>
+                Acknowledgments
+            </h5>
+            <p style='margin: 0; font-size: 12px; opacity: 0.9; color: #5d4037; line-height: 1.3;'>
+                Our 3D visualizer is adapted from <strong>TAPIP3D</strong>. We thank the authors for their excellent work!
+            </p>
+            <div style='margin-top: 6px;'>
+                <a href="https://github.com/zbw001/TAPIP3D"
+                   target="_blank"
+                   style='display: inline-flex;
+                          align-items: center;
+                          gap: 3px;
+                          background: rgba(255, 193, 7, 0.15);
+                          color: #5d4037;
+                          padding: 3px 10px;
+                          border-radius: 12px;
+                          text-decoration: none;
+                          font-weight: 500;
+                          font-size: 11px;
+                          border: 1px solid rgba(255, 193, 7, 0.3);
+                          transition: all 0.3s ease;'
+                   onmouseover="this.style.background='rgba(255, 193, 7, 0.2)'"
+                   onmouseout="this.style.background='rgba(255, 193, 7, 0.15)'">
+                    📚 TAPIP3D Repository
+                </a>
+            </div>
+        </div>
+    </div>
+    """)
+# Launch the interface
+if __name__ == "__main__":
+    print("🌟 Launching SpatialTracker V2 Local Version...")
+    print("🔗 Running in Local Processing Mode")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        debug=True,
+        show_error=True
+    )

models/SpaTrackV2/models/SpaTrack.py ADDED Viewed

	@@ -0,0 +1,758 @@

+#python
+"""
+SpaTrackerV2, which is an unified model to estimate 'intrinsic',
+'video depth', 'extrinsic' and '3D Tracking' from casual video frames.
+Contact: DM yuxixiao@zju.edu.cn
+"""
+import os
+import numpy as np
+from typing import Literal, Union, List, Tuple, Dict
+import cv2
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from depth anything v2
+from einops import rearrange
+from models.monoD.depth_anything_v2.dpt import DepthAnythingV2
+from models.moge.model.v1 import MoGeModel
+import copy
+from functools import partial
+from models.SpaTrackV2.models.tracker3D.TrackRefiner import TrackRefiner3D
+import kornia
+from models.SpaTrackV2.utils.model_utils import sample_features5d
+import utils3d
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.utils import depth_to_points_colmap, get_nth_visible_time_index
+from models.SpaTrackV2.models.utils import pose_enc2mat, matrix_to_quaternion, get_track_points, normalize_rgb
+import random
+class SpaTrack2(nn.Module):
+    def __init__(
+        self,
+        loggers: list,   # include [ viz, logger_tf, logger]
+        backbone_cfg,
+        Track_cfg=None,
+        chunk_size=24,
+        ckpt_fwd: bool = False,
+        ft_cfg=None,
+        resolution=518,
+        max_len=600,  # the maximum video length we can preprocess,
+        track_num=768,
+    ):
+        self.chunk_size = chunk_size
+        self.max_len = max_len
+        self.resolution = resolution
+        # config the T-Lora Dinov2
+        #NOTE: initial the base model
+        base_cfg = copy.deepcopy(backbone_cfg)
+        backbone_ckpt_dir = base_cfg.pop('ckpt_dir', None)
+        super(SpaTrack2, self).__init__()
+        if os.path.exists(backbone_ckpt_dir)==False:
+            base_model = MoGeModel.from_pretrained('Ruicheng/moge-vitl')
+        else:
+            checkpoint = torch.load(backbone_ckpt_dir, map_location='cpu', weights_only=True)
+            base_model = MoGeModel(**checkpoint["model_config"])
+            base_model.load_state_dict(checkpoint['model'])
+        # avoid the base_model is a member of SpaTrack2
+        object.__setattr__(self, 'base_model', base_model)
+        # Tracker model
+        self.Track3D = TrackRefiner3D(Track_cfg)
+        track_base_ckpt_dir = Track_cfg.base_ckpt
+        if os.path.exists(track_base_ckpt_dir):
+            track_pretrain = torch.load(track_base_ckpt_dir)
+            self.Track3D.load_state_dict(track_pretrain, strict=False)
+        # wrap the function of make lora trainable
+        self.make_paras_trainable = partial(self.make_paras_trainable,
+                                            mode=ft_cfg.mode,
+                                            paras_name=ft_cfg.paras_name)
+        self.track_num = track_num
+    def make_paras_trainable(self, mode: str = 'fix', paras_name: List[str] = []):
+        # gradient required for the lora_experts and gate
+        for name, param in self.named_parameters():
+            if any(x in name for x in paras_name):
+                if mode == 'fix':
+                    param.requires_grad = False
+                else:
+                    param.requires_grad = True
+            else:
+                if mode == 'fix':
+                    param.requires_grad = True
+                else:
+                    param.requires_grad = False
+        total_params = sum(p.numel() for p in self.parameters())
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        print(f"Total parameters: {total_params}")
+        print(f"Trainable parameters: {trainable_params/total_params*100:.2f}%")
+    def ProcVid(self,
+                        x: torch.Tensor):
+        """
+        split the video into several overlapped windows.
+        args:
+            x: the input video frames.   [B, T, C, H, W]
+        outputs:
+            patch_size: the patch size of the video features
+        raises:
+            ValueError: if the input video is longer than `max_len`.
+        """
+        # normalize the input images
+        num_types = x.dtype
+        x = normalize_rgb(x, input_size=self.resolution)
+        x = x.to(num_types)
+        # get the video features
+        B, T, C, H, W = x.size()
+        if T > self.max_len:
+            raise ValueError(f"the video length should no more than {self.max_len}.")
+        # get the video features
+        patch_h, patch_w = H // 14, W // 14
+        patch_size = (patch_h,  patch_w)
+        # resize and get the video features
+        x = x.view(B * T, C, H, W)
+        # operate the temporal encoding
+        return patch_size, x
+    def forward_stream(
+            self,
+            video: torch.Tensor,
+            queries: torch.Tensor = None,
+            T_org: int = None,
+            depth: torch.Tensor|np.ndarray|str=None,
+            unc_metric_in: torch.Tensor|np.ndarray|str=None,
+            intrs: torch.Tensor|np.ndarray|str=None,
+            extrs: torch.Tensor|np.ndarray|str=None,
+            queries_3d: torch.Tensor = None,
+            window_len: int = 16,
+            overlap_len: int = 4,
+            full_point: bool = False,
+            track2d_gt: torch.Tensor = None,
+            fixed_cam: bool = False,
+            query_no_BA: bool = False,
+            stage: int = 0,
+            support_frame: int = 0,
+            replace_ratio: float = 0.6,
+            annots_train: Dict = None,
+            iters_track=4,
+            **kwargs,
+    ):
+        # step 1 allocate the query points on the grid
+        T, C, H, W = video.shape
+        if annots_train is not None:
+            vis_gt = annots_train["vis"]
+            _, _, N = vis_gt.shape
+            number_visible = vis_gt.sum(dim=1)
+            ratio_rand = torch.rand(1, N, device=vis_gt.device)
+            first_positive_inds = get_nth_visible_time_index(vis_gt, (number_visible*ratio_rand).long().clamp(min=1, max=T))
+            assert (torch.gather(vis_gt, 1, first_positive_inds[:, None, :].repeat(1, T, 1)) < 0).sum() == 0
+            first_positive_inds = first_positive_inds.long()
+            gather = torch.gather(
+                annots_train["traj_3d"][...,:2], 1, first_positive_inds[:, :, None, None].repeat(1, 1, N, 2)
+                )
+            xys = torch.diagonal(gather, dim1=1, dim2=2).permute(0, 2, 1)
+            queries = torch.cat([first_positive_inds[:, :, None], xys], dim=-1)[0].cpu().numpy()
+        # Unfold video into segments of window_len with overlap_len
+        step_slide = window_len - overlap_len
+        if T < window_len:
+            video_unf = video.unsqueeze(0)
+            if depth is not None:
+                depth_unf = depth.unsqueeze(0)
+            else:
+                depth_unf = None
+            if unc_metric_in is not None:
+                unc_metric_unf = unc_metric_in.unsqueeze(0)
+            else:
+                unc_metric_unf = None
+            if intrs is not None:
+                intrs_unf = intrs.unsqueeze(0)
+            else:
+                intrs_unf = None
+            if extrs is not None:
+                extrs_unf = extrs.unsqueeze(0)
+            else:
+                extrs_unf = None
+        else:
+            video_unf = video.unfold(0, window_len, step_slide).permute(0, 4, 1, 2, 3)  # [B, S, C, H, W]
+            if depth is not None:
+                depth_unf = depth.unfold(0, window_len, step_slide).permute(0, 3, 1, 2)
+                intrs_unf = intrs.unfold(0, window_len, step_slide).permute(0, 3, 1, 2)
+            else:
+                depth_unf = None
+                intrs_unf = None
+            if extrs is not None:
+                extrs_unf = extrs.unfold(0, window_len, step_slide).permute(0, 3, 1, 2)
+            else:
+                extrs_unf = None
+            if unc_metric_in is not None:
+                unc_metric_unf = unc_metric_in.unfold(0, window_len, step_slide).permute(0, 3, 1, 2)
+            else:
+                unc_metric_unf = None
+        # parallel
+        # Get number of segments
+        B = video_unf.shape[0]
+        #TODO: Process each segment in parallel using torch.nn.DataParallel
+        c2w_traj = torch.eye(4, 4)[None].repeat(T, 1, 1)
+        intrs_out = torch.eye(3, 3)[None].repeat(T, 1, 1)
+        point_map = torch.zeros(T, 3, H, W).cuda()
+        unc_metric = torch.zeros(T, H, W).cuda()
+        # set the queries
+        N, _ = queries.shape
+        track3d_pred = torch.zeros(T, N, 6).cuda()
+        track2d_pred = torch.zeros(T, N, 3).cuda()
+        vis_pred = torch.zeros(T, N, 1).cuda()
+        conf_pred = torch.zeros(T, N, 1).cuda()
+        dyn_preds = torch.zeros(T, N, 1).cuda()
+        # sort the queries by time
+        sorted_indices = np.argsort(queries[...,0])
+        sorted_inv_indices = np.argsort(sorted_indices)
+        sort_query = queries[sorted_indices]
+        sort_query = torch.from_numpy(sort_query).cuda()
+        if queries_3d is not None:
+            sort_query_3d = queries_3d[sorted_indices]
+            sort_query_3d = torch.from_numpy(sort_query_3d).cuda()
+        queries_len = 0
+        overlap_d = None
+        cache = None
+        loss = 0.0
+        for i in range(B):
+            segment = video_unf[i:i+1].cuda()
+            # Forward pass through model
+            # detect the key points for each frames
+            queries_new_mask = (sort_query[...,0] < i * step_slide + window_len) * (sort_query[...,0] >= (i * step_slide + overlap_len if i > 0 else 0))
+            if queries_3d is not None:
+                queries_new_3d = sort_query_3d[queries_new_mask]
+                queries_new_3d = queries_new_3d.float()
+            else:
+                queries_new_3d = None
+            queries_new = sort_query[queries_new_mask.bool()]
+            queries_new = queries_new.float()
+            if i > 0:
+                overlap2d = track2d_pred[i*step_slide:(i+1)*step_slide, :queries_len, :]
+                overlapvis = vis_pred[i*step_slide:(i+1)*step_slide, :queries_len, :]
+                overlapconf = conf_pred[i*step_slide:(i+1)*step_slide, :queries_len, :]
+                overlap_query = (overlapvis * overlapconf).max(dim=0)[1][None, ...]
+                overlap_xy = torch.gather(overlap2d, 0, overlap_query.repeat(1,1,2))
+                overlap_d = torch.gather(overlap2d, 0, overlap_query.repeat(1,1,3))[...,2].detach()
+                overlap_query = torch.cat([overlap_query[...,:1], overlap_xy], dim=-1)[0]
+                queries_new[...,0] -= i*step_slide
+                queries_new = torch.cat([overlap_query, queries_new], dim=0).detach()
+            if annots_train is None:
+                annots = {}
+            else:
+                annots = copy.deepcopy(annots_train)
+                annots["traj_3d"] = annots["traj_3d"][:, i*step_slide:i*step_slide+window_len, sorted_indices,:][...,:len(queries_new),:]
+                annots["vis"] = annots["vis"][:, i*step_slide:i*step_slide+window_len, sorted_indices][...,:len(queries_new)]
+                annots["poses_gt"] =  annots["poses_gt"][:, i*step_slide:i*step_slide+window_len]
+                annots["depth_gt"] = annots["depth_gt"][:, i*step_slide:i*step_slide+window_len]
+                annots["intrs"] = annots["intrs"][:, i*step_slide:i*step_slide+window_len]
+                annots["traj_mat"] = annots["traj_mat"][:,i*step_slide:i*step_slide+window_len]
+            if depth is not None:
+                annots["depth_gt"] = depth_unf[i:i+1].to(segment.device).to(segment.dtype)
+            if unc_metric_in is not None:
+                annots["unc_metric"] = unc_metric_unf[i:i+1].to(segment.device).to(segment.dtype)
+            if intrs is not None:
+                intr_seg = intrs_unf[i:i+1].to(segment.device).to(segment.dtype)[0].clone()
+                focal = (intr_seg[:,0,0] / segment.shape[-1] + intr_seg[:,1,1]/segment.shape[-2]) / 2
+                pose_fake = torch.zeros(1, 8).to(depth.device).to(depth.dtype).repeat(segment.shape[1], 1)
+                pose_fake[:, -1] = focal
+                pose_fake[:,3]=1
+                annots["intrs_gt"] = intr_seg
+            if extrs is not None:
+                extrs_unf_norm = extrs_unf[i:i+1][0].clone()
+                extrs_unf_norm = torch.inverse(extrs_unf_norm[:1,...]) @ extrs_unf[i:i+1][0]
+                rot_vec = matrix_to_quaternion(extrs_unf_norm[:,:3,:3])
+                annots["poses_gt"] = torch.zeros(1, rot_vec.shape[0], 7).to(segment.device).to(segment.dtype)
+                annots["poses_gt"][:, :, 3:7] = rot_vec.to(segment.device).to(segment.dtype)[None]
+                annots["poses_gt"][:, :, :3] = extrs_unf_norm[:,:3,3].to(segment.device).to(segment.dtype)[None]
+                annots["use_extr"] = True
+            kwargs.update({"stage": stage})
+            #TODO: DEBUG
+            out = self.forward(segment, pts_q=queries_new,
+                                pts_q_3d=queries_new_3d, overlap_d=overlap_d,
+                                full_point=full_point,
+                                fixed_cam=fixed_cam, query_no_BA=query_no_BA,
+                                support_frame=segment.shape[1]-1,
+                                cache=cache, replace_ratio=replace_ratio,
+                                iters_track=iters_track,
+                                **kwargs, annots=annots)
+            if self.training:
+                loss += out["loss"].squeeze()
+            # from models.SpaTrackV2.utils.visualizer import Visualizer
+            # vis_track = Visualizer(grayscale=False,
+            #                     fps=10, pad_value=50, tracks_leave_trace=0)
+            # vis_track.visualize(video=segment,
+            #                         tracks=out["traj_est"][...,:2],
+            #                         visibility=out["vis_est"],
+            #                         save_video=True)
+            # # visualize 4d
+            # import os, json
+            # import os.path as osp
+            # viser4d_dir = os.path.join("viser_4d_results")
+            # os.makedirs(viser4d_dir, exist_ok=True)
+            # depth_est = annots["depth_gt"][0]
+            # unc_metric = out["unc_metric"]
+            # mask = (unc_metric > 0.5).squeeze(1)
+            # # pose_est = out["poses_pred"].squeeze(0)
+            # pose_est = annots["traj_mat"][0]
+            # rgb_tracks = out["rgb_tracks"].squeeze(0)
+            # intrinsics = out["intrs"].squeeze(0)
+            # for i_k in range(out["depth"].shape[0]):
+            #     img_i = out["imgs_raw"][0][i_k].permute(1, 2, 0).cpu().numpy()
+            #     img_i = cv2.cvtColor(img_i, cv2.COLOR_BGR2RGB)
+            #     cv2.imwrite(osp.join(viser4d_dir, f'frame_{i_k:04d}.png'), img_i)
+            #     if stage == 1:
+            #         depth = depth_est[i_k].squeeze().cpu().numpy()
+            #         np.save(osp.join(viser4d_dir, f'frame_{i_k:04d}.npy'), depth)
+            #     else:
+            #         point_map_vis = out["points_map"][i_k].cpu().numpy()
+            #         np.save(osp.join(viser4d_dir, f'point_{i_k:04d}.npy'), point_map_vis)
+            # np.save(os.path.join(viser4d_dir, f'intrinsics.npy'), intrinsics.cpu().numpy())
+            # np.save(os.path.join(viser4d_dir, f'extrinsics.npy'), pose_est.cpu().numpy())
+            # np.save(os.path.join(viser4d_dir, f'conf.npy'), mask.float().cpu().numpy())
+            # np.save(os.path.join(viser4d_dir, f'colored_track3d.npy'), rgb_tracks.cpu().numpy())
+            queries_len = len(queries_new)
+            # update the track3d and track2d
+            left_len = len(track3d_pred[i*step_slide:i*step_slide+window_len, :queries_len, :])
+            track3d_pred[i*step_slide:i*step_slide+window_len, :queries_len, :] = out["rgb_tracks"][0,:left_len,:queries_len,:]
+            track2d_pred[i*step_slide:i*step_slide+window_len, :queries_len, :] = out["traj_est"][0,:left_len,:queries_len,:3]
+            vis_pred[i*step_slide:i*step_slide+window_len, :queries_len, :] = out["vis_est"][0,:left_len,:queries_len,None]
+            conf_pred[i*step_slide:i*step_slide+window_len, :queries_len, :] = out["conf_pred"][0,:left_len,:queries_len,None]
+            dyn_preds[i*step_slide:i*step_slide+window_len, :queries_len, :] = out["dyn_preds"][0,:left_len,:queries_len,None]
+            # process the output for each segment
+            seg_c2w = out["poses_pred"][0]
+            seg_intrs = out["intrs"][0]
+            seg_point_map = out["points_map"]
+            seg_conf_depth = out["unc_metric"]
+            # cache management
+            cache = out["cache"]
+            for k in cache.keys():
+                if "_pyramid" in k:
+                    for j in range(len(cache[k])):
+                        if len(cache[k][j].shape) == 5:
+                            cache[k][j] = cache[k][j][:,:,:,:queries_len,:]
+                        elif len(cache[k][j].shape) == 4:
+                            cache[k][j] = cache[k][j][:,:1,:queries_len,:]
+                elif "_pred_cache" in k:
+                    cache[k] = cache[k][-overlap_len:,:queries_len,:]
+                else:
+                    cache[k] = cache[k][-overlap_len:]
+            # update the results
+            idx_glob = i * step_slide
+            # refine part
+            # mask_update = sort_query[..., 0] < i * step_slide + window_len
+            # sort_query_pick = sort_query[mask_update]
+            intrs_out[idx_glob:idx_glob+window_len] = seg_intrs
+            point_map[idx_glob:idx_glob+window_len] = seg_point_map
+            unc_metric[idx_glob:idx_glob+window_len] = seg_conf_depth
+            # update the camera poses
+            # if using the ground truth pose
+            # if extrs_unf is not None:
+            #     c2w_traj[idx_glob:idx_glob+window_len] = extrs_unf[i:i+1][0].to(c2w_traj.device).to(c2w_traj.dtype)
+            # else:
+            prev_c2w = c2w_traj[idx_glob:idx_glob+window_len][:1]
+            c2w_traj[idx_glob:idx_glob+window_len] = prev_c2w@seg_c2w.to(c2w_traj.device).to(c2w_traj.dtype)
+        track2d_pred = track2d_pred[:T_org,sorted_inv_indices,:]
+        track3d_pred = track3d_pred[:T_org,sorted_inv_indices,:]
+        vis_pred = vis_pred[:T_org,sorted_inv_indices,:]
+        conf_pred = conf_pred[:T_org,sorted_inv_indices,:]
+        dyn_preds = dyn_preds[:T_org,sorted_inv_indices,:]
+        unc_metric = unc_metric[:T_org,:]
+        point_map = point_map[:T_org,:]
+        intrs_out = intrs_out[:T_org,:]
+        c2w_traj = c2w_traj[:T_org,:]
+        if self.training:
+            ret = {
+                "loss": loss,
+                "depth_loss": 0.0,
+                "ab_loss": 0.0,
+                "vis_loss": out["vis_loss"],
+                "track_loss": out["track_loss"],
+                "conf_loss": out["conf_loss"],
+                "dyn_loss": out["dyn_loss"],
+                "sync_loss": out["sync_loss"],
+                "poses_pred": c2w_traj[None],
+                "intrs": intrs_out[None],
+                "points_map": point_map,
+                "track3d_pred": track3d_pred[None],
+                "rgb_tracks": track3d_pred[None],
+                "track2d_pred": track2d_pred[None],
+                "traj_est": track2d_pred[None],
+                "vis_est": vis_pred[None], "conf_pred": conf_pred[None],
+                "dyn_preds": dyn_preds[None],
+                "imgs_raw": video[None],
+                "unc_metric": unc_metric,
+                }
+            return ret
+        else:
+            return c2w_traj, intrs_out, point_map, unc_metric, track3d_pred, track2d_pred, vis_pred, conf_pred
+    def forward(self,
+                 x: torch.Tensor,
+                 annots: Dict = {},
+                 pts_q: torch.Tensor = None,
+                 pts_q_3d: torch.Tensor = None,
+                 overlap_d: torch.Tensor = None,
+                 full_point = False,
+                 fixed_cam = False,
+                 support_frame = 0,
+                 query_no_BA = False,
+                 cache = None,
+                 replace_ratio = 0.6,
+                 iters_track=4,
+                 **kwargs):
+        """
+        forward the video camera model, which predict (
+            `intr` `camera poses` `video depth`
+            )
+        args:
+            x: the input video frames. [B, T, C, H, W]
+            annots: the annotations for video frames.
+                    {
+                        "poses_gt": the pose encoding for the video frames. [B, T, 7]
+                        "depth_gt": the ground truth depth for the video frames. [B, T, 1, H, W],
+                        "metric": bool, whether to calculate the metric for the video frames.
+                    }
+        """
+        self.support_frame = support_frame
+        #TODO: to adjust a little bit
+        track_loss=ab_loss=vis_loss=track_loss=conf_loss=dyn_loss=0.0
+        B, T, _, H, W = x.shape
+        imgs_raw = x.clone()
+        # get the video split and features for each segment
+        patch_size, x_resize = self.ProcVid(x)
+        x_resize = rearrange(x_resize, "(b t) c h w -> b t c h w", b=B)
+        H_resize, W_resize = x_resize.shape[-2:]
+        prec_fx = W / W_resize
+        prec_fy = H / H_resize
+        # get patch size
+        P_H, P_W = patch_size
+        # get the depth, pointmap and mask
+        #TODO: Release DepthAnything Version
+        points_map_gt = None
+        with torch.no_grad():
+            if_gt_depth = (("depth_gt" in annots.keys())) and (kwargs.get('stage', 0)==1 or kwargs.get('stage', 0)==3)
+            if if_gt_depth==False:
+                if cache is not None:
+                    T_cache = cache["points_map"].shape[0]
+                    T_new = T - T_cache
+                    x_resize_new = x_resize[:, T_cache:]
+                else:
+                    T_new = T
+                    x_resize_new = x_resize
+                # infer with chunk
+                chunk_size = self.chunk_size
+                metric_depth = []
+                intrs = []
+                unc_metric = []
+                mask = []
+                points_map = []
+                normals = []
+                normals_mask = []
+                for i in range(0, B*T_new, chunk_size):
+                    output = self.base_model.infer(x_resize_new.view(B*T_new, -1, H_resize, W_resize)[i:i+chunk_size])
+                    metric_depth.append(output['depth'])
+                    intrs.append(output['intrinsics'])
+                    unc_metric.append(output['mask_prob'])
+                    mask.append(output['mask'])
+                    points_map.append(output['points'])
+                    normals_i, normals_mask_i = utils3d.torch.points_to_normals(output['points'], mask=output['mask'])
+                    normals.append(normals_i)
+                    normals_mask.append(normals_mask_i)
+                metric_depth = torch.cat(metric_depth, dim=0).view(B*T_new, 1, H_resize, W_resize).to(x.dtype)
+                intrs = torch.cat(intrs, dim=0).view(B, T_new, 3, 3).to(x.dtype)
+                intrs[:,:,0,:] *= W_resize
+                intrs[:,:,1,:] *= H_resize
+                # points_map = torch.cat(points_map, dim=0)
+                mask = torch.cat(mask, dim=0).view(B*T_new, 1, H_resize, W_resize).to(x.dtype)
+                # cat the normals
+                normals = torch.cat(normals, dim=0)
+                normals_mask = torch.cat(normals_mask, dim=0)
+                metric_depth = metric_depth.clone()
+                metric_depth[metric_depth == torch.inf] = 0
+                _depths = metric_depth[metric_depth > 0].reshape(-1)
+                q25 = torch.kthvalue(_depths, int(0.25 * len(_depths))).values
+                q75 = torch.kthvalue(_depths, int(0.75 * len(_depths))).values
+                iqr = q75 - q25
+                upper_bound = (q75 + 0.8*iqr).clamp(min=1e-6, max=10*q25)
+                _depth_roi = torch.tensor(
+                    [1e-1, upper_bound.item()],
+                    dtype=metric_depth.dtype,
+                    device=metric_depth.device
+                )
+                mask_roi = (metric_depth > _depth_roi[0]) & (metric_depth < _depth_roi[1])
+                mask = mask * mask_roi
+                mask = mask * (~(utils3d.torch.depth_edge(metric_depth, rtol=0.03, mask=mask.bool()))) * normals_mask[:,None,...]
+                points_map = depth_to_points_colmap(metric_depth.squeeze(1), intrs.view(B*T_new, 3, 3))
+                unc_metric = torch.cat(unc_metric, dim=0).view(B*T_new, 1, H_resize, W_resize).to(x.dtype)
+                unc_metric *= mask
+                if full_point:
+                    unc_metric = (~(utils3d.torch.depth_edge(metric_depth, rtol=0.1, mask=torch.ones_like(metric_depth).bool()))).float() * (metric_depth != 0)
+                if cache is not None:
+                    assert B==1, "only support batch size 1 right now."
+                    unc_metric = torch.cat([cache["unc_metric"], unc_metric], dim=0)
+                    intrs = torch.cat([cache["intrs"][None], intrs], dim=1)
+                    points_map = torch.cat([cache["points_map"].permute(0,2,3,1), points_map], dim=0)
+                    metric_depth = torch.cat([cache["metric_depth"], metric_depth], dim=0)
+            if "poses_gt" in annots.keys():
+                intrs, c2w_traj_gt = pose_enc2mat(annots["poses_gt"],
+                                            H_resize, W_resize, self.resolution)
+            else:
+                c2w_traj_gt = None
+            if "intrs_gt" in annots.keys():
+                intrs = annots["intrs_gt"].view(B, T, 3, 3)
+                fx_factor = W_resize / W
+                fy_factor = H_resize / H
+                intrs[:,:,0,:] *= fx_factor
+                intrs[:,:,1,:] *= fy_factor
+            if "depth_gt" in annots.keys():
+                metric_depth_gt = annots['depth_gt'].view(B*T, 1, H, W)
+                metric_depth_gt = F.interpolate(metric_depth_gt,
+                                size=(H_resize, W_resize), mode='nearest')
+                _depths = metric_depth_gt[metric_depth_gt > 0].reshape(-1)
+                q25 = torch.kthvalue(_depths, int(0.25 * len(_depths))).values
+                q75 = torch.kthvalue(_depths, int(0.75 * len(_depths))).values
+                iqr = q75 - q25
+                upper_bound = (q75 + 0.8*iqr).clamp(min=1e-6, max=10*q25)
+                _depth_roi = torch.tensor(
+                    [1e-1, upper_bound.item()],
+                    dtype=metric_depth_gt.dtype,
+                    device=metric_depth_gt.device
+                )
+                mask_roi = (metric_depth_gt > _depth_roi[0]) & (metric_depth_gt < _depth_roi[1])
+                # if (upper_bound > 200).any():
+                #     import pdb; pdb.set_trace()
+                if (kwargs.get('stage', 0) == 2):
+                    unc_metric = ((metric_depth_gt > 0)*(mask_roi) * (unc_metric > 0.5)).float()
+                    metric_depth_gt[metric_depth_gt > 10*q25] = 0
+                else:
+                    unc_metric = ((metric_depth_gt > 0)*(mask_roi)).float()
+                    unc_metric *= (~(utils3d.torch.depth_edge(metric_depth_gt, rtol=0.03, mask=mask_roi.bool()))).float()
+                    # filter the sky
+                    metric_depth_gt[metric_depth_gt > 10*q25] = 0
+                if "unc_metric" in annots.keys():
+                    unc_metric_ = F.interpolate(annots["unc_metric"].permute(1,0,2,3),
+                                size=(H_resize, W_resize), mode='nearest')
+                    unc_metric = unc_metric * unc_metric_
+                if if_gt_depth:
+                    points_map = depth_to_points_colmap(metric_depth_gt.squeeze(1), intrs.view(B*T, 3, 3))
+                    metric_depth = metric_depth_gt
+                    points_map_gt = points_map
+                else:
+                    points_map_gt = depth_to_points_colmap(metric_depth_gt.squeeze(1), intrs.view(B*T, 3, 3))
+        # track the 3d points
+        ret_track = None
+        regular_track = True
+        dyn_preds, final_tracks = None, None
+        if "use_extr" in annots.keys():
+            init_pose = True
+        else:
+            init_pose = False
+        # set the custom vid and valid only
+        custom_vid = annots.get("custom_vid", False)
+        valid_only = annots.get("data_dir", [""])[0] == "stereo4d"
+        if self.training:
+            if (annots["vis"].sum() > 0) and (kwargs.get('stage', 0)==1 or kwargs.get('stage', 0)==3):
+                traj3d = annots['traj_3d']
+                if (kwargs.get('stage', 0)==1) and (annots.get("custom_vid", False)==False):
+                    support_pts_q = get_track_points(H_resize, W_resize,
+                                                    T, x.device, query_size=self.track_num // 2,
+                                                    support_frame=self.support_frame, unc_metric=unc_metric, mode="incremental")[None]
+                else:
+                    support_pts_q = get_track_points(H_resize, W_resize,
+                                                    T, x.device, query_size=random.randint(32, 256),
+                                                    support_frame=self.support_frame, unc_metric=unc_metric, mode="incremental")[None]
+                if pts_q is not None:
+                    pts_q = pts_q[None,None]
+                    ret_track, dyn_preds, final_tracks, rgb_tracks, intrs_org, point_map_org_refined, cache = self.Track3D(imgs_raw,
+                                                        metric_depth,
+                                                        unc_metric.detach(), points_map, pts_q,
+                                                        intrs=intrs.clone(), cache=cache,
+                                                        prec_fx=prec_fx, prec_fy=prec_fy, overlap_d=overlap_d,
+                                                        vis_gt=annots['vis'], traj3d_gt=traj3d, iters=iters_track,
+                                                        cam_gt=c2w_traj_gt, support_pts_q=support_pts_q, custom_vid=custom_vid,
+                                                        init_pose=init_pose, fixed_cam=fixed_cam, stage=kwargs.get('stage', 0),
+                                                        points_map_gt=points_map_gt, valid_only=valid_only, replace_ratio=replace_ratio)
+                else:
+                    ret_track, dyn_preds, final_tracks, rgb_tracks, intrs_org, point_map_org_refined, cache = self.Track3D(imgs_raw,
+                                                        metric_depth,
+                                                        unc_metric.detach(), points_map, traj3d[..., :2],
+                                                        intrs=intrs.clone(), cache=cache,
+                                                        prec_fx=prec_fx, prec_fy=prec_fy, overlap_d=overlap_d,
+                                                        vis_gt=annots['vis'], traj3d_gt=traj3d, iters=iters_track,
+                                                        cam_gt=c2w_traj_gt, support_pts_q=support_pts_q, custom_vid=custom_vid,
+                                                        init_pose=init_pose, fixed_cam=fixed_cam, stage=kwargs.get('stage', 0),
+                                                        points_map_gt=points_map_gt, valid_only=valid_only, replace_ratio=replace_ratio)
+                regular_track = False
+        if regular_track:
+            if pts_q is None:
+                pts_q = get_track_points(H_resize, W_resize,
+                                            T, x.device, query_size=self.track_num,
+                                            support_frame=self.support_frame, unc_metric=unc_metric, mode="incremental" if self.training else "incremental")[None]
+                support_pts_q = None
+            else:
+                pts_q = pts_q[None,None]
+                # resize the query points
+                pts_q[...,1] *= W_resize / W
+                pts_q[...,2] *= H_resize / H
+                if pts_q_3d is not None:
+                    pts_q_3d = pts_q_3d[None,None]
+                    # resize the query points
+                    pts_q_3d[...,1] *= W_resize / W
+                    pts_q_3d[...,2] *= H_resize / H
+                else:
+                    # adjust the query with uncertainty
+                    if (full_point==False) and (overlap_d is None):
+                        pts_q_unc = sample_features5d(unc_metric[None], pts_q).squeeze()
+                        pts_q = pts_q[:,:,pts_q_unc>0.5,:]
+                        if (pts_q_unc<0.5).sum() > 0:
+                            # pad the query points
+                            pad_num = pts_q_unc.shape[0] - pts_q.shape[2]
+                            # pick the random indices
+                            indices = torch.randint(0, pts_q.shape[2], (pad_num,), device=pts_q.device)
+                            pad_pts = indices
+                            pts_q = torch.cat([pts_q, pts_q[:,:,pad_pts,:]], dim=-2)
+                support_pts_q = get_track_points(H_resize, W_resize,
+                                            T, x.device, query_size=self.track_num,
+                                            support_frame=self.support_frame,
+                                            unc_metric=unc_metric, mode="mixed")[None]
+            points_map[points_map>1e3] = 0
+            points_map = depth_to_points_colmap(metric_depth.squeeze(1), intrs.view(B*T, 3, 3))
+            ret_track, dyn_preds, final_tracks, rgb_tracks, intrs_org, point_map_org_refined, cache = self.Track3D(imgs_raw,
+                                                    metric_depth,
+                                                    unc_metric.detach(), points_map, pts_q,
+                                                    pts_q_3d=pts_q_3d, intrs=intrs.clone(),cache=cache,
+                                                    overlap_d=overlap_d, cam_gt=c2w_traj_gt if kwargs.get('stage', 0)==1 else None,
+                                                    prec_fx=prec_fx, prec_fy=prec_fy, support_pts_q=support_pts_q, custom_vid=custom_vid, valid_only=valid_only,
+                                                    fixed_cam=fixed_cam, query_no_BA=query_no_BA, init_pose=init_pose, iters=iters_track,
+                                                    stage=kwargs.get('stage', 0), points_map_gt=points_map_gt, replace_ratio=replace_ratio)
+        intrs = intrs_org
+        points_map = point_map_org_refined
+        c2w_traj = ret_track["cam_pred"]
+        if ret_track is not None:
+            if ret_track["loss"] is not None:
+                track_loss, conf_loss, dyn_loss, vis_loss, point_map_loss, scale_loss, shift_loss, sync_loss= ret_track["loss"]
+        # update the cache
+        cache.update({"metric_depth": metric_depth, "unc_metric": unc_metric, "points_map": points_map, "intrs": intrs[0]})
+        # output
+        depth = F.interpolate(metric_depth,
+                            size=(H, W), mode='bilinear', align_corners=True).squeeze(1)
+        points_map = F.interpolate(points_map,
+                            size=(H, W), mode='bilinear', align_corners=True).squeeze(1)
+        unc_metric = F.interpolate(unc_metric,
+                            size=(H, W), mode='bilinear', align_corners=True).squeeze(1)
+        if self.training:
+            loss = track_loss + conf_loss + dyn_loss + sync_loss + vis_loss + point_map_loss + (scale_loss + shift_loss)*50
+            ret = {"loss": loss,
+                    "depth_loss": point_map_loss,
+                    "ab_loss": (scale_loss + shift_loss)*50,
+                    "vis_loss": vis_loss, "track_loss": track_loss,
+                    "poses_pred": c2w_traj, "dyn_preds": dyn_preds, "traj_est": final_tracks, "conf_loss": conf_loss,
+                    "imgs_raw": imgs_raw, "rgb_tracks": rgb_tracks, "vis_est": ret_track['vis_pred'],
+                    "depth": depth, "points_map": points_map, "unc_metric": unc_metric, "intrs": intrs, "dyn_loss": dyn_loss,
+                    "sync_loss": sync_loss, "conf_pred": ret_track['conf_pred'], "cache": cache,
+                    }
+        else:
+            if ret_track is not None:
+                traj_est = ret_track['preds']
+                traj_est[..., 0] *= W / W_resize
+                traj_est[..., 1] *= H / H_resize
+                vis_est = ret_track['vis_pred']
+            else:
+                traj_est = torch.zeros(B, self.track_num // 2, 3).to(x.device)
+                vis_est = torch.zeros(B, self.track_num // 2).to(x.device)
+            if intrs is not None:
+                intrs[..., 0, :] *= W / W_resize
+                intrs[..., 1, :] *= H / H_resize
+            ret = {"poses_pred": c2w_traj, "dyn_preds": dyn_preds,
+                    "depth": depth, "traj_est": traj_est, "vis_est": vis_est, "imgs_raw": imgs_raw,
+                    "rgb_tracks": rgb_tracks, "intrs": intrs, "unc_metric": unc_metric, "points_map": points_map,
+                    "conf_pred": ret_track['conf_pred'], "cache": cache,
+                    }
+        return ret
+# three stages of training
+# stage 1:
+# gt depth and intrinsics synthetic (includes Dynamic Replica, Kubric, Pointodyssey, Vkitti, TartanAir and Indoor() ) Motion Patern (tapvid3d)
+# Tracking and Pose as well -> based on gt depth and intrinsics
+# (Finished) -> (megasam + base model) vs. tapip3d.     (use depth from megasam or pose, which keep the same setting as tapip3d.)
+# stage 2: fixed 3D tracking
+# Joint depth refiner
+# input depth from whatever + rgb -> temporal module + scale and shift token -> coarse alignment -> scale and shift
+# estimate the 3D tracks -> 3D tracks combine with pointmap -> update for pointmap (iteratively) -> residual map B T 3 H W
+# ongoing two days
+# stage 3: train multi windows by propagation
+# 4 frames overlapped -> train on 64 -> fozen image encoder and finetuning the transformer  (learnable parameters pretty small)
+# types of scenarioes:
+# 1. auto driving (waymo open dataset)
+# 2. robot
+# 3. internet ego video
+# Iterative Transformer -- Solver -- General Neural MegaSAM + Tracks
+# Update Variables:
+# 1. 3D tracks B T N 3  xyz.
+# 2. 2D tracks B T N 2  x y.
+# 3. Dynamic Mask B T H W.
+# 4. Camera Pose B T 4 4.
+# 5. Video Depth.
+# (RGB, RGBD, RGBD+Pose) x (Static, Dynamic)
+# Campatiablity by product.

models/SpaTrackV2/models/__init__.py ADDED Viewed

File without changes

models/SpaTrackV2/models/blocks.py ADDED Viewed

	@@ -0,0 +1,519 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.cuda.amp import autocast
+from einops import rearrange
+import collections
+from functools import partial
+from itertools import repeat
+import torchvision.models as tvm
+from torch.utils.checkpoint import checkpoint
+from models.monoD.depth_anything.dpt import DPTHeadEnc, DPTHead
+from typing import Union, Tuple
+from torch import Tensor
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+to_2tuple = _ntuple(2)
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, query_dim, context_dim=None,
+                  num_heads=8, dim_head=48, qkv_bias=False, flash=False):
+        super().__init__()
+        inner_dim = self.inner_dim = dim_head * num_heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = num_heads
+        self.flash = flash
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=qkv_bias)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias=qkv_bias)
+        self.to_out = nn.Linear(inner_dim, query_dim)
+    def forward(self, x, context=None, attn_bias=None):
+        B, N1, _ = x.shape
+        C = self.inner_dim
+        h = self.heads
+        q = self.to_q(x).reshape(B, N1, h, C // h).permute(0, 2, 1, 3)
+        context = default(context, x)
+        k, v = self.to_kv(context).chunk(2, dim=-1)
+        N2 = context.shape[1]
+        k = k.reshape(B, N2, h, C // h).permute(0, 2, 1, 3)
+        v = v.reshape(B, N2, h, C // h).permute(0, 2, 1, 3)
+        with torch.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+            if self.flash==False:
+                sim = (q @ k.transpose(-2, -1)) * self.scale
+                if attn_bias is not None:
+                    sim = sim + attn_bias
+                if sim.abs().max()>1e2:
+                    import pdb; pdb.set_trace()
+                attn = sim.softmax(dim=-1)
+                x = (attn @ v).transpose(1, 2).reshape(B, N1, C)
+            else:
+                input_args = [x.contiguous() for x in [q, k, v]]
+                x = F.scaled_dot_product_attention(*input_args).permute(0,2,1,3).reshape(B,N1,-1)  # type: ignore
+            if self.to_out.bias.dtype != x.dtype:
+                x = x.to(self.to_out.bias.dtype)
+        return self.to_out(x)
+class VGG19(nn.Module):
+    def __init__(self, pretrained=False, amp = False, amp_dtype = torch.float16) -> None:
+        super().__init__()
+        self.layers = nn.ModuleList(tvm.vgg19_bn(pretrained=pretrained).features[:40])
+        self.amp = amp
+        self.amp_dtype = amp_dtype
+    def forward(self, x, **kwargs):
+        with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
+            feats = {}
+            scale = 1
+            for layer in self.layers:
+                if isinstance(layer, nn.MaxPool2d):
+                    feats[scale] = x
+                    scale = scale*2
+                x = layer(x)
+            return feats
+class CNNandDinov2(nn.Module):
+    def __init__(self, cnn_kwargs = None, amp = True, amp_dtype = torch.float16):
+        super().__init__()
+        # in case the Internet connection is not stable, please load the DINOv2 locally
+        self.dinov2_vitl14 = torch.hub.load('models/torchhub/facebookresearch_dinov2_main',
+                                          'dinov2_{:}14'.format("vitl"), source='local', pretrained=False)
+        state_dict = torch.load("models/monoD/zoeDepth/ckpts/dinov2_vitl14_pretrain.pth")
+        self.dinov2_vitl14.load_state_dict(state_dict, strict=True)
+        cnn_kwargs = cnn_kwargs if cnn_kwargs is not None else {}
+        self.cnn = VGG19(**cnn_kwargs)
+        self.amp = amp
+        self.amp_dtype = amp_dtype
+        if self.amp:
+            dinov2_vitl14 = dinov2_vitl14.to(self.amp_dtype)
+        self.dinov2_vitl14 = [dinov2_vitl14] # ugly hack to not show parameters to DDP
+    def train(self, mode: bool = True):
+        return self.cnn.train(mode)
+    def forward(self, x, upsample = False):
+        B,C,H,W = x.shape
+        feature_pyramid = self.cnn(x)
+        if not upsample:
+            with torch.no_grad():
+                if self.dinov2_vitl14[0].device != x.device:
+                    self.dinov2_vitl14[0] = self.dinov2_vitl14[0].to(x.device).to(self.amp_dtype)
+                dinov2_features_16 = self.dinov2_vitl14[0].forward_features(x.to(self.amp_dtype))
+                features_16 = dinov2_features_16['x_norm_patchtokens'].permute(0,2,1).reshape(B,1024,H//14, W//14)
+                del dinov2_features_16
+                feature_pyramid[16] = features_16
+        return feature_pyramid
+class Dinov2(nn.Module):
+    def __init__(self, amp = True, amp_dtype = torch.float16):
+        super().__init__()
+        # in case the Internet connection is not stable, please load the DINOv2 locally
+        self.dinov2_vitl14 = torch.hub.load('models/torchhub/facebookresearch_dinov2_main',
+                                          'dinov2_{:}14'.format("vitl"), source='local', pretrained=False)
+        state_dict = torch.load("models/monoD/zoeDepth/ckpts/dinov2_vitl14_pretrain.pth")
+        self.dinov2_vitl14.load_state_dict(state_dict, strict=True)
+        self.amp = amp
+        self.amp_dtype = amp_dtype
+        if self.amp:
+            self.dinov2_vitl14 = self.dinov2_vitl14.to(self.amp_dtype)
+    def forward(self, x, upsample = False):
+        B,C,H,W = x.shape
+        mean_ = torch.tensor([0.485, 0.456, 0.406],
+                             device=x.device).view(1, 3, 1, 1)
+        std_ = torch.tensor([0.229, 0.224, 0.225],
+                            device=x.device).view(1, 3, 1, 1)
+        x = (x+1)/2
+        x = (x - mean_)/std_
+        h_re, w_re = 560, 560
+        x_resize = F.interpolate(x, size=(h_re, w_re),
+                                  mode='bilinear', align_corners=True)
+        if not upsample:
+            with torch.no_grad():
+                dinov2_features_16 = self.dinov2_vitl14.forward_features(x_resize.to(self.amp_dtype))
+                features_16 = dinov2_features_16['x_norm_patchtokens'].permute(0,2,1).reshape(B,1024,h_re//14, w_re//14)
+                del dinov2_features_16
+        features_16 = F.interpolate(features_16, size=(H//8, W//8), mode="bilinear", align_corners=True)
+        return features_16
+class AttnBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0,
+                  flash=False, ckpt_fwd=False, debug=False, **block_kwargs):
+        super().__init__()
+        self.debug=debug
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.flash=flash
+        self.attn = Attention(
+            hidden_size, num_heads=num_heads, qkv_bias=True, flash=flash,
+            **block_kwargs
+        )
+        self.ls = LayerScale(hidden_size, init_values=0.005)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+        )
+        self.ckpt_fwd = ckpt_fwd
+    def forward(self, x):
+        if self.debug:
+            print(x.max(), x.min(), x.mean())
+        if self.ckpt_fwd:
+            x = x + checkpoint(self.attn, self.norm1(x), use_reentrant=False)
+        else:
+            x = x + self.attn(self.norm1(x))
+        x = x + self.ls(self.mlp(self.norm2(x)))
+        return x
+class CrossAttnBlock(nn.Module):
+    def __init__(self, hidden_size, context_dim, num_heads=1, mlp_ratio=4.0, head_dim=48,
+                 flash=False, ckpt_fwd=False, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm_context = nn.LayerNorm(hidden_size)
+        self.cross_attn = Attention(
+            hidden_size, context_dim=context_dim, dim_head=head_dim,
+            num_heads=num_heads, qkv_bias=True, **block_kwargs, flash=flash,
+        )
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+        self.ckpt_fwd = ckpt_fwd
+    def forward(self, x, context):
+        if self.ckpt_fwd:
+            with autocast():
+                x = x + checkpoint(self.cross_attn,
+                                    self.norm1(x), self.norm_context(context), use_reentrant=False)
+        else:
+            with autocast():
+                x = x + self.cross_attn(
+                    self.norm1(x), self.norm_context(context)
+                )
+        x = x + self.mlp(self.norm2(x))
+        return x
+def bilinear_sampler(img, coords, mode="bilinear", mask=False):
+    """Wrapper for grid_sample, uses pixel coordinates"""
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1, 1], dim=-1)
+    # go to 0,1 then 0,2 then -1,1
+    xgrid = 2 * xgrid / (W - 1) - 1
+    ygrid = 2 * ygrid / (H - 1) - 1
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True, mode=mode)
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+    return img
+class CorrBlock:
+    def __init__(self, fmaps, num_levels=4, radius=4, depths_dnG=None):
+        B, S, C, H_prev, W_prev = fmaps.shape
+        self.S, self.C, self.H, self.W = S, C, H_prev, W_prev
+        self.num_levels = num_levels
+        self.radius = radius
+        self.fmaps_pyramid = []
+        self.depth_pyramid = []
+        self.fmaps_pyramid.append(fmaps)
+        if depths_dnG is not None:
+           self.depth_pyramid.append(depths_dnG)
+        for i in range(self.num_levels - 1):
+            if depths_dnG is not None:
+                depths_dnG_ = depths_dnG.reshape(B * S, 1, H_prev, W_prev)
+                depths_dnG_ = F.avg_pool2d(depths_dnG_, 2, stride=2)
+                _, _, H, W = depths_dnG_.shape
+                depths_dnG = depths_dnG_.reshape(B, S, 1, H, W)
+                self.depth_pyramid.append(depths_dnG)
+            fmaps_ = fmaps.reshape(B * S, C, H_prev, W_prev)
+            fmaps_ = F.avg_pool2d(fmaps_, 2, stride=2)
+            _, _, H, W = fmaps_.shape
+            fmaps = fmaps_.reshape(B, S, C, H, W)
+            H_prev = H
+            W_prev = W
+            self.fmaps_pyramid.append(fmaps)
+    def sample(self, coords):
+        r = self.radius
+        B, S, N, D = coords.shape
+        assert D == 2
+        H, W = self.H, self.W
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corrs = self.corrs_pyramid[i]  # B, S, N, H, W
+            _, _, _, H, W = corrs.shape
+            dx = torch.linspace(-r, r, 2 * r + 1)
+            dy = torch.linspace(-r, r, 2 * r + 1)
+            delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1).to(
+                coords.device
+            )
+            centroid_lvl = coords.reshape(B * S * N, 1, 1, 2) / 2 ** i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+            corrs = bilinear_sampler(corrs.reshape(B * S * N, 1, H, W), coords_lvl)
+            corrs = corrs.view(B, S, N, -1)
+            out_pyramid.append(corrs)
+        out = torch.cat(out_pyramid, dim=-1)  # B, S, N, LRR*2
+        return out.contiguous().float()
+    def corr(self, targets):
+        B, S, N, C = targets.shape
+        assert C == self.C
+        assert S == self.S
+        fmap1 = targets
+        self.corrs_pyramid = []
+        for fmaps in self.fmaps_pyramid:
+            _, _, _, H, W = fmaps.shape
+            fmap2s = fmaps.view(B, S, C, H * W)
+            corrs = torch.matmul(fmap1, fmap2s)
+            corrs = corrs.view(B, S, N, H, W)
+            corrs = corrs / torch.sqrt(torch.tensor(C).float())
+            self.corrs_pyramid.append(corrs)
+    def corr_sample(self, targets, coords, coords_dp=None):
+        B, S, N, C = targets.shape
+        r = self.radius
+        Dim_c = (2*r+1)**2
+        assert C == self.C
+        assert S == self.S
+        out_pyramid = []
+        out_pyramid_dp = []
+        for i in range(self.num_levels):
+            dx = torch.linspace(-r, r, 2 * r + 1)
+            dy = torch.linspace(-r, r, 2 * r + 1)
+            delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1).to(
+                coords.device
+            )
+            centroid_lvl = coords.reshape(B * S * N, 1, 1, 2) / 2 ** i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+            fmaps = self.fmaps_pyramid[i]
+            _, _, _, H, W = fmaps.shape
+            fmap2s = fmaps.view(B*S, C, H, W)
+            if len(self.depth_pyramid)>0:
+                depths_dnG_i = self.depth_pyramid[i]
+                depths_dnG_i = depths_dnG_i.view(B*S, 1, H, W)
+                dnG_sample = bilinear_sampler(depths_dnG_i, coords_lvl.view(B*S,1,N*Dim_c,2))
+                dp_corrs = (dnG_sample.view(B*S,N,-1) - coords_dp[0]).abs()/coords_dp[0]
+                out_pyramid_dp.append(dp_corrs)
+            fmap2s_sample = bilinear_sampler(fmap2s, coords_lvl.view(B*S,1,N*Dim_c,2))
+            fmap2s_sample = fmap2s_sample.permute(0, 3, 1, 2) # B*S, N*Dim_c, C, -1
+            corrs = torch.matmul(targets.reshape(B*S*N, 1, -1), fmap2s_sample.reshape(B*S*N, Dim_c, -1).permute(0, 2, 1))
+            corrs = corrs / torch.sqrt(torch.tensor(C).float())
+            corrs = corrs.view(B, S, N, -1)
+            out_pyramid.append(corrs)
+        out = torch.cat(out_pyramid, dim=-1)  # B, S, N, LRR*2
+        if len(self.depth_pyramid)>0:
+            out_dp = torch.cat(out_pyramid_dp, dim=-1)
+            self.fcorrD = out_dp.contiguous().float()
+        else:
+            self.fcorrD = torch.zeros_like(out).contiguous().float()
+        return out.contiguous().float()
+class EUpdateFormer(nn.Module):
+    """
+    Transformer model that updates track estimates.
+    """
+    def __init__(
+        self,
+        space_depth=12,
+        time_depth=12,
+        input_dim=320,
+        hidden_size=384,
+        num_heads=8,
+        output_dim=130,
+        mlp_ratio=4.0,
+        vq_depth=3,
+        add_space_attn=True,
+        add_time_attn=True,
+        flash=True
+    ):
+        super().__init__()
+        self.out_channels = 2
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.add_space_attn = add_space_attn
+        self.input_transform = torch.nn.Linear(input_dim, hidden_size, bias=True)
+        self.flash = flash
+        self.flow_head = nn.Sequential(
+            nn.Linear(hidden_size, output_dim, bias=True),
+            nn.ReLU(inplace=True),
+            nn.Linear(output_dim, output_dim, bias=True),
+            nn.ReLU(inplace=True),
+            nn.Linear(output_dim, output_dim, bias=True)
+        )
+        self.norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        cfg = xLSTMBlockStackConfig(
+            mlstm_block=mLSTMBlockConfig(
+                mlstm=mLSTMLayerConfig(
+                    conv1d_kernel_size=4, qkv_proj_blocksize=4, num_heads=4
+                )
+            ),
+            slstm_block=sLSTMBlockConfig(
+                slstm=sLSTMLayerConfig(
+                    backend="cuda",
+                    num_heads=4,
+                    conv1d_kernel_size=4,
+                    bias_init="powerlaw_blockdependent",
+                ),
+                feedforward=FeedForwardConfig(proj_factor=1.3, act_fn="gelu"),
+            ),
+            context_length=50,
+            num_blocks=7,
+            embedding_dim=384,
+            slstm_at=[1],
+        )
+        self.xlstm_fwd = xLSTMBlockStack(cfg)
+        self.xlstm_bwd = xLSTMBlockStack(cfg)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward(self,
+                input_tensor,
+                track_mask=None):
+        """ Updating with Transformer
+        Args:
+            input_tensor: B, N, T, C
+            arap_embed: B, N, T, C
+        """
+        B, N, T, C = input_tensor.shape
+        x = self.input_transform(input_tensor)
+        track_mask = track_mask.permute(0,2,1,3).float()
+        fwd_x = x*track_mask
+        bwd_x = x.flip(2)*track_mask.flip(2)
+        feat_fwd = self.xlstm_fwd(self.norm(fwd_x.view(B*N, T, -1)))
+        feat_bwd = self.xlstm_bwd(self.norm(bwd_x.view(B*N, T, -1)))
+        feat = (feat_bwd.flip(1) + feat_fwd).view(B, N, T, -1)
+        flow = self.flow_head(feat)
+        return flow[..., :2], flow[..., 2:]

models/SpaTrackV2/models/camera_transform.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Adapted from https://github.com/amyxlase/relpose-plus-plus
+import torch
+import numpy as np
+import math
+def bbox_xyxy_to_xywh(xyxy):
+    wh = xyxy[2:] - xyxy[:2]
+    xywh = np.concatenate([xyxy[:2], wh])
+    return xywh
+def adjust_camera_to_bbox_crop_(fl, pp, image_size_wh: torch.Tensor, clamp_bbox_xywh: torch.Tensor):
+    focal_length_px, principal_point_px = _convert_ndc_to_pixels(fl, pp, image_size_wh)
+    principal_point_px_cropped = principal_point_px - clamp_bbox_xywh[:2]
+    focal_length, principal_point_cropped = _convert_pixels_to_ndc(
+        focal_length_px, principal_point_px_cropped, clamp_bbox_xywh[2:]
+    )
+    return focal_length, principal_point_cropped
+def adjust_camera_to_image_scale_(fl, pp, original_size_wh: torch.Tensor, new_size_wh: torch.LongTensor):
+    focal_length_px, principal_point_px = _convert_ndc_to_pixels(fl, pp, original_size_wh)
+    # now scale and convert from pixels to NDC
+    image_size_wh_output = new_size_wh.float()
+    scale = (image_size_wh_output / original_size_wh).min(dim=-1, keepdim=True).values
+    focal_length_px_scaled = focal_length_px * scale
+    principal_point_px_scaled = principal_point_px * scale
+    focal_length_scaled, principal_point_scaled = _convert_pixels_to_ndc(
+        focal_length_px_scaled, principal_point_px_scaled, image_size_wh_output
+    )
+    return focal_length_scaled, principal_point_scaled
+def _convert_ndc_to_pixels(focal_length: torch.Tensor, principal_point: torch.Tensor, image_size_wh: torch.Tensor):
+    half_image_size = image_size_wh / 2
+    rescale = half_image_size.min()
+    principal_point_px = half_image_size - principal_point * rescale
+    focal_length_px = focal_length * rescale
+    return focal_length_px, principal_point_px
+def _convert_pixels_to_ndc(
+    focal_length_px: torch.Tensor, principal_point_px: torch.Tensor, image_size_wh: torch.Tensor
+):
+    half_image_size = image_size_wh / 2
+    rescale = half_image_size.min()
+    principal_point = (half_image_size - principal_point_px) / rescale
+    focal_length = focal_length_px / rescale
+    return focal_length, principal_point
+def normalize_cameras(
+    cameras, compute_optical=True, first_camera=True, normalize_trans=True, scale=1.0, points=None, max_norm=False,
+    pose_mode="C2W"
+):
+    """
+    Normalizes cameras such that
+    (1) the optical axes point to the origin and the average distance to the origin is 1
+    (2) the first camera is the origin
+    (3) the translation vector is normalized
+    TODO: some transforms overlap with others. no need to do so many transforms
+    Args:
+        cameras (List[camera]).
+    """
+    # Let distance from first camera to origin be unit
+    new_cameras = cameras.clone()
+    scale = 1.0
+    if compute_optical:
+        new_cameras, points = compute_optical_transform(new_cameras, points=points)
+    if first_camera:
+        new_cameras, points = first_camera_transform(new_cameras, points=points, pose_mode=pose_mode)
+    if normalize_trans:
+        new_cameras, points, scale = normalize_translation(new_cameras,
+                                                            points=points, max_norm=max_norm)
+    return new_cameras, points, scale
+def compute_optical_transform(new_cameras, points=None):
+    """
+    adapted from https://github.com/amyxlase/relpose-plus-plus
+    """
+    new_transform = new_cameras.get_world_to_view_transform()
+    p_intersect, dist, p_line_intersect, pp, r = compute_optical_axis_intersection(new_cameras)
+    t = Translate(p_intersect)
+    scale = dist.squeeze()[0]
+    if points is not None:
+        points = t.inverse().transform_points(points)
+        points = points / scale
+    # Degenerate case
+    if scale == 0:
+        scale = torch.norm(new_cameras.T, dim=(0, 1))
+        scale = torch.sqrt(scale)
+        new_cameras.T = new_cameras.T / scale
+    else:
+        new_matrix = t.compose(new_transform).get_matrix()
+        new_cameras.R = new_matrix[:, :3, :3]
+        new_cameras.T = new_matrix[:, 3, :3] / scale
+    return new_cameras, points
+def compute_optical_axis_intersection(cameras):
+    centers = cameras.get_camera_center()
+    principal_points = cameras.principal_point
+    one_vec = torch.ones((len(cameras), 1))
+    optical_axis = torch.cat((principal_points, one_vec), -1)
+    pp = cameras.unproject_points(optical_axis, from_ndc=True, world_coordinates=True)
+    pp2 = pp[torch.arange(pp.shape[0]), torch.arange(pp.shape[0])]
+    directions = pp2 - centers
+    centers = centers.unsqueeze(0).unsqueeze(0)
+    directions = directions.unsqueeze(0).unsqueeze(0)
+    p_intersect, p_line_intersect, _, r = intersect_skew_line_groups(p=centers, r=directions, mask=None)
+    p_intersect = p_intersect.squeeze().unsqueeze(0)
+    dist = (p_intersect - centers).norm(dim=-1)
+    return p_intersect, dist, p_line_intersect, pp2, r
+def intersect_skew_line_groups(p, r, mask):
+    # p, r both of shape (B, N, n_intersected_lines, 3)
+    # mask of shape (B, N, n_intersected_lines)
+    p_intersect, r = intersect_skew_lines_high_dim(p, r, mask=mask)
+    _, p_line_intersect = _point_line_distance(p, r, p_intersect[..., None, :].expand_as(p))
+    intersect_dist_squared = ((p_line_intersect - p_intersect[..., None, :]) ** 2).sum(dim=-1)
+    return p_intersect, p_line_intersect, intersect_dist_squared, r
+def intersect_skew_lines_high_dim(p, r, mask=None):
+    # Implements https://en.wikipedia.org/wiki/Skew_lines In more than two dimensions
+    dim = p.shape[-1]
+    # make sure the heading vectors are l2-normed
+    if mask is None:
+        mask = torch.ones_like(p[..., 0])
+    r = torch.nn.functional.normalize(r, dim=-1)
+    eye = torch.eye(dim, device=p.device, dtype=p.dtype)[None, None]
+    I_min_cov = (eye - (r[..., None] * r[..., None, :])) * mask[..., None, None]
+    sum_proj = I_min_cov.matmul(p[..., None]).sum(dim=-3)
+    p_intersect = torch.linalg.lstsq(I_min_cov.sum(dim=-3), sum_proj).solution[..., 0]
+    if torch.any(torch.isnan(p_intersect)):
+        print(p_intersect)
+        raise ValueError(f"p_intersect is NaN")
+    return p_intersect, r
+def _point_line_distance(p1, r1, p2):
+    df = p2 - p1
+    proj_vector = df - ((df * r1).sum(dim=-1, keepdim=True) * r1)
+    line_pt_nearest = p2 - proj_vector
+    d = (proj_vector).norm(dim=-1)
+    return d, line_pt_nearest
+def first_camera_transform(cameras, rotation_only=False,
+                            points=None, pose_mode="C2W"):
+    """
+    Transform so that the first camera is the origin
+    """
+    new_cameras = cameras.clone()
+    # new_transform = new_cameras.get_world_to_view_transform()
+    R = cameras.R
+    T = cameras.T
+    Tran_M = torch.cat([R, T.unsqueeze(-1)], dim=-1) # [B, 3, 4]
+    Tran_M = torch.cat([Tran_M,
+                        torch.tensor([[[0, 0, 0, 1]]], device=Tran_M.device).expand(Tran_M.shape[0], -1, -1)], dim=1)
+    if pose_mode == "C2W":
+        Tran_M_new = (Tran_M[:1,...].inverse())@Tran_M
+    elif pose_mode == "W2C":
+        Tran_M_new = Tran_M@(Tran_M[:1,...].inverse())
+    if False:
+        tR = Rotate(new_cameras.R[0].unsqueeze(0))
+        if rotation_only:
+            t = tR.inverse()
+        else:
+            tT = Translate(new_cameras.T[0].unsqueeze(0))
+            t = tR.compose(tT).inverse()
+        if points is not None:
+            points = t.inverse().transform_points(points)
+        if pose_mode == "C2W":
+            new_matrix = new_transform.compose(t).get_matrix()
+        else:
+            import ipdb; ipdb.set_trace()
+            new_matrix = t.compose(new_transform).get_matrix()
+    new_cameras.R = Tran_M_new[:, :3, :3]
+    new_cameras.T = Tran_M_new[:, :3, 3]
+    return new_cameras, points
+def normalize_translation(new_cameras, points=None, max_norm=False):
+    t_gt = new_cameras.T.clone()
+    t_gt = t_gt[1:, :]
+    if max_norm:
+        t_gt_norm = torch.norm(t_gt, dim=(-1))
+        t_gt_scale = t_gt_norm.max()
+        if t_gt_norm.max() < 0.001:
+            t_gt_scale = torch.ones_like(t_gt_scale)
+        t_gt_scale = t_gt_scale.clamp(min=0.01, max=1e5)
+    else:
+        t_gt_norm = torch.norm(t_gt, dim=(0, 1))
+        t_gt_scale = t_gt_norm / math.sqrt(len(t_gt))
+        t_gt_scale = t_gt_scale / 2
+        if t_gt_norm.max()  < 0.001:
+            t_gt_scale = torch.ones_like(t_gt_scale)
+        t_gt_scale = t_gt_scale.clamp(min=0.01, max=1e5)
+    new_cameras.T = new_cameras.T / t_gt_scale
+    if points is not None:
+        points = points / t_gt_scale
+    return new_cameras, points, t_gt_scale

models/SpaTrackV2/models/depth_refiner/backbone.py ADDED Viewed

	@@ -0,0 +1,472 @@

+# ---------------------------------------------------------------
+# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+#
+# This work is licensed under the NVIDIA Source Code License
+# ---------------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from timm.layers import DropPath, to_2tuple, trunc_normal_
+from timm.models import register_model
+from timm.models.vision_transformer import _cfg
+import math
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        if self.sr_ratio > 1:
+            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+            x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
+            x_ = self.norm(x_)
+            kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        else:
+            kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        return x
+class OverlapPatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+                              padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, H, W
+class OverlapPatchEmbed43(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+                              padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        if x.shape[1]==4:
+            x = self.proj_4c(x)
+        else:
+            x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, H, W
+class MixVisionTransformer(nn.Module):
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
+                 attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1]):
+        super().__init__()
+        self.num_classes = num_classes
+        self.depths = depths
+        # patch_embed    43
+        self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_chans=in_chans,
+                                              embed_dim=embed_dims[0])
+        self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_chans=embed_dims[0],
+                                              embed_dim=embed_dims[1])
+        self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_chans=embed_dims[1],
+                                              embed_dim=embed_dims[2])
+        self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_chans=embed_dims[2],
+                                              embed_dim=embed_dims[3])
+        # transformer encoder
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        cur = 0
+        self.block1 = nn.ModuleList([Block(
+            dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[0])
+            for i in range(depths[0])])
+        self.norm1 = norm_layer(embed_dims[0])
+        cur += depths[0]
+        self.block2 = nn.ModuleList([Block(
+            dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[1])
+            for i in range(depths[1])])
+        self.norm2 = norm_layer(embed_dims[1])
+        cur += depths[1]
+        self.block3 = nn.ModuleList([Block(
+            dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[2])
+            for i in range(depths[2])])
+        self.norm3 = norm_layer(embed_dims[2])
+        cur += depths[2]
+        self.block4 = nn.ModuleList([Block(
+            dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[3])
+            for i in range(depths[3])])
+        self.norm4 = norm_layer(embed_dims[3])
+        # classification head
+        # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
+    def reset_drop_path(self, drop_path_rate):
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
+        cur = 0
+        for i in range(self.depths[0]):
+            self.block1[i].drop_path.drop_prob = dpr[cur + i]
+        cur += self.depths[0]
+        for i in range(self.depths[1]):
+            self.block2[i].drop_path.drop_prob = dpr[cur + i]
+        cur += self.depths[1]
+        for i in range(self.depths[2]):
+            self.block3[i].drop_path.drop_prob = dpr[cur + i]
+        cur += self.depths[2]
+        for i in range(self.depths[3]):
+            self.block4[i].drop_path.drop_prob = dpr[cur + i]
+    def freeze_patch_emb(self):
+        self.patch_embed1.requires_grad = False
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'}  # has pos_embed may be better
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+    def forward_features(self, x):
+        B = x.shape[0]
+        outs = []
+        # stage 1
+        x, H, W = self.patch_embed1(x)
+        for i, blk in enumerate(self.block1):
+            x = blk(x, H, W)
+        x = self.norm1(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs.append(x)
+        # stage 2
+        x, H, W = self.patch_embed2(x)
+        for i, blk in enumerate(self.block2):
+            x = blk(x, H, W)
+        x = self.norm2(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs.append(x)
+        # stage 3
+        x, H, W = self.patch_embed3(x)
+        for i, blk in enumerate(self.block3):
+            x = blk(x, H, W)
+        x = self.norm3(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs.append(x)
+        # stage 4
+        x, H, W = self.patch_embed4(x)
+        for i, blk in enumerate(self.block4):
+            x = blk(x, H, W)
+        x = self.norm4(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs.append(x)
+        return outs
+    def forward(self, x):
+        if x.dim() == 5:
+            x = x.reshape(x.shape[0]*x.shape[1],x.shape[2],x.shape[3],x.shape[4])
+        x = self.forward_features(x)
+        # x = self.head(x)
+        return x
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+#@BACKBONES.register_module()
+class mit_b0(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b0, self).__init__(
+            patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+#@BACKBONES.register_module()
+class mit_b1(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b1, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+#@BACKBONES.register_module()
+class mit_b2(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b2, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+#@BACKBONES.register_module()
+class mit_b3(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b3, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+#@BACKBONES.register_module()
+class mit_b4(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b4, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+#@BACKBONES.register_module()
+class mit_b5(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b5, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)

models/SpaTrackV2/models/depth_refiner/decode_head.py ADDED Viewed

	@@ -0,0 +1,619 @@

+from abc import ABCMeta, abstractmethod
+import torch
+import torch.nn as nn
+# from mmcv.cnn import normal_init
+# from mmcv.runner import auto_fp16, force_fp32
+# from mmseg.core import build_pixel_sampler
+# from mmseg.ops import resize
+class BaseDecodeHead(nn.Module, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead.
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict): Config of decode loss.
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+    """
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 decoder_params=None,
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False):
+        super(BaseDecodeHead, self).__init__()
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.num_classes = num_classes
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+        if sampler is not None:
+            self.sampler = build_pixel_sampler(sampler, context=self)
+        else:
+            self.sampler = None
+        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+    def init_weights(self):
+        """Initialize weights of classification layer."""
+        normal_init(self.conv_seg, mean=0, std=0.01)
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+        return inputs
+    # @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+        return losses
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Forward function for testing.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs)
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+class BaseDecodeHead_clips(nn.Module, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead_clips.
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict): Config of decode loss.
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+    """
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 decoder_params=None,
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False,
+                 num_clips=5):
+        super(BaseDecodeHead_clips, self).__init__()
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.num_classes = num_classes
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+        self.num_clips=num_clips
+        if sampler is not None:
+            self.sampler = build_pixel_sampler(sampler, context=self)
+        else:
+            self.sampler = None
+        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+    def init_weights(self):
+        """Initialize weights of classification layer."""
+        normal_init(self.conv_seg, mean=0, std=0.01)
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+        return inputs
+    # @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg,batch_size, num_clips):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs,batch_size, num_clips)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+        return losses
+    def forward_test(self, inputs, img_metas, test_cfg, batch_size, num_clips):
+        """Forward function for testing.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs, batch_size, num_clips)
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+class BaseDecodeHead_clips_flow(nn.Module, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead_clips_flow.
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict): Config of decode loss.
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+    """
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 decoder_params=None,
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False,
+                 num_clips=5):
+        super(BaseDecodeHead_clips_flow, self).__init__()
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.num_classes = num_classes
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+        self.num_clips=num_clips
+        if sampler is not None:
+            self.sampler = build_pixel_sampler(sampler, context=self)
+        else:
+            self.sampler = None
+        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+    def init_weights(self):
+        """Initialize weights of classification layer."""
+        normal_init(self.conv_seg, mean=0, std=0.01)
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+        return inputs
+    # @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg,batch_size, num_clips,img=None):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs,batch_size, num_clips,img)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+        return losses
+    def forward_test(self, inputs, img_metas, test_cfg, batch_size=None, num_clips=None, img=None):
+        """Forward function for testing.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs, batch_size, num_clips,img)
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output

models/SpaTrackV2/models/depth_refiner/depth_refiner.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models.monoD.depth_anything_v2.dinov2_layers.patch_embed import PatchEmbed
+from models.SpaTrackV2.models.depth_refiner.backbone import mit_b3
+from models.SpaTrackV2.models.depth_refiner.stablizer import Stabilization_Network_Cross_Attention
+from einops import rearrange
+class TrackStablizer(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.backbone = mit_b3()
+        old_conv = self.backbone.patch_embed1.proj
+        new_conv = nn.Conv2d(old_conv.in_channels + 4, old_conv.out_channels, kernel_size=old_conv.kernel_size, stride=old_conv.stride, padding=old_conv.padding)
+        new_conv.weight[:, :3, :, :].data.copy_(old_conv.weight.clone())
+        self.backbone.patch_embed1.proj = new_conv
+        self.Track_Stabilizer = Stabilization_Network_Cross_Attention(in_channels=[64, 128, 320, 512],
+                    in_index=[0, 1, 2, 3],
+                    feature_strides=[4, 8, 16, 32],
+                    channels=128,
+                    dropout_ratio=0.1,
+                    num_classes=1,
+                    align_corners=False,
+                    decoder_params=dict(embed_dim=256, depths=4),
+                    num_clips=16,
+                    norm_cfg = dict(type='SyncBN', requires_grad=True))
+        self.edge_conv = nn.Sequential(nn.Conv2d(in_channels=4, out_channels=64, kernel_size=3, padding=1, stride=1, bias=True),\
+                                  nn.ReLU(inplace=True))
+        self.edge_conv1 = nn.Sequential(nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1, stride=2, bias=True),\
+                                  nn.ReLU(inplace=True))
+        self.success = False
+        self.x = None
+    def buffer_forward(self, inputs, num_clips=16):
+        """
+            buffer forward for getting the pointmap and image features
+        """
+        B, T, C, H, W = inputs.shape
+        self.x = self.backbone(inputs)
+        scale, shift = self.Track_Stabilizer.buffer_forward(self.x, num_clips=num_clips)
+        self.success = True
+        return scale, shift
+    def forward(self, inputs, tracks, tracks_uvd, num_clips=16, imgs=None, vis_track=None):
+        """
+        Args:
+            inputs: [B, T, C, H, W], RGB + PointMap + Mask
+            tracks: [B, T, N, 4], 3D tracks in camera coordinate + visibility
+            num_clips: int, number of clips to use
+        """
+        B, T, C, H, W = inputs.shape
+        edge_feat = self.edge_conv(inputs.view(B*T,4,H,W))
+        edge_feat1 = self.edge_conv1(edge_feat)
+        if not self.success:
+            scale, shift = self.Track_Stabilizer.buffer_forward(self.x,num_clips=num_clips)
+            self.success = True
+            update = self.Track_Stabilizer(self.x,edge_feat,edge_feat1,tracks,tracks_uvd,num_clips=num_clips, imgs=imgs, vis_track=vis_track)
+        else:
+            update = self.Track_Stabilizer(self.x,edge_feat,edge_feat1,tracks,tracks_uvd,num_clips=num_clips, imgs=imgs, vis_track=vis_track)
+        return update
+    def reset_success(self):
+        self.success = False
+        self.x = None
+        self.Track_Stabilizer.reset_success()
+if __name__ == "__main__":
+    # Create test input tensors
+    batch_size = 1
+    seq_len = 16
+    channels = 7  # 3 for RGB + 3 for PointMap + 1 for Mask
+    height = 384
+    width = 512
+    # Create random input tensor with shape [B, T, C, H, W]
+    inputs = torch.randn(batch_size, seq_len, channels, height, width)
+    # Create random tracks
+    tracks = torch.randn(batch_size, seq_len, 1024, 4)
+    # Create random test images
+    test_imgs = torch.randn(batch_size, seq_len, 3, height, width)
+    # Initialize model and move to GPU
+    model = TrackStablizer().cuda()
+    # Move inputs to GPU and run forward pass
+    inputs = inputs.cuda()
+    tracks = tracks.cuda()
+    outputs = model.buffer_forward(inputs, num_clips=seq_len)
+    import time
+    start_time = time.time()
+    outputs = model(inputs, tracks, num_clips=seq_len)
+    end_time = time.time()
+    print(f"Time taken: {end_time - start_time} seconds")
+    import pdb; pdb.set_trace()
+    # # Print shapes for verification
+    # print(f"Input shape: {inputs.shape}")
+    # print(f"Output shape: {outputs.shape}")
+    # # Basic tests
+    # assert outputs.shape[0] == batch_size, "Batch size mismatch"
+    # assert len(outputs.shape) == 4, "Output should be 4D: [B,C,H,W]"
+    # assert torch.all(outputs >= 0), "Output should be non-negative after ReLU"
+    # print("All tests passed!")

models/SpaTrackV2/models/depth_refiner/network.py ADDED Viewed

	@@ -0,0 +1,429 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Author: Ke Xian
+Email: kexian@hust.edu.cn
+Date: 2020/07/20
+'''
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+# ==============================================================================================================
+class FTB(nn.Module):
+    def __init__(self, inchannels, midchannels=512):
+        super(FTB, self).__init__()
+        self.in1 = inchannels
+        self.mid = midchannels
+        self.conv1 = nn.Conv2d(in_channels=self.in1, out_channels=self.mid, kernel_size=3, padding=1, stride=1, bias=True)
+        self.conv_branch = nn.Sequential(nn.ReLU(inplace=True),\
+                                         nn.Conv2d(in_channels=self.mid, out_channels=self.mid, kernel_size=3, padding=1, stride=1, bias=True),\
+                                         #nn.BatchNorm2d(num_features=self.mid),\
+                                         nn.ReLU(inplace=True),\
+                                         nn.Conv2d(in_channels=self.mid, out_channels= self.mid, kernel_size=3, padding=1, stride=1, bias=True))
+        self.relu = nn.ReLU(inplace=True)
+        self.init_params()
+    def forward(self, x):
+        x = self.conv1(x)
+        x = x + self.conv_branch(x)
+        x = self.relu(x)
+        return x
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                # init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                # init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):  #nn.BatchNorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class ATA(nn.Module):
+    def __init__(self, inchannels, reduction = 8):
+        super(ATA, self).__init__()
+        self.inchannels = inchannels
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(nn.Linear(self.inchannels*2, self.inchannels // reduction),
+                                nn.ReLU(inplace=True),
+                                nn.Linear(self.inchannels // reduction, self.inchannels),
+                                nn.Sigmoid())
+        self.init_params()
+    def forward(self, low_x, high_x):
+        n, c, _, _ = low_x.size()
+        x = torch.cat([low_x, high_x], 1)
+        x = self.avg_pool(x)
+        x = x.view(n, -1)
+        x = self.fc(x).view(n,c,1,1)
+        x = low_x * x + high_x
+        return x
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                #init.normal(m.weight, std=0.01)
+                init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                #init.normal_(m.weight, std=0.01)
+                init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d): #nn.BatchNorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class FFM(nn.Module):
+    def __init__(self, inchannels, midchannels, outchannels, upfactor=2):
+        super(FFM, self).__init__()
+        self.inchannels = inchannels
+        self.midchannels = midchannels
+        self.outchannels = outchannels
+        self.upfactor = upfactor
+        self.ftb1 = FTB(inchannels=self.inchannels, midchannels=self.midchannels)
+        self.ftb2 = FTB(inchannels=self.midchannels, midchannels=self.outchannels)
+        self.upsample = nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True)
+        self.init_params()
+        #self.p1 = nn.Conv2d(512, 256, kernel_size=1, padding=0, bias=False)
+        #self.p2 = nn.Conv2d(512, 256, kernel_size=1, padding=0, bias=False)
+        #self.p3 = nn.Conv2d(512, 256, kernel_size=1, padding=0, bias=False)
+    def forward(self, low_x, high_x):
+        x = self.ftb1(low_x)
+        '''
+        x = torch.cat((x,high_x),1)
+        if x.shape[2] == 12:
+            x = self.p1(x)
+        elif x.shape[2] == 24:
+            x = self.p2(x)
+        elif x.shape[2] == 48:
+            x = self.p3(x)
+        '''
+        x = x + high_x            ###high_x
+        x = self.ftb2(x)
+        x = self.upsample(x)
+        return x
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d): #nn.Batchnorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class noFFM(nn.Module):
+    def __init__(self, inchannels, midchannels, outchannels, upfactor=2):
+        super(noFFM, self).__init__()
+        self.inchannels = inchannels
+        self.midchannels = midchannels
+        self.outchannels = outchannels
+        self.upfactor = upfactor
+        self.ftb2 = FTB(inchannels=self.midchannels, midchannels=self.outchannels)
+        self.upsample = nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True)
+        self.init_params()
+        #self.p1 = nn.Conv2d(512, 256, kernel_size=1, padding=0, bias=False)
+        #self.p2 = nn.Conv2d(512, 256, kernel_size=1, padding=0, bias=False)
+        #self.p3 = nn.Conv2d(512, 256, kernel_size=1, padding=0, bias=False)
+    def forward(self, low_x, high_x):
+        #x = self.ftb1(low_x)
+        x = high_x            ###high_x
+        x = self.ftb2(x)
+        x = self.upsample(x)
+        return x
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d): #nn.Batchnorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class AO(nn.Module):
+    # Adaptive output module
+    def __init__(self, inchannels, outchannels, upfactor=2):
+        super(AO, self).__init__()
+        self.inchannels = inchannels
+        self.outchannels = outchannels
+        self.upfactor = upfactor
+        """
+        self.adapt_conv = nn.Sequential(nn.Conv2d(in_channels=self.inchannels, out_channels=self.inchannels//2, kernel_size=3, padding=1, stride=1, bias=True),\
+                                  nn.BatchNorm2d(num_features=self.inchannels//2),\
+                                  nn.ReLU(inplace=True),\
+                                  nn.Conv2d(in_channels=self.inchannels//2, out_channels=self.outchannels, kernel_size=3, padding=1, stride=1, bias=True),\
+                                  nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True) )#,\
+                                  #nn.ReLU(inplace=True))  ## get positive values
+        """
+        self.adapt_conv = nn.Sequential(nn.Conv2d(in_channels=self.inchannels, out_channels=self.inchannels//2, kernel_size=3, padding=1, stride=1, bias=True),\
+                                  #nn.BatchNorm2d(num_features=self.inchannels//2),\
+                                  nn.ReLU(inplace=True),\
+                                  nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True), \
+                                  nn.Conv2d(in_channels=self.inchannels//2, out_channels=self.outchannels, kernel_size=1, padding=0, stride=1))
+                                  #nn.ReLU(inplace=True))  ## get positive values
+        self.init_params()
+    def forward(self, x):
+        x = self.adapt_conv(x)
+        return x
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d): #nn.Batchnorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class ASPP(nn.Module):
+    def __init__(self, inchannels=256, planes=128, rates = [1, 6, 12, 18]):
+        super(ASPP, self).__init__()
+        self.inchannels = inchannels
+        self.planes = planes
+        self.rates = rates
+        self.kernel_sizes = []
+        self.paddings = []
+        for rate in self.rates:
+            if rate == 1:
+                self.kernel_sizes.append(1)
+                self.paddings.append(0)
+            else:
+                self.kernel_sizes.append(3)
+                self.paddings.append(rate)
+        self.atrous_0 = nn.Sequential(nn.Conv2d(in_channels=self.inchannels, out_channels=self.planes, kernel_size=self.kernel_sizes[0],
+                                                     stride=1, padding=self.paddings[0], dilation=self.rates[0], bias=True),
+                                      nn.ReLU(inplace=True),
+                                      nn.BatchNorm2d(num_features=self.planes)
+                                      )
+        self.atrous_1 = nn.Sequential(nn.Conv2d(in_channels=self.inchannels, out_channels=self.planes, kernel_size=self.kernel_sizes[1],
+                                                     stride=1, padding=self.paddings[1], dilation=self.rates[1], bias=True),
+                                      nn.ReLU(inplace=True),
+                                      nn.BatchNorm2d(num_features=self.planes),
+                                      )
+        self.atrous_2 = nn.Sequential(nn.Conv2d(in_channels=self.inchannels, out_channels=self.planes, kernel_size=self.kernel_sizes[2],
+                                                     stride=1, padding=self.paddings[2], dilation=self.rates[2], bias=True),
+                                      nn.ReLU(inplace=True),
+                                      nn.BatchNorm2d(num_features=self.planes),
+                                      )
+        self.atrous_3 = nn.Sequential(nn.Conv2d(in_channels=self.inchannels, out_channels=self.planes, kernel_size=self.kernel_sizes[3],
+                                                     stride=1, padding=self.paddings[3], dilation=self.rates[3], bias=True),
+                                      nn.ReLU(inplace=True),
+                                      nn.BatchNorm2d(num_features=self.planes),
+                                      )
+        #self.conv = nn.Conv2d(in_channels=self.planes * 4, out_channels=self.inchannels, kernel_size=3, padding=1, stride=1, bias=True)
+    def forward(self, x):
+        x = torch.cat([self.atrous_0(x), self.atrous_1(x), self.atrous_2(x), self.atrous_3(x)],1)
+        #x = self.conv(x)
+        return x
+# ==============================================================================================================
+class ResidualConv(nn.Module):
+    def __init__(self, inchannels):
+        super(ResidualConv, self).__init__()
+        #nn.BatchNorm2d
+        self.conv = nn.Sequential(
+                                  #nn.BatchNorm2d(num_features=inchannels),
+                                  nn.ReLU(inplace=False),
+                                  #nn.Conv2d(in_channels=inchannels, out_channels=inchannels, kernel_size=3, padding=1, stride=1, groups=inchannels,bias=True),
+                                  #nn.Conv2d(in_channels=inchannels, out_channels=inchannels, kernel_size=1, padding=0, stride=1, groups=1,bias=True)
+                                  nn.Conv2d(in_channels=inchannels, out_channels=inchannels//2, kernel_size=3, padding=1, stride=1, bias=False),
+                                  nn.BatchNorm2d(num_features=inchannels//2),
+                                  nn.ReLU(inplace=False),
+                                  nn.Conv2d(in_channels=inchannels//2, out_channels=inchannels, kernel_size=3, padding=1, stride=1, bias=False)
+                                  )
+        self.init_params()
+    def forward(self, x):
+        x = self.conv(x)+x
+        return x
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d): #nn.BatchNorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class FeatureFusion(nn.Module):
+    def __init__(self, inchannels, outchannels):
+        super(FeatureFusion, self).__init__()
+        self.conv = ResidualConv(inchannels=inchannels)
+        #nn.BatchNorm2d
+        self.up = nn.Sequential(ResidualConv(inchannels=inchannels),
+                                nn.ConvTranspose2d(in_channels=inchannels, out_channels=outchannels, kernel_size=3,stride=2, padding=1, output_padding=1),
+                                nn.BatchNorm2d(num_features=outchannels),
+                                nn.ReLU(inplace=True))
+    def forward(self, lowfeat, highfeat):
+        return self.up(highfeat + self.conv(lowfeat))
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d): #nn.BatchNorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class SenceUnderstand(nn.Module):
+    def __init__(self, channels):
+        super(SenceUnderstand, self).__init__()
+        self.channels = channels
+        self.conv1 = nn.Sequential(nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
+                                   nn.ReLU(inplace = True))
+        self.pool = nn.AdaptiveAvgPool2d(8)
+        self.fc = nn.Sequential(nn.Linear(512*8*8, self.channels),
+                                nn.ReLU(inplace = True))
+        self.conv2 = nn.Sequential(nn.Conv2d(in_channels=self.channels, out_channels=self.channels, kernel_size=1, padding=0),
+                                   nn.ReLU(inplace=True))
+        self.initial_params()
+    def forward(self, x):
+        n,c,h,w = x.size()
+        x = self.conv1(x)
+        x = self.pool(x)
+        x = x.view(n,-1)
+        x = self.fc(x)
+        x = x.view(n, self.channels, 1, 1)
+        x = self.conv2(x)
+        x = x.repeat(1,1,h,w)
+        return x
+    def initial_params(self, dev=0.01):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #print torch.sum(m.weight)
+                m.weight.data.normal_(0, dev)
+                if m.bias is not None:
+                    m.bias.data.fill_(0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #print torch.sum(m.weight)
+                m.weight.data.normal_(0, dev)
+                if m.bias is not None:
+                    m.bias.data.fill_(0)
+            elif isinstance(m, nn.Linear):
+                m.weight.data.normal_(0, dev)

models/SpaTrackV2/models/depth_refiner/stablilization_attention.py ADDED Viewed

	@@ -0,0 +1,1187 @@

+import math
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.layers import DropPath, to_2tuple, trunc_normal_
+from einops import rearrange
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_partition_noreshape(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (B, num_windows_h, num_windows_w, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+def get_roll_masks(H, W, window_size, shift_size):
+    #####################################
+    # move to top-left
+    img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+    h_slices = (slice(0, H-window_size),
+                slice(H-window_size, H-shift_size),
+                slice(H-shift_size, H))
+    w_slices = (slice(0, W-window_size),
+                slice(W-window_size, W-shift_size),
+                slice(W-shift_size, W))
+    cnt = 0
+    for h in h_slices:
+        for w in w_slices:
+            img_mask[:, h, w, :] = cnt
+            cnt += 1
+    mask_windows = window_partition(img_mask, window_size)  # nW, window_size, window_size, 1
+    mask_windows = mask_windows.view(-1, window_size * window_size)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask_tl = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    ####################################
+    # move to top right
+    img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+    h_slices = (slice(0, H-window_size),
+                slice(H-window_size, H-shift_size),
+                slice(H-shift_size, H))
+    w_slices = (slice(0, shift_size),
+                slice(shift_size, window_size),
+                slice(window_size, W))
+    cnt = 0
+    for h in h_slices:
+        for w in w_slices:
+            img_mask[:, h, w, :] = cnt
+            cnt += 1
+    mask_windows = window_partition(img_mask, window_size)  # nW, window_size, window_size, 1
+    mask_windows = mask_windows.view(-1, window_size * window_size)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask_tr = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    ####################################
+    # move to bottom left
+    img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+    h_slices = (slice(0, shift_size),
+                slice(shift_size, window_size),
+                slice(window_size, H))
+    w_slices = (slice(0, W-window_size),
+                slice(W-window_size, W-shift_size),
+                slice(W-shift_size, W))
+    cnt = 0
+    for h in h_slices:
+        for w in w_slices:
+            img_mask[:, h, w, :] = cnt
+            cnt += 1
+    mask_windows = window_partition(img_mask, window_size)  # nW, window_size, window_size, 1
+    mask_windows = mask_windows.view(-1, window_size * window_size)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask_bl = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    ####################################
+    # move to bottom right
+    img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+    h_slices = (slice(0, shift_size),
+                slice(shift_size, window_size),
+                slice(window_size, H))
+    w_slices = (slice(0, shift_size),
+                slice(shift_size, window_size),
+                slice(window_size, W))
+    cnt = 0
+    for h in h_slices:
+        for w in w_slices:
+            img_mask[:, h, w, :] = cnt
+            cnt += 1
+    mask_windows = window_partition(img_mask, window_size)  # nW, window_size, window_size, 1
+    mask_windows = mask_windows.view(-1, window_size * window_size)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask_br = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    # append all
+    attn_mask_all = torch.cat((attn_mask_tl, attn_mask_tr, attn_mask_bl, attn_mask_br), -1)
+    return attn_mask_all
+def get_relative_position_index(q_windows, k_windows):
+    """
+    Args:
+        q_windows: tuple (query_window_height, query_window_width)
+        k_windows: tuple (key_window_height, key_window_width)
+    Returns:
+        relative_position_index: query_window_height*query_window_width, key_window_height*key_window_width
+    """
+    # get pair-wise relative position index for each token inside the window
+    coords_h_q = torch.arange(q_windows[0])
+    coords_w_q = torch.arange(q_windows[1])
+    coords_q = torch.stack(torch.meshgrid([coords_h_q, coords_w_q]))  # 2, Wh_q, Ww_q
+    coords_h_k = torch.arange(k_windows[0])
+    coords_w_k = torch.arange(k_windows[1])
+    coords_k = torch.stack(torch.meshgrid([coords_h_k, coords_w_k]))  # 2, Wh, Ww
+    coords_flatten_q = torch.flatten(coords_q, 1)  # 2, Wh_q*Ww_q
+    coords_flatten_k = torch.flatten(coords_k, 1)  # 2, Wh_k*Ww_k
+    relative_coords = coords_flatten_q[:, :, None] - coords_flatten_k[:, None, :]  # 2, Wh_q*Ww_q, Wh_k*Ww_k
+    relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh_q*Ww_q, Wh_k*Ww_k, 2
+    relative_coords[:, :, 0] += k_windows[0] - 1  # shift to start from 0
+    relative_coords[:, :, 1] += k_windows[1] - 1
+    relative_coords[:, :, 0] *= (q_windows[1] + k_windows[1]) - 1
+    relative_position_index = relative_coords.sum(-1)  #  Wh_q*Ww_q, Wh_k*Ww_k
+    return relative_position_index
+def get_relative_position_index3d(q_windows, k_windows, num_clips):
+    """
+    Args:
+        q_windows: tuple (query_window_height, query_window_width)
+        k_windows: tuple (key_window_height, key_window_width)
+    Returns:
+        relative_position_index: query_window_height*query_window_width, key_window_height*key_window_width
+    """
+    # get pair-wise relative position index for each token inside the window
+    coords_d_q = torch.arange(num_clips)
+    coords_h_q = torch.arange(q_windows[0])
+    coords_w_q = torch.arange(q_windows[1])
+    coords_q = torch.stack(torch.meshgrid([coords_d_q, coords_h_q, coords_w_q]))  # 2, Wh_q, Ww_q
+    coords_d_k = torch.arange(num_clips)
+    coords_h_k = torch.arange(k_windows[0])
+    coords_w_k = torch.arange(k_windows[1])
+    coords_k = torch.stack(torch.meshgrid([coords_d_k, coords_h_k, coords_w_k]))  # 2, Wh, Ww
+    coords_flatten_q = torch.flatten(coords_q, 1)  # 2, Wh_q*Ww_q
+    coords_flatten_k = torch.flatten(coords_k, 1)  # 2, Wh_k*Ww_k
+    relative_coords = coords_flatten_q[:, :, None] - coords_flatten_k[:, None, :]  # 2, Wh_q*Ww_q, Wh_k*Ww_k
+    relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh_q*Ww_q, Wh_k*Ww_k, 2
+    relative_coords[:, :, 0] += num_clips - 1  # shift to start from 0
+    relative_coords[:, :, 1] += k_windows[0] - 1
+    relative_coords[:, :, 2] += k_windows[1] - 1
+    relative_coords[:, :, 0] *= (q_windows[0] + k_windows[0] - 1)*(q_windows[1] + k_windows[1] - 1)
+    relative_coords[:, :, 1] *= (q_windows[1] + k_windows[1] - 1)
+    relative_position_index = relative_coords.sum(-1)  #  Wh_q*Ww_q, Wh_k*Ww_k
+    return relative_position_index
+class WindowAttention3d3(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    Args:
+        dim (int): Number of input channels.
+        expand_size (int): The expand size at focal level 1.
+        window_size (tuple[int]): The height and width of the window.
+        focal_window (int): Focal region size.
+        focal_level (int): Focal attention level.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        pool_method (str): window pooling method. Default: none
+    """
+    def __init__(self, dim, expand_size, window_size, focal_window, focal_level, num_heads,
+                    qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0., pool_method="none", focal_l_clips=[7,1,2], focal_kernel_clips=[7,5,3]):
+        super().__init__()
+        self.dim = dim
+        self.expand_size = expand_size
+        self.window_size = window_size  # Wh, Ww
+        self.pool_method = pool_method
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+        # define a parameter table of relative position bias for each window
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        num_clips=4
+        # # define a parameter table of relative position bias
+        # self.relative_position_bias_table = nn.Parameter(
+        #     torch.zeros((2 * num_clips - 1) * (2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+        # # get pair-wise relative position index for each token inside the window
+        # coords_d = torch.arange(num_clips)
+        # coords_h = torch.arange(self.window_size[0])
+        # coords_w = torch.arange(self.window_size[1])
+        # coords = torch.stack(torch.meshgrid(coords_d, coords_h, coords_w))  # 3, Wd, Wh, Ww
+        # coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
+        # relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        # relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        # relative_coords[:, :, 0] += num_clips - 1  # shift to start from 0
+        # relative_coords[:, :, 1] += self.window_size[0] - 1
+        # relative_coords[:, :, 2] += self.window_size[1] - 1
+        # relative_coords[:, :, 0] *= (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1)
+        # relative_coords[:, :, 1] *= (2 * self.window_size[1] - 1)
+        # relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+        # self.register_buffer("relative_position_index", relative_position_index)
+        if self.expand_size > 0 and focal_level > 0:
+            # define a parameter table of position bias between window and its fine-grained surroundings
+            self.window_size_of_key = self.window_size[0] * self.window_size[1] if self.expand_size == 0 else \
+                (4 * self.window_size[0] * self.window_size[1] - 4 * (self.window_size[0] -  self.expand_size) * (self.window_size[0] -  self.expand_size))
+            self.relative_position_bias_table_to_neighbors = nn.Parameter(
+                torch.zeros(1, num_heads, self.window_size[0] * self.window_size[1], self.window_size_of_key))  # Wh*Ww, nH, nSurrounding
+            trunc_normal_(self.relative_position_bias_table_to_neighbors, std=.02)
+            # get mask for rolled k and rolled v
+            mask_tl = torch.ones(self.window_size[0], self.window_size[1]); mask_tl[:-self.expand_size, :-self.expand_size] = 0
+            mask_tr = torch.ones(self.window_size[0], self.window_size[1]); mask_tr[:-self.expand_size, self.expand_size:] = 0
+            mask_bl = torch.ones(self.window_size[0], self.window_size[1]); mask_bl[self.expand_size:, :-self.expand_size] = 0
+            mask_br = torch.ones(self.window_size[0], self.window_size[1]); mask_br[self.expand_size:, self.expand_size:] = 0
+            mask_rolled = torch.stack((mask_tl, mask_tr, mask_bl, mask_br), 0).flatten(0)
+            self.register_buffer("valid_ind_rolled", mask_rolled.nonzero().view(-1))
+        if pool_method != "none" and focal_level > 1:
+            #self.relative_position_bias_table_to_windows = nn.ParameterList()
+            #self.relative_position_bias_table_to_windows_clips = nn.ParameterList()
+            #self.register_parameter('relative_position_bias_table_to_windows',[])
+            #self.register_parameter('relative_position_bias_table_to_windows_clips',[])
+            self.unfolds = nn.ModuleList()
+            self.unfolds_clips=nn.ModuleList()
+            # build relative position bias between local patch and pooled windows
+            for k in range(focal_level-1):
+                stride = 2**k
+                kernel_size = 2*(self.focal_window // 2) + 2**k + (2**k-1)
+                # define unfolding operations
+                self.unfolds += [nn.Unfold(
+                    kernel_size=(kernel_size, kernel_size),
+                    stride=stride, padding=kernel_size // 2)
+                ]
+                # define relative position bias table
+                relative_position_bias_table_to_windows = nn.Parameter(
+                    torch.zeros(
+                        self.num_heads,
+                        (self.window_size[0] + self.focal_window + 2**k - 2) * (self.window_size[1] + self.focal_window + 2**k - 2),
+                        )
+                )
+                trunc_normal_(relative_position_bias_table_to_windows, std=.02)
+                #self.relative_position_bias_table_to_windows.append(relative_position_bias_table_to_windows)
+                self.register_parameter('relative_position_bias_table_to_windows_{}'.format(k),relative_position_bias_table_to_windows)
+                # define relative position bias index
+                relative_position_index_k = get_relative_position_index(self.window_size, to_2tuple(self.focal_window + 2**k - 1))
+                # relative_position_index_k = get_relative_position_index3d(self.window_size, to_2tuple(self.focal_window + 2**k - 1), num_clips)
+                self.register_buffer("relative_position_index_{}".format(k), relative_position_index_k)
+                # define unfolding index for focal_level > 0
+                if k > 0:
+                    mask = torch.zeros(kernel_size, kernel_size); mask[(2**k)-1:, (2**k)-1:] = 1
+                    self.register_buffer("valid_ind_unfold_{}".format(k), mask.flatten(0).nonzero().view(-1))
+            for k in range(len(focal_l_clips)):
+                # kernel_size=focal_kernel_clips[k]
+                focal_l_big_flag=False
+                if focal_l_clips[k]>self.window_size[0]:
+                    stride=1
+                    padding=0
+                    kernel_size=focal_kernel_clips[k]
+                    kernel_size_true=kernel_size
+                    focal_l_big_flag=True
+                    # stride=math.ceil(self.window_size/focal_l_clips[k])
+                    # padding=(kernel_size-stride)/2
+                else:
+                    stride = focal_l_clips[k]
+                    # kernel_size
+                    # kernel_size = 2*(focal_kernel_clips[k]// 2) + 2**focal_l_clips[k] + (2**focal_l_clips[k]-1)
+                    kernel_size = focal_kernel_clips[k]     ## kernel_size must be jishu
+                    assert kernel_size%2==1
+                    padding=kernel_size // 2
+                    # kernel_size_true=focal_kernel_clips[k]+2**focal_l_clips[k]-1
+                    kernel_size_true=kernel_size
+                # stride=math.ceil(self.window_size/focal_l_clips[k])
+                self.unfolds_clips += [nn.Unfold(
+                    kernel_size=(kernel_size, kernel_size),
+                    stride=stride,
+                    padding=padding)
+                ]
+                relative_position_bias_table_to_windows = nn.Parameter(
+                    torch.zeros(
+                        self.num_heads,
+                        (self.window_size[0] + kernel_size_true - 1) * (self.window_size[0] + kernel_size_true - 1),
+                        )
+                )
+                trunc_normal_(relative_position_bias_table_to_windows, std=.02)
+                #self.relative_position_bias_table_to_windows_clips.append(relative_position_bias_table_to_windows)
+                self.register_parameter('relative_position_bias_table_to_windows_clips_{}'.format(k),relative_position_bias_table_to_windows)
+                relative_position_index_k = get_relative_position_index(self.window_size, to_2tuple(kernel_size_true))
+                self.register_buffer("relative_position_index_clips_{}".format(k), relative_position_index_k)
+                # if (not focal_l_big_flag) and  focal_l_clips[k]>0:
+                #     mask = torch.zeros(kernel_size, kernel_size); mask[(2**focal_l_clips[k])-1:, (2**focal_l_clips[k])-1:] = 1
+                #     self.register_buffer("valid_ind_unfold_clips_{}".format(k), mask.flatten(0).nonzero().view(-1))
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.softmax = nn.Softmax(dim=-1)
+        self.focal_l_clips=focal_l_clips
+        self.focal_kernel_clips=focal_kernel_clips
+    def forward(self, x_all, mask_all=None, batch_size=None, num_clips=None):
+        """
+        Args:
+            x_all (list[Tensors]): input features at different granularity
+            mask_all (list[Tensors/None]): masks for input features at different granularity
+        """
+        x = x_all[0][0] #
+        B0, nH, nW, C = x.shape
+        # assert B==batch_size*num_clips
+        assert B0==batch_size
+        qkv = self.qkv(x).reshape(B0, nH, nW, 3, C).permute(3, 0, 1, 2, 4).contiguous()
+        q, k, v = qkv[0], qkv[1], qkv[2]  # B0, nH, nW, C
+        # partition q map
+        # print("x.shape: ", x.shape)
+        # print("q.shape: ", q.shape)   # [4, 126, 126, 256]
+        (q_windows, k_windows, v_windows) = map(
+            lambda t: window_partition(t, self.window_size[0]).view(
+            -1, self.window_size[0] * self.window_size[0], self.num_heads, C // self.num_heads
+            ).transpose(1, 2),
+            (q, k, v)
+        )
+        # q_dim0, q_dim1, q_dim2, q_dim3=q_windows.shape
+        # q_windows=q_windows.view(batch_size, num_clips, (nH//self.window_size[0])*(nW//self.window_size[1]), q_dim1, q_dim2, q_dim3)
+        # q_windows=q_windows[:,-1].contiguous().view(-1, q_dim1, q_dim2, q_dim3)   # query for the last frame (target frame)
+        # k_windows.shape [1296, 8, 49, 32]
+        if self.expand_size > 0 and self.focal_level > 0:
+            (k_tl, v_tl) = map(
+                lambda t: torch.roll(t, shifts=(-self.expand_size, -self.expand_size), dims=(1, 2)), (k, v)
+            )
+            (k_tr, v_tr) = map(
+                lambda t: torch.roll(t, shifts=(-self.expand_size, self.expand_size), dims=(1, 2)), (k, v)
+            )
+            (k_bl, v_bl) = map(
+                lambda t: torch.roll(t, shifts=(self.expand_size, -self.expand_size), dims=(1, 2)), (k, v)
+            )
+            (k_br, v_br) = map(
+                lambda t: torch.roll(t, shifts=(self.expand_size, self.expand_size), dims=(1, 2)), (k, v)
+            )
+            (k_tl_windows, k_tr_windows, k_bl_windows, k_br_windows) = map(
+                lambda t: window_partition(t, self.window_size[0]).view(-1, self.window_size[0] * self.window_size[0], self.num_heads, C // self.num_heads),
+                (k_tl, k_tr, k_bl, k_br)
+            )
+            (v_tl_windows, v_tr_windows, v_bl_windows, v_br_windows) = map(
+                lambda t: window_partition(t, self.window_size[0]).view(-1, self.window_size[0] * self.window_size[0], self.num_heads, C // self.num_heads),
+                (v_tl, v_tr, v_bl, v_br)
+            )
+            k_rolled = torch.cat((k_tl_windows, k_tr_windows, k_bl_windows, k_br_windows), 1).transpose(1, 2)
+            v_rolled = torch.cat((v_tl_windows, v_tr_windows, v_bl_windows, v_br_windows), 1).transpose(1, 2)
+            # mask out tokens in current window
+            # print("self.valid_ind_rolled.shape: ", self.valid_ind_rolled.shape)    # [132]
+            # print("k_rolled.shape: ", k_rolled.shape)    # [1296, 8, 196, 32]
+            k_rolled = k_rolled[:, :, self.valid_ind_rolled]
+            v_rolled = v_rolled[:, :, self.valid_ind_rolled]
+            k_rolled = torch.cat((k_windows, k_rolled), 2)
+            v_rolled = torch.cat((v_windows, v_rolled), 2)
+        else:
+            k_rolled = k_windows; v_rolled = v_windows;
+        # print("k_rolled.shape: ", k_rolled.shape)  # [1296, 8, 181, 32]
+        if self.pool_method != "none" and self.focal_level > 1:
+            k_pooled = []
+            v_pooled = []
+            for k in range(self.focal_level-1):
+                stride = 2**k
+                x_window_pooled = x_all[0][k+1]  # B0, nWh, nWw, C
+                nWh, nWw = x_window_pooled.shape[1:3]
+                # generate mask for pooled windows
+                # print("x_window_pooled.shape: ", x_window_pooled.shape)
+                mask = x_window_pooled.new(nWh, nWw).fill_(1)
+                # print("here: ",x_window_pooled.shape, self.unfolds[k].kernel_size, self.unfolds[k](mask.unsqueeze(0).unsqueeze(1)).shape)
+                # print(mask.unique())
+                unfolded_mask = self.unfolds[k](mask.unsqueeze(0).unsqueeze(1)).view(
+                    1, 1, self.unfolds[k].kernel_size[0], self.unfolds[k].kernel_size[1], -1).permute(0, 4, 2, 3, 1).contiguous().\
+                    view(nWh*nWw // stride // stride, -1, 1)
+                if k > 0:
+                    valid_ind_unfold_k = getattr(self, "valid_ind_unfold_{}".format(k))
+                    unfolded_mask = unfolded_mask[:, valid_ind_unfold_k]
+                # print("unfolded_mask.shape: ", unfolded_mask.shape, unfolded_mask.unique())
+                x_window_masks = unfolded_mask.flatten(1).unsqueeze(0)
+                # print((x_window_masks == 0).sum(), (x_window_masks > 0).sum(), x_window_masks.unique())
+                x_window_masks = x_window_masks.masked_fill(x_window_masks == 0, float(-100.0)).masked_fill(x_window_masks > 0, float(0.0))
+                # print(x_window_masks.shape)
+                mask_all[0][k+1] = x_window_masks
+                # generate k and v for pooled windows
+                qkv_pooled = self.qkv(x_window_pooled).reshape(B0, nWh, nWw, 3, C).permute(3, 0, 4, 1, 2).contiguous()
+                k_pooled_k, v_pooled_k = qkv_pooled[1], qkv_pooled[2]  # B0, C, nWh, nWw
+                (k_pooled_k, v_pooled_k) = map(
+                    lambda t: self.unfolds[k](t).view(
+                    B0, C, self.unfolds[k].kernel_size[0], self.unfolds[k].kernel_size[1], -1).permute(0, 4, 2, 3, 1).contiguous().\
+                    view(-1, self.unfolds[k].kernel_size[0]*self.unfolds[k].kernel_size[1], self.num_heads, C // self.num_heads).transpose(1, 2),
+                    (k_pooled_k, v_pooled_k)  # (B0 x (nH*nW)) x nHeads x (unfold_wsize x unfold_wsize) x head_dim
+                )
+                # print("k_pooled_k.shape: ", k_pooled_k.shape)
+                # print("valid_ind_unfold_k.shape: ", valid_ind_unfold_k.shape)
+                if k > 0:
+                    (k_pooled_k, v_pooled_k) = map(
+                        lambda t: t[:, :, valid_ind_unfold_k], (k_pooled_k, v_pooled_k)
+                    )
+                # print("k_pooled_k.shape: ", k_pooled_k.shape)
+                k_pooled += [k_pooled_k]
+                v_pooled += [v_pooled_k]
+            for k in range(len(self.focal_l_clips)):
+                focal_l_big_flag=False
+                if self.focal_l_clips[k]>self.window_size[0]:
+                    stride=1
+                    focal_l_big_flag=True
+                else:
+                    stride = self.focal_l_clips[k]
+                # if self.window_size>=focal_l_clips[k]:
+                #     stride=math.ceil(self.window_size/focal_l_clips[k])
+                #     # padding=(kernel_size-stride)/2
+                # else:
+                #     stride=1
+                    # padding=0
+                x_window_pooled = x_all[k+1]
+                nWh, nWw = x_window_pooled.shape[1:3]
+                mask = x_window_pooled.new(nWh, nWw).fill_(1)
+                # import pdb; pdb.set_trace()
+                # print(x_window_pooled.shape, self.unfolds_clips[k].kernel_size, self.unfolds_clips[k](mask.unsqueeze(0).unsqueeze(1)).shape)
+                unfolded_mask = self.unfolds_clips[k](mask.unsqueeze(0).unsqueeze(1)).view(
+                    1, 1, self.unfolds_clips[k].kernel_size[0], self.unfolds_clips[k].kernel_size[1], -1).permute(0, 4, 2, 3, 1).contiguous().\
+                    view(nWh*nWw // stride // stride, -1, 1)
+                # if (not focal_l_big_flag) and self.focal_l_clips[k]>0:
+                #     valid_ind_unfold_k = getattr(self, "valid_ind_unfold_clips_{}".format(k))
+                #     unfolded_mask = unfolded_mask[:, valid_ind_unfold_k]
+                # print("unfolded_mask.shape: ", unfolded_mask.shape, unfolded_mask.unique())
+                x_window_masks = unfolded_mask.flatten(1).unsqueeze(0)
+                # print((x_window_masks == 0).sum(), (x_window_masks > 0).sum(), x_window_masks.unique())
+                x_window_masks = x_window_masks.masked_fill(x_window_masks == 0, float(-100.0)).masked_fill(x_window_masks > 0, float(0.0))
+                # print(x_window_masks.shape)
+                mask_all[k+1] = x_window_masks
+                # generate k and v for pooled windows
+                qkv_pooled = self.qkv(x_window_pooled).reshape(B0, nWh, nWw, 3, C).permute(3, 0, 4, 1, 2).contiguous()
+                k_pooled_k, v_pooled_k = qkv_pooled[1], qkv_pooled[2]  # B0, C, nWh, nWw
+                if (not focal_l_big_flag):
+                    (k_pooled_k, v_pooled_k) = map(
+                        lambda t: self.unfolds_clips[k](t).view(
+                        B0, C, self.unfolds_clips[k].kernel_size[0], self.unfolds_clips[k].kernel_size[1], -1).permute(0, 4, 2, 3, 1).contiguous().\
+                        view(-1, self.unfolds_clips[k].kernel_size[0]*self.unfolds_clips[k].kernel_size[1], self.num_heads, C // self.num_heads).transpose(1, 2),
+                        (k_pooled_k, v_pooled_k)  # (B0 x (nH*nW)) x nHeads x (unfold_wsize x unfold_wsize) x head_dim
+                    )
+                else:
+                    (k_pooled_k, v_pooled_k) = map(
+                        lambda t: self.unfolds_clips[k](t),
+                        (k_pooled_k, v_pooled_k)  # (B0 x (nH*nW)) x nHeads x (unfold_wsize x unfold_wsize) x head_dim
+                    )
+                    LLL=k_pooled_k.size(2)
+                    LLL_h=int(LLL**0.5)
+                    assert LLL_h**2==LLL
+                    k_pooled_k=k_pooled_k.reshape(B0, -1, LLL_h, LLL_h)
+                    v_pooled_k=v_pooled_k.reshape(B0, -1, LLL_h, LLL_h)
+                # print("k_pooled_k.shape: ", k_pooled_k.shape)
+                # print("valid_ind_unfold_k.shape: ", valid_ind_unfold_k.shape)
+                # if (not focal_l_big_flag) and self.focal_l_clips[k]:
+                #     (k_pooled_k, v_pooled_k) = map(
+                #         lambda t: t[:, :, valid_ind_unfold_k], (k_pooled_k, v_pooled_k)
+                #     )
+                # print("k_pooled_k.shape: ", k_pooled_k.shape)
+                k_pooled += [k_pooled_k]
+                v_pooled += [v_pooled_k]
+                # qkv_pooled = self.qkv(x_window_pooled).reshape(B0, nWh, nWw, 3, C).permute(3, 0, 4, 1, 2).contiguous()
+                # k_pooled_k, v_pooled_k = qkv_pooled[1], qkv_pooled[2]  # B0, C, nWh, nWw
+                # (k_pooled_k, v_pooled_k) = map(
+                #     lambda t: self.unfolds[k](t).view(
+                #     B0, C, self.unfolds[k].kernel_size[0], self.unfolds[k].kernel_size[1], -1).permute(0, 4, 2, 3, 1).contiguous().\
+                #     view(-1, self.unfolds[k].kernel_size[0]*self.unfolds[k].kernel_size[1], self.num_heads, C // self.num_heads).transpose(1, 2),
+                #     (k_pooled_k, v_pooled_k)  # (B0 x (nH*nW)) x nHeads x (unfold_wsize x unfold_wsize) x head_dim
+                # )
+                # k_pooled += [k_pooled_k]
+                # v_pooled += [v_pooled_k]
+            k_all = torch.cat([k_rolled] + k_pooled, 2)
+            v_all = torch.cat([v_rolled] + v_pooled, 2)
+        else:
+            k_all = k_rolled
+            v_all = v_rolled
+        N = k_all.shape[-2]
+        q_windows = q_windows * self.scale
+        # print(q_windows.shape, k_all.shape, v_all.shape)
+        # exit()
+        # k_all_dim0, k_all_dim1, k_all_dim2, k_all_dim3=k_all.shape
+        # k_all=k_all.contiguous().view(batch_size, num_clips, (nH//self.window_size[0])*(nW//self.window_size[1]),
+        #     k_all_dim1, k_all_dim2, k_all_dim3).permute(0,2,3,4,1,5).contiguous().view(-1, k_all_dim1, k_all_dim2*num_clips, k_all_dim3)
+        # v_all=v_all.contiguous().view(batch_size, num_clips, (nH//self.window_size[0])*(nW//self.window_size[1]),
+        #     k_all_dim1, k_all_dim2, k_all_dim3).permute(0,2,3,4,1,5).contiguous().view(-1, k_all_dim1, k_all_dim2*num_clips, k_all_dim3)
+        # print(q_windows.shape, k_all.shape, v_all.shape, k_rolled.shape)
+        # exit()
+        attn = (q_windows @ k_all.transpose(-2, -1))  # B0*nW, nHead, window_size*window_size, focal_window_size*focal_window_size
+        window_area = self.window_size[0] * self.window_size[1]
+        # window_area_clips= num_clips*self.window_size[0] * self.window_size[1]
+        window_area_rolled = k_rolled.shape[2]
+        # add relative position bias for tokens inside window
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        # print(relative_position_bias.shape, attn.shape)
+        attn[:, :, :window_area, :window_area] = attn[:, :, :window_area, :window_area] + relative_position_bias.unsqueeze(0)
+        # relative_position_bias = self.relative_position_bias_table[self.relative_position_index[-window_area:, :window_area_clips].reshape(-1)].view(
+        #     window_area, window_area_clips, -1)  # Wh*Ww,Wd*Wh*Ww,nH
+        # relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous().view(self.num_heads,window_area,num_clips,window_area
+        # ).permute(0,1,3,2).contiguous().view(self.num_heads,window_area,window_area_clips).contiguous()  # nH, Wh*Ww, Wh*Ww*Wd
+        # # attn_dim0, attn_dim1, attn_dim2, attn_dim3=attn.shape
+        # # attn=attn.view(attn_dim0,attn_dim1,attn_dim2,num_clips,-1)
+        # # print(attn.shape, relative_position_bias.shape)
+        # attn[:,:,:window_area, :window_area_clips]=attn[:,:,:window_area, :window_area_clips] + relative_position_bias.unsqueeze(0)
+        # attn = attn + relative_position_bias.unsqueeze(0) # B_, nH, N, N
+        # add relative position bias for patches inside a window
+        if self.expand_size > 0 and self.focal_level > 0:
+            attn[:, :, :window_area, window_area:window_area_rolled] = attn[:, :, :window_area, window_area:window_area_rolled] + self.relative_position_bias_table_to_neighbors
+        if self.pool_method != "none" and self.focal_level > 1:
+            # add relative position bias for different windows in an image
+            offset = window_area_rolled
+            # print(offset)
+            for k in range(self.focal_level-1):
+                # add relative position bias
+                relative_position_index_k = getattr(self, 'relative_position_index_{}'.format(k))
+                relative_position_bias_to_windows = getattr(self,'relative_position_bias_table_to_windows_{}'.format(k))[:, relative_position_index_k.view(-1)].view(
+                    -1, self.window_size[0] * self.window_size[1], (self.focal_window+2**k-1)**2,
+                ) # nH, NWh*NWw,focal_region*focal_region
+                attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] = \
+                    attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] + relative_position_bias_to_windows.unsqueeze(0)
+                # add attentional mask
+                if mask_all[0][k+1] is not None:
+                    attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] = \
+                        attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] + \
+                            mask_all[0][k+1][:, :, None, None, :].repeat(attn.shape[0] // mask_all[0][k+1].shape[1], 1, 1, 1, 1).view(-1, 1, 1, mask_all[0][k+1].shape[-1])
+                offset += (self.focal_window+2**k-1)**2
+            # print(offset)
+            for k in range(len(self.focal_l_clips)):
+                focal_l_big_flag=False
+                if self.focal_l_clips[k]>self.window_size[0]:
+                    stride=1
+                    padding=0
+                    kernel_size=self.focal_kernel_clips[k]
+                    kernel_size_true=kernel_size
+                    focal_l_big_flag=True
+                    # stride=math.ceil(self.window_size/focal_l_clips[k])
+                    # padding=(kernel_size-stride)/2
+                else:
+                    stride = self.focal_l_clips[k]
+                    # kernel_size
+                    # kernel_size = 2*(self.focal_kernel_clips[k]// 2) + 2**self.focal_l_clips[k] + (2**self.focal_l_clips[k]-1)
+                    kernel_size = self.focal_kernel_clips[k]
+                    padding=kernel_size // 2
+                    # kernel_size_true=self.focal_kernel_clips[k]+2**self.focal_l_clips[k]-1
+                    kernel_size_true=kernel_size
+                relative_position_index_k = getattr(self, 'relative_position_index_clips_{}'.format(k))
+                relative_position_bias_to_windows = getattr(self,'relative_position_bias_table_to_windows_clips_{}'.format(k))[:, relative_position_index_k.view(-1)].view(
+                    -1, self.window_size[0] * self.window_size[1], (kernel_size_true)**2,
+                )
+                attn[:, :, :window_area, offset:(offset + (kernel_size_true)**2)] = \
+                    attn[:, :, :window_area, offset:(offset + (kernel_size_true)**2)] + relative_position_bias_to_windows.unsqueeze(0)
+                if mask_all[k+1] is not None:
+                    attn[:, :, :window_area, offset:(offset + (kernel_size_true)**2)] = \
+                        attn[:, :, :window_area, offset:(offset + (kernel_size_true)**2)] + \
+                            mask_all[k+1][:, :, None, None, :].repeat(attn.shape[0] // mask_all[k+1].shape[1], 1, 1, 1, 1).view(-1, 1, 1, mask_all[k+1].shape[-1])
+                offset += (kernel_size_true)**2
+                # print(offset)
+                # relative_position_index_k = getattr(self, 'relative_position_index_{}'.format(k))
+                # # relative_position_bias_to_windows = self.relative_position_bias_table_to_windows[k][:, relative_position_index_k.view(-1)].view(
+                # #     -1, self.window_size[0] * self.window_size[1], (self.focal_window+2**k-1)**2,
+                # # ) # nH, NWh*NWw,focal_region*focal_region
+                # # attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] = \
+                # #     attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] + relative_position_bias_to_windows.unsqueeze(0)
+                # relative_position_bias_to_windows = self.relative_position_bias_table_to_windows[k][:, relative_position_index_k[-window_area:, :].view(-1)].view(
+                #     -1, self.window_size[0] * self.window_size[1], num_clips*(self.focal_window+2**k-1)**2,
+                # ).contiguous() # nH, NWh*NWw, num_clips*focal_region*focal_region
+                # relative_position_bias_to_windows = relative_position_bias_to_windows.view(self.num_heads,
+                #     window_area,num_clips,-1).permute(0,1,3,2).contiguous().view(self.num_heads,window_area,-1)
+                # attn[:, :, :window_area, offset:(offset + num_clips*(self.focal_window+2**k-1)**2)] = \
+                #     attn[:, :, :window_area, offset:(offset + num_clips*(self.focal_window+2**k-1)**2)] + relative_position_bias_to_windows.unsqueeze(0)
+                # # add attentional mask
+                # if mask_all[k+1] is not None:
+                #     # print("inside the mask, be careful 1")
+                #     # attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] = \
+                #     #     attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] + \
+                #     #         mask_all[k+1][:, :, None, None, :].repeat(attn.shape[0] // mask_all[k+1].shape[1], 1, 1, 1, 1).view(-1, 1, 1, mask_all[k+1].shape[-1])
+                #     # print("here: ", mask_all[k+1].shape, mask_all[k+1][:, :, None, None, :].shape)
+                #     attn[:, :, :window_area, offset:(offset + num_clips*(self.focal_window+2**k-1)**2)] = \
+                #         attn[:, :, :window_area, offset:(offset + num_clips*(self.focal_window+2**k-1)**2)] + \
+                #             mask_all[k+1][:, :, None, None, :,None].repeat(attn.shape[0] // mask_all[k+1].shape[1], 1, 1, 1, 1, num_clips).view(-1, 1, 1, mask_all[k+1].shape[-1]*num_clips)
+                #     # print()
+                # offset += (self.focal_window+2**k-1)**2
+        # print("mask_all[0]: ", mask_all[0])
+        # exit()
+        if mask_all[0][0] is not None:
+            print("inside the mask, be careful 0")
+            nW = mask_all[0].shape[0]
+            attn = attn.view(attn.shape[0] // nW, nW, self.num_heads, window_area, N)
+            attn[:, :, :, :, :window_area] = attn[:, :, :, :, :window_area] + mask_all[0][None, :, None, :, :]
+            attn = attn.view(-1, self.num_heads, window_area, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v_all).transpose(1, 2).reshape(attn.shape[0], window_area, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        # print(x.shape)
+        # x = x.view(B/num_clips, nH, nW, C )
+        # exit()
+        return x
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+    def flops(self, N, window_size, unfold_size):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        if self.pool_method != "none" and self.focal_level > 1:
+            flops += self.num_heads * N * (self.dim // self.num_heads) * (unfold_size * unfold_size)
+        if self.expand_size > 0 and self.focal_level > 0:
+            flops += self.num_heads * N * (self.dim // self.num_heads) * ((window_size + 2*self.expand_size)**2-window_size**2)
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        if self.pool_method != "none" and self.focal_level > 1:
+            flops += self.num_heads * N * (self.dim // self.num_heads) * (unfold_size * unfold_size)
+        if self.expand_size > 0 and self.focal_level > 0:
+            flops += self.num_heads * N * (self.dim // self.num_heads) * ((window_size + 2*self.expand_size)**2-window_size**2)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+class CffmTransformerBlock3d3(nn.Module):
+    r""" Focal Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        expand_size (int): expand size at first focal level (finest level).
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+        pool_method (str): window pooling method. Default: none, options: [none|fc|conv]
+        focal_level (int): number of focal levels. Default: 1.
+        focal_window (int): region size of focal attention. Default: 1
+        use_layerscale (bool): whether use layer scale for training stability. Default: False
+        layerscale_value (float): scaling value for layer scale. Default: 1e-4
+    """
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, expand_size=0, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, pool_method="none",
+                 focal_level=1, focal_window=1, use_layerscale=False, layerscale_value=1e-4, focal_l_clips=[7,2,4], focal_kernel_clips=[7,5,3]):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.expand_size = expand_size
+        self.mlp_ratio = mlp_ratio
+        self.pool_method = pool_method
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+        self.use_layerscale = use_layerscale
+        self.focal_l_clips=focal_l_clips
+        self.focal_kernel_clips=focal_kernel_clips
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.expand_size = 0
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.window_size_glo = self.window_size
+        self.pool_layers = nn.ModuleList()
+        self.pool_layers_clips = nn.ModuleList()
+        if self.pool_method != "none":
+            for k in range(self.focal_level-1):
+                window_size_glo = math.floor(self.window_size_glo / (2 ** k))
+                if self.pool_method == "fc":
+                    self.pool_layers.append(nn.Linear(window_size_glo * window_size_glo, 1))
+                    self.pool_layers[-1].weight.data.fill_(1./(window_size_glo * window_size_glo))
+                    self.pool_layers[-1].bias.data.fill_(0)
+                elif self.pool_method == "conv":
+                    self.pool_layers.append(nn.Conv2d(dim, dim, kernel_size=window_size_glo, stride=window_size_glo, groups=dim))
+            for k in range(len(focal_l_clips)):
+                # window_size_glo = math.floor(self.window_size_glo / (2 ** k))
+                if focal_l_clips[k]>self.window_size:
+                    window_size_glo = focal_l_clips[k]
+                else:
+                    window_size_glo = math.floor(self.window_size_glo / (focal_l_clips[k]))
+                # window_size_glo = focal_l_clips[k]
+                if self.pool_method == "fc":
+                    self.pool_layers_clips.append(nn.Linear(window_size_glo * window_size_glo, 1))
+                    self.pool_layers_clips[-1].weight.data.fill_(1./(window_size_glo * window_size_glo))
+                    self.pool_layers_clips[-1].bias.data.fill_(0)
+                elif self.pool_method == "conv":
+                    self.pool_layers_clips.append(nn.Conv2d(dim, dim, kernel_size=window_size_glo, stride=window_size_glo, groups=dim))
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention3d3(
+            dim, expand_size=self.expand_size, window_size=to_2tuple(self.window_size),
+            focal_window=focal_window, focal_level=focal_level, num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, pool_method=pool_method, focal_l_clips=focal_l_clips, focal_kernel_clips=focal_kernel_clips)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        # print("******self.shift_size: ", self.shift_size)
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            # print("here mask none")
+            attn_mask = None
+        self.register_buffer("attn_mask", attn_mask)
+        if self.use_layerscale:
+            self.gamma_1 = nn.Parameter(layerscale_value * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(layerscale_value * torch.ones((dim)), requires_grad=True)
+    def forward(self, x):
+        H0, W0 = self.input_resolution
+        # B, L, C = x.shape
+        B0, D0, H0, W0, C = x.shape
+        shortcut = x
+        # assert L == H * W, "input feature has wrong size"
+        x=x.reshape(B0*D0,H0,W0,C).reshape(B0*D0,H0*W0,C)
+        x = self.norm1(x)
+        x = x.reshape(B0*D0, H0, W0, C)
+        # print("here")
+        # exit()
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W0 % self.window_size) % self.window_size
+        pad_b = (self.window_size - H0 % self.window_size) % self.window_size
+        if pad_r > 0 or pad_b > 0:
+            x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        B, H, W, C = x.shape     ## B=B0*D0
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+        # print("shifted_x.shape: ", shifted_x.shape)
+        shifted_x=shifted_x.view(B0,D0,H,W,C)
+        x_windows_all = [shifted_x[:,-1]]
+        x_windows_all_clips=[]
+        x_window_masks_all = [self.attn_mask]
+        x_window_masks_all_clips=[]
+        if self.focal_level > 1 and self.pool_method != "none":
+            # if we add coarser granularity and the pool method is not none
+            # pooling_index=0
+            for k in range(self.focal_level-1):
+                window_size_glo = math.floor(self.window_size_glo / (2 ** k))
+                pooled_h = math.ceil(H / self.window_size) * (2 ** k)
+                pooled_w = math.ceil(W / self.window_size) * (2 ** k)
+                H_pool = pooled_h * window_size_glo
+                W_pool = pooled_w * window_size_glo
+                x_level_k = shifted_x[:,-1]
+                # trim or pad shifted_x depending on the required size
+                if H > H_pool:
+                    trim_t = (H - H_pool) // 2
+                    trim_b = H - H_pool - trim_t
+                    x_level_k = x_level_k[:, trim_t:-trim_b]
+                elif H < H_pool:
+                    pad_t = (H_pool - H) // 2
+                    pad_b = H_pool - H - pad_t
+                    x_level_k = F.pad(x_level_k, (0,0,0,0,pad_t,pad_b))
+                if W > W_pool:
+                    trim_l = (W - W_pool) // 2
+                    trim_r = W - W_pool - trim_l
+                    x_level_k = x_level_k[:, :, trim_l:-trim_r]
+                elif W < W_pool:
+                    pad_l = (W_pool - W) // 2
+                    pad_r = W_pool - W - pad_l
+                    x_level_k = F.pad(x_level_k, (0,0,pad_l,pad_r))
+                x_windows_noreshape = window_partition_noreshape(x_level_k.contiguous(), window_size_glo) # B0, nw, nw, window_size, window_size, C
+                nWh, nWw = x_windows_noreshape.shape[1:3]
+                if self.pool_method == "mean":
+                    x_windows_pooled = x_windows_noreshape.mean([3, 4]) # B0, nWh, nWw, C
+                elif self.pool_method == "max":
+                    x_windows_pooled = x_windows_noreshape.max(-2)[0].max(-2)[0].view(B0, nWh, nWw, C) # B0, nWh, nWw, C
+                elif self.pool_method == "fc":
+                    x_windows_noreshape = x_windows_noreshape.view(B0, nWh, nWw, window_size_glo*window_size_glo, C).transpose(3, 4) # B0, nWh, nWw, C, wsize**2
+                    x_windows_pooled = self.pool_layers[k](x_windows_noreshape).flatten(-2) # B0, nWh, nWw, C
+                elif self.pool_method == "conv":
+                    x_windows_noreshape = x_windows_noreshape.view(-1, window_size_glo, window_size_glo, C).permute(0, 3, 1, 2).contiguous() # B0 * nw * nw, C, wsize, wsize
+                    x_windows_pooled = self.pool_layers[k](x_windows_noreshape).view(B0, nWh, nWw, C) # B0, nWh, nWw, C
+                x_windows_all += [x_windows_pooled]
+                # print(x_windows_pooled.shape)
+                x_window_masks_all += [None]
+                # pooling_index=pooling_index+1
+            x_windows_all_clips += [x_windows_all]
+            x_window_masks_all_clips += [x_window_masks_all]
+            for k in range(len(self.focal_l_clips)):
+                if self.focal_l_clips[k]>self.window_size:
+                    window_size_glo = self.focal_l_clips[k]
+                else:
+                    window_size_glo = math.floor(self.window_size_glo / (self.focal_l_clips[k]))
+                    pooled_h = math.ceil(H / self.window_size) * (self.focal_l_clips[k])
+                    pooled_w = math.ceil(W / self.window_size) * (self.focal_l_clips[k])
+                H_pool = pooled_h * window_size_glo
+                W_pool = pooled_w * window_size_glo
+                x_level_k = shifted_x[:,k]
+                if H!=H_pool or W!=W_pool:
+                    x_level_k=F.interpolate(x_level_k.permute(0,3,1,2), size=(H_pool, W_pool), mode='bilinear').permute(0,2,3,1)
+                # print(x_level_k.shape)
+                x_windows_noreshape = window_partition_noreshape(x_level_k.contiguous(), window_size_glo) # B0, nw, nw, window_size, window_size, C
+                nWh, nWw = x_windows_noreshape.shape[1:3]
+                if self.pool_method == "mean":
+                    x_windows_pooled = x_windows_noreshape.mean([3, 4]) # B0, nWh, nWw, C
+                elif self.pool_method == "max":
+                    x_windows_pooled = x_windows_noreshape.max(-2)[0].max(-2)[0].view(B0, nWh, nWw, C) # B0, nWh, nWw, C
+                elif self.pool_method == "fc":
+                    x_windows_noreshape = x_windows_noreshape.view(B0, nWh, nWw, window_size_glo*window_size_glo, C).transpose(3, 4) # B0, nWh, nWw, C, wsize**2
+                    x_windows_pooled = self.pool_layers_clips[k](x_windows_noreshape).flatten(-2) # B0, nWh, nWw, C
+                elif self.pool_method == "conv":
+                    x_windows_noreshape = x_windows_noreshape.view(-1, window_size_glo, window_size_glo, C).permute(0, 3, 1, 2).contiguous() # B0 * nw * nw, C, wsize, wsize
+                    x_windows_pooled = self.pool_layers_clips[k](x_windows_noreshape).view(B0, nWh, nWw, C) # B0, nWh, nWw, C
+                x_windows_all_clips += [x_windows_pooled]
+                # print(x_windows_pooled.shape)
+                x_window_masks_all_clips += [None]
+                # pooling_index=pooling_index+1
+        # exit()
+        attn_windows = self.attn(x_windows_all_clips, mask_all=x_window_masks_all_clips, batch_size=B0, num_clips=D0)  # nW*B0, window_size*window_size, C
+        attn_windows = attn_windows[:, :self.window_size ** 2]
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H(padded) W(padded) C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        # x = x[:, :self.input_resolution[0], :self.input_resolution[1]].contiguous().view(B, -1, C)
+        x = x[:, :H0, :W0].contiguous().view(B0, -1, C)
+        # FFN
+        # x = shortcut + self.drop_path(x if (not self.use_layerscale) else (self.gamma_1 * x))
+        # x = x + self.drop_path(self.mlp(self.norm2(x)) if (not self.use_layerscale) else (self.gamma_2 * self.mlp(self.norm2(x))))
+        # print(x.shape, shortcut[:,-1].view(B0, -1, C).shape)
+        x = shortcut[:,-1].view(B0, -1, C) + self.drop_path(x if (not self.use_layerscale) else (self.gamma_1 * x))
+        x = x + self.drop_path(self.mlp(self.norm2(x)) if (not self.use_layerscale) else (self.gamma_2 * self.mlp(self.norm2(x))))
+        # x=torch.cat([shortcut[:,:-1],x.view(B0,self.input_resolution[0],self.input_resolution[1],C).unsqueeze(1)],1)
+        x=torch.cat([shortcut[:,:-1],x.view(B0,H0,W0,C).unsqueeze(1)],1)
+        assert x.shape==shortcut.shape
+        # exit()
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size, self.window_size, self.focal_window)
+        if self.pool_method != "none" and self.focal_level > 1:
+            for k in range(self.focal_level-1):
+                window_size_glo = math.floor(self.window_size_glo / (2 ** k))
+                nW_glo = nW * (2**k)
+                # (sub)-window pooling
+                flops += nW_glo * self.dim * window_size_glo * window_size_glo
+                # qkv for global levels
+                # NOTE: in our implementation, we pass the pooled window embedding to qkv embedding layer,
+                # but theoritically, we only need to compute k and v.
+                flops += nW_glo * self.dim * 3 * self.dim
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+class BasicLayer3d3(nn.Module):
+    """ A basic Focal Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        expand_size (int): expand size for focal level 1.
+        expand_layer (str): expand layer. Default: all
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        pool_method (str): Window pooling method. Default: none.
+        focal_level (int): Number of focal levels. Default: 1.
+        focal_window (int): region size at each focal level. Default: 1.
+        use_conv_embed (bool): whether use overlapped convolutional patch embedding layer. Default: False
+        use_shift (bool): Whether use window shift as in Swin Transformer. Default: False
+        use_pre_norm (bool): Whether use pre-norm before patch embedding projection for stability. Default: False
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        use_layerscale (bool): Whether use layer scale for stability. Default: False.
+        layerscale_value (float): Layerscale value. Default: 1e-4.
+    """
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size, expand_size, expand_layer="all",
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, pool_method="none",
+                 focal_level=1, focal_window=1, use_conv_embed=False, use_shift=False, use_pre_norm=False,
+                 downsample=None, use_checkpoint=False, use_layerscale=False, layerscale_value=1e-4, focal_l_clips=[16,8,2], focal_kernel_clips=[7,5,3]):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        if expand_layer == "even":
+            expand_factor = 0
+        elif expand_layer == "odd":
+            expand_factor = 1
+        elif expand_layer == "all":
+            expand_factor = -1
+        # build blocks
+        self.blocks = nn.ModuleList([
+            CffmTransformerBlock3d3(dim=dim, input_resolution=input_resolution,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=(0 if (i % 2 == 0) else window_size // 2) if use_shift else 0,
+                                 expand_size=0 if (i % 2 == expand_factor) else expand_size,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop,
+                                 attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer,
+                                 pool_method=pool_method,
+                                 focal_level=focal_level,
+                                 focal_window=focal_window,
+                                 use_layerscale=use_layerscale,
+                                 layerscale_value=layerscale_value,
+                                 focal_l_clips=focal_l_clips,
+                                 focal_kernel_clips=focal_kernel_clips)
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                img_size=input_resolution, patch_size=2, in_chans=dim, embed_dim=2*dim,
+                use_conv_embed=use_conv_embed, norm_layer=norm_layer, use_pre_norm=use_pre_norm,
+                is_stem=False
+            )
+        else:
+            self.downsample = None
+    def forward(self, x, batch_size=None, num_clips=None, reg_tokens=None):
+        B, D, C, H, W = x.shape
+        x = rearrange(x, 'b d c h w -> b d h w c')
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = x.view(x.shape[0], self.input_resolution[0], self.input_resolution[1], -1).permute(0, 3, 1, 2).contiguous()
+            x = self.downsample(x)
+        x = rearrange(x, 'b d h w c -> b d c h w')
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops

models/SpaTrackV2/models/depth_refiner/stablizer.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import numpy as np
+import torch.nn as nn
+import torch
+# from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from collections import OrderedDict
+# from mmseg.ops import resize
+from torch.nn.functional import interpolate as resize
+# from builder import HEADS
+from models.SpaTrackV2.models.depth_refiner.decode_head import BaseDecodeHead, BaseDecodeHead_clips, BaseDecodeHead_clips_flow
+# from mmseg.models.utils import *
+import attr
+from IPython import embed
+from models.SpaTrackV2.models.depth_refiner.stablilization_attention import BasicLayer3d3
+import cv2
+from models.SpaTrackV2.models.depth_refiner.network import *
+import warnings
+# from mmcv.utils import Registry, build_from_cfg
+from torch import nn
+from einops import rearrange
+import torch.nn.functional as F
+from models.SpaTrackV2.models.blocks import (
+    AttnBlock, CrossAttnBlock, Mlp
+)
+class MLP(nn.Module):
+    """
+    Linear Embedding
+    """
+    def __init__(self, input_dim=2048, embed_dim=768):
+        super().__init__()
+        self.proj = nn.Linear(input_dim, embed_dim)
+    def forward(self, x):
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+def scatter_multiscale_fast(
+    track2d: torch.Tensor,
+    trackfeature: torch.Tensor,
+    H: int,
+    W: int,
+    kernel_sizes = [1]
+) -> torch.Tensor:
+    """
+    Scatter sparse track features onto a dense image grid with weighted multi-scale pooling to handle zero-value gaps.
+    This function scatters sparse track features into a dense image grid and applies multi-scale average pooling
+    while excluding zero-value holes. The weight mask ensures that only valid feature regions contribute to the pooling,
+    avoiding dilution by empty pixels.
+    Args:
+        track2d (torch.Tensor): Float tensor of shape (B, T, N, 2) containing (x, y) pixel coordinates
+                                for each track point across batches, frames, and points.
+        trackfeature (torch.Tensor): Float tensor of shape (B, T, N, C) with C-dimensional features
+                                    for each track point.
+        H (int): Height of the target output image.
+        W (int): Width of the target output image.
+        kernel_sizes (List[int]): List of odd integers for average pooling kernel sizes. Default: [3, 5, 7].
+    Returns:
+        torch.Tensor: Multi-scale fused feature map of shape (B, T, C, H, W) with hole-resistant pooling.
+    """
+    B, T, N, C = trackfeature.shape
+    device = trackfeature.device
+    # 1. Flatten coordinates and filter valid points within image bounds
+    coords_flat = track2d.round().long().reshape(-1, 2)  # (B*T*N, 2)
+    x = coords_flat[:, 0]  # x coordinates
+    y = coords_flat[:, 1]  # y coordinates
+    feat_flat = trackfeature.reshape(-1, C)  # Flatten features
+    valid_mask = (x >= 0) & (x < W) & (y >= 0) & (y < H)
+    x = x[valid_mask]
+    y = y[valid_mask]
+    feat_flat = feat_flat[valid_mask]
+    valid_count = x.shape[0]
+    if valid_count == 0:
+        return torch.zeros(B, T, C, H, W, device=device)  # Handle no-valid-point case
+    # 2. Calculate linear indices and batch-frame indices for scattering
+    lin_idx = y * W + x  # Linear index within a single frame (H*W range)
+    # Generate batch-frame indices (e.g., 0~B*T-1 for each frame in batch)
+    bt_idx_raw = (
+        torch.arange(B * T, device=device)
+        .view(B, T, 1)
+        .expand(B, T, N)
+        .reshape(-1)
+    )
+    bt_idx = bt_idx_raw[valid_mask]  # Indices for valid points across batch and frames
+    # 3. Create accumulation buffers for features and weights
+    total_space = B * T * H * W
+    img_accum_flat = torch.zeros(total_space, C, device=device)    # Feature accumulator
+    weight_accum_flat = torch.zeros(total_space, 1, device=device) # Weight accumulator (counts)
+    # 4. Scatter features and weights into accumulation buffers
+    idx_in_accum = bt_idx * (H * W) + lin_idx  # Global index: batch_frame * H*W + pixel_index
+    # Add features to corresponding indices (index_add_ is efficient for sparse updates)
+    img_accum_flat.index_add_(0, idx_in_accum, feat_flat)
+    weight_accum_flat.index_add_(0, idx_in_accum, torch.ones((valid_count, 1), device=device))
+    # 5. Normalize features by valid weights, keep zeros for invalid regions
+    valid_mask_flat = weight_accum_flat > 0  # Binary mask for valid pixels
+    img_accum_flat = img_accum_flat / (weight_accum_flat + 1e-6)  # Avoid division by zero
+    img_accum_flat = img_accum_flat * valid_mask_flat.float()  # Mask out invalid regions
+    # 6. Reshape to (B, T, C, H, W) for further processing
+    img = (
+        img_accum_flat.view(B, T, H, W, C)
+        .permute(0, 1, 4, 2, 3)
+        .contiguous()
+    )  # Shape: (B, T, C, H, W)
+    # 7. Multi-scale pooling with weight masking to exclude zero holes
+    blurred_outputs = []
+    for k in kernel_sizes:
+        pad = k // 2
+        img_bt = img.view(B*T, C, H, W)  # Flatten batch and time for pooling
+        # Create weight mask for valid regions (1 where features exist, 0 otherwise)
+        weight_mask = (
+            weight_accum_flat.view(B, T, 1, H, W) > 0
+        ).float().view(B*T, 1, H, W)  # Shape: (B*T, 1, H, W)
+        # Calculate number of valid neighbors in each pooling window
+        weight_sum = F.conv2d(
+            weight_mask,
+            torch.ones((1, 1, k, k), device=device),
+            stride=1,
+            padding=pad
+        )  # Shape: (B*T, 1, H, W)
+        # Sum features only in valid regions
+        feat_sum = F.conv2d(
+            img_bt * weight_mask,  # Mask out invalid regions before summing
+            torch.ones((1, 1, k, k), device=device).expand(C, 1, k, k),
+            stride=1,
+            padding=pad,
+            groups=C
+        )  # Shape: (B*T, C, H, W)
+        # Compute average only over valid neighbors
+        feat_avg = feat_sum / (weight_sum + 1e-6)
+        blurred_outputs.append(feat_avg)
+    # 8. Fuse multi-scale results by averaging across kernel sizes
+    fused = torch.stack(blurred_outputs).mean(dim=0)  # Average over kernel sizes
+    return fused.view(B, T, C, H, W)  # Restore original shape
+#@HEADS.register_module()
+class Stabilization_Network_Cross_Attention(BaseDecodeHead_clips_flow):
+    def __init__(self, feature_strides, **kwargs):
+        super(Stabilization_Network_Cross_Attention, self).__init__(input_transform='multiple_select', **kwargs)
+        self.training = False
+        assert len(feature_strides) == len(self.in_channels)
+        assert min(feature_strides) == feature_strides[0]
+        self.feature_strides = feature_strides
+        c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = self.in_channels
+        decoder_params = kwargs['decoder_params']
+        embedding_dim = decoder_params['embed_dim']
+        self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=embedding_dim)
+        self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=embedding_dim)
+        self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=embedding_dim)
+        self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=embedding_dim)
+        self.linear_fuse = nn.Sequential(nn.Conv2d(embedding_dim*4, embedding_dim, kernel_size=(1, 1), stride=(1, 1), bias=False),\
+                                         nn.ReLU(inplace=True))
+        self.proj_track = nn.Conv2d(100, 128, kernel_size=(1, 1), stride=(1, 1), bias=True)
+        depths = decoder_params['depths']
+        self.reg_tokens = nn.Parameter(torch.zeros(1, 2, embedding_dim))
+        self.global_patch = nn.Conv2d(embedding_dim, embedding_dim, kernel_size=(8, 8), stride=(8, 8), bias=True)
+        self.att_temporal = nn.ModuleList(
+            [
+                AttnBlock(embedding_dim, 8,
+                          mlp_ratio=4, flash=True, ckpt_fwd=True)
+                for _ in range(8)
+            ]
+        )
+        self.att_spatial = nn.ModuleList(
+            [
+                AttnBlock(embedding_dim, 8,
+                          mlp_ratio=4, flash=True, ckpt_fwd=True)
+                for _ in range(8)
+            ]
+        )
+        self.scale_shift_head = nn.Sequential(nn.Linear(embedding_dim, embedding_dim), nn.GELU(), nn.Linear(embedding_dim, 4))
+        # Initialize reg tokens
+        nn.init.trunc_normal_(self.reg_tokens, std=0.02)
+        self.decoder_focal=BasicLayer3d3(dim=embedding_dim,
+               input_resolution=(96,
+                                 96),
+               depth=depths,
+               num_heads=8,
+               window_size=7,
+               mlp_ratio=4.,
+               qkv_bias=True,
+               qk_scale=None,
+               drop=0.,
+               attn_drop=0.,
+               drop_path=0.,
+               norm_layer=nn.LayerNorm,
+               pool_method='fc',
+               downsample=None,
+               focal_level=2,
+               focal_window=5,
+               expand_size=3,
+               expand_layer="all",
+               use_conv_embed=False,
+               use_shift=False,
+               use_pre_norm=False,
+               use_checkpoint=False,
+               use_layerscale=False,
+               layerscale_value=1e-4,
+               focal_l_clips=[7,4,2],
+               focal_kernel_clips=[7,5,3])
+        self.ffm2 = FFM(inchannels= 256, midchannels= 256, outchannels = 128)
+        self.ffm1 = FFM(inchannels= 128, midchannels= 128, outchannels = 64)
+        self.ffm0 = FFM(inchannels= 64, midchannels= 64, outchannels = 32,upfactor=1)
+        self.AO = AO(32, outchannels=3, upfactor=1)
+        self._c2 = None
+        self._c_further = None
+    def buffer_forward(self, inputs, num_clips=None, imgs=None):#,infermode=1):
+        # input: B T 7 H W  (7 means 3 rgb + 3 pointmap + 1 uncertainty)  normalized
+        if self.training:
+            assert self.num_clips==num_clips
+        x = self._transform_inputs(inputs)  # len=4, 1/4,1/8,1/16,1/32
+        c1, c2, c3, c4 = x
+        ############## MLP decoder on C1-C4 ###########
+        n, _, h, w = c4.shape
+        batch_size = n // num_clips
+        _c4 = self.linear_c4(c4).permute(0,2,1).reshape(n, -1, c4.shape[2], c4.shape[3])
+        _c4 = resize(_c4, size=c1.size()[2:],mode='bilinear',align_corners=False)
+        _c3 = self.linear_c3(c3).permute(0,2,1).reshape(n, -1, c3.shape[2], c3.shape[3])
+        _c3 = resize(_c3, size=c1.size()[2:],mode='bilinear',align_corners=False)
+        _c2 = self.linear_c2(c2).permute(0,2,1).reshape(n, -1, c2.shape[2], c2.shape[3])
+        _c2 = resize(_c2, size=c1.size()[2:],mode='bilinear',align_corners=False)
+        _c1 = self.linear_c1(c1).permute(0,2,1).reshape(n, -1, c1.shape[2], c1.shape[3])
+        _c = self.linear_fuse(torch.cat([_c4, _c3, _c2, _c1], dim=1))
+        _, _, h, w=_c.shape
+        _c_further=_c.reshape(batch_size, num_clips, -1, h, w)  #h2w2
+        # Expand reg_tokens to match batch size
+        reg_tokens = self.reg_tokens.expand(batch_size*num_clips, -1, -1)  # [B, 2, C]
+        _c2=self.decoder_focal(_c_further, batch_size=batch_size, num_clips=num_clips, reg_tokens=reg_tokens)
+        assert _c_further.shape==_c2.shape
+        self._c2 = _c2
+        self._c_further = _c_further
+        # compute the scale and shift of the global patch
+        global_patch = self.global_patch(_c2.view(batch_size*num_clips, -1, h, w)).view(batch_size*num_clips, _c2.shape[2], -1).permute(0,2,1)
+        global_patch = torch.cat([global_patch, reg_tokens], dim=1)
+        for i in range(8):
+            global_patch = self.att_temporal[i](global_patch)
+            global_patch = rearrange(global_patch, '(b t) n c -> (b n) t c', b=batch_size, t=num_clips, c=_c2.shape[2])
+            global_patch = self.att_spatial[i](global_patch)
+            global_patch = rearrange(global_patch, '(b n) t c -> (b t) n c', b=batch_size, t=num_clips, c=_c2.shape[2])
+        reg_tokens = global_patch[:, -2:, :]
+        s_ = self.scale_shift_head(reg_tokens)
+        scale = 1 + s_[:, 0, :1].view(batch_size, num_clips, 1, 1, 1)
+        shift = s_[:, 1, 1:].view(batch_size, num_clips, 3, 1, 1)
+        shift[:,:,:2,...] = 0
+        return scale, shift
+    def forward(self, inputs, edge_feat, edge_feat1, tracks, tracks_uvd, num_clips=None, imgs=None, vis_track=None):#,infermode=1):
+        if self._c2 is None:
+            scale, shift = self.buffer_forward(inputs,num_clips,imgs)
+        B, T, N, _ = tracks.shape
+        _c2 = self._c2
+        _c_further = self._c_further
+        # skip and head
+        _c_further = rearrange(_c_further, 'b t c h w -> (b t) c h w', b=B, t=T)
+        _c2 = rearrange(_c2, 'b t c h w -> (b t) c h w', b=B, t=T)
+        outframe = self.ffm2(_c_further, _c2)
+        tracks_uv = tracks_uvd[...,:2].clone()
+        track_feature = scatter_multiscale_fast(tracks_uv/2, tracks, outframe.shape[-2], outframe.shape[-1], kernel_sizes=[1, 3, 5])
+        # visualize track_feature as video
+        # import cv2
+        # import imageio
+        # import os
+        # BT, C, H, W = outframe.shape
+        # track_feature_vis = track_feature.view(B, T, 3, H, W).float().detach().cpu().numpy()
+        # track_feature_vis = track_feature_vis.transpose(0,1,3,4,2)
+        # track_feature_vis = (track_feature_vis - track_feature_vis.min()) / (track_feature_vis.max() - track_feature_vis.min() + 1e-6)
+        # track_feature_vis = (track_feature_vis * 255).astype(np.uint8)
+        # imgs =(imgs.detach() + 1) * 127.5
+        # vis_track.visualize(video=imgs, tracks=tracks_uv, filename="test")
+        # for b in range(B):
+        #     frames = []
+        #     for t in range(T):
+        #         frame = track_feature_vis[b,t]
+        #         frame = cv2.applyColorMap(frame[...,0], cv2.COLORMAP_JET)
+        #         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        #         frames.append(frame)
+        #     # Save as gif
+        #     imageio.mimsave(f'track_feature_b{b}.gif', frames, duration=0.1)
+        # import pdb; pdb.set_trace()
+        track_feature = rearrange(track_feature, 'b t c h w -> (b t) c h w')
+        track_feature = self.proj_track(track_feature)
+        outframe = self.ffm1(edge_feat1 + track_feature,outframe)
+        outframe = self.ffm0(edge_feat,outframe)
+        outframe = self.AO(outframe)
+        return outframe
+    def reset_success(self):
+        self._c2 = None
+        self._c_further = None

models/SpaTrackV2/models/predictor.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from models.SpaTrackV2.models.SpaTrack import SpaTrack2
+from typing import Literal
+import numpy as np
+from pathlib import Path
+from typing import Union, Optional
+import cv2
+import os
+import decord
+class Predictor(torch.nn.Module):
+    def __init__(self, args=None):
+        super().__init__()
+        self.args = args
+        self.spatrack = SpaTrack2(loggers=[None, None, None], **args)
+        self.S_wind = 200
+        self.overlap = 8
+    def to(self, device: Union[str, torch.device]):
+        self.spatrack.to(device)
+        self.spatrack.base_model.to(device)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, Path],
+        *,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        model_cfg: Optional[dict] = None,
+        **kwargs,
+    ) -> "SpaTrack2":
+        """
+        Load a pretrained model from a local file or a remote repository.
+        Args:
+            pretrained_model_name_or_path (str or Path):
+                - Path to a local model file (e.g., `./model.pth`).
+                - HuggingFace Hub model ID (e.g., `username/model-name`).
+            force_download (bool, optional):
+                Whether to force re-download even if cached. Default: False.
+            cache_dir (str, optional):
+                Custom cache directory. Default: None (use default cache).
+            device (str or torch.device, optional):
+                Target device (e.g., "cuda", "cpu"). Default: None (keep original).
+            **kwargs:
+                Additional config overrides.
+        Returns:
+            SpaTrack2: Loaded pretrained model.
+        """
+        # (1) check the path is local or remote
+        if isinstance(pretrained_model_name_or_path, Path):
+            model_path = str(pretrained_model_name_or_path)
+        else:
+            model_path = pretrained_model_name_or_path
+        # (2) if the path is remote, download it
+        if not os.path.exists(model_path):
+            raise NotImplementedError("Remote download not implemented yet. Use a local path.")
+        # (3) load the model weights
+        state_dict = torch.load(model_path, map_location="cpu")
+        # (4) initialize the model (can load config.json if exists)
+        config_path = os.path.join(os.path.dirname(model_path), "config.json")
+        config = {}
+        if os.path.exists(config_path):
+            import json
+            with open(config_path, "r") as f:
+                config.update(json.load(f))
+        config.update(kwargs)  # allow override the config
+        if model_cfg is not None:
+            config = model_cfg
+        model = cls(config)
+        if "model" in state_dict:
+            model.spatrack.load_state_dict(state_dict["model"], strict=False)
+        else:
+            model.spatrack.load_state_dict(state_dict, strict=False)
+        # (5) device management
+        if device is not None:
+            model.to(device)
+        return model
+    def forward(self, video: str|torch.Tensor|np.ndarray,
+                 depth: str|torch.Tensor|np.ndarray=None,
+                 unc_metric: str|torch.Tensor|np.ndarray=None,
+                 intrs: str|torch.Tensor|np.ndarray=None,
+                 extrs: str|torch.Tensor|np.ndarray=None,
+                 queries=None, queries_3d=None, iters_track=4,
+                 full_point=False, fps=30, track2d_gt=None,
+                 fixed_cam=False, query_no_BA=False, stage=0,
+                 support_frame=0, replace_ratio=0.6):
+        """
+        video: this could be a path to a video, a tensor of shape (T, C, H, W) or a numpy array of shape (T, C, H, W)
+        queries: (B, N, 2)
+        """
+        if isinstance(video, str):
+            video = decord.VideoReader(video)
+            video = video[::fps].asnumpy()  # Convert to numpy array
+            video = np.array(video)  # Ensure numpy array
+            video = torch.from_numpy(video).permute(0, 3, 1, 2).float()
+        elif isinstance(video, np.ndarray):
+            video = torch.from_numpy(video).float()
+        if isinstance(depth, np.ndarray):
+            depth = torch.from_numpy(depth).float()
+        if isinstance(intrs, np.ndarray):
+            intrs = torch.from_numpy(intrs).float()
+        if isinstance(extrs, np.ndarray):
+            extrs = torch.from_numpy(extrs).float()
+        if isinstance(unc_metric, np.ndarray):
+            unc_metric = torch.from_numpy(unc_metric).float()
+        T_, C, H, W = video.shape
+        step_slide = self.S_wind - self.overlap
+        if T_ > self.S_wind:
+            num_windows = (T_ - self.S_wind + step_slide) // step_slide
+            T = num_windows * step_slide + self.S_wind
+            pad_len = T - T_
+            video = torch.cat([video, video[-1:].repeat(T-video.shape[0], 1, 1, 1)], dim=0)
+            if depth is not None:
+                depth = torch.cat([depth, depth[-1:].repeat(T-depth.shape[0], 1, 1)], dim=0)
+            if intrs is not None:
+                intrs = torch.cat([intrs, intrs[-1:].repeat(T-intrs.shape[0], 1, 1)], dim=0)
+            if extrs is not None:
+                extrs = torch.cat([extrs, extrs[-1:].repeat(T-extrs.shape[0], 1, 1)], dim=0)
+            if unc_metric is not None:
+                unc_metric = torch.cat([unc_metric, unc_metric[-1:].repeat(T-unc_metric.shape[0], 1)], dim=0)
+        with torch.no_grad():
+            ret = self.spatrack.forward_stream(video, queries, T_org=T_,
+                                                depth=depth, intrs=intrs, unc_metric_in=unc_metric, extrs=extrs, queries_3d=queries_3d,
+                                                window_len=self.S_wind, overlap_len=self.overlap, track2d_gt=track2d_gt, full_point=full_point, iters_track=iters_track,
+                                                fixed_cam=fixed_cam, query_no_BA=query_no_BA, stage=stage, support_frame=support_frame, replace_ratio=replace_ratio) + (video[:T_],)
+        return ret

models/SpaTrackV2/models/tracker3D/TrackRefiner.py ADDED Viewed

	@@ -0,0 +1,1478 @@

+import os, sys
+import torch
+import torch.amp
+from models.SpaTrackV2.models.tracker3D.co_tracker.cotracker_base import CoTrackerThreeOffline, get_1d_sincos_pos_embed_from_grid
+import torch.nn.functional as F
+from models.SpaTrackV2.utils.visualizer import Visualizer
+from models.SpaTrackV2.utils.model_utils import sample_features5d
+from models.SpaTrackV2.models.blocks import bilinear_sampler
+import torch.nn as nn
+from models.SpaTrackV2.models.tracker3D.co_tracker.utils import (
+    EfficientUpdateFormer, AttnBlock, Attention, CrossAttnBlock,
+    sequence_BCE_loss, sequence_loss, sequence_prob_loss, sequence_dyn_prob_loss, sequence_loss_xyz, balanced_binary_cross_entropy
+)
+from torchvision.io import write_video
+import math
+from models.SpaTrackV2.models.tracker3D.co_tracker.utils import (
+    Mlp, BasicEncoder, EfficientUpdateFormer, GeometryEncoder, NeighborTransformer, CorrPointformer
+)
+from models.SpaTrackV2.utils.embeddings import get_3d_sincos_pos_embed_from_grid
+from einops import rearrange, repeat
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.utils import (
+    EfficientUpdateFormer3D, weighted_procrustes_torch, posenc, key_fr_wprocrustes, get_topo_mask,
+    TrackFusion, get_nth_visible_time_index
+)
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.ba import extract_static_from_3DTracks, ba_pycolmap
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.pointmap_updator import PointMapUpdator
+from models.SpaTrackV2.models.depth_refiner.depth_refiner import TrackStablizer
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.alignment import affine_invariant_global_loss
+from models.SpaTrackV2.models.tracker3D.delta_utils.upsample_transformer import UpsampleTransformerAlibi
+class TrackRefiner3D(CoTrackerThreeOffline):
+    def __init__(self, args=None):
+        super().__init__(**args.base)
+        """
+        This is 3D warpper from cotracker, which load the cotracker pretrain and
+        jointly refine the `camera pose`, `3D tracks`, `video depth`, `visibility` and `conf`
+        """
+        self.updateformer3D = EfficientUpdateFormer3D(self.updateformer)
+        self.corr_depth_mlp = Mlp(in_features=256, hidden_features=256, out_features=256)
+        self.rel_pos_mlp = Mlp(in_features=75, hidden_features=128, out_features=128)
+        self.rel_pos_glob_mlp = Mlp(in_features=75, hidden_features=128, out_features=256)
+        self.corr_xyz_mlp = Mlp(in_features=256, hidden_features=128, out_features=128)
+        self.xyz_mlp = Mlp(in_features=126, hidden_features=128, out_features=84)
+        # self.track_feat_mlp = Mlp(in_features=1110, hidden_features=128, out_features=128)
+        self.proj_xyz_embed = Mlp(in_features=1210+50, hidden_features=1110, out_features=1110)
+        # get the anchor point's embedding, and init the pts refiner
+        update_pts = True
+        # self.corr_transformer = nn.ModuleList([
+        #     CorrPointformer(
+        #         dim=128,
+        #         num_heads=8,
+        #         head_dim=128 // 8,
+        #         mlp_ratio=4.0,
+        #     )
+        #     for _ in range(self.corr_levels)
+        # ])
+        self.corr_transformer = nn.ModuleList([
+            CorrPointformer(
+                dim=128,
+                num_heads=8,
+                head_dim=128 // 8,
+                mlp_ratio=4.0,
+            )
+        ]
+        )
+        self.fnet = BasicEncoder(input_dim=3,
+                                 output_dim=self.latent_dim, stride=self.stride)
+        self.corr3d_radius = 3
+        if args.stablizer:
+            self.scale_shift_tokens = nn.Parameter(torch.randn(1, 2, self.latent_dim, requires_grad=True))
+            self.upsample_kernel_size = 5
+            self.residual_embedding = nn.Parameter(torch.randn(
+                                            self.latent_dim, self.model_resolution[0]//16,
+                                            self.model_resolution[1]//16, requires_grad=True))
+            self.dense_mlp = nn.Conv2d(2*self.latent_dim+63, self.latent_dim, kernel_size=1, stride=1, padding=0)
+            self.upsample_factor = 4
+            self.upsample_transformer = UpsampleTransformerAlibi(
+                kernel_size=self.upsample_kernel_size, # kernel_size=3, #
+                stride=self.stride,
+                latent_dim=self.latent_dim,
+                num_attn_blocks=2,
+                upsample_factor=4,
+            )
+        else:
+            self.update_pointmap = None
+        self.mode = args.mode
+        if self.mode == "online":
+            self.s_wind = args.s_wind
+            self.overlap = args.overlap
+    def upsample_with_mask(
+        self, inp: torch.Tensor, mask: torch.Tensor
+    ) -> torch.Tensor:
+        """Upsample flow field [H/P, W/P, 2] -> [H, W, 2] using convex combination"""
+        H, W = inp.shape[-2:]
+        up_inp = F.unfold(
+            inp, [self.upsample_kernel_size, self.upsample_kernel_size], padding=(self.upsample_kernel_size - 1) // 2
+        )
+        up_inp = rearrange(up_inp, "b c (h w) -> b c h w", h=H, w=W)
+        up_inp = F.interpolate(up_inp, scale_factor=self.upsample_factor, mode="nearest")
+        up_inp = rearrange(
+            up_inp, "b (c i j) h w -> b c (i j) h w", i=self.upsample_kernel_size, j=self.upsample_kernel_size
+        )
+        up_inp = torch.sum(mask * up_inp, dim=2)
+        return up_inp
+    def track_from_cam(self, queries, c2w_traj, intrs,
+                                rgbs=None, visualize=False):
+        """
+        This function will generate tracks by camera transform
+        Args:
+            queries: B T N 4
+            c2w_traj: B T 4 4
+            intrs: B T 3 3
+        """
+        B, T, N, _ = queries.shape
+        query_t = queries[:,0,:,0].to(torch.int64) # B N
+        query_c2w = torch.gather(c2w_traj,
+                                  dim=1, index=query_t[..., None, None].expand(-1, -1, 4, 4))  # B N 4 4
+        query_intr = torch.gather(intrs,
+                                  dim=1, index=query_t[..., None, None].expand(-1, -1, 3, 3))  # B N 3 3
+        query_pts = queries[:,0,:,1:4].clone() # B N 3
+        query_d = queries[:,0,:,3:4] # B N 3
+        query_pts[...,2] = 1
+        cam_pts = torch.einsum("bnij,bnj->bni", torch.inverse(query_intr), query_pts)*query_d # B N 3
+        # convert to world
+        cam_pts_h = torch.zeros(B, N, 4, device=cam_pts.device)
+        cam_pts_h[..., :3] = cam_pts
+        cam_pts_h[..., 3] = 1
+        world_pts = torch.einsum("bnij,bnj->bni", query_c2w, cam_pts_h)
+        # convert to other frames
+        cam_other_pts_ = torch.einsum("btnij,btnj->btni",
+                                    torch.inverse(c2w_traj[:,:,None].float().repeat(1,1,N,1,1)),
+                                    world_pts[:,None].repeat(1,T,1,1))
+        cam_depth = cam_other_pts_[...,2:3]
+        cam_other_pts = cam_other_pts_[...,:3] / (cam_other_pts_[...,2:3].abs()+1e-6)
+        cam_other_pts = torch.einsum("btnij,btnj->btni", intrs[:,:,None].repeat(1,1,N,1,1), cam_other_pts[...,:3])
+        cam_other_pts[..., 2:] = cam_depth
+        if visualize:
+            viser = Visualizer(save_dir=".", grayscale=True,
+                               fps=10, pad_value=50, tracks_leave_trace=0)
+            cam_other_pts[..., 0] /= self.factor_x
+            cam_other_pts[..., 1] /= self.factor_y
+            viser.visualize(video=rgbs, tracks=cam_other_pts[..., :2], filename="test")
+        init_xyzs = cam_other_pts
+        return init_xyzs, world_pts[..., :3], cam_other_pts_[..., :3]
+    def cam_from_track(self, tracks, intrs,
+                       dyn_prob=None, metric_unc=None,
+                       vis_est=None, only_cam_pts=False,
+                       track_feat_concat=None,
+                       tracks_xyz=None,
+                       query_pts=None,
+                       fixed_cam=False,
+                       depth_unproj=None,
+                       cam_gt=None,
+                       init_pose=False,
+                       ):
+        """
+        This function will generate tracks by camera transform
+        Args:
+            queries: B T N 3
+            scale_est: 1 1
+            shift_est: 1 1
+            intrs: B T 3 3
+            dyn_prob: B T N
+            metric_unc: B N 1
+            query_pts: B T N 3
+        """
+        if tracks_xyz is not None:
+            B, T, N, _ = tracks.shape
+            cam_pts = tracks_xyz
+            intr_repeat = intrs[:,:,None].repeat(1,1,N,1,1)
+        else:
+            B, T, N, _ = tracks.shape
+            # get the pts in cam coordinate
+            tracks_xy = tracks[...,:3].clone().detach() # B T N 3
+            # tracks_z = 1/(tracks[...,2:] * scale_est + shift_est) # B T N 1
+            tracks_z = tracks[...,2:].detach() # B T N 1
+            tracks_xy[...,2] = 1
+            intr_repeat = intrs[:,:,None].repeat(1,1,N,1,1)
+            cam_pts = torch.einsum("bnij,bnj->bni",
+                                torch.inverse(intr_repeat.view(B*T,N,3,3)).float(),
+                                    tracks_xy.view(B*T, N, 3))*(tracks_z.view(B*T,N,1).abs()) # B*T N 3
+            cam_pts[...,2] *= torch.sign(tracks_z.view(B*T,N))
+            # get the normalized cam pts, and pts refiner
+            mask_z = (tracks_z.max(dim=1)[0]<200).squeeze()
+            cam_pts = cam_pts.view(B, T, N, 3)
+        if only_cam_pts:
+            return cam_pts
+        dyn_prob = dyn_prob.mean(dim=1)[..., None]
+        # B T N 3 -> local frames coordinates.  transformer  static points  B T N 3 -> B T N 3  static (B T N 3) -> same -> dynamic points @ C2T.inverse()
+        # get the cam pose
+        vis_est_ = vis_est[:,:,None,:]
+        graph_matrix = (vis_est_*vis_est_.permute(0, 2,1,3)).detach()
+        # find the max connected component
+        key_fr_idx = [0]
+        weight_final = (metric_unc) # * vis_est
+        with torch.amp.autocast(enabled=False, device_type='cuda'):
+            if fixed_cam:
+                c2w_traj_init = self.c2w_est_curr
+                c2w_traj_glob = c2w_traj_init
+                cam_pts_refine = cam_pts
+                intrs_refine = intrs
+                xy_refine = query_pts[...,1:3]
+                world_tracks_init = torch.einsum("btij,btnj->btni", c2w_traj_init[:,:,:3,:3], cam_pts) + c2w_traj_init[:,:,None,:3,3]
+                world_tracks_refined = world_tracks_init
+                # extract the stable static points for refine the camera pose
+                intrs_dn = intrs.clone()
+                intrs_dn[...,0,:] *= self.factor_x
+                intrs_dn[...,1,:] *= self.factor_y
+                _, query_world_pts, _ = self.track_from_cam(query_pts, c2w_traj_init, intrs_dn)
+                world_tracks_static, mask_static, mask_topk, vis_mask_static, tracks2d_static = extract_static_from_3DTracks(world_tracks_init,
+                                                                                                                dyn_prob, query_world_pts,
+                                                                                                                vis_est, tracks, img_size=self.image_size,
+                                                                                                                K=0)
+                world_static_refine = world_tracks_static
+            else:
+                if (not self.training):
+                    # if (self.c2w_est_curr==torch.eye(4, device=cam_pts.device).repeat(B, T, 1, 1)).all():
+                    campts_update = torch.einsum("btij,btnj->btni", self.c2w_est_curr[...,:3,:3], cam_pts) + self.c2w_est_curr[...,None,:3,3]
+                    # campts_update = cam_pts
+                    c2w_traj_init_update = key_fr_wprocrustes(campts_update, graph_matrix,
+                                                                (weight_final*(1-dyn_prob)).permute(0,2,1), vis_est_.permute(0,1,3,2))
+                    c2w_traj_init = c2w_traj_init_update@self.c2w_est_curr
+                    # else:
+                        # c2w_traj_init = self.c2w_est_curr                # extract the stable static points for refine the camera pose
+                else:
+                    # if (self.c2w_est_curr==torch.eye(4, device=cam_pts.device).repeat(B, T, 1, 1)).all():
+                    campts_update = torch.einsum("btij,btnj->btni", self.c2w_est_curr[...,:3,:3], cam_pts) + self.c2w_est_curr[...,None,:3,3]
+                    # campts_update = cam_pts
+                    c2w_traj_init_update = key_fr_wprocrustes(campts_update, graph_matrix,
+                                                                (weight_final*(1-dyn_prob)).permute(0,2,1), vis_est_.permute(0,1,3,2))
+                    c2w_traj_init = c2w_traj_init_update@self.c2w_est_curr
+                    # else:
+                        # c2w_traj_init = self.c2w_est_curr                # extract the stable static points for refine the camera pose
+                intrs_dn = intrs.clone()
+                intrs_dn[...,0,:] *= self.factor_x
+                intrs_dn[...,1,:] *= self.factor_y
+                _, query_world_pts, _ = self.track_from_cam(query_pts, c2w_traj_init, intrs_dn)
+                # refine the world tracks
+                world_tracks_init = torch.einsum("btij,btnj->btni", c2w_traj_init[:,:,:3,:3], cam_pts) + c2w_traj_init[:,:,None,:3,3]
+                world_tracks_static, mask_static, mask_topk, vis_mask_static, tracks2d_static = extract_static_from_3DTracks(world_tracks_init,
+                                                                                                            dyn_prob, query_world_pts,
+                                                                                                            vis_est, tracks, img_size=self.image_size,
+                                                                                                            K=150 if self.training else 1500)
+                # calculate the efficient ba
+                cam_tracks_static = cam_pts[:,:,mask_static.squeeze(),:][:,:,mask_topk.squeeze(),:]
+                cam_tracks_static[...,2] = depth_unproj.view(B, T, N)[:,:,mask_static.squeeze()][:,:,mask_topk.squeeze()]
+                c2w_traj_glob, world_static_refine, intrs_refine = ba_pycolmap(world_tracks_static, intrs,
+                                                                                c2w_traj_init, vis_mask_static,
+                                                                                tracks2d_static, self.image_size,
+                                                                                cam_tracks_static=cam_tracks_static,
+                                                                                training=self.training, query_pts=query_pts)
+                c2w_traj_glob = c2w_traj_glob.view(B, T, 4, 4)
+                world_tracks_refined = world_tracks_init
+            #NOTE: merge the index of static points and topk points
+            # merge_idx = torch.where(mask_static.squeeze()>0)[0][mask_topk.squeeze()]
+            # world_tracks_refined[:,:,merge_idx] = world_static_refine
+            # test the procrustes
+            w2c_traj_glob = torch.inverse(c2w_traj_init.detach())
+            cam_pts_refine = torch.einsum("btij,btnj->btni", w2c_traj_glob[:,:,:3,:3], world_tracks_refined) + w2c_traj_glob[:,:,None,:3,3]
+            # get the xyz_refine
+            #TODO: refiner
+            cam_pts4_proj = cam_pts_refine.clone()
+            cam_pts4_proj[...,2] *= torch.sign(cam_pts4_proj[...,2:3].view(B*T,N))
+            xy_refine = torch.einsum("btnij,btnj->btni", intrs_refine.view(B,T,1,3,3).repeat(1,1,N,1,1), cam_pts4_proj/cam_pts4_proj[...,2:3].abs())
+            xy_refine[..., 2] = cam_pts4_proj[...,2:3].view(B*T,N)
+        # xy_refine = torch.zeros_like(cam_pts_refine)[...,:2]
+        return c2w_traj_glob, cam_pts_refine, intrs_refine, xy_refine, world_tracks_init, world_tracks_refined, c2w_traj_init
+    def extract_img_feat(self, video, fmaps_chunk_size=200):
+        B, T, C, H, W = video.shape
+        dtype = video.dtype
+        H4, W4 = H // self.stride, W // self.stride
+        # Compute convolutional features for the video or for the current chunk in case of online mode
+        if T > fmaps_chunk_size:
+            fmaps = []
+            for t in range(0, T, fmaps_chunk_size):
+                video_chunk = video[:, t : t + fmaps_chunk_size]
+                fmaps_chunk = self.fnet(video_chunk.reshape(-1, C, H, W))
+                T_chunk = video_chunk.shape[1]
+                C_chunk, H_chunk, W_chunk = fmaps_chunk.shape[1:]
+                fmaps.append(fmaps_chunk.reshape(B, T_chunk, C_chunk, H_chunk, W_chunk))
+            fmaps = torch.cat(fmaps, dim=1).reshape(-1, C_chunk, H_chunk, W_chunk)
+        else:
+            fmaps = self.fnet(video.reshape(-1, C, H, W))
+        fmaps = fmaps.permute(0, 2, 3, 1)
+        fmaps = fmaps / torch.sqrt(
+            torch.maximum(
+                torch.sum(torch.square(fmaps), axis=-1, keepdims=True),
+                torch.tensor(1e-12, device=fmaps.device),
+            )
+        )
+        fmaps = fmaps.permute(0, 3, 1, 2).reshape(
+            B, -1, self.latent_dim, H // self.stride, W // self.stride
+        )
+        fmaps = fmaps.to(dtype)
+        return fmaps
+    def norm_xyz(self, xyz):
+        """
+        xyz can be (B T N 3) or (B T 3 H W) or (B N 3)
+        """
+        if xyz.ndim == 3:
+            min_pts = self.min_pts
+            max_pts = self.max_pts
+            return (xyz - min_pts[None,None,:]) / (max_pts - min_pts)[None,None,:] * 2 - 1
+        elif xyz.ndim == 4:
+            min_pts = self.min_pts
+            max_pts = self.max_pts
+            return (xyz - min_pts[None,None,None,:]) / (max_pts - min_pts)[None,None,None,:] * 2 - 1
+        elif xyz.ndim == 5:
+            if xyz.shape[2] == 3:
+                min_pts = self.min_pts
+                max_pts = self.max_pts
+                return (xyz - min_pts[None,None,:,None,None]) / (max_pts - min_pts)[None,None,:,None,None] * 2 - 1
+            elif xyz.shape[-1] == 3:
+                min_pts = self.min_pts
+                max_pts = self.max_pts
+                return (xyz - min_pts[None,None,None,None,:]) / (max_pts - min_pts)[None,None,None,None,:] * 2 - 1
+    def denorm_xyz(self, xyz):
+        """
+        xyz can be (B T N 3) or (B T 3 H W) or (B N 3)
+        """
+        if xyz.ndim == 3:
+            min_pts = self.min_pts
+            max_pts = self.max_pts
+            return (xyz + 1) / 2 * (max_pts - min_pts)[None,None,:] + min_pts[None,None,:]
+        elif xyz.ndim == 4:
+            min_pts = self.min_pts
+            max_pts = self.max_pts
+            return (xyz + 1) / 2 * (max_pts - min_pts)[None,None,None,:] + min_pts[None,None,None,:]
+        elif xyz.ndim == 5:
+            if xyz.shape[2] == 3:
+                min_pts = self.min_pts
+                max_pts = self.max_pts
+                return (xyz + 1) / 2 * (max_pts - min_pts)[None,None,:,None,None] + min_pts[None,None,:,None,None]
+            elif xyz.shape[-1] == 3:
+                min_pts = self.min_pts
+                max_pts = self.max_pts
+                return (xyz + 1) / 2 * (max_pts - min_pts)[None,None,None,None,:] + min_pts[None,None,None,None,:]
+    def forward(
+        self,
+        video,
+        metric_depth,
+        metric_unc,
+        point_map,
+        queries,
+        pts_q_3d=None,
+        overlap_d=None,
+        iters=4,
+        add_space_attn=True,
+        fmaps_chunk_size=200,
+        intrs=None,
+        traj3d_gt=None,
+        custom_vid=False,
+        vis_gt=None,
+        prec_fx=None,
+        prec_fy=None,
+        cam_gt=None,
+        init_pose=False,
+        support_pts_q=None,
+        update_pointmap=True,
+        fixed_cam=False,
+        query_no_BA=False,
+        stage=0,
+        cache=None,
+        points_map_gt=None,
+        valid_only=False,
+        replace_ratio=0.6,
+    ):
+        """Predict tracks
+        Args:
+            video (FloatTensor[B, T, 3 H W]): input videos.
+            queries (FloatTensor[B, N, 3]): point queries.
+            iters (int, optional): number of updates. Defaults to 4.
+            vdp_feats_cache: last layer's feature of depth
+            tracks_init: B T N 3 the initialization of 3D tracks computed by cam pose
+        Returns:
+            - coords_predicted (FloatTensor[B, T, N, 2]):
+            - vis_predicted (FloatTensor[B, T, N]):
+            - train_data: `None` if `is_train` is false, otherwise:
+                - all_vis_predictions (List[FloatTensor[B, S, N, 1]]):
+                - all_coords_predictions (List[FloatTensor[B, S, N, 2]]):
+                - mask (BoolTensor[B, T, N]):
+        """
+        self.stage = stage
+        if cam_gt is not None:
+            cam_gt = cam_gt.clone()
+            cam_gt = torch.inverse(cam_gt[:,:1,...])@cam_gt
+        B, T, C, _, _ = video.shape
+        _, _, H_, W_ = metric_depth.shape
+        _, _, N, __ = queries.shape
+        if (vis_gt is not None)&(queries.shape[1] == T):
+            aug_visb = True
+            if aug_visb:
+                number_visible = vis_gt.sum(dim=1)
+                ratio_rand = torch.rand(B, N, device=vis_gt.device)
+                # first_positive_inds = get_nth_visible_time_index(vis_gt, 1)
+                first_positive_inds = get_nth_visible_time_index(vis_gt, (number_visible*ratio_rand).long().clamp(min=1, max=T))
+                assert (torch.gather(vis_gt, 1, first_positive_inds[:, None, :].repeat(1, T, 1)) < 0).sum() == 0
+            else:
+                __, first_positive_inds = torch.max(vis_gt, dim=1)
+            first_positive_inds = first_positive_inds.long()
+            gather = torch.gather(
+                queries, 1, first_positive_inds[:, :, None, None].repeat(1, 1, N, 2)
+                )
+            xys = torch.diagonal(gather, dim1=1, dim2=2).permute(0, 2, 1)
+            gather_xyz = torch.gather(
+                traj3d_gt, 1, first_positive_inds[:, :, None, None].repeat(1, 1, N, 3)
+            )
+            z_gt_query = torch.diagonal(gather_xyz, dim1=1, dim2=2).permute(0, 2, 1)[...,2]
+            queries = torch.cat([first_positive_inds[:, :, None], xys], dim=-1)
+            queries = torch.cat([queries, support_pts_q[:,0]], dim=1)
+        else:
+            # Generate the 768 points randomly in the whole video
+            queries = queries.squeeze(1)
+            ba_len = queries.shape[1]
+            z_gt_query = None
+            if support_pts_q is not None:
+                queries = torch.cat([queries, support_pts_q[:,0]], dim=1)
+        if (abs(prec_fx-1.0) > 1e-4) & (self.training) & (traj3d_gt is not None):
+            traj3d_gt[..., 0] /= prec_fx
+            traj3d_gt[..., 1] /= prec_fy
+            queries[...,1] /= prec_fx
+            queries[...,2] /= prec_fy
+        video_vis = F.interpolate(video.clone().view(B*T, 3, video.shape[-2], video.shape[-1]), (H_, W_), mode="bilinear", align_corners=False).view(B, T, 3, H_, W_)
+        self.image_size = torch.tensor([H_, W_])
+        # self.model_resolution = (H_, W_)
+        # resize the queries and intrs
+        self.factor_x = self.model_resolution[1]/W_
+        self.factor_y = self.model_resolution[0]/H_
+        queries[...,1] *= self.factor_x
+        queries[...,2] *= self.factor_y
+        intrs_org = intrs.clone()
+        intrs[...,0,:] *= self.factor_x
+        intrs[...,1,:] *= self.factor_y
+        # get the fmaps and color features
+        video = F.interpolate(video.view(B*T, 3, video.shape[-2], video.shape[-1]),
+                              (self.model_resolution[0], self.model_resolution[1])).view(B, T, 3, self.model_resolution[0], self.model_resolution[1])
+        _, _, _, H, W = video.shape
+        if cache is not None:
+            T_cache = cache["fmaps"].shape[0]
+            fmaps = self.extract_img_feat(video[:,T_cache:], fmaps_chunk_size=fmaps_chunk_size)
+            fmaps = torch.cat([cache["fmaps"][None], fmaps], dim=1)
+        else:
+            fmaps = self.extract_img_feat(video, fmaps_chunk_size=fmaps_chunk_size)
+        fmaps_org = fmaps.clone()
+        metric_depth = F.interpolate(metric_depth.view(B*T, 1, H_, W_),
+                              (self.model_resolution[0], self.model_resolution[1]),mode="nearest").view(B*T, 1, self.model_resolution[0], self.model_resolution[1]).clamp(0.01, 200)
+        self.metric_unc_org = metric_unc.clone()
+        metric_unc = F.interpolate(metric_unc.view(B*T, 1, H_, W_),
+                                (self.model_resolution[0], self.model_resolution[1]),mode="nearest").view(B*T, 1, self.model_resolution[0], self.model_resolution[1])
+        if (self.stage == 2) & (self.training):
+            scale_rand = (torch.rand(B, T, device=video.device) - 0.5) + 1
+            point_map = scale_rand.view(B*T,1,1,1) * point_map
+        point_map_org = point_map.permute(0,3,1,2).view(B*T, 3, H_, W_).clone()
+        point_map = F.interpolate(point_map_org.clone(),
+                                  (self.model_resolution[0], self.model_resolution[1]),mode="nearest").view(B*T, 3, self.model_resolution[0], self.model_resolution[1])
+        # align the point map
+        point_map_org_train = point_map_org.view(B*T, 3, H_, W_).clone()
+        if (stage == 2):
+            # align the point map
+            try:
+                self.pred_points, scale_gt, shift_gt = affine_invariant_global_loss(
+                    point_map_org_train.permute(0,2,3,1),
+                    points_map_gt,
+                    mask=self.metric_unc_org[:,0]>0.5,
+                    align_resolution=32,
+                    only_align=True
+                )
+            except:
+                scale_gt, shift_gt = torch.ones(B*T).to(video.device), torch.zeros(B*T,3).to(video.device)
+            self.scale_gt, self.shift_gt = scale_gt, shift_gt
+        else:
+            scale_est, shift_est = None, None
+        # extract the pts features
+        device = queries.device
+        assert H % self.stride == 0 and W % self.stride == 0
+        B, N, __ = queries.shape
+        queries_z = sample_features5d(metric_depth.view(B, T, 1, H, W),
+                                                queries[:,None], interp_mode="nearest").squeeze(1)
+        queries_z_unc = sample_features5d(metric_unc.view(B, T, 1, H, W),
+                                                queries[:,None], interp_mode="nearest").squeeze(1)
+        queries_rgb = sample_features5d(video.view(B, T, C, H, W),
+                                                queries[:,None], interp_mode="nearest").squeeze(1)
+        queries_point_map = sample_features5d(point_map.view(B, T, 3, H, W),
+                                                    queries[:,None], interp_mode="nearest").squeeze(1)
+        if ((queries_z > 100)*(queries_z == 0)).sum() > 0:
+            import pdb; pdb.set_trace()
+        if overlap_d is not None:
+            queries_z[:,:overlap_d.shape[1],:] = overlap_d[...,None]
+            queries_point_map[:,:overlap_d.shape[1],2:] = overlap_d[...,None]
+        if pts_q_3d is not None:
+            scale_factor = (pts_q_3d[...,-1].permute(0,2,1) / queries_z[:,:pts_q_3d.shape[2],:]).squeeze().median()
+            queries_z[:,:pts_q_3d.shape[2],:] = pts_q_3d[...,-1].permute(0,2,1) / scale_factor
+            queries_point_map[:,:pts_q_3d.shape[2],2:] = pts_q_3d[...,-1].permute(0,2,1) / scale_factor
+        # normalize the points
+        self.min_pts, self.max_pts = queries_point_map.mean(dim=(0,1)) - 3*queries_point_map.std(dim=(0,1)), queries_point_map.mean(dim=(0,1)) + 3*queries_point_map.std(dim=(0,1))
+        queries_point_map = self.norm_xyz(queries_point_map)
+        queries_point_map_ = queries_point_map.reshape(B, 1, N, 3).expand(B, T, N, 3).clone()
+        point_map = self.norm_xyz(point_map.view(B, T, 3, H, W)).view(B*T, 3, H, W)
+        if z_gt_query is not None:
+            queries_z[:,:z_gt_query.shape[1],:] = z_gt_query[:,:,None]
+            mask_traj_gt = ((queries_z[:,:z_gt_query.shape[1],:] - z_gt_query[:,:,None])).abs() < 0.1
+        else:
+            if traj3d_gt is not None:
+                mask_traj_gt = torch.ones_like(queries_z[:, :traj3d_gt.shape[2]]).bool()
+            else:
+                mask_traj_gt = torch.ones_like(queries_z).bool()
+        queries_xyz = torch.cat([queries, queries_z], dim=-1)[:,None].repeat(1, T, 1, 1)
+        if cache is not None:
+            cache_T, cache_N = cache["track2d_pred_cache"].shape[0], cache["track2d_pred_cache"].shape[1]
+            cachexy = cache["track2d_pred_cache"].clone()
+            cachexy[...,0] = cachexy[...,0] * self.factor_x
+            cachexy[...,1] = cachexy[...,1] * self.factor_y
+            # initialize the 2d points with cache
+            queries_xyz[:,:cache_T,:cache_N,1:] = cachexy
+            queries_xyz[:,cache_T:,:cache_N,1:] = cachexy[-1:]
+            # initialize the 3d points with cache
+            queries_point_map_[:,:cache_T,:cache_N,:] = self.norm_xyz(cache["track3d_pred_cache"][None])
+            queries_point_map_[:,cache_T:,:cache_N,:] = self.norm_xyz(cache["track3d_pred_cache"][-1:][None])
+        if cam_gt is not None:
+            q_static_proj, q_xyz_world, q_xyz_cam = self.track_from_cam(queries_xyz, cam_gt,
+                                intrs, rgbs=video_vis, visualize=False)
+            q_static_proj[..., 0] /= self.factor_x
+            q_static_proj[..., 1] /= self.factor_y
+        assert T >= 1  # A tracker needs at least two frames to track something
+        video = 2 * (video / 255.0) - 1.0
+        dtype = video.dtype
+        queried_frames = queries[:, :, 0].long()
+        queried_coords = queries[..., 1:3]
+        queried_coords = queried_coords / self.stride
+        # We store our predictions here
+        (all_coords_predictions, all_coords_xyz_predictions,all_vis_predictions,
+         all_confidence_predictions, all_cam_predictions, all_dynamic_prob_predictions,
+         all_cam_pts_predictions, all_world_tracks_predictions, all_world_tracks_refined_predictions,
+         all_scale_est, all_shift_est) = (
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            []
+        )
+        # We compute track features
+        fmaps_pyramid = []
+        point_map_pyramid = []
+        track_feat_pyramid = []
+        track_feat_support_pyramid = []
+        track_feat3d_pyramid = []
+        track_feat_support3d_pyramid = []
+        track_depth_support_pyramid = []
+        track_point_map_pyramid = []
+        track_point_map_support_pyramid = []
+        fmaps_pyramid.append(fmaps)
+        metric_depth = metric_depth
+        point_map = point_map
+        metric_depth_align = F.interpolate(metric_depth, scale_factor=0.25, mode='nearest')
+        point_map_align = F.interpolate(point_map, scale_factor=0.25, mode='nearest')
+        point_map_pyramid.append(point_map_align.view(B, T, 3, point_map_align.shape[-2], point_map_align.shape[-1]))
+        for i in range(self.corr_levels - 1):
+            fmaps_ = fmaps.reshape(
+                B * T, self.latent_dim, fmaps.shape[-2], fmaps.shape[-1]
+            )
+            fmaps_ = F.avg_pool2d(fmaps_, 2, stride=2)
+            fmaps = fmaps_.reshape(
+                B, T, self.latent_dim, fmaps_.shape[-2], fmaps_.shape[-1]
+            )
+            fmaps_pyramid.append(fmaps)
+            # downsample the depth
+            metric_depth_ = metric_depth_align.reshape(B*T,1,metric_depth_align.shape[-2],metric_depth_align.shape[-1])
+            metric_depth_ = F.interpolate(metric_depth_, scale_factor=0.5, mode='nearest')
+            metric_depth_align = metric_depth_.reshape(B,T,1,metric_depth_.shape[-2], metric_depth_.shape[-1])
+            # downsample the point map
+            point_map_ = point_map_align.reshape(B*T,3,point_map_align.shape[-2],point_map_align.shape[-1])
+            point_map_ = F.interpolate(point_map_, scale_factor=0.5, mode='nearest')
+            point_map_align = point_map_.reshape(B,T,3,point_map_.shape[-2], point_map_.shape[-1])
+            point_map_pyramid.append(point_map_align)
+        for i in range(self.corr_levels):
+            if cache is not None:
+                cache_N = cache["track_feat_pyramid"][i].shape[2]
+                track_feat_cached, track_feat_support_cached = cache["track_feat_pyramid"][i], cache["track_feat_support_pyramid"][i]
+                track_feat3d_cached, track_feat_support3d_cached = cache["track_feat3d_pyramid"][i], cache["track_feat_support3d_pyramid"][i]
+                track_point_map_cached, track_point_map_support_cached = self.norm_xyz(cache["track_point_map_pyramid"][i]), self.norm_xyz(cache["track_point_map_support_pyramid"][i])
+                queried_coords_new = queried_coords[:,cache_N:,:] / 2**i
+                queried_frames_new = queried_frames[:,cache_N:]
+            else:
+                queried_coords_new = queried_coords / 2**i
+                queried_frames_new = queried_frames
+            track_feat, track_feat_support = self.get_track_feat(
+                fmaps_pyramid[i],
+                queried_frames_new,
+                queried_coords_new,
+                support_radius=self.corr_radius,
+            )
+            # get 3d track feat
+            track_point_map, track_point_map_support = self.get_track_feat(
+                point_map_pyramid[i],
+                queried_frames_new,
+                queried_coords_new,
+                support_radius=self.corr3d_radius,
+            )
+            track_feat3d, track_feat_support3d = self.get_track_feat(
+                fmaps_pyramid[i],
+                queried_frames_new,
+                queried_coords_new,
+                support_radius=self.corr3d_radius,
+            )
+            if cache is not None:
+                track_feat = torch.cat([track_feat_cached, track_feat], dim=2)
+                track_point_map = torch.cat([track_point_map_cached, track_point_map], dim=2)
+                track_feat_support = torch.cat([track_feat_support_cached[:,0], track_feat_support], dim=2)
+                track_point_map_support = torch.cat([track_point_map_support_cached[:,0], track_point_map_support], dim=2)
+                track_feat3d = torch.cat([track_feat3d_cached, track_feat3d], dim=2)
+                track_feat_support3d = torch.cat([track_feat_support3d_cached[:,0], track_feat_support3d], dim=2)
+            track_feat_pyramid.append(track_feat.repeat(1, T, 1, 1))
+            track_feat_support_pyramid.append(track_feat_support.unsqueeze(1))
+            track_feat3d_pyramid.append(track_feat3d.repeat(1, T, 1, 1))
+            track_feat_support3d_pyramid.append(track_feat_support3d.unsqueeze(1))
+            track_point_map_pyramid.append(track_point_map.repeat(1, T, 1, 1))
+            track_point_map_support_pyramid.append(track_point_map_support.unsqueeze(1))
+        D_coords = 2
+        (coord_preds, coords_xyz_preds, vis_preds, confidence_preds,
+         dynamic_prob_preds, cam_preds, pts3d_cam_pred, world_tracks_pred,
+         world_tracks_refined_pred, point_map_preds, scale_ests, shift_ests) = (
+            [], [], [], [], [], [], [], [], [], [], [], []
+        )
+        c2w_ests = []
+        vis = torch.zeros((B, T, N), device=device).float()
+        confidence = torch.zeros((B, T, N), device=device).float()
+        dynamic_prob = torch.zeros((B, T, N), device=device).float()
+        pro_analysis_w = torch.zeros((B, T, N), device=device).float()
+        coords = queries_xyz[...,1:].clone()
+        coords[...,:2] /= self.stride
+        # coords[...,:2] = queried_coords.reshape(B, 1, N, 2).expand(B, T, N, 2).float()[...,:2]
+        # initialize the 3d points
+        coords_xyz = queries_point_map_.clone()
+        # if cache is not None:
+        #     viser = Visualizer(save_dir=".", grayscale=True,
+        #                        fps=10, pad_value=50, tracks_leave_trace=0)
+        #     coords_clone = coords.clone()
+        #     coords_clone[...,:2] *= self.stride
+        #     coords_clone[..., 0] /= self.factor_x
+        #     coords_clone[..., 1] /= self.factor_y
+        #     viser.visualize(video=video_vis, tracks=coords_clone[..., :2], filename="test")
+        #     import pdb; pdb.set_trace()
+        if init_pose:
+            q_init_proj, q_xyz_world, q_xyz_cam = self.track_from_cam(queries_xyz, cam_gt,
+                                intrs, rgbs=video_vis, visualize=False)
+            q_init_proj[..., 0] /= self.stride
+            q_init_proj[..., 1] /= self.stride
+        r = 2 * self.corr_radius + 1
+        r_depth = 2 * self.corr3d_radius + 1
+        anchor_loss = 0
+        # two current states
+        self.c2w_est_curr = torch.eye(4, device=device).repeat(B, T , 1, 1)
+        coords_proj_curr = coords.view(B * T, N, 3)[...,:2]
+        if init_pose:
+            self.c2w_est_curr = cam_gt.to(coords_proj_curr.device).to(coords_proj_curr.dtype)
+        sync_loss = 0
+        if stage == 2:
+            extra_sparse_tokens = self.scale_shift_tokens[:,:,None,:].repeat(B, 1, T, 1)
+            extra_dense_tokens = self.residual_embedding[None,None].repeat(B, T, 1, 1, 1)
+            xyz_pos_enc = posenc(point_map_pyramid[-2].permute(0,1,3,4,2), min_deg=0, max_deg=10).permute(0,1,4,2,3)
+            extra_dense_tokens = torch.cat([xyz_pos_enc, extra_dense_tokens, fmaps_pyramid[-2]], dim=2)
+            extra_dense_tokens = rearrange(extra_dense_tokens, 'b t c h w -> (b t) c h w')
+            extra_dense_tokens = self.dense_mlp(extra_dense_tokens)
+            extra_dense_tokens = rearrange(extra_dense_tokens, '(b t) c h w -> b t c h w', b=B, t=T)
+        else:
+            extra_sparse_tokens = None
+            extra_dense_tokens = None
+        scale_est, shift_est = torch.ones(B, T, 1, 1, device=device), torch.zeros(B, T, 1, 3, device=device)
+        residual_point = torch.zeros(B, T, 3, self.model_resolution[0]//self.stride,
+                                                         self.model_resolution[1]//self.stride, device=device)
+        for it in range(iters):
+            # query points scale and shift
+            scale_est_query = torch.gather(scale_est, dim=1, index=queries[:,:,None,:1].long())
+            shift_est_query = torch.gather(shift_est, dim=1, index=queries[:,:,None,:1].long().repeat(1, 1, 1, 3))
+            coords = coords.detach()  # B T N 3
+            coords_xyz = coords_xyz.detach()
+            vis = vis.detach()
+            confidence = confidence.detach()
+            dynamic_prob = dynamic_prob.detach()
+            pro_analysis_w = pro_analysis_w.detach()
+            coords_init = coords.view(B * T, N, 3)
+            coords_xyz_init = coords_xyz.view(B * T, N, 3)
+            corr_embs = []
+            corr_depth_embs = []
+            corr_feats = []
+            for i in range(self.corr_levels):
+                # K_level = int(32*0.8**(i))
+                K_level = 16
+                corr_feat = self.get_correlation_feat(
+                    fmaps_pyramid[i], coords_init[...,:2] / 2**i
+                )
+                #NOTE: update the point map
+                residual_point_i = F.interpolate(residual_point.view(B*T,3,residual_point.shape[-2],residual_point.shape[-1]),
+                                                                                     size=(point_map_pyramid[i].shape[-2], point_map_pyramid[i].shape[-1]), mode='nearest')
+                point_map_pyramid_i = (self.denorm_xyz(point_map_pyramid[i]) * scale_est[...,None]
+                                                     + shift_est.permute(0,1,3,2)[...,None] + residual_point_i.view(B,T,3,point_map_pyramid[i].shape[-2], point_map_pyramid[i].shape[-1])).clone().detach()
+                corr_point_map = self.get_correlation_feat(
+                    self.norm_xyz(point_map_pyramid_i), coords_proj_curr / 2**i, radius=self.corr3d_radius
+                )
+                corr_point_feat = self.get_correlation_feat(
+                    fmaps_pyramid[i], coords_proj_curr / 2**i, radius=self.corr3d_radius
+                )
+                track_feat_support = (
+                    track_feat_support_pyramid[i]
+                    .view(B, 1, r, r, N, self.latent_dim)
+                    .squeeze(1)
+                    .permute(0, 3, 1, 2, 4)
+                )
+                track_feat_support3d = (
+                    track_feat_support3d_pyramid[i]
+                    .view(B, 1, r_depth, r_depth, N, self.latent_dim)
+                    .squeeze(1)
+                    .permute(0, 3, 1, 2, 4)
+                )
+                #NOTE: update the point map
+                track_point_map_support_pyramid_i = (self.denorm_xyz(track_point_map_support_pyramid[i]) * scale_est_query.view(B,1,1,N,1)
+                                                                                                + shift_est_query.view(B,1,1,N,3)).clone().detach()
+                track_point_map_support = (
+                    self.norm_xyz(track_point_map_support_pyramid_i)
+                    .view(B, 1, r_depth, r_depth, N, 3)
+                    .squeeze(1)
+                    .permute(0, 3, 1, 2, 4)
+                )
+                corr_volume = torch.einsum(
+                    "btnhwc,bnijc->btnhwij", corr_feat, track_feat_support
+                )
+                corr_emb = self.corr_mlp(corr_volume.reshape(B, T, N, r * r * r * r))
+                with torch.no_grad():
+                    rel_pos_query_ = track_point_map_support - track_point_map_support[:,:,self.corr3d_radius,self.corr3d_radius,:][...,None,None,:]
+                    rel_pos_target_ = corr_point_map - coords_xyz_init.view(B, T, N, 1, 1, 3)
+                    # select the top 9 points
+                    rel_pos_query_idx = rel_pos_query_.norm(dim=-1).view(B, N, -1).topk(K_level+1, dim=-1, largest=False)[1][...,1:,None]
+                    rel_pos_target_idx = rel_pos_target_.norm(dim=-1).view(B, T, N, -1).topk(K_level+1, dim=-1, largest=False)[1][...,1:,None]
+                    rel_pos_query_ = torch.gather(rel_pos_query_.view(B, N, -1, 3), dim=-2, index=rel_pos_query_idx.expand(B, N, K_level, 3))
+                    rel_pos_target_ = torch.gather(rel_pos_target_.view(B, T, N, -1, 3), dim=-2, index=rel_pos_target_idx.expand(B, T, N, K_level, 3))
+                    rel_pos_query = rel_pos_query_
+                    rel_pos_target = rel_pos_target_
+                    rel_pos_query = posenc(rel_pos_query, min_deg=0, max_deg=12)
+                    rel_pos_target = posenc(rel_pos_target, min_deg=0, max_deg=12)
+                rel_pos_target = self.rel_pos_mlp(rel_pos_target)
+                rel_pos_query = self.rel_pos_mlp(rel_pos_query)
+                with torch.no_grad():
+                    # integrate with feature
+                    track_feat_support_ = rearrange(track_feat_support3d, 'b n r k c -> b n (r k) c', r=r_depth, k=r_depth, n=N, b=B)
+                    track_feat_support_ = torch.gather(track_feat_support_, dim=-2, index=rel_pos_query_idx.expand(B, N, K_level, 128))
+                    queried_feat = torch.cat([rel_pos_query, track_feat_support_], dim=-1)
+                    corr_feat_ = rearrange(corr_point_feat, 'b t n r k c -> b t n (r k) c', t=T, n=N, b=B)
+                    corr_feat_ = torch.gather(corr_feat_, dim=-2, index=rel_pos_target_idx.expand(B, T, N, K_level, 128))
+                    target_feat = torch.cat([rel_pos_target, corr_feat_], dim=-1)
+                # 3d attention
+                queried_feat = self.corr_xyz_mlp(queried_feat)
+                target_feat = self.corr_xyz_mlp(target_feat)
+                queried_feat = repeat(queried_feat, 'b n k c -> b t n k c', k=K_level, t=T, n=N, b=B)
+                corr_depth_emb = self.corr_transformer[0](queried_feat.reshape(B*T*N,-1,128),
+                                                        target_feat.reshape(B*T*N,-1,128),
+                                                        target_rel_pos=rel_pos_target.reshape(B*T*N,-1,128))
+                corr_depth_emb = rearrange(corr_depth_emb, '(b t n) 1 c -> b t n c', t=T, n=N, b=B)
+                corr_depth_emb = self.corr_depth_mlp(corr_depth_emb)
+                valid_mask = self.denorm_xyz(coords_xyz_init).view(B, T, N, -1)[...,2:3] > 0
+                corr_depth_embs.append(corr_depth_emb*valid_mask)
+                corr_embs.append(corr_emb)
+            corr_embs = torch.cat(corr_embs, dim=-1)
+            corr_embs = corr_embs.view(B, T, N, corr_embs.shape[-1])
+            corr_depth_embs = torch.cat(corr_depth_embs, dim=-1)
+            corr_depth_embs = corr_depth_embs.view(B, T, N, corr_depth_embs.shape[-1])
+            transformer_input = [vis[..., None], confidence[..., None], corr_embs]
+            transformer_input_depth = [vis[..., None], confidence[..., None], corr_depth_embs]
+            rel_coords_forward = coords[:,:-1,...,:2] - coords[:,1:,...,:2]
+            rel_coords_backward = coords[:, 1:,...,:2] - coords[:, :-1,...,:2]
+            rel_xyz_forward = coords_xyz[:,:-1,...,:3] - coords_xyz[:,1:,...,:3]
+            rel_xyz_backward = coords_xyz[:, 1:,...,:3] - coords_xyz[:, :-1,...,:3]
+            rel_coords_forward = torch.nn.functional.pad(
+                rel_coords_forward, (0, 0, 0, 0, 0, 1)
+            )
+            rel_coords_backward = torch.nn.functional.pad(
+                rel_coords_backward, (0, 0, 0, 0, 1, 0)
+            )
+            rel_xyz_forward = torch.nn.functional.pad(
+                rel_xyz_forward, (0, 0, 0, 0, 0, 1)
+            )
+            rel_xyz_backward = torch.nn.functional.pad(
+                rel_xyz_backward, (0, 0, 0, 0, 1, 0)
+            )
+            scale = (
+                torch.tensor(
+                    [self.model_resolution[1], self.model_resolution[0]],
+                    device=coords.device,
+                )
+                / self.stride
+            )
+            rel_coords_forward = rel_coords_forward / scale
+            rel_coords_backward = rel_coords_backward / scale
+            rel_pos_emb_input = posenc(
+                torch.cat([rel_coords_forward, rel_coords_backward], dim=-1),
+                min_deg=0,
+                max_deg=10,
+            )  # batch, num_points, num_frames, 84
+            rel_xyz_emb_input = posenc(
+                torch.cat([rel_xyz_forward, rel_xyz_backward], dim=-1),
+                min_deg=0,
+                max_deg=10,
+            )  # batch, num_points, num_frames, 126
+            rel_xyz_emb_input = self.xyz_mlp(rel_xyz_emb_input)
+            transformer_input.append(rel_pos_emb_input)
+            transformer_input_depth.append(rel_xyz_emb_input)
+            # get the queries world
+            with torch.no_grad():
+                # update the query points with scale and shift
+                queries_xyz_i = queries_xyz.clone().detach()
+                queries_xyz_i[..., -1] = queries_xyz_i[..., -1] * scale_est_query.view(B,1,N) + shift_est_query.view(B,1,N,3)[...,2]
+                _, _, q_xyz_cam = self.track_from_cam(queries_xyz_i, self.c2w_est_curr,
+                                                intrs, rgbs=None, visualize=False)
+                q_xyz_cam = self.norm_xyz(q_xyz_cam)
+            query_t = queries[:,None,:,:1].repeat(B, T, 1, 1)
+            q_xyz_cam = torch.cat([query_t/T, q_xyz_cam], dim=-1)
+            T_all = torch.arange(T, device=device)[None,:,None,None].repeat(B, 1, N, 1)
+            current_xyzt = torch.cat([T_all/T, coords_xyz_init.view(B, T, N, -1)], dim=-1)
+            rel_pos_query_glob = q_xyz_cam - current_xyzt
+            # embed the confidence and dynamic probability
+            confidence_curr = torch.sigmoid(confidence[...,None])
+            dynamic_prob_curr = torch.sigmoid(dynamic_prob[...,None]).mean(dim=1, keepdim=True).repeat(1,T,1,1)
+            # embed the confidence and dynamic probability
+            rel_pos_query_glob = torch.cat([rel_pos_query_glob, confidence_curr, dynamic_prob_curr], dim=-1)
+            rel_pos_query_glob = posenc(rel_pos_query_glob, min_deg=0, max_deg=12)
+            transformer_input_depth.append(rel_pos_query_glob)
+            x = (
+                torch.cat(transformer_input, dim=-1)
+                .permute(0, 2, 1, 3)
+                .reshape(B * N, T, -1)
+            )
+            x_depth = (
+                torch.cat(transformer_input_depth, dim=-1)
+                .permute(0, 2, 1, 3)
+                .reshape(B * N, T, -1)
+            )
+            x_depth = self.proj_xyz_embed(x_depth)
+            x = x + self.interpolate_time_embed(x, T)
+            x = x.view(B, N, T, -1)  # (B N) T D -> B N T D
+            x_depth = x_depth + self.interpolate_time_embed(x_depth, T)
+            x_depth = x_depth.view(B, N, T, -1)  # (B N) T D -> B N T D
+            delta, delta_depth, delta_dynamic_prob, delta_pro_analysis_w, scale_shift_out, dense_res_out = self.updateformer3D(
+                x,
+                x_depth,
+                self.updateformer,
+                add_space_attn=add_space_attn,
+                extra_sparse_tokens=extra_sparse_tokens,
+                extra_dense_tokens=extra_dense_tokens,
+            )
+            # update the scale and shift
+            if scale_shift_out is not None:
+                extra_sparse_tokens = extra_sparse_tokens + scale_shift_out[...,:128]
+                scale_update = scale_shift_out[:,:1,:,-1].permute(0,2,1)[...,None]
+                shift_update = scale_shift_out[:,1:,:,-1].permute(0,2,1)[...,None]
+                scale_est = scale_est + scale_update
+                shift_est[...,2:] = shift_est[...,2:] + shift_update / 10
+                # dense tokens update
+                extra_dense_tokens = extra_dense_tokens + dense_res_out[:,:,-128:]
+                res_low = dense_res_out[:,:,:3]
+                up_mask = self.upsample_transformer(extra_dense_tokens.mean(dim=1), res_low)
+                up_mask = repeat(up_mask, "b k h w -> b s k h w", s=T)
+                up_mask = rearrange(up_mask, "b s c h w -> (b s) 1 c h w")
+                res_up = self.upsample_with_mask(
+                        rearrange(res_low, 'b t c h w -> (b t) c h w'),
+                        up_mask,
+                    )
+                res_up = rearrange(res_up, "(b t) c h w -> b t c h w", b=B, t=T)
+                # residual_point = residual_point + res_up
+            delta_coords = delta[..., :D_coords].permute(0, 2, 1, 3)
+            delta_vis = delta[..., D_coords].permute(0, 2, 1)
+            delta_confidence = delta[..., D_coords + 1].permute(0, 2, 1)
+            vis = vis + delta_vis
+            confidence = confidence + delta_confidence
+            dynamic_prob = dynamic_prob + delta_dynamic_prob[...,0].permute(0, 2, 1)
+            pro_analysis_w = pro_analysis_w + delta_pro_analysis_w[...,0].permute(0, 2, 1)
+            # update the depth
+            vis_est = torch.sigmoid(vis.detach())
+            delta_xyz = delta_depth[...,:3].permute(0,2,1,3)
+            denorm_delta_depth = (self.denorm_xyz(coords_xyz+delta_xyz)-self.denorm_xyz(coords_xyz))[...,2:3]
+            delta_depth_ = denorm_delta_depth.detach()
+            delta_coords = torch.cat([delta_coords, delta_depth_],dim=-1)
+            coords = coords + delta_coords
+            coords_append = coords.clone()
+            coords_xyz_append = self.denorm_xyz(coords_xyz + delta_xyz).clone()
+            coords_append[..., :2] = coords_append[..., :2] * float(self.stride)
+            coords_append[..., 0] /= self.factor_x
+            coords_append[..., 1] /= self.factor_y
+            # get the camera pose from tracks
+            dynamic_prob_curr = torch.sigmoid(dynamic_prob.detach())*torch.sigmoid(pro_analysis_w)
+            mask_out = (coords_append[...,0]<W_)&(coords_append[...,0]>0)&(coords_append[...,1]<H_)&(coords_append[...,1]>0)
+            if query_no_BA:
+                dynamic_prob_curr[:,:,:ba_len] = torch.ones_like(dynamic_prob_curr[:,:,:ba_len])
+            point_map_org_i = scale_est.view(B*T,1,1,1)*point_map_org.clone().detach() + shift_est.view(B*T,3,1,1)
+            # depth_unproj = bilinear_sampler(point_map_org_i, coords_append[...,:2].view(B*T, N, 1, 2), mode="nearest")[:,2,:,0].detach()
+            depth_unproj_neg = self.get_correlation_feat(
+                    point_map_org_i.view(B,T,3,point_map_org_i.shape[-2], point_map_org_i.shape[-1]),
+                     coords_append[...,:2].view(B*T, N, 2), radius=self.corr3d_radius
+                )[..., 2]
+            depth_diff = (depth_unproj_neg.view(B,T,N,-1) - coords_append[...,2:]).abs()
+            idx_neg = torch.argmin(depth_diff, dim=-1)
+            depth_unproj = depth_unproj_neg.view(B,T,N,-1)[torch.arange(B)[:, None, None, None],
+                                                          torch.arange(T)[None, :, None, None],
+                                                          torch.arange(N)[None, None, :, None],
+                                                          idx_neg.view(B,T,N,1)].view(B*T, N)
+            unc_unproj = bilinear_sampler(self.metric_unc_org, coords_append[...,:2].view(B*T, N, 1, 2), mode="nearest")[:,0,:,0].detach()
+            depth_unproj[unc_unproj<0.5] = 0.0
+            # replace the depth for visible and solid points
+            conf_est = torch.sigmoid(confidence.detach())
+            replace_mask = (depth_unproj.view(B,T,N)>0.0) * (vis_est>0.5) # * (conf_est>0.5)
+            #NOTE: way1: find the jitter points
+            depth_rel = (depth_unproj.view(B, T, N) - queries_z.permute(0, 2, 1))
+            depth_ddt1 = depth_rel[:, 1:, :] - depth_rel[:, :-1, :]
+            depth_ddt2 = depth_rel[:, 2:, :] - 2 * depth_rel[:, 1:-1, :] + depth_rel[:, :-2, :]
+            jitter_mask = torch.zeros_like(depth_rel, dtype=torch.bool)
+            if depth_ddt2.abs().max()>0:
+                thre2 = torch.quantile(depth_ddt2.abs()[depth_ddt2.abs()>0], replace_ratio)
+                jitter_mask[:, 1:-1, :] = (depth_ddt2.abs() < thre2)
+                thre1 = torch.quantile(depth_ddt1.abs()[depth_ddt1.abs()>0], replace_ratio)
+                jitter_mask[:, :-1, :] *= (depth_ddt1.abs() < thre1)
+                replace_mask = replace_mask * jitter_mask
+            #NOTE: way2: top k topological change detection
+            # coords_2d_lift = coords_append.clone()
+            # coords_2d_lift[...,2][replace_mask] = depth_unproj.view(B,T,N)[replace_mask]
+            # coords_2d_lift = self.cam_from_track(coords_2d_lift.clone(), intrs_org, only_cam_pts=True)
+            # coords_2d_lift[~replace_mask] = coords_xyz_append[~replace_mask]
+            # import pdb; pdb.set_trace()
+            # jitter_mask = get_topo_mask(coords_xyz_append, coords_2d_lift, replace_ratio)
+            # replace_mask = replace_mask * jitter_mask
+            # replace the depth
+            if self.training:
+                replace_mask = torch.zeros_like(replace_mask)
+            coords_append[...,2][replace_mask] = depth_unproj.view(B,T,N)[replace_mask]
+            coords_xyz_unproj = self.cam_from_track(coords_append.clone(), intrs_org, only_cam_pts=True)
+            coords[...,2][replace_mask] = depth_unproj.view(B,T,N)[replace_mask]
+            # coords_xyz_append[replace_mask] = coords_xyz_unproj[replace_mask]
+            coords_xyz_append_refine = coords_xyz_append.clone()
+            coords_xyz_append_refine[replace_mask] = coords_xyz_unproj[replace_mask]
+            c2w_traj_est, cam_pts_est, intrs_refine, coords_refine, world_tracks, world_tracks_refined, c2w_traj_init = self.cam_from_track(coords_append.clone(),
+                                                  intrs_org, dynamic_prob_curr, queries_z_unc, conf_est*vis_est*mask_out.float(),
+                                                  track_feat_concat=x_depth, tracks_xyz=coords_xyz_append_refine, init_pose=init_pose,
+                                                  query_pts=queries_xyz_i, fixed_cam=fixed_cam, depth_unproj=depth_unproj, cam_gt=cam_gt)
+            intrs_org = intrs_refine.view(B, T, 3, 3).to(intrs_org.dtype)
+            # get the queries world
+            self.c2w_est_curr = c2w_traj_est.detach()
+            # update coords and coords_append
+            coords[..., 2] = (cam_pts_est)[...,2]
+            coords_append[..., 2] = (cam_pts_est)[...,2]
+            # update coords_xyz_append
+            # coords_xyz_append = cam_pts_est
+            coords_xyz = self.norm_xyz(cam_pts_est)
+            # proj
+            coords_xyz_de = coords_xyz_append.clone()
+            coords_xyz_de[coords_xyz_de[...,2].abs()<1e-6] = -1e-4
+            mask_nan = coords_xyz_de[...,2].abs()<1e-2
+            coords_proj = torch.einsum("btij,btnj->btni", intrs_org, coords_xyz_de/coords_xyz_de[...,2:3].abs())[...,:2]
+            coords_proj[...,0] *= self.factor_x
+            coords_proj[...,1] *= self.factor_y
+            coords_proj[...,:2] /= float(self.stride)
+            # make sure it is aligned with 2d tracking
+            coords_proj_curr = coords[...,:2].view(B*T, N, 2).detach()
+            vis_est = (vis_est>0.5).float()
+            sync_loss += (vis_est.detach()[...,None]*(coords_proj_curr - coords_proj).norm(dim=-1, keepdim=True)*(1-mask_nan[...,None].float())).mean()
+            # coords_proj_curr[~mask_nan.view(B*T, N)] = coords_proj.view(B*T, N, 2)[~mask_nan.view(B*T, N)].to(coords_proj_curr.dtype)
+            # if torch.isnan(coords_proj_curr).sum()>0:
+            #     import pdb; pdb.set_trace()
+            if False:
+                point_map_resize = point_map.clone().view(B, T, 3, H, W)
+                update_input = torch.cat([point_map_resize, metric_unc.view(B,T,1,H,W)], dim=2)
+                coords_append_resize = coords.clone().detach()
+                coords_append_resize[..., :2] = coords_append_resize[..., :2] * float(self.stride)
+                update_track_input = self.norm_xyz(cam_pts_est)*5
+                update_track_input = torch.cat([update_track_input, vis_est[...,None]], dim=-1)
+                update_track_input = posenc(update_track_input, min_deg=0, max_deg=12)
+                update = self.update_pointmap.stablizer(update_input,
+                                                        update_track_input, coords_append_resize)#, imgs=video, vis_track=viser)
+                #NOTE: update the point map
+                point_map_resize += update
+                point_map_refine_out = F.interpolate(point_map_resize.view(B*T, -1, H, W),
+                                                                size=(self.image_size[0].item(), self.image_size[1].item()), mode='nearest')
+                point_map_refine_out = rearrange(point_map_refine_out, '(b t) c h w -> b t c h w', t=T, b=B)
+                point_map_preds.append(self.denorm_xyz(point_map_refine_out))
+                point_map_org = self.denorm_xyz(point_map_refine_out).view(B*T, 3, H_, W_)
+            # if torch.isnan(coords).sum()>0:
+            #     import pdb; pdb.set_trace()
+            #NOTE: the 2d tracking + unproject depth
+            fix_cam_est = coords_append.clone()
+            fix_cam_est[...,2] = depth_unproj
+            fix_cam_pts = self.cam_from_track(
+                        fix_cam_est, intrs_org, only_cam_pts=True
+                    )
+            coord_preds.append(coords_append)
+            coords_xyz_preds.append(coords_xyz_append)
+            vis_preds.append(vis)
+            cam_preds.append(c2w_traj_init)
+            pts3d_cam_pred.append(cam_pts_est)
+            world_tracks_pred.append(world_tracks)
+            world_tracks_refined_pred.append(world_tracks_refined)
+            confidence_preds.append(confidence)
+            dynamic_prob_preds.append(dynamic_prob)
+            scale_ests.append(scale_est)
+            shift_ests.append(shift_est)
+        if stage!=0:
+            all_coords_predictions.append([coord for coord in coord_preds])
+            all_coords_xyz_predictions.append([coord_xyz for coord_xyz in coords_xyz_preds])
+            all_vis_predictions.append(vis_preds)
+            all_confidence_predictions.append(confidence_preds)
+            all_dynamic_prob_predictions.append(dynamic_prob_preds)
+            all_cam_predictions.append([cam for cam in cam_preds])
+            all_cam_pts_predictions.append([pts for pts in pts3d_cam_pred])
+            all_world_tracks_predictions.append([world_tracks for world_tracks in world_tracks_pred])
+            all_world_tracks_refined_predictions.append([world_tracks_refined for world_tracks_refined in world_tracks_refined_pred])
+            all_scale_est.append(scale_ests)
+            all_shift_est.append(shift_ests)
+        if stage!=0:
+            train_data = (
+                all_coords_predictions,
+                all_coords_xyz_predictions,
+                all_vis_predictions,
+                all_confidence_predictions,
+                all_dynamic_prob_predictions,
+                all_cam_predictions,
+                all_cam_pts_predictions,
+                all_world_tracks_predictions,
+                all_world_tracks_refined_predictions,
+                all_scale_est,
+                all_shift_est,
+                torch.ones_like(vis_preds[-1], device=vis_preds[-1].device),
+            )
+        else:
+            train_data = None
+        # resize back
+        # init the trajectories by camera motion
+        # if cache is not None:
+        #     viser = Visualizer(save_dir=".", grayscale=True,
+        #                        fps=10, pad_value=50, tracks_leave_trace=0)
+        #     coords_clone = coords.clone()
+        #     coords_clone[...,:2] *= self.stride
+        #     coords_clone[..., 0] /= self.factor_x
+        #     coords_clone[..., 1] /= self.factor_y
+        #     viser.visualize(video=video_vis, tracks=coords_clone[..., :2], filename="test_refine")
+        #     import pdb; pdb.set_trace()
+        if train_data is not None:
+            # get the gt pts in the world coordinate
+            self_supervised = False
+            if (traj3d_gt is not None):
+                if traj3d_gt[...,2].abs().max()>0:
+                    gt_cam_pts = self.cam_from_track(
+                        traj3d_gt, intrs_org, only_cam_pts=True
+                    )
+                else:
+                    self_supervised = True
+            else:
+                self_supervised = True
+            if self_supervised:
+                gt_cam_pts = self.cam_from_track(
+                    coord_preds[-1].detach(), intrs_org, only_cam_pts=True
+                )
+            if cam_gt is not None:
+                gt_world_pts = torch.einsum(
+                    "btij,btnj->btni",
+                    cam_gt[...,:3,:3],
+                    gt_cam_pts
+                ) + cam_gt[...,None, :3,3]  # B T N 3
+            else:
+                gt_world_pts = torch.einsum(
+                    "btij,btnj->btni",
+                    self.c2w_est_curr[...,:3,:3],
+                    gt_cam_pts
+                ) + self.c2w_est_curr[...,None, :3,3]  # B T N 3
+                # update the query points with scale and shift
+                queries_xyz_i = queries_xyz.clone().detach()
+                queries_xyz_i[..., -1] = queries_xyz_i[..., -1] * scale_est_query.view(B,1,N) + shift_est_query.view(B,1,N,3)[...,2]
+                q_static_proj, q_xyz_world, q_xyz_cam = self.track_from_cam(queries_xyz_i,
+                     self.c2w_est_curr,
+                    intrs, rgbs=video_vis, visualize=False)
+                q_static_proj[..., 0] /= self.factor_x
+                q_static_proj[..., 1] /= self.factor_y
+                cam_gt = self.c2w_est_curr[:,:,:3,:]
+            if traj3d_gt is not None:
+                ret_loss = self.loss(train_data, traj3d_gt,
+                                      vis_gt, None, cam_gt, queries_z_unc,
+                                      q_xyz_world, q_static_proj, anchor_loss=anchor_loss, fix_cam_pts=fix_cam_pts, video_vis=video_vis, stage=stage,
+                                      gt_world_pts=gt_world_pts, mask_traj_gt=mask_traj_gt, intrs=intrs_org, custom_vid=custom_vid, valid_only=valid_only,
+                                      c2w_ests=c2w_ests, point_map_preds=point_map_preds, points_map_gt=points_map_gt, metric_unc=metric_unc, scale_est=scale_est,
+                                      shift_est=shift_est, point_map_org_train=point_map_org_train)
+            else:
+                ret_loss = self.loss(train_data, traj3d_gt,
+                                      vis_gt, None, cam_gt, queries_z_unc,
+                                      q_xyz_world, q_static_proj, anchor_loss=anchor_loss, fix_cam_pts=fix_cam_pts, video_vis=video_vis, stage=stage,
+                                      gt_world_pts=gt_world_pts, mask_traj_gt=mask_traj_gt, intrs=intrs_org, custom_vid=custom_vid, valid_only=valid_only,
+                                      c2w_ests=c2w_ests, point_map_preds=point_map_preds, points_map_gt=points_map_gt, metric_unc=metric_unc, scale_est=scale_est,
+                                      shift_est=shift_est, point_map_org_train=point_map_org_train)
+            if custom_vid:
+                sync_loss = 0*sync_loss
+            if (sync_loss > 50) and (stage==1):
+                ret_loss = (0*sync_loss, 0*sync_loss, 0*sync_loss, 0*sync_loss, 0*sync_loss, 0*sync_loss, 0*sync_loss) + (0*sync_loss,)
+            else:
+                ret_loss = ret_loss+(10*sync_loss,)
+        else:
+            ret_loss = None
+        color_pts = torch.cat([pts3d_cam_pred[-1], queries_rgb[:,None].repeat(1, T, 1, 1)], dim=-1)
+        #TODO: For evaluation. We found our model have some bias on invisible points after training. (to be fixed)
+        vis_pred_out = torch.sigmoid(vis_preds[-1]) + 0.2
+        ret = {"preds": coord_preds[-1], "vis_pred": vis_pred_out,
+                 "conf_pred": torch.sigmoid(confidence_preds[-1]),
+                "cam_pred": self.c2w_est_curr,"loss": ret_loss}
+        cache = {
+            "fmaps": fmaps_org[0].detach(),
+            "track_feat_support3d_pyramid": [track_feat_support3d_pyramid[i].detach() for i in range(len(track_feat_support3d_pyramid))],
+            "track_point_map_support_pyramid": [self.denorm_xyz(track_point_map_support_pyramid[i].detach()) for i in range(len(track_point_map_support_pyramid))],
+            "track_feat3d_pyramid": [track_feat3d_pyramid[i].detach() for i in range(len(track_feat3d_pyramid))],
+            "track_point_map_pyramid": [self.denorm_xyz(track_point_map_pyramid[i].detach()) for i in range(len(track_point_map_pyramid))],
+            "track_feat_pyramid": [track_feat_pyramid[i].detach() for i in range(len(track_feat_pyramid))],
+            "track_feat_support_pyramid": [track_feat_support_pyramid[i].detach() for i in range(len(track_feat_support_pyramid))],
+            "track2d_pred_cache": coord_preds[-1][0].clone().detach(),
+            "track3d_pred_cache": pts3d_cam_pred[-1][0].clone().detach(),
+        }
+        #NOTE: update the point map
+        point_map_org = scale_est.view(B*T,1,1,1)*point_map_org + shift_est.view(B*T,3,1,1)
+        point_map_org_refined = point_map_org
+        return ret, torch.sigmoid(dynamic_prob_preds[-1])*queries_z_unc[:,None,:,0], coord_preds[-1], color_pts, intrs_org, point_map_org_refined, cache
+    def track_d2_loss(self, tracks3d, stride=[1,2,3], dyn_prob=None, mask=None):
+        """
+        tracks3d: B T N 3
+        dyn_prob: B T N 1
+        """
+        r = 0.8
+        t_diff_total = 0.0
+        for i, s_ in enumerate(stride):
+            w_ = r**i
+            tracks3d_stride = tracks3d[:, ::s_, :, :]  # B T//s_ N 3
+            t_diff_tracks3d = (tracks3d_stride[:, 1:, :, :] - tracks3d_stride[:, :-1, :, :])
+            t_diff2 = (t_diff_tracks3d[:, 1:, :, :] - t_diff_tracks3d[:, :-1, :, :])
+            t_diff_total += w_*(t_diff2.norm(dim=-1).mean())
+        return 1e2*t_diff_total
+    def loss(self, train_data, traj3d_gt=None,
+                         vis_gt=None, static_tracks_gt=None, cam_gt=None,
+                         z_unc=None, q_xyz_world=None, q_static_proj=None, anchor_loss=0, valid_only=False,
+                         gt_world_pts=None, mask_traj_gt=None, intrs=None, c2w_ests=None, custom_vid=False, video_vis=None, stage=0,
+                         fix_cam_pts=None, point_map_preds=None, points_map_gt=None, metric_unc=None, scale_est=None, shift_est=None, point_map_org_train=None):
+        """
+        Compute the loss of 3D tracking problem
+        """
+        (
+            coord_predictions, coords_xyz_predictions, vis_predictions, confidence_predicitons,
+            dynamic_prob_predictions, camera_predictions, cam_pts_predictions, world_tracks_predictions,
+            world_tracks_refined_predictions, scale_ests, shift_ests, valid_mask
+        ) = train_data
+        B, T, _, _ = cam_gt.shape
+        if (stage == 2) and self.training:
+            # get the scale and shift gt
+            self.metric_unc_org[:,0] = self.metric_unc_org[:,0] * (points_map_gt.norm(dim=-1)>0).float() * (self.metric_unc_org[:,0]>0.5).float()
+            if not (self.scale_gt==torch.ones(B*T).to(self.scale_gt.device)).all():
+                scale_gt, shift_gt = self.scale_gt, self.shift_gt
+                scale_re = scale_gt[:4].mean()
+                scale_loss = 0.0
+                shift_loss = 0.0
+                for i_scale in range(len(scale_ests[0])):
+                    scale_loss += 0.8**(len(scale_ests[0])-i_scale-1)*10*(scale_gt - scale_re*scale_ests[0][i_scale].view(-1)).abs().mean()
+                    shift_loss += 0.8**(len(shift_ests[0])-i_scale-1)*10*(shift_gt - scale_re*shift_ests[0][i_scale].view(-1,3)).abs().mean()
+            else:
+                scale_loss = 0.0 * scale_ests[0][0].mean()
+                shift_loss = 0.0 * shift_ests[0][0].mean()
+                scale_re = 1.0
+        else:
+            scale_loss = 0.0
+            shift_loss = 0.0
+        if len(point_map_preds)>0:
+            point_map_loss = 0.0
+            for i in range(len(point_map_preds)):
+                point_map_preds_i = point_map_preds[i]
+                point_map_preds_i = rearrange(point_map_preds_i, 'b t c h w -> (b t) c h w', b=B, t=T)
+                base_loss = ((self.pred_points - points_map_gt).norm(dim=-1) * self.metric_unc_org[:,0]).mean()
+                point_map_loss_i = ((point_map_preds_i - points_map_gt.permute(0,3,1,2)).norm(dim=1) * self.metric_unc_org[:,0]).mean()
+                point_map_loss += point_map_loss_i
+                # point_map_loss += ((point_map_org_train - points_map_gt.permute(0,3,1,2)).norm(dim=1) * self.metric_unc_org[:,0]).mean()
+            if scale_loss == 0.0:
+                point_map_loss = 0*point_map_preds_i.sum()
+        else:
+            point_map_loss = 0.0
+        # camera loss
+        cam_loss = 0.0
+        dyn_loss = 0.0
+        N_gt = gt_world_pts.shape[2]
+        # self supervised dynamic mask
+        H_org, W_org = self.image_size[0], self.image_size[1]
+        q_static_proj[torch.isnan(q_static_proj)] = -200
+        in_view_mask = (q_static_proj[...,0]>0) & (q_static_proj[...,0]<W_org) & (q_static_proj[...,1]>0) & (q_static_proj[...,1]<H_org)
+        dyn_mask_final = (((coord_predictions[0][-1] - q_static_proj))[...,:2].norm(dim=-1) * in_view_mask)
+        dyn_mask_final = dyn_mask_final.sum(dim=1) / (in_view_mask.sum(dim=1) + 1e-2)
+        dyn_mask_final = dyn_mask_final > 6
+        for iter_, cam_pred_i in enumerate(camera_predictions[0]):
+            # points loss
+            pts_i_world = world_tracks_predictions[0][iter_].view(B, T, -1, 3)
+            coords_xyz_i_world = coords_xyz_predictions[0][iter_].view(B, T, -1, 3)
+            coords_i = coord_predictions[0][iter_].view(B, T, -1, 3)[..., :2]
+            pts_i_world_refined = torch.einsum(
+                "btij,btnj->btni",
+                cam_gt[...,:3,:3],
+                coords_xyz_i_world
+            ) + cam_gt[...,None, :3,3]  # B T N 3
+            # pts_i_world_refined = world_tracks_refined_predictions[0][iter_].view(B, T, -1, 3)
+            pts_world = pts_i_world
+            dyn_prob_i_logits = dynamic_prob_predictions[0][iter_].mean(dim=1)
+            dyn_prob_i = torch.sigmoid(dyn_prob_i_logits).detach()
+            mask = pts_world.norm(dim=-1) < 200
+            # general
+            vis_i_logits = vis_predictions[0][iter_]
+            vis_i = torch.sigmoid(vis_i_logits).detach()
+            if mask_traj_gt is not None:
+                try:
+                    N_gt_mask = mask_traj_gt.shape[1]
+                    align_loss = (gt_world_pts - q_xyz_world[:,None,:N_gt,:,]).norm(dim=-1)[...,:N_gt_mask] * (mask_traj_gt.permute(0,2,1))
+                    visb_traj = (align_loss * vis_i[:,:,:N_gt_mask]).sum(dim=1)/vis_i[:,:,:N_gt_mask].sum(dim=1)
+                except:
+                    import pdb; pdb.set_trace()
+            else:
+                visb_traj = ((gt_world_pts - q_xyz_world[:,None,:N_gt,:,]).norm(dim=-1) * vis_i[:,:,:N_gt]).sum(dim=1)/vis_i[:,:,:N_gt].sum(dim=1)
+            # pts_loss = ((q_xyz_world[:,None,...] - pts_world)[:,:,:N_gt,:].norm(dim=-1)*(1-dyn_prob_i[:,None,:N_gt])) # - 0.1*(1-dyn_prob_i[:,None,:N_gt]).log()
+            pts_loss = 0
+            static_mask = ~dyn_mask_final   # more strict for static points
+            dyn_mask = dyn_mask_final
+            pts_loss_refined = ((q_xyz_world[:,None,...] - pts_i_world_refined).norm(dim=-1)*static_mask[:,None,:]).sum()/static_mask.sum() # - 0.1*(1-dyn_prob_i[:,None,:N_gt]).log()
+            vis_logits_final = vis_predictions[0][-1].detach()
+            vis_final = torch.sigmoid(vis_logits_final)+0.2 > 0.5  # more strict for visible points
+            dyn_vis_mask = dyn_mask*vis_final * (fix_cam_pts[...,2] > 0.1)
+            pts_loss_dynamic = ((fix_cam_pts - coords_xyz_i_world).norm(dim=-1)*dyn_vis_mask[:,None,:]).sum()/dyn_vis_mask.sum()
+            # pts_loss_refined = 0
+            if traj3d_gt is not None:
+                tap_traj = (gt_world_pts[:,:-1,...] - gt_world_pts[:,1:,...]).norm(dim=-1).sum(dim=1)[...,:N_gt_mask]
+                mask_dyn = tap_traj>0.5
+                if mask_traj_gt.sum() > 0:
+                    dyn_loss_i = 20*balanced_binary_cross_entropy(dyn_prob_i_logits[:,:N_gt_mask][mask_traj_gt.squeeze(-1)],
+                                                                                        mask_dyn.float()[mask_traj_gt.squeeze(-1)])
+                else:
+                    dyn_loss_i = 0
+            else:
+                dyn_loss_i = 10*balanced_binary_cross_entropy(dyn_prob_i_logits, dyn_mask_final.float())
+            dyn_loss += dyn_loss_i
+            # visible loss for out of view points
+            vis_i_train = torch.sigmoid(vis_i_logits)
+            out_of_view_mask = (coords_i[...,0]<0)|(coords_i[...,0]>self.image_size[1])|(coords_i[...,1]<0)|(coords_i[...,1]>self.image_size[0])
+            vis_loss_out_of_view = vis_i_train[out_of_view_mask].sum() / out_of_view_mask.sum()
+            if traj3d_gt is not None:
+                world_pts_loss = (((gt_world_pts - pts_i_world_refined[:,:,:gt_world_pts.shape[2],...]).norm(dim=-1))[...,:N_gt_mask] * mask_traj_gt.permute(0,2,1)).sum() / mask_traj_gt.sum()
+                # world_pts_init_loss = (((gt_world_pts - pts_i_world[:,:,:gt_world_pts.shape[2],...]).norm(dim=-1))[...,:N_gt_mask] * mask_traj_gt.permute(0,2,1)).sum() / mask_traj_gt.sum()
+            else:
+                world_pts_loss = 0
+            # cam regress
+            t_err = (cam_pred_i[...,:3,3] - cam_gt[...,:3,3]).norm(dim=-1).sum()
+            # xyz loss
+            in_view_mask_large = (q_static_proj[...,0]>-50) & (q_static_proj[...,0]<W_org+50) & (q_static_proj[...,1]>-50) & (q_static_proj[...,1]<H_org+50)
+            static_vis_mask = (q_static_proj[...,2]>0.05).float() * static_mask[:,None,:] * in_view_mask_large
+            xyz_loss = ((coord_predictions[0][iter_] - q_static_proj)).abs()[...,:2].norm(dim=-1)*static_vis_mask
+            xyz_loss = xyz_loss.sum()/static_vis_mask.sum()
+            # visualize the q_static_proj
+            # viser = Visualizer(save_dir=".", grayscale=True,
+            #                     fps=10, pad_value=50, tracks_leave_trace=0)
+            # video_vis_ = F.interpolate(video_vis.view(B*T,3,video_vis.shape[-2],video_vis.shape[-1]), (H_org, W_org), mode='bilinear', align_corners=False)
+            # viser.visualize(video=video_vis_, tracks=q_static_proj[:,:,dyn_mask_final.squeeze(), :2], filename="test")
+            # viser.visualize(video=video_vis_, tracks=coord_predictions[0][-1][:,:,dyn_mask_final.squeeze(), :2], filename="test_pred")
+            # import pdb; pdb.set_trace()
+            # temporal loss
+            t_loss = self.track_d2_loss(pts_i_world_refined, [1,2,3], dyn_prob=dyn_prob_i, mask=mask)
+            R_err = (cam_pred_i[...,:3,:3] - cam_gt[...,:3,:3]).abs().sum(dim=-1).mean()
+            if self.stage == 1:
+                cam_loss += 0.8**(len(camera_predictions[0])-iter_-1)*(10*t_err + 500*R_err + 20*pts_loss_refined + 10*xyz_loss + 20*pts_loss_dynamic + 10*vis_loss_out_of_view) #+ 5*(pts_loss + pts_loss_refined + world_pts_loss) + t_loss)
+            elif self.stage == 3:
+                cam_loss += 0.8**(len(camera_predictions[0])-iter_-1)*(10*t_err + 500*R_err + 10*vis_loss_out_of_view) #+ 5*(pts_loss + pts_loss_refined + world_pts_loss) + t_loss)
+            else:
+                cam_loss += 0*vis_loss_out_of_view
+        if (cam_loss > 20000)|(torch.isnan(cam_loss)):
+            cam_loss = torch.zeros_like(cam_loss)
+        if traj3d_gt is None:
+        # ================ Condition 1: The self-supervised signals from the self-consistency ===================
+            return cam_loss, train_data[0][0][0].mean()*0, dyn_loss, train_data[0][0][0].mean()*0, point_map_loss, scale_loss, shift_loss
+        # ================ Condition 2: The supervision signal given by the ground truth trajectories ===================
+        if (
+            (torch.isnan(traj3d_gt).any()
+            or traj3d_gt.abs().max() > 2000) and (custom_vid==False)
+        ):
+            return cam_loss, train_data[0][0][0].mean()*0, dyn_loss, train_data[0][0][0].mean()*0, point_map_loss, scale_loss, shift_loss
+        vis_gts = [vis_gt.float()]
+        invis_gts = [1-vis_gt.float()]
+        traj_gts = [traj3d_gt]
+        valids_gts = [valid_mask]
+        seq_loss_all = sequence_loss(
+            coord_predictions,
+            traj_gts,
+            valids_gts,
+            vis=vis_gts,
+            gamma=0.8,
+            add_huber_loss=False,
+            loss_only_for_visible=False if custom_vid==False else True,
+            z_unc=z_unc,
+            mask_traj_gt=mask_traj_gt
+        )
+        confidence_loss = sequence_prob_loss(
+            coord_predictions, confidence_predicitons, traj_gts, vis_gts
+        )
+        seq_loss_xyz = sequence_loss_xyz(
+            coords_xyz_predictions,
+            traj_gts,
+            valids_gts,
+            intrs=intrs,
+            vis=vis_gts,
+            gamma=0.8,
+            add_huber_loss=False,
+            loss_only_for_visible=False,
+            mask_traj_gt=mask_traj_gt
+        )
+        # filter the blinking points
+        mask_vis = vis_gts[0].clone()  # B T N
+        mask_vis[mask_vis==0] = -1
+        blink_mask = mask_vis[:,:-1,:] * mask_vis[:,1:,:] # first derivative   B (T-1) N
+        mask_vis[:,:-1,:], mask_vis[:,-1,:] = (blink_mask == 1), 0
+        vis_loss = sequence_BCE_loss(vis_predictions, vis_gts, mask=[mask_vis])
+        track_loss_out = (seq_loss_all+2*seq_loss_xyz + cam_loss)
+        if valid_only:
+            vis_loss = 0.0*vis_loss
+        if custom_vid:
+            return seq_loss_all, 0.0*seq_loss_all, 0.0*seq_loss_all, 10*vis_loss, 0.0*seq_loss_all, 0.0*seq_loss_all, 0.0*seq_loss_all
+        return track_loss_out, confidence_loss, dyn_loss, 10*vis_loss, point_map_loss, scale_loss, shift_loss

models/SpaTrackV2/models/tracker3D/co_tracker/cotracker_base.py ADDED Viewed

	@@ -0,0 +1,418 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models.SpaTrackV2.utils.model_utils  import sample_features5d, bilinear_sampler
+from models.SpaTrackV2.models.tracker3D.co_tracker.utils import (
+    Mlp, BasicEncoder, EfficientUpdateFormer
+)
+torch.manual_seed(0)
+def get_1d_sincos_pos_embed_from_grid(
+    embed_dim: int, pos: torch.Tensor
+) -> torch.Tensor:
+    """
+    This function generates a 1D positional embedding from a given grid using sine and cosine functions.
+    Args:
+    - embed_dim: The embedding dimension.
+    - pos: The position to generate the embedding from.
+    Returns:
+    - emb: The generated 1D positional embedding.
+    """
+    assert embed_dim % 2 == 0
+    omega = torch.arange(embed_dim // 2, dtype=torch.double)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = torch.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = torch.sin(out)  # (M, D/2)
+    emb_cos = torch.cos(out)  # (M, D/2)
+    emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
+    return emb[None].float()
+def posenc(x, min_deg, max_deg):
+    """Cat x with a positional encoding of x with scales 2^[min_deg, max_deg-1].
+    Instead of computing [sin(x), cos(x)], we use the trig identity
+    cos(x) = sin(x + pi/2) and do one vectorized call to sin([x, x+pi/2]).
+    Args:
+      x: torch.Tensor, variables to be encoded. Note that x should be in [-pi, pi].
+      min_deg: int, the minimum (inclusive) degree of the encoding.
+      max_deg: int, the maximum (exclusive) degree of the encoding.
+      legacy_posenc_order: bool, keep the same ordering as the original tf code.
+    Returns:
+      encoded: torch.Tensor, encoded variables.
+    """
+    if min_deg == max_deg:
+        return x
+    scales = torch.tensor(
+        [2**i for i in range(min_deg, max_deg)], dtype=x.dtype, device=x.device
+    )
+    xb = (x[..., None, :] * scales[:, None]).reshape(list(x.shape[:-1]) + [-1])
+    four_feat = torch.sin(torch.cat([xb, xb + 0.5 * torch.pi], dim=-1))
+    return torch.cat([x] + [four_feat], dim=-1)
+class CoTrackerThreeBase(nn.Module):
+    def __init__(
+        self,
+        window_len=8,
+        stride=4,
+        corr_radius=3,
+        corr_levels=4,
+        num_virtual_tracks=64,
+        model_resolution=(384, 512),
+        add_space_attn=True,
+        linear_layer_for_vis_conf=True,
+    ):
+        super(CoTrackerThreeBase, self).__init__()
+        self.window_len = window_len
+        self.stride = stride
+        self.corr_radius = corr_radius
+        self.corr_levels = corr_levels
+        self.hidden_dim = 256
+        self.latent_dim = 128
+        self.linear_layer_for_vis_conf = linear_layer_for_vis_conf
+        self.fnet = BasicEncoder(input_dim=3, output_dim=self.latent_dim, stride=stride)
+        highres_dim = 128
+        lowres_dim = 256
+        self.num_virtual_tracks = num_virtual_tracks
+        self.model_resolution = model_resolution
+        self.input_dim = 1110
+        self.updateformer = EfficientUpdateFormer(
+            space_depth=3,
+            time_depth=3,
+            input_dim=self.input_dim,
+            hidden_size=384,
+            output_dim=4,
+            mlp_ratio=4.0,
+            num_virtual_tracks=num_virtual_tracks,
+            add_space_attn=add_space_attn,
+            linear_layer_for_vis_conf=linear_layer_for_vis_conf,
+        )
+        self.corr_mlp = Mlp(in_features=49 * 49, hidden_features=384, out_features=256)
+        time_grid = torch.linspace(0, window_len - 1, window_len).reshape(
+            1, window_len, 1
+        )
+        self.register_buffer(
+            "time_emb", get_1d_sincos_pos_embed_from_grid(self.input_dim, time_grid[0])
+        )
+    def get_support_points(self, coords, r, reshape_back=True):
+        B, _, N, _ = coords.shape
+        device = coords.device
+        centroid_lvl = coords.reshape(B, N, 1, 1, 3)
+        dx = torch.linspace(-r, r, 2 * r + 1, device=device)
+        dy = torch.linspace(-r, r, 2 * r + 1, device=device)
+        xgrid, ygrid = torch.meshgrid(dy, dx, indexing="ij")
+        zgrid = torch.zeros_like(xgrid, device=device)
+        delta = torch.stack([zgrid, xgrid, ygrid], axis=-1)
+        delta_lvl = delta.view(1, 1, 2 * r + 1, 2 * r + 1, 3)
+        coords_lvl = centroid_lvl + delta_lvl
+        if reshape_back:
+            return coords_lvl.reshape(B, N, (2 * r + 1) ** 2, 3).permute(0, 2, 1, 3)
+        else:
+            return coords_lvl
+    def get_track_feat(self, fmaps, queried_frames, queried_coords, support_radius=0):
+        sample_frames = queried_frames[:, None, :, None]
+        sample_coords = torch.cat(
+            [
+                sample_frames,
+                queried_coords[:, None],
+            ],
+            dim=-1,
+        )
+        support_points = self.get_support_points(sample_coords, support_radius)
+        support_track_feats = sample_features5d(fmaps, support_points)
+        return (
+            support_track_feats[:, None, support_track_feats.shape[1] // 2],
+            support_track_feats,
+        )
+    def get_correlation_feat(self, fmaps, queried_coords, radius=None, padding_mode="border"):
+        B, T, D, H_, W_ = fmaps.shape
+        N = queried_coords.shape[1]
+        if radius is None:
+            r = self.corr_radius
+        else:
+            r = radius
+        sample_coords = torch.cat(
+            [torch.zeros_like(queried_coords[..., :1]), queried_coords], dim=-1
+        )[:, None]
+        support_points = self.get_support_points(sample_coords, r, reshape_back=False)
+        correlation_feat = bilinear_sampler(
+            fmaps.reshape(B * T, D, 1, H_, W_), support_points, padding_mode=padding_mode
+        )
+        return correlation_feat.view(B, T, D, N, (2 * r + 1), (2 * r + 1)).permute(
+            0, 1, 3, 4, 5, 2
+        )
+    def interpolate_time_embed(self, x, t):
+        previous_dtype = x.dtype
+        T = self.time_emb.shape[1]
+        if t == T:
+            return self.time_emb
+        time_emb = self.time_emb.float()
+        time_emb = F.interpolate(
+            time_emb.permute(0, 2, 1), size=t, mode="linear"
+        ).permute(0, 2, 1)
+        return time_emb.to(previous_dtype)
+class CoTrackerThreeOffline(CoTrackerThreeBase):
+    def __init__(self, **args):
+        super(CoTrackerThreeOffline, self).__init__(**args)
+    def forward(
+        self,
+        video,
+        queries,
+        iters=4,
+        is_train=False,
+        add_space_attn=True,
+        fmaps_chunk_size=200,
+    ):
+        """Predict tracks
+        Args:
+            video (FloatTensor[B, T, 3]): input videos.
+            queries (FloatTensor[B, N, 3]): point queries.
+            iters (int, optional): number of updates. Defaults to 4.
+            is_train (bool, optional): enables training mode. Defaults to False.
+        Returns:
+            - coords_predicted (FloatTensor[B, T, N, 2]):
+            - vis_predicted (FloatTensor[B, T, N]):
+            - train_data: `None` if `is_train` is false, otherwise:
+                - all_vis_predictions (List[FloatTensor[B, S, N, 1]]):
+                - all_coords_predictions (List[FloatTensor[B, S, N, 2]]):
+                - mask (BoolTensor[B, T, N]):
+        """
+        B, T, C, H, W = video.shape
+        device = queries.device
+        assert H % self.stride == 0 and W % self.stride == 0
+        B, N, __ = queries.shape
+        # B = batch size
+        # S_trimmed = actual number of frames in the window
+        # N = number of tracks
+        # C = color channels (3 for RGB)
+        # E = positional embedding size
+        # LRR = local receptive field radius
+        # D = dimension of the transformer input tokens
+        # video = B T C H W
+        # queries = B N 3
+        # coords_init = B T N 2
+        # vis_init = B T N 1
+        assert T >= 1  # A tracker needs at least two frames to track something
+        video = 2 * (video / 255.0) - 1.0
+        dtype = video.dtype
+        queried_frames = queries[:, :, 0].long()
+        queried_coords = queries[..., 1:3]
+        queried_coords = queried_coords / self.stride
+        # We store our predictions here
+        all_coords_predictions, all_vis_predictions, all_confidence_predictions = (
+            [],
+            [],
+            [],
+        )
+        C_ = C
+        H4, W4 = H // self.stride, W // self.stride
+        # Compute convolutional features for the video or for the current chunk in case of online mode
+        if T > fmaps_chunk_size:
+            fmaps = []
+            for t in range(0, T, fmaps_chunk_size):
+                video_chunk = video[:, t : t + fmaps_chunk_size]
+                fmaps_chunk = self.fnet(video_chunk.reshape(-1, C_, H, W))
+                T_chunk = video_chunk.shape[1]
+                C_chunk, H_chunk, W_chunk = fmaps_chunk.shape[1:]
+                fmaps.append(fmaps_chunk.reshape(B, T_chunk, C_chunk, H_chunk, W_chunk))
+            fmaps = torch.cat(fmaps, dim=1).reshape(-1, C_chunk, H_chunk, W_chunk)
+        else:
+            fmaps = self.fnet(video.reshape(-1, C_, H, W))
+        fmaps = fmaps.permute(0, 2, 3, 1)
+        fmaps = fmaps / torch.sqrt(
+            torch.maximum(
+                torch.sum(torch.square(fmaps), axis=-1, keepdims=True),
+                torch.tensor(1e-12, device=fmaps.device),
+            )
+        )
+        fmaps = fmaps.permute(0, 3, 1, 2).reshape(
+            B, -1, self.latent_dim, H // self.stride, W // self.stride
+        )
+        fmaps = fmaps.to(dtype)
+        # We compute track features
+        fmaps_pyramid = []
+        track_feat_pyramid = []
+        track_feat_support_pyramid = []
+        fmaps_pyramid.append(fmaps)
+        for i in range(self.corr_levels - 1):
+            fmaps_ = fmaps.reshape(
+                B * T, self.latent_dim, fmaps.shape[-2], fmaps.shape[-1]
+            )
+            fmaps_ = F.avg_pool2d(fmaps_, 2, stride=2)
+            fmaps = fmaps_.reshape(
+                B, T, self.latent_dim, fmaps_.shape[-2], fmaps_.shape[-1]
+            )
+            fmaps_pyramid.append(fmaps)
+        for i in range(self.corr_levels):
+            track_feat, track_feat_support = self.get_track_feat(
+                fmaps_pyramid[i],
+                queried_frames,
+                queried_coords / 2**i,
+                support_radius=self.corr_radius,
+            )
+            track_feat_pyramid.append(track_feat.repeat(1, T, 1, 1))
+            track_feat_support_pyramid.append(track_feat_support.unsqueeze(1))
+        D_coords = 2
+        coord_preds, vis_preds, confidence_preds = [], [], []
+        vis = torch.zeros((B, T, N), device=device).float()
+        confidence = torch.zeros((B, T, N), device=device).float()
+        coords = queried_coords.reshape(B, 1, N, 2).expand(B, T, N, 2).float()
+        r = 2 * self.corr_radius + 1
+        for it in range(iters):
+            coords = coords.detach()  # B T N 2
+            coords_init = coords.view(B * T, N, 2)
+            corr_embs = []
+            corr_feats = []
+            for i in range(self.corr_levels):
+                corr_feat = self.get_correlation_feat(
+                    fmaps_pyramid[i], coords_init / 2**i
+                )
+                track_feat_support = (
+                    track_feat_support_pyramid[i]
+                    .view(B, 1, r, r, N, self.latent_dim)
+                    .squeeze(1)
+                    .permute(0, 3, 1, 2, 4)
+                )
+                corr_volume = torch.einsum(
+                    "btnhwc,bnijc->btnhwij", corr_feat, track_feat_support
+                )
+                corr_emb = self.corr_mlp(corr_volume.reshape(B * T * N, r * r * r * r))
+                corr_embs.append(corr_emb)
+            corr_embs = torch.cat(corr_embs, dim=-1)
+            corr_embs = corr_embs.view(B, T, N, corr_embs.shape[-1])
+            transformer_input = [vis[..., None], confidence[..., None], corr_embs]
+            rel_coords_forward = coords[:, :-1] - coords[:, 1:]
+            rel_coords_backward = coords[:, 1:] - coords[:, :-1]
+            rel_coords_forward = torch.nn.functional.pad(
+                rel_coords_forward, (0, 0, 0, 0, 0, 1)
+            )
+            rel_coords_backward = torch.nn.functional.pad(
+                rel_coords_backward, (0, 0, 0, 0, 1, 0)
+            )
+            scale = (
+                torch.tensor(
+                    [self.model_resolution[1], self.model_resolution[0]],
+                    device=coords.device,
+                )
+                / self.stride
+            )
+            rel_coords_forward = rel_coords_forward / scale
+            rel_coords_backward = rel_coords_backward / scale
+            rel_pos_emb_input = posenc(
+                torch.cat([rel_coords_forward, rel_coords_backward], dim=-1),
+                min_deg=0,
+                max_deg=10,
+            )  # batch, num_points, num_frames, 84
+            transformer_input.append(rel_pos_emb_input)
+            x = (
+                torch.cat(transformer_input, dim=-1)
+                .permute(0, 2, 1, 3)
+                .reshape(B * N, T, -1)
+            )
+            x = x + self.interpolate_time_embed(x, T)
+            x = x.view(B, N, T, -1)  # (B N) T D -> B N T D
+            delta = self.updateformer(
+                x,
+                add_space_attn=add_space_attn,
+            )
+            delta_coords = delta[..., :D_coords].permute(0, 2, 1, 3)
+            delta_vis = delta[..., D_coords].permute(0, 2, 1)
+            delta_confidence = delta[..., D_coords + 1].permute(0, 2, 1)
+            vis = vis + delta_vis
+            confidence = confidence + delta_confidence
+            coords = coords + delta_coords
+            coords_append = coords.clone()
+            coords_append[..., :2] = coords_append[..., :2] * float(self.stride)
+            coord_preds.append(coords_append)
+            vis_preds.append(torch.sigmoid(vis))
+            confidence_preds.append(torch.sigmoid(confidence))
+        if is_train:
+            all_coords_predictions.append([coord[..., :2] for coord in coord_preds])
+            all_vis_predictions.append(vis_preds)
+            all_confidence_predictions.append(confidence_preds)
+        if is_train:
+            train_data = (
+                all_coords_predictions,
+                all_vis_predictions,
+                all_confidence_predictions,
+                torch.ones_like(vis_preds[-1], device=vis_preds[-1].device),
+            )
+        else:
+            train_data = None
+        return coord_preds[-1][..., :2], vis_preds[-1], confidence_preds[-1], train_data
+if __name__ == "__main__":
+    cotrack_cktp = "/data0/xyx/scaled_offline.pth"
+    cotracker = CoTrackerThreeOffline(
+                stride=4, corr_radius=3, window_len=60
+            )
+    with open(cotrack_cktp, "rb") as f:
+        state_dict = torch.load(f, map_location="cpu")
+        if "model" in state_dict:
+            state_dict = state_dict["model"]
+        cotracker.load_state_dict(state_dict)
+    import pdb; pdb.set_trace()

models/SpaTrackV2/models/tracker3D/co_tracker/utils.py ADDED Viewed

	@@ -0,0 +1,929 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from typing import Callable, List
+import collections
+from torch import Tensor
+from itertools import repeat
+from models.SpaTrackV2.utils.model_utils import bilinear_sampler
+from models.SpaTrackV2.models.blocks import CrossAttnBlock as CrossAttnBlock_F
+from torch.nn.functional import scaled_dot_product_attention
+from torch.nn.attention import sdpa_kernel, SDPBackend
+# import flash_attn
+EPS = 1e-6
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes,
+            planes,
+            kernel_size=3,
+            padding=1,
+            stride=stride,
+            padding_mode="zeros",
+        )
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, padding=1, padding_mode="zeros"
+        )
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+        elif norm_fn == "none":
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+def reduce_masked_mean(input, mask, dim=None, keepdim=False):
+    r"""Masked mean
+    `reduce_masked_mean(x, mask)` computes the mean of a tensor :attr:`input`
+    over a mask :attr:`mask`, returning
+    .. math::
+        \text{output} =
+        \frac
+        {\sum_{i=1}^N \text{input}_i \cdot \text{mask}_i}
+        {\epsilon + \sum_{i=1}^N \text{mask}_i}
+    where :math:`N` is the number of elements in :attr:`input` and
+    :attr:`mask`, and :math:`\epsilon` is a small constant to avoid
+    division by zero.
+    `reduced_masked_mean(x, mask, dim)` computes the mean of a tensor
+    :attr:`input` over a mask :attr:`mask` along a dimension :attr:`dim`.
+    Optionally, the dimension can be kept in the output by setting
+    :attr:`keepdim` to `True`. Tensor :attr:`mask` must be broadcastable to
+    the same dimension as :attr:`input`.
+    The interface is similar to `torch.mean()`.
+    Args:
+        inout (Tensor): input tensor.
+        mask (Tensor): mask.
+        dim (int, optional): Dimension to sum over. Defaults to None.
+        keepdim (bool, optional): Keep the summed dimension. Defaults to False.
+    Returns:
+        Tensor: mean tensor.
+    """
+    mask = mask.expand_as(input)
+    prod = input * mask
+    if dim is None:
+        numer = torch.sum(prod)
+        denom = torch.sum(mask)
+    else:
+        numer = torch.sum(prod, dim=dim, keepdim=keepdim)
+        denom = torch.sum(mask, dim=dim, keepdim=keepdim)
+    mean = numer / (EPS + denom)
+    return mean
+class GeometryEncoder(nn.Module):
+    def __init__(self, input_dim=3, output_dim=128, stride=4):
+        super(GeometryEncoder, self).__init__()
+        self.stride = stride
+        self.norm_fn = "instance"
+        self.in_planes = output_dim // 2
+        self.norm1 = nn.InstanceNorm2d(self.in_planes)
+        self.norm2 = nn.InstanceNorm2d(output_dim * 2)
+        self.conv1 = nn.Conv2d(
+            input_dim,
+            self.in_planes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            padding_mode="zeros",
+        )
+        self.relu1 = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(output_dim // 2, stride=1)
+        self.layer2 = self._make_layer(output_dim // 4 * 3, stride=2)
+        self.conv2 = nn.Conv2d(
+            output_dim * 5 // 4,
+            output_dim,
+            kernel_size=3,
+            padding=1,
+            padding_mode="zeros",
+        )
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(output_dim, output_dim, kernel_size=1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.InstanceNorm2d)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        _, _, H, W = x.shape
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        a = self.layer1(x)
+        b = self.layer2(a)
+        def _bilinear_intepolate(x):
+            return F.interpolate(
+                x,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+        a = _bilinear_intepolate(a)
+        b = _bilinear_intepolate(b)
+        x = self.conv2(torch.cat([a, b], dim=1))
+        x = self.norm2(x)
+        x = self.relu2(x)
+        x = self.conv3(x)
+        return x
+class BasicEncoder(nn.Module):
+    def __init__(self, input_dim=3, output_dim=128, stride=4):
+        super(BasicEncoder, self).__init__()
+        self.stride = stride
+        self.norm_fn = "instance"
+        self.in_planes = output_dim // 2
+        self.norm1 = nn.InstanceNorm2d(self.in_planes)
+        self.norm2 = nn.InstanceNorm2d(output_dim * 2)
+        self.conv1 = nn.Conv2d(
+            input_dim,
+            self.in_planes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            padding_mode="zeros",
+        )
+        self.relu1 = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(output_dim // 2, stride=1)
+        self.layer2 = self._make_layer(output_dim // 4 * 3, stride=2)
+        self.layer3 = self._make_layer(output_dim, stride=2)
+        self.layer4 = self._make_layer(output_dim, stride=2)
+        self.conv2 = nn.Conv2d(
+            output_dim * 3 + output_dim // 4,
+            output_dim * 2,
+            kernel_size=3,
+            padding=1,
+            padding_mode="zeros",
+        )
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(output_dim * 2, output_dim, kernel_size=1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.InstanceNorm2d)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        _, _, H, W = x.shape
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        a = self.layer1(x)
+        b = self.layer2(a)
+        c = self.layer3(b)
+        d = self.layer4(c)
+        def _bilinear_intepolate(x):
+            return F.interpolate(
+                x,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+        a = _bilinear_intepolate(a)
+        b = _bilinear_intepolate(b)
+        c = _bilinear_intepolate(c)
+        d = _bilinear_intepolate(d)
+        x = self.conv2(torch.cat([a, b, c, d], dim=1))
+        x = self.norm2(x)
+        x = self.relu2(x)
+        x = self.conv3(x)
+        return x
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+to_2tuple = _ntuple(2)
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = (
+            norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        )
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self, query_dim, context_dim=None, num_heads=8, dim_head=48, qkv_bias=False
+    ):
+        super().__init__()
+        inner_dim = dim_head * num_heads
+        self.inner_dim = inner_dim
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = num_heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=qkv_bias)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias=qkv_bias)
+        self.to_out = nn.Linear(inner_dim, query_dim)
+    def forward(self, x, context=None, attn_bias=None, flash=True):
+        B, N1, C = x.shape
+        h = self.heads
+        q = self.to_q(x).reshape(B, N1, h, self.inner_dim // h).permute(0, 2, 1, 3)
+        context = default(context, x)
+        k, v = self.to_kv(context).chunk(2, dim=-1)
+        N2 = context.shape[1]
+        k = k.reshape(B, N2, h, self.inner_dim // h).permute(0, 2, 1, 3)
+        v = v.reshape(B, N2, h, self.inner_dim // h).permute(0, 2, 1, 3)
+        if (
+            (N1 < 64 and N2 < 64) or
+            (B > 1e4) or
+            (q.shape[1] != k.shape[1]) or
+            (q.shape[1] % k.shape[1] != 0)
+        ):
+            flash = False
+        if flash == False:
+            sim = (q @ k.transpose(-2, -1)) * self.scale
+            if attn_bias is not None:
+                sim = sim + attn_bias
+            if sim.abs().max() > 1e2:
+                import pdb; pdb.set_trace()
+            attn = sim.softmax(dim=-1)
+            x = (attn @ v).transpose(1, 2).reshape(B, N1, self.inner_dim)
+        else:
+            input_args = [x.contiguous() for x in [q, k, v]]
+            try:
+                # print(f"q.shape: {q.shape}, dtype: {q.dtype}, device: {q.device}")
+                # print(f"Flash SDP available: {torch.backends.cuda.flash_sdp_enabled()}")
+                # print(f"Flash SDP allowed: {torch.backends.cuda.enable_flash_sdp}")
+                with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+                    x = F.scaled_dot_product_attention(*input_args).permute(0,2,1,3).reshape(B,N1,-1)  # type: ignore
+            except Exception as e:
+                print(e)
+        if self.to_out.bias.dtype != x.dtype:
+            x = x.to(self.to_out.bias.dtype)
+        return self.to_out(x)
+class CrossAttnBlock(nn.Module):
+    def __init__(
+        self, hidden_size, context_dim, num_heads=1, mlp_ratio=4.0, **block_kwargs
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm_context = nn.LayerNorm(context_dim)
+        self.cross_attn = Attention(
+            hidden_size,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            qkv_bias=True,
+            **block_kwargs
+        )
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+    def forward(self, x, context, mask=None):
+        attn_bias = None
+        if mask is not None:
+            if mask.shape[1] == x.shape[1]:
+                mask = mask[:, None, :, None].expand(
+                    -1, self.cross_attn.heads, -1, context.shape[1]
+                )
+            else:
+                mask = mask[:, None, None].expand(
+                    -1, self.cross_attn.heads, x.shape[1], -1
+                )
+            max_neg_value = -torch.finfo(x.dtype).max
+            attn_bias = (~mask) * max_neg_value
+        x = x + self.cross_attn(
+            self.norm1(x), context=self.norm_context(context), attn_bias=attn_bias
+        )
+        x = x + self.mlp(self.norm2(x))
+        return x
+class AttnBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        attn_class: Callable[..., nn.Module] = Attention,
+        mlp_ratio=4.0,
+        **block_kwargs
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = attn_class(
+            hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs
+        )
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+    def forward(self, x, mask=None):
+        attn_bias = mask
+        if mask is not None:
+            mask = (
+                (mask[:, None] * mask[:, :, None])
+                .unsqueeze(1)
+                .expand(-1, self.attn.num_heads, -1, -1)
+            )
+            max_neg_value = -torch.finfo(x.dtype).max
+            attn_bias = (~mask) * max_neg_value
+        x = x + self.attn(self.norm1(x), attn_bias=attn_bias)
+        x = x + self.mlp(self.norm2(x))
+        return x
+class EfficientUpdateFormer(nn.Module):
+    """
+    Transformer model that updates track estimates.
+    """
+    def __init__(
+        self,
+        space_depth=6,
+        time_depth=6,
+        input_dim=320,
+        hidden_size=384,
+        num_heads=8,
+        output_dim=130,
+        mlp_ratio=4.0,
+        num_virtual_tracks=64,
+        add_space_attn=True,
+        linear_layer_for_vis_conf=False,
+        patch_feat=False,
+        patch_dim=128,
+    ):
+        super().__init__()
+        self.out_channels = 2
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.input_transform = torch.nn.Linear(input_dim, hidden_size, bias=True)
+        if linear_layer_for_vis_conf:
+            self.flow_head = torch.nn.Linear(hidden_size, output_dim - 2, bias=True)
+            self.vis_conf_head = torch.nn.Linear(hidden_size, 2, bias=True)
+        else:
+            self.flow_head = torch.nn.Linear(hidden_size, output_dim, bias=True)
+        if patch_feat==False:
+            self.virual_tracks = nn.Parameter(
+                torch.randn(1, num_virtual_tracks, 1, hidden_size)
+            )
+            self.num_virtual_tracks = num_virtual_tracks
+        else:
+            self.patch_proj = nn.Linear(patch_dim, hidden_size, bias=True)
+        self.add_space_attn = add_space_attn
+        self.linear_layer_for_vis_conf = linear_layer_for_vis_conf
+        self.time_blocks = nn.ModuleList(
+            [
+                AttnBlock(
+                    hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    attn_class=Attention,
+                )
+                for _ in range(time_depth)
+            ]
+        )
+        if add_space_attn:
+            self.space_virtual_blocks = nn.ModuleList(
+                [
+                    AttnBlock(
+                        hidden_size,
+                        num_heads,
+                        mlp_ratio=mlp_ratio,
+                        attn_class=Attention,
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            self.space_point2virtual_blocks = nn.ModuleList(
+                [
+                    CrossAttnBlock(
+                        hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            self.space_virtual2point_blocks = nn.ModuleList(
+                [
+                    CrossAttnBlock(
+                        hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            assert len(self.time_blocks) >= len(self.space_virtual2point_blocks)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+            torch.nn.init.trunc_normal_(self.flow_head.weight, std=0.001)
+            if self.linear_layer_for_vis_conf:
+                torch.nn.init.trunc_normal_(self.vis_conf_head.weight, std=0.001)
+        def _trunc_init(module):
+            """ViT weight initialization, original timm impl (for reproducibility)"""
+            if isinstance(module, nn.Linear):
+                torch.nn.init.trunc_normal_(module.weight, std=0.02)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+        self.apply(_basic_init)
+    def forward(self, input_tensor, mask=None, add_space_attn=True, patch_feat=None):
+        tokens = self.input_transform(input_tensor)
+        B, _, T, _ = tokens.shape
+        if patch_feat is None:
+            virtual_tokens = self.virual_tracks.repeat(B, 1, T, 1)
+            tokens = torch.cat([tokens, virtual_tokens], dim=1)
+        else:
+            patch_feat = self.patch_proj(patch_feat.detach())
+            tokens = torch.cat([tokens, patch_feat], dim=1)
+            self.num_virtual_tracks = patch_feat.shape[1]
+        _, N, _, _ = tokens.shape
+        j = 0
+        layers = []
+        for i in range(len(self.time_blocks)):
+            time_tokens = tokens.contiguous().view(B * N, T, -1)  # B N T C -> (B N) T C
+            time_tokens = torch.utils.checkpoint.checkpoint(
+                self.time_blocks[i],
+                time_tokens
+            )
+            tokens = time_tokens.view(B, N, T, -1)  # (B N) T C -> B N T C
+            if (
+                add_space_attn
+                and hasattr(self, "space_virtual_blocks")
+                and (i % (len(self.time_blocks) // len(self.space_virtual_blocks)) == 0)
+            ):
+                space_tokens = (
+                    tokens.permute(0, 2, 1, 3).contiguous().view(B * T, N, -1)
+                )  # B N T C -> (B T) N C
+                point_tokens = space_tokens[:, : N - self.num_virtual_tracks]
+                virtual_tokens = space_tokens[:, N - self.num_virtual_tracks :]
+                virtual_tokens = torch.utils.checkpoint.checkpoint(
+                    self.space_virtual2point_blocks[j],
+                    virtual_tokens, point_tokens, mask
+                )
+                virtual_tokens = torch.utils.checkpoint.checkpoint(
+                    self.space_virtual_blocks[j],
+                    virtual_tokens
+                )
+                point_tokens = torch.utils.checkpoint.checkpoint(
+                    self.space_point2virtual_blocks[j],
+                    point_tokens, virtual_tokens, mask
+                )
+                space_tokens = torch.cat([point_tokens, virtual_tokens], dim=1)
+                tokens = space_tokens.view(B, T, N, -1).permute(
+                    0, 2, 1, 3
+                )  # (B T) N C -> B N T C
+                j += 1
+        tokens = tokens[:, : N - self.num_virtual_tracks]
+        flow = self.flow_head(tokens)
+        if self.linear_layer_for_vis_conf:
+            vis_conf = self.vis_conf_head(tokens)
+            flow = torch.cat([flow, vis_conf], dim=-1)
+        return flow
+def focal_loss(logits, targets, alpha=0.25, gamma=2.0):
+    probs = torch.sigmoid(logits)
+    ce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
+    p_t = probs * targets + (1 - probs) * (1 - targets)
+    loss = alpha * (1 - p_t) ** gamma * ce_loss
+    return loss.mean()
+def balanced_binary_cross_entropy(logits, targets, balance_weight=1.0, eps=1e-6, reduction="mean", pos_bias=0.0, mask=None):
+    """
+    logits: Tensor of arbitrary shape
+    targets: same shape as logits
+    balance_weight: scaling the loss
+    reduction: 'mean', 'sum', or 'none'
+    """
+    targets = targets.float()
+    positive = (targets == 1).float().sum()
+    total = targets.numel()
+    positive_ratio = positive / (total + eps)
+    pos_weight = (1 - positive_ratio) / (positive_ratio + eps)
+    pos_weight = pos_weight.clamp(min=0.1, max=10.0)
+    loss = F.binary_cross_entropy_with_logits(
+        logits,
+        targets,
+        pos_weight=pos_weight+pos_bias,
+        reduction=reduction
+    )
+    if mask is not None:
+        loss = (loss * mask).sum() / (mask.sum() + eps)
+    return balance_weight * loss
+def sequence_loss(
+    flow_preds,
+    flow_gt,
+    valids,
+    vis=None,
+    gamma=0.8,
+    add_huber_loss=False,
+    loss_only_for_visible=False,
+    depth_sample=None,
+    z_unc=None,
+    mask_traj_gt=None
+):
+    """Loss function defined over sequence of flow predictions"""
+    total_flow_loss = 0.0
+    for j in range(len(flow_gt)):
+        B, S, N, D = flow_gt[j].shape
+        B, S2, N = valids[j].shape
+        assert S == S2
+        n_predictions = len(flow_preds[j])
+        flow_loss = 0.0
+        for i in range(n_predictions):
+            i_weight = gamma ** (n_predictions - i - 1)
+            flow_pred = flow_preds[j][i][:,:,:flow_gt[j].shape[2]]
+            if flow_pred.shape[-1] == 3:
+                flow_pred[...,2] = flow_pred[...,2]
+            if add_huber_loss:
+                i_loss = huber_loss(flow_pred, flow_gt[j], delta=6.0)
+            else:
+                if flow_gt[j][...,2].abs().max() != 0:
+                    track_z_loss = (flow_pred- flow_gt[j])[...,2].abs().mean()
+                    if mask_traj_gt is not None:
+                        track_z_loss = ((flow_pred- flow_gt[j])[...,2].abs() * mask_traj_gt.permute(0,2,1)).sum() / (mask_traj_gt.sum(dim=1)+1e-6)
+                else:
+                    track_z_loss = 0
+                i_loss = (flow_pred[...,:2] - flow_gt[j][...,:2]).abs() # B, S, N, 2
+            # print((flow_pred - flow_gt[j])[...,2].abs()[vis[j].bool()].mean())
+            i_loss = torch.mean(i_loss, dim=3)  # B, S, N
+            valid_ = valids[j].clone()[:,:, :flow_gt[j].shape[2]]  # Ensure valid_ has the same shape as i_loss
+            valid_ = valid_ * (flow_gt[j][...,:2].norm(dim=-1) > 0).float()
+            if loss_only_for_visible:
+                valid_ = valid_ * vis[j]
+            # print(reduce_masked_mean(i_loss, valid_).item(), track_z_loss.item()/16)
+            flow_loss += i_weight * (reduce_masked_mean(i_loss, valid_) + track_z_loss + 10*reduce_masked_mean(i_loss, valid_* vis[j]))
+            # if flow_loss > 5e2:
+            #     import pdb; pdb.set_trace()
+        flow_loss = flow_loss / n_predictions
+        total_flow_loss += flow_loss
+    return total_flow_loss / len(flow_gt)
+def sequence_loss_xyz(
+    flow_preds,
+    flow_gt,
+    valids,
+    intrs,
+    vis=None,
+    gamma=0.8,
+    add_huber_loss=False,
+    loss_only_for_visible=False,
+    mask_traj_gt=None
+):
+    """Loss function defined over sequence of flow predictions"""
+    total_flow_loss = 0.0
+    for j in range(len(flow_gt)):
+        B, S, N, D = flow_gt[j].shape
+        B, S2, N = valids[j].shape
+        assert S == S2
+        n_predictions = len(flow_preds[j])
+        flow_loss = 0.0
+        for i in range(n_predictions):
+            i_weight = gamma ** (n_predictions - i - 1)
+            flow_pred = flow_preds[j][i][:,:,:flow_gt[j].shape[2]]
+            flow_gt_ = flow_gt[j]
+            flow_gt_one = torch.cat([flow_gt_[...,:2], torch.ones_like(flow_gt_[:,:,:,:1])], dim=-1)
+            flow_gt_cam = torch.einsum('btsc,btnc->btns', torch.inverse(intrs), flow_gt_one)
+            flow_gt_cam *= flow_gt_[...,2:3].abs()
+            flow_gt_cam[...,2] *= torch.sign(flow_gt_cam[...,2])
+            if add_huber_loss:
+                i_loss = huber_loss(flow_pred, flow_gt_cam, delta=6.0)
+            else:
+                i_loss = (flow_pred- flow_gt_cam).norm(dim=-1,keepdim=True) # B, S, N, 2
+            # print((flow_pred - flow_gt[j])[...,2].abs()[vis[j].bool()].mean())
+            i_loss = torch.mean(i_loss, dim=3)  # B, S, N
+            valid_ = valids[j].clone()[:,:, :flow_gt[j].shape[2]]  # Ensure valid_ has the same shape as i_loss
+            if loss_only_for_visible:
+                valid_ = valid_ * vis[j]
+            # print(reduce_masked_mean(i_loss, valid_).item(), track_z_loss.item()/16)
+            flow_loss += i_weight * (reduce_masked_mean(i_loss, valid_)) * 1000
+            # if flow_loss > 5e2:
+            #     import pdb; pdb.set_trace()
+        flow_loss = flow_loss / n_predictions
+        total_flow_loss += flow_loss
+    return total_flow_loss / len(flow_gt)
+def huber_loss(x, y, delta=1.0):
+    """Calculate element-wise Huber loss between x and y"""
+    diff = x - y
+    abs_diff = diff.abs()
+    flag = (abs_diff <= delta).float()
+    return flag * 0.5 * diff**2 + (1 - flag) * delta * (abs_diff - 0.5 * delta)
+def sequence_BCE_loss(vis_preds, vis_gts, mask=None):
+    total_bce_loss = 0.0
+    for j in range(len(vis_preds)):
+        n_predictions = len(vis_preds[j])
+        bce_loss = 0.0
+        for i in range(n_predictions):
+            N_gt = vis_gts[j].shape[-1]
+            if mask is not None:
+                vis_loss = balanced_binary_cross_entropy(vis_preds[j][i][...,:N_gt], vis_gts[j], mask=mask[j], reduction="none")
+            else:
+                vis_loss = balanced_binary_cross_entropy(vis_preds[j][i][...,:N_gt], vis_gts[j]) + focal_loss(vis_preds[j][i][...,:N_gt], vis_gts[j])
+            # print(vis_loss, ((torch.sigmoid(vis_preds[j][i][...,:N_gt])>0.5).float() - vis_gts[j]).abs().sum())
+            bce_loss += vis_loss
+        bce_loss = bce_loss / n_predictions
+        total_bce_loss += bce_loss
+    return total_bce_loss / len(vis_preds)
+def sequence_prob_loss(
+    tracks: torch.Tensor,
+    confidence: torch.Tensor,
+    target_points: torch.Tensor,
+    visibility: torch.Tensor,
+    expected_dist_thresh: float = 12.0,
+):
+    """Loss for classifying if a point is within pixel threshold of its target."""
+    # Points with an error larger than 12 pixels are likely to be useless; marking
+    # them as occluded will actually improve Jaccard metrics and give
+    # qualitatively better results.
+    total_logprob_loss = 0.0
+    for j in range(len(tracks)):
+        n_predictions = len(tracks[j])
+        logprob_loss = 0.0
+        for i in range(n_predictions):
+            N_gt = target_points[j].shape[2]
+            err = torch.sum((tracks[j][i].detach()[:,:,:N_gt,:2] - target_points[j][...,:2]) ** 2, dim=-1)
+            valid = (err <= expected_dist_thresh**2).float()
+            logprob = balanced_binary_cross_entropy(confidence[j][i][...,:N_gt], valid, reduction="none")
+            logprob *= visibility[j]
+            logprob = torch.mean(logprob, dim=[1, 2])
+            logprob_loss += logprob
+        logprob_loss = logprob_loss / n_predictions
+        total_logprob_loss += logprob_loss
+    return total_logprob_loss / len(tracks)
+def sequence_dyn_prob_loss(
+    tracks: torch.Tensor,
+    confidence: torch.Tensor,
+    target_points: torch.Tensor,
+    visibility: torch.Tensor,
+    expected_dist_thresh: float = 6.0,
+):
+    """Loss for classifying if a point is within pixel threshold of its target."""
+    # Points with an error larger than 12 pixels are likely to be useless; marking
+    # them as occluded will actually improve Jaccard metrics and give
+    # qualitatively better results.
+    total_logprob_loss = 0.0
+    for j in range(len(tracks)):
+        n_predictions = len(tracks[j])
+        logprob_loss = 0.0
+        for i in range(n_predictions):
+            err = torch.sum((tracks[j][i].detach() - target_points[j]) ** 2, dim=-1)
+            valid = (err <= expected_dist_thresh**2).float()
+            valid = (valid.sum(dim=1) > 0).float()
+            logprob = balanced_binary_cross_entropy(confidence[j][i].mean(dim=1), valid, reduction="none")
+            # logprob *= visibility[j]
+            logprob = torch.mean(logprob, dim=[0, 1])
+            logprob_loss += logprob
+        logprob_loss = logprob_loss / n_predictions
+        total_logprob_loss += logprob_loss
+    return total_logprob_loss / len(tracks)
+def masked_mean(data: torch.Tensor, mask: torch.Tensor, dim: List[int]):
+    if mask is None:
+        return data.mean(dim=dim, keepdim=True)
+    mask = mask.float()
+    mask_sum = torch.sum(mask, dim=dim, keepdim=True)
+    mask_mean = torch.sum(data * mask, dim=dim, keepdim=True) / torch.clamp(
+        mask_sum, min=1.0
+    )
+    return mask_mean
+def masked_mean_var(data: torch.Tensor, mask: torch.Tensor, dim: List[int]):
+    if mask is None:
+        return data.mean(dim=dim, keepdim=True), data.var(dim=dim, keepdim=True)
+    mask = mask.float()
+    mask_sum = torch.sum(mask, dim=dim, keepdim=True)
+    mask_mean = torch.sum(data * mask, dim=dim, keepdim=True) / torch.clamp(
+        mask_sum, min=1.0
+    )
+    mask_var = torch.sum(
+        mask * (data - mask_mean) ** 2, dim=dim, keepdim=True
+    ) / torch.clamp(mask_sum, min=1.0)
+    return mask_mean.squeeze(dim), mask_var.squeeze(dim)
+class NeighborTransformer(nn.Module):
+    def __init__(self, dim: int, num_heads: int, head_dim: int, mlp_ratio: float):
+        super().__init__()
+        self.dim = dim
+        self.output_token_1 = nn.Parameter(torch.randn(1, dim))
+        self.output_token_2 = nn.Parameter(torch.randn(1, dim))
+        self.xblock1_2 = CrossAttnBlock(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim, mlp_ratio=mlp_ratio)
+        self.xblock2_1 = CrossAttnBlock(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim, mlp_ratio=mlp_ratio)
+        self.aggr1 = Attention(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim)
+        self.aggr2 = Attention(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim)
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        from einops import rearrange, repeat
+        import torch.utils.checkpoint as checkpoint
+        assert len (x.shape) == 3, "x should be of shape (B, N, D)"
+        assert len (y.shape) == 3, "y should be of shape (B, N, D)"
+        # not work so well ...
+        def forward_chunk(x, y):
+            new_x = self.xblock1_2(x, y)
+            new_y = self.xblock2_1(y, x)
+            out1 = self.aggr1(repeat(self.output_token_1, 'n d -> b n d', b=x.shape[0]), context=new_x)
+            out2 = self.aggr2(repeat(self.output_token_2, 'n d -> b n d', b=x.shape[0]), context=new_y)
+            return out1 + out2
+        return checkpoint.checkpoint(forward_chunk, x, y)
+class CorrPointformer(nn.Module):
+    def __init__(self, dim: int, num_heads: int, head_dim: int, mlp_ratio: float):
+        super().__init__()
+        self.dim = dim
+        self.xblock1_2 = CrossAttnBlock(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim, mlp_ratio=mlp_ratio)
+        # self.xblock2_1 = CrossAttnBlock(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim, mlp_ratio=mlp_ratio)
+        self.aggr = CrossAttnBlock(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim, mlp_ratio=mlp_ratio)
+        self.out_proj = nn.Linear(dim, 2*dim)
+    def forward(self, query: torch.Tensor, target: torch.Tensor, target_rel_pos: torch.Tensor) -> torch.Tensor:
+        from einops import rearrange, repeat
+        import torch.utils.checkpoint as checkpoint
+        def forward_chunk(query, target, target_rel_pos):
+            new_query = self.xblock1_2(query, target).mean(dim=1, keepdim=True)
+            # new_target = self.xblock2_1(target, query).mean(dim=1, keepdim=True)
+            # new_aggr = new_query + new_target
+            out = self.aggr(new_query, target+target_rel_pos)  # (potential delta xyz)  (target - center)
+            out = self.out_proj(out)
+            return out
+        return checkpoint.checkpoint(forward_chunk, query, target, target_rel_pos)

models/SpaTrackV2/models/tracker3D/delta_utils/__init__.py ADDED Viewed

File without changes

models/SpaTrackV2/models/tracker3D/delta_utils/blocks.py ADDED Viewed

	@@ -0,0 +1,842 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import collections
+from functools import partial
+from itertools import repeat
+from typing import Callable
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models.SpaTrackV2.models.blocks import bilinear_sampler
+from einops import rearrange
+from torch import Tensor, einsum
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+to_2tuple = _ntuple(2)
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+        zero_init=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+        if zero_init:
+            self.zero_init()
+    def zero_init(self):
+        nn.init.constant_(self.fc2.weight, 0)
+        if self.fc2.bias is not None:
+            nn.init.constant_(self.fc2.bias, 0)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x, mode="nearest"):
+        x = F.interpolate(x, scale_factor=2.0, mode=mode)
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes,
+            planes,
+            kernel_size=3,
+            padding=1,
+            stride=stride,
+            padding_mode="zeros",
+        )
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, padding_mode="zeros")
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+        elif norm_fn == "none":
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+class BasicEncoder(nn.Module):
+    def __init__(self, input_dim=3, output_dim=128, stride=4):
+        super(BasicEncoder, self).__init__()
+        self.stride = stride
+        self.norm_fn = "instance"
+        self.in_planes = output_dim // 2
+        self.norm1 = nn.InstanceNorm2d(self.in_planes)
+        self.norm2 = nn.InstanceNorm2d(output_dim * 2)
+        self.conv1 = nn.Conv2d(
+            input_dim,
+            self.in_planes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            padding_mode="zeros",
+        )
+        self.relu1 = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(output_dim // 2, stride=1)
+        self.layer2 = self._make_layer(output_dim // 4 * 3, stride=2)
+        self.layer3 = self._make_layer(output_dim, stride=2)
+        self.layer4 = self._make_layer(output_dim, stride=2)
+        self.conv2 = nn.Conv2d(
+            output_dim * 3 + output_dim // 4,
+            output_dim * 2,
+            kernel_size=3,
+            padding=1,
+            padding_mode="zeros",
+        )
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(output_dim * 2, output_dim, kernel_size=1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.InstanceNorm2d)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x, return_intermediate=False):
+        _, _, H, W = x.shape
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        a = self.layer1(x)
+        b = self.layer2(a)
+        c = self.layer3(b)
+        d = self.layer4(c)
+        def _bilinear_intepolate(x):
+            return F.interpolate(
+                x,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+        # a = _bilinear_intepolate(a)
+        # b = _bilinear_intepolate(b)
+        # c = _bilinear_intepolate(c)
+        # d = _bilinear_intepolate(d)
+        cat_feat = torch.cat(
+            [_bilinear_intepolate(a), _bilinear_intepolate(b), _bilinear_intepolate(c), _bilinear_intepolate(d)], dim=1
+        )
+        x = self.conv2(cat_feat)
+        x = self.norm2(x)
+        x = self.relu2(x)
+        x = self.conv3(x)
+        # breakpoint()
+        if return_intermediate:
+            if self.stride == 4:
+                return x, a, c  # 128, h/4, w/4, - 64, h/2, w/2 - 128, h/8, w/8
+            elif self.stride == 8:
+                return x, b, d
+            else:
+                raise NotImplementedError
+        return x
+class CorrBlockFP16:
+    def __init__(
+        self,
+        fmaps,
+        num_levels=4,
+        radius=4,
+        multiple_track_feats=False,
+        padding_mode="zeros",
+    ):
+        B, S, C, H, W = fmaps.shape
+        self.S, self.C, self.H, self.W = S, C, H, W
+        self.padding_mode = padding_mode
+        self.num_levels = num_levels
+        self.radius = radius
+        self.fmaps_pyramid = []
+        self.multiple_track_feats = multiple_track_feats
+        self.fmaps_pyramid.append(fmaps)
+        for i in range(self.num_levels - 1):
+            fmaps_ = fmaps.reshape(B * S, C, H, W)
+            fmaps_ = F.avg_pool2d(fmaps_, 2, stride=2)
+            _, _, H, W = fmaps_.shape
+            fmaps = fmaps_.reshape(B, S, C, H, W)
+            self.fmaps_pyramid.append(fmaps)
+    def sample(self, coords):
+        r = self.radius
+        B, S, N, D = coords.shape
+        assert D == 2
+        H, W = self.H, self.W
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corrs = self.corrs_pyramid[i]  # B, S, N, H, W
+            *_, H, W = corrs.shape
+            dx = torch.linspace(-r, r, 2 * r + 1)
+            dy = torch.linspace(-r, r, 2 * r + 1)
+            delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1).to(coords.device)
+            centroid_lvl = coords.reshape(B * S * N, 1, 1, 2) / 2**i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+            # breakpoint()
+            corrs = bilinear_sampler(
+                corrs.reshape(B * S * N, 1, H, W),
+                coords_lvl,
+                padding_mode=self.padding_mode,
+            )
+            corrs = corrs.view(B, S, N, -1)
+            out_pyramid.append(corrs)
+        del self.corrs_pyramid
+        out = torch.cat(out_pyramid, dim=-1)  # B, S, N, LRR*2
+        out = out.permute(0, 2, 1, 3).contiguous().view(B * N, S, -1).float()
+        return out
+    def corr(self, targets):
+        B, S, N, C = targets.shape
+        if self.multiple_track_feats:
+            targets_split = targets.split(C // self.num_levels, dim=-1)
+            B, S, N, C = targets_split[0].shape
+        assert C == self.C
+        assert S == self.S
+        fmap1 = targets
+        self.corrs_pyramid = []
+        for i, fmaps in enumerate(self.fmaps_pyramid):
+            *_, H, W = fmaps.shape
+            fmap2s = fmaps.view(B, S, C, H * W)  # B S C H W ->  B S C (H W)
+            if self.multiple_track_feats:
+                fmap1 = targets_split[i]
+            corrs = torch.matmul(fmap1, fmap2s)
+            corrs = corrs.view(B, S, N, H, W)  # B S N (H W) -> B S N H W
+            corrs = corrs / torch.sqrt(torch.tensor(C).float())
+            # breakpoint()
+            self.corrs_pyramid.append(corrs)
+class CorrBlock:
+    def __init__(
+        self,
+        fmaps,
+        num_levels=4,
+        radius=4,
+        multiple_track_feats=False,
+        padding_mode="zeros",
+    ):
+        B, S, C, H, W = fmaps.shape
+        self.S, self.C, self.H, self.W = S, C, H, W
+        self.padding_mode = padding_mode
+        self.num_levels = num_levels
+        self.radius = radius
+        self.fmaps_pyramid = []
+        self.multiple_track_feats = multiple_track_feats
+        self.fmaps_pyramid.append(fmaps)
+        for i in range(self.num_levels - 1):
+            fmaps_ = fmaps.reshape(B * S, C, H, W)
+            fmaps_ = F.avg_pool2d(fmaps_, 2, stride=2)
+            _, _, H, W = fmaps_.shape
+            fmaps = fmaps_.reshape(B, S, C, H, W)
+            self.fmaps_pyramid.append(fmaps)
+    def sample(self, coords, delete=True):
+        r = self.radius
+        B, S, N, D = coords.shape
+        assert D == 2
+        H, W = self.H, self.W
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corrs = self.corrs_pyramid[i]  # B, S, N, H, W
+            *_, H, W = corrs.shape
+            dx = torch.linspace(-r, r, 2 * r + 1)
+            dy = torch.linspace(-r, r, 2 * r + 1)
+            delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1).to(coords.device)
+            centroid_lvl = coords.reshape(B * S * N, 1, 1, 2) / 2**i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+            # breakpoint()
+            # t1 = time.time()
+            corrs = bilinear_sampler(
+                corrs.reshape(B * S * N, 1, H, W),
+                coords_lvl,
+                padding_mode=self.padding_mode,
+            )
+            # t2 = time.time()
+            # print(coords_lvl.shape, t2 - t1)
+            corrs = corrs.view(B, S, N, -1)
+            out_pyramid.append(corrs)
+        if delete:
+            del self.corrs_pyramid
+        out = torch.cat(out_pyramid, dim=-1)  # B, S, N, LRR*2
+        out = out.permute(0, 2, 1, 3).contiguous().view(B * N, S, -1).float()
+        return out
+    def corr(self, targets):
+        B, S, N, C = targets.shape
+        if self.multiple_track_feats:
+            targets_split = targets.split(C // self.num_levels, dim=-1)
+            B, S, N, C = targets_split[0].shape
+        assert C == self.C
+        assert S == self.S
+        fmap1 = targets
+        self.corrs_pyramid = []
+        for i, fmaps in enumerate(self.fmaps_pyramid):
+            *_, H, W = fmaps.shape
+            fmap2s = fmaps.view(B, S, C, H * W)  # B S C H W ->  B S C (H W)
+            if self.multiple_track_feats:
+                fmap1 = targets_split[i]
+            corrs = torch.matmul(fmap1, fmap2s)
+            corrs = corrs.view(B, S, N, H, W)  # B S N (H W) -> B S N H W
+            corrs = corrs / torch.sqrt(torch.tensor(C).float())
+            # breakpoint()
+            self.corrs_pyramid.append(corrs)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        num_heads=8,
+        dim_head=48,
+        qkv_bias=False,
+        flash=False,
+        alibi=False,
+        zero_init=False,
+    ):
+        super().__init__()
+        inner_dim = dim_head * num_heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = num_heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=qkv_bias)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias=qkv_bias)
+        self.to_out = nn.Linear(inner_dim, query_dim)
+        self.flash = flash
+        self.alibi = alibi
+        if zero_init:
+            self.zero_init()
+            # if self.alibi:
+            #     self.training_length = 24
+            #     bias_forward = get_alibi_slope(self.heads // 2) * get_relative_positions(self.training_length)
+            #     bias_forward = bias_forward + torch.triu(torch.full_like(bias_forward, -1e9), diagonal=1)
+            #     bias_backward = get_alibi_slope(self.heads // 2) * get_relative_positions(self.training_length, reverse=True)
+            #     bias_backward = bias_backward + torch.tril(torch.full_like(bias_backward, -1e9), diagonal=-1)
+            #     self.precomputed_attn_bias = self.register_buffer("precomputed_attn_bias", torch.cat([bias_forward, bias_backward], dim=0), persistent=False)
+    def zero_init(self):
+        nn.init.constant_(self.to_out.weight, 0)
+        nn.init.constant_(self.to_out.bias, 0)
+        # breakpoint()
+    def forward(self, x, context=None, attn_bias=None):
+        B, N1, C = x.shape
+        h = self.heads
+        q = self.to_q(x).reshape(B, N1, h, C // h)
+        context = default(context, x)
+        N2 = context.shape[1]
+        k, v = self.to_kv(context).chunk(2, dim=-1)
+        k = k.reshape(B, N2, h, C // h)
+        v = v.reshape(B, N2, h, C // h)
+        if self.flash:
+            with torch.autocast(device_type="cuda", enabled=True):
+                x = flash_attn_func(q.half(), k.half(), v.half())
+                x = x.reshape(B, N1, C)
+            x = x.float()
+        else:
+            q = q.permute(0, 2, 1, 3)
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            sim = (q @ k.transpose(-2, -1)) * self.scale
+            if attn_bias is not None:
+                sim = sim + attn_bias
+            attn = sim.softmax(dim=-1)
+            x = attn @ v
+            x = x.transpose(1, 2).reshape(B, N1, C)
+        x = self.to_out(x)
+        return x
+    def forward_noattn(self, x):
+        # B, N1, C = x.shape
+        # h = self.heads
+        _, x = self.to_kv(x).chunk(2, dim=-1)
+        # x = x.reshape(B, N1, h, C // h).permute(0, 2, 1, 3)
+        # x = x.transpose(1, 2).reshape(B, N1, C)
+        x = self.to_out(x)
+        return x
+def get_relative_positions(seq_len, reverse=False, device="cpu"):
+    x = torch.arange(seq_len, device=device)[None, :]
+    y = torch.arange(seq_len, device=device)[:, None]
+    return torch.tril(x - y) if not reverse else torch.triu(y - x)
+def get_alibi_slope(num_heads, device="cpu"):
+    x = (24) ** (1 / num_heads)
+    return torch.tensor([1 / x ** (i + 1) for i in range(num_heads)], device=device, dtype=torch.float32).view(
+        -1, 1, 1
+    )
+class RelativeAttention(nn.Module):
+    """Multi-headed attention (MHA) module."""
+    def __init__(self, query_dim, num_heads=8, qkv_bias=True, model_size=None, flash=False):
+        super(RelativeAttention, self).__init__()
+        query_dim = query_dim // num_heads
+        self.num_heads = num_heads
+        self.query_dim = query_dim
+        self.value_size = query_dim
+        self.model_size = query_dim * num_heads
+        self.qkv_bias = qkv_bias
+        self.query_proj = nn.Linear(num_heads * query_dim, num_heads * query_dim, bias=qkv_bias)
+        self.key_proj = nn.Linear(num_heads * query_dim, num_heads * query_dim, bias=qkv_bias)
+        self.value_proj = nn.Linear(num_heads * self.value_size, num_heads * self.value_size, bias=qkv_bias)
+        self.final_proj = nn.Linear(num_heads * self.value_size, self.model_size, bias=qkv_bias)
+        self.training_length = 24
+        bias_forward = get_alibi_slope(self.num_heads // 2) * get_relative_positions(self.training_length)
+        bias_forward = bias_forward + torch.triu(torch.full_like(bias_forward, -1e9), diagonal=1)
+        bias_backward = get_alibi_slope(self.num_heads // 2) * get_relative_positions(
+            self.training_length, reverse=True
+        )
+        bias_backward = bias_backward + torch.tril(torch.full_like(bias_backward, -1e9), diagonal=-1)
+        self.register_buffer(
+            "precomputed_attn_bias", torch.cat([bias_forward, bias_backward], dim=0), persistent=False
+        )
+    def forward(self, x, attn_bias=None):
+        batch_size, sequence_length, _ = x.size()
+        query_heads = self._linear_projection(x, self.query_dim, self.query_proj)  # [T', H, Q=K]
+        key_heads = self._linear_projection(x, self.query_dim, self.key_proj)  # [T, H, K]
+        value_heads = self._linear_projection(x, self.value_size, self.value_proj)  # [T, H, V]
+        if self.training_length == sequence_length:
+            new_attn_bias = self.precomputed_attn_bias
+        else:
+            device = x.device
+            bias_forward = get_alibi_slope(self.num_heads // 2, device=device) * get_relative_positions(
+                sequence_length, device=device
+            )
+            bias_forward = bias_forward + torch.triu(torch.full_like(bias_forward, -1e9), diagonal=1)
+            bias_backward = get_alibi_slope(self.num_heads // 2, device=device) * get_relative_positions(
+                sequence_length, reverse=True, device=device
+            )
+            bias_backward = bias_backward + torch.tril(torch.full_like(bias_backward, -1e9), diagonal=-1)
+            new_attn_bias = torch.cat([bias_forward, bias_backward], dim=0)
+        if attn_bias is not None:
+            attn_bias = attn_bias + new_attn_bias
+        else:
+            attn_bias = new_attn_bias
+        attn = F.scaled_dot_product_attention(
+            query_heads, key_heads, value_heads, attn_mask=new_attn_bias, scale=1 / np.sqrt(self.query_dim)
+        )
+        attn = attn.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, -1)
+        return self.final_proj(attn)  # [T', D']
+        # attn_logits = torch.einsum("...thd,...Thd->...htT", query_heads, key_heads)
+        # attn_logits = attn_logits / np.sqrt(self.query_dim) + new_attn_bias
+        # # breakpoint()
+        # if attn_bias is not None:
+        #     if attn_bias.ndim != attn_logits.ndim:
+        #         raise ValueError(f"Mask dimensionality {attn_bias.ndim} must match logits dimensionality {attn_logits.ndim}.")
+        #     attn_logits = torch.where(attn_bias, attn_logits, torch.tensor(-1e30))
+        # attn_weights = F.softmax(attn_logits, dim=-1)  # [H, T', T]
+        # attn = torch.einsum("...htT,...Thd->...thd", attn_weights, value_heads)
+        # attn = attn.reshape(batch_size, sequence_length, -1)  # [T', H*V]
+        # return self.final_proj(attn)  # [T', D']
+    # def _linear_projection(self, x, head_size, proj_layer):
+    #     y = proj_layer(x)
+    #     *leading_dims, _ = x.shape
+    #     return y.reshape((*leading_dims, self.num_heads, head_size))
+    def _linear_projection(self, x, head_size, proj_layer):
+        y = proj_layer(x)
+        batch_size, sequence_length, _ = x.shape
+        return y.reshape((batch_size, sequence_length, self.num_heads, head_size)).permute(0, 2, 1, 3)
+class AttnBlock(nn.Module):
+    def __init__(
+        self, hidden_size, num_heads, attn_class: Callable[..., nn.Module] = Attention, mlp_ratio=4.0, **block_kwargs
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = attn_class(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+    def forward(self, x, mask=None):
+        attn_bias = mask
+        if mask is not None:
+            mask = (mask[:, None] * mask[:, :, None]).unsqueeze(1).expand(-1, self.attn.heads, -1, -1)
+            max_neg_value = -torch.finfo(x.dtype).max
+            attn_bias = (~mask) * max_neg_value
+        x = x + self.attn(self.norm1(x), attn_bias=attn_bias)
+        x = x + self.mlp(self.norm2(x))
+        return x
+    def forward_noattn(self, x):
+        x = x + self.attn.forward_noattn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+def pix2cam(coords, intr, detach=True):
+    """
+    Args:
+        coords: [B, T, N, 3]
+        intr: [B, T, 3, 3]
+    """
+    if detach:
+        coords = coords.detach()
+    (
+        B,
+        S,
+        N,
+        _,
+    ) = coords.shape
+    xy_src = coords.reshape(B * S * N, 3)
+    intr = intr[:, :, None, ...].repeat(1, 1, N, 1, 1).reshape(B * S * N, 3, 3)
+    xy_src = torch.cat([xy_src[..., :2], torch.ones_like(xy_src[..., :1])], dim=-1)
+    xyz_src = (torch.inverse(intr) @ xy_src[..., None])[..., 0]
+    dp_pred = coords[..., 2]
+    xyz_src_ = xyz_src * (dp_pred.reshape(S * N, 1))
+    xyz_src_ = xyz_src_.reshape(B, S, N, 3)
+    return xyz_src_
+def cam2pix(coords, intr):
+    """
+    Args:
+        coords: [B, T, N, 3]
+        intr: [B, T, 3, 3]
+    """
+    coords = coords.detach()
+    (
+        B,
+        S,
+        N,
+        _,
+    ) = coords.shape
+    xy_src = coords.reshape(B * S * N, 3).clone()
+    intr = intr[:, :, None, ...].repeat(1, 1, N, 1, 1).reshape(B * S * N, 3, 3)
+    xy_src = xy_src / (xy_src[..., 2:] + 1e-5)
+    xyz_src = (intr @ xy_src[..., None])[..., 0]
+    dp_pred = coords[..., 2]
+    xyz_src[..., 2] *= dp_pred.reshape(S * N)
+    xyz_src = xyz_src.reshape(B, S, N, 3)
+    return xyz_src
+class BroadMultiHeadAttention(nn.Module):
+    def __init__(self, dim, heads):
+        super(BroadMultiHeadAttention, self).__init__()
+        self.dim = dim
+        self.heads = heads
+        self.scale = (dim / heads) ** -0.5
+        self.attend = nn.Softmax(dim=-1)
+    def attend_with_rpe(self, Q, K):
+        Q = rearrange(Q.squeeze(), "i (heads d) -> heads i d", heads=self.heads)
+        K = rearrange(K, "b j (heads d) -> b heads j d", heads=self.heads)
+        dots = einsum("hid, bhjd -> bhij", Q, K) * self.scale  # (b hw) heads 1 pointnum
+        return self.attend(dots)
+    def forward(self, Q, K, V):
+        attn = self.attend_with_rpe(Q, K)
+        B, _, _ = K.shape
+        _, N, _ = Q.shape
+        V = rearrange(V, "b j (heads d) -> b heads j d", heads=self.heads)
+        out = einsum("bhij, bhjd -> bhid", attn, V)
+        out = rearrange(out, "b heads n d -> b n (heads d)", b=B, n=N)
+        return out
+class CrossAttentionLayer(nn.Module):
+    def __init__(
+        self,
+        qk_dim,
+        v_dim,
+        query_token_dim,
+        tgt_token_dim,
+        num_heads=8,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        drop_path=0.0,
+        dropout=0.0,
+    ):
+        super(CrossAttentionLayer, self).__init__()
+        assert qk_dim % num_heads == 0, f"dim {qk_dim} should be divided by num_heads {num_heads}."
+        assert v_dim % num_heads == 0, f"dim {v_dim} should be divided by num_heads {num_heads}."
+        """
+            Query Token:    [N, C]  -> [N, qk_dim]  (Q)
+            Target Token:   [M, D]  -> [M, qk_dim]  (K),    [M, v_dim]  (V)
+        """
+        self.num_heads = num_heads
+        head_dim = qk_dim // num_heads
+        self.scale = head_dim**-0.5
+        self.norm1 = nn.LayerNorm(query_token_dim)
+        self.norm2 = nn.LayerNorm(query_token_dim)
+        self.multi_head_attn = BroadMultiHeadAttention(qk_dim, num_heads)
+        self.q, self.k, self.v = (
+            nn.Linear(query_token_dim, qk_dim, bias=True),
+            nn.Linear(tgt_token_dim, qk_dim, bias=True),
+            nn.Linear(tgt_token_dim, v_dim, bias=True),
+        )
+        self.proj = nn.Linear(v_dim, query_token_dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.ffn = nn.Sequential(
+            nn.Linear(query_token_dim, query_token_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(query_token_dim, query_token_dim),
+            nn.Dropout(dropout),
+        )
+    def forward(self, query, tgt_token):
+        """
+        x: [BH1W1, H3W3, D]
+        """
+        short_cut = query
+        query = self.norm1(query)
+        q, k, v = self.q(query), self.k(tgt_token), self.v(tgt_token)
+        x = self.multi_head_attn(q, k, v)
+        x = short_cut + self.proj_drop(self.proj(x))
+        x = x + self.drop_path(self.ffn(self.norm2(x)))
+        return x
+class LayerNormProxy(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x):
+        x = rearrange(x, "b c h w -> b h w c")
+        x = self.norm(x)
+        return rearrange(x, "b h w c -> b c h w")
+def posenc(x, min_deg, max_deg, legacy_posenc_order=False):
+    """Cat x with a positional encoding of x with scales 2^[min_deg, max_deg-1].
+    Instead of computing [sin(x), cos(x)], we use the trig identity
+    cos(x) = sin(x + pi/2) and do one vectorized call to sin([x, x+pi/2]).
+    Args:
+        x: torch.Tensor, variables to be encoded. Note that x should be in [-pi, pi].
+        min_deg: int, the minimum (inclusive) degree of the encoding.
+        max_deg: int, the maximum (exclusive) degree of the encoding.
+        legacy_posenc_order: bool, keep the same ordering as the original tf code.
+    Returns:
+        encoded: torch.Tensor, encoded variables.
+    """
+    if min_deg == max_deg:
+        return x
+    scales = torch.tensor([2**i for i in range(min_deg, max_deg)], dtype=x.dtype, device=x.device)
+    if legacy_posenc_order:
+        xb = x[..., None, :] * scales[:, None]
+        four_feat = torch.reshape(torch.sin(torch.stack([xb, xb + 0.5 * np.pi], dim=-2)), list(x.shape[:-1]) + [-1])
+    else:
+        xb = torch.reshape((x[..., None, :] * scales[:, None]), list(x.shape[:-1]) + [-1])
+        four_feat = torch.sin(torch.cat([xb, xb + 0.5 * np.pi], dim=-1))
+    return torch.cat([x] + [four_feat], dim=-1)
+def gaussian2D2(shape, sigma=(1, 1), rho=0):
+    if not isinstance(sigma, tuple):
+        sigma = (sigma, sigma)
+    sigma_x, sigma_y = sigma
+    m, n = [(ss - 1.0) / 2.0 for ss in shape]
+    y, x = np.ogrid[-m : m + 1, -n : n + 1]
+    energy = (x * x) / (sigma_x * sigma_x) - 2 * rho * x * y / (sigma_x * sigma_y) + (y * y) / (sigma_y * sigma_y)
+    h = np.exp(-energy / (2 * (1 - rho * rho)))
+    h[h < np.finfo(h.dtype).eps * h.max()] = 0
+    return h / h.sum()

models/SpaTrackV2/models/tracker3D/delta_utils/upsample_transformer.py ADDED Viewed

	@@ -0,0 +1,438 @@

+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import einsum, rearrange, repeat
+from jaxtyping import Float, Int64
+from torch import Tensor, nn
+from models.SpaTrackV2.models.tracker3D.delta_utils.blocks import (
+    Attention,
+    AttnBlock,
+    BasicEncoder,
+    CorrBlock,
+    Mlp,
+    ResidualBlock,
+    Upsample,
+    cam2pix,
+    pix2cam,
+)
+from models.SpaTrackV2.models.blocks import bilinear_sampler
+def get_grid(height, width, shape=None, dtype="torch", device="cpu", align_corners=True, normalize=True):
+    H, W = height, width
+    S = shape if shape else []
+    if align_corners:
+        x = torch.linspace(0, 1, W, device=device)
+        y = torch.linspace(0, 1, H, device=device)
+        if not normalize:
+            x = x * (W - 1)
+            y = y * (H - 1)
+    else:
+        x = torch.linspace(0.5 / W, 1.0 - 0.5 / W, W, device=device)
+        y = torch.linspace(0.5 / H, 1.0 - 0.5 / H, H, device=device)
+        if not normalize:
+            x = x * W
+            y = y * H
+    x_view, y_view, exp = [1 for _ in S] + [1, -1], [1 for _ in S] + [-1, 1], S + [H, W]
+    x = x.view(*x_view).expand(*exp)
+    y = y.view(*y_view).expand(*exp)
+    grid = torch.stack([x, y], dim=-1)
+    if dtype == "numpy":
+        grid = grid.numpy()
+    return grid
+class RelativeAttention(nn.Module):
+    """Multi-headed attention (MHA) module."""
+    def __init__(self, query_dim, num_heads=8, qkv_bias=True, model_size=None, flash=False):
+        super(RelativeAttention, self).__init__()
+        query_dim = query_dim // num_heads
+        self.num_heads = num_heads
+        self.query_dim = query_dim
+        self.value_size = query_dim
+        self.model_size = query_dim * num_heads
+        self.qkv_bias = qkv_bias
+        self.flash = flash
+        self.query_proj = nn.Linear(num_heads * query_dim, num_heads * query_dim, bias=qkv_bias)
+        self.key_proj = nn.Linear(num_heads * query_dim, num_heads * query_dim, bias=qkv_bias)
+        self.value_proj = nn.Linear(num_heads * self.value_size, num_heads * self.value_size, bias=qkv_bias)
+        self.final_proj = nn.Linear(num_heads * self.value_size, self.model_size, bias=qkv_bias)
+        self.scale = 1.0 / math.sqrt(self.query_dim)
+        # self.training_length = 24
+        # bias_forward = get_alibi_slope(self.num_heads // 2) * get_relative_positions(self.training_length)
+        # bias_forward = bias_forward + torch.triu(torch.full_like(bias_forward, -1e9), diagonal=1)
+        # bias_backward = get_alibi_slope(self.num_heads // 2) * get_relative_positions(self.training_length, reverse=True)
+        # bias_backward = bias_backward + torch.tril(torch.full_like(bias_backward, -1e9), diagonal=-1)
+        # self.register_buffer("precomputed_attn_bias", torch.cat([bias_forward, bias_backward], dim=0), persistent=False)
+    def forward(self, x, context, attn_bias=None):
+        B, N1, C = x.size()
+        q = self._linear_projection(x, self.query_dim, self.query_proj)  # [T', H, Q=K]
+        k = self._linear_projection(context, self.query_dim, self.key_proj)  # [T, H, K]
+        v = self._linear_projection(context, self.value_size, self.value_proj)  # [T, H, V]
+        if self.flash:
+            with torch.autocast(device_type="cuda", enabled=True):
+                x = flash_attn_func(q.half(), k.half(), v.half())
+                x = x.reshape(B, N1, C)
+            x = x.float()
+        else:
+            q = q.permute(0, 2, 1, 3)
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            sim = (q @ k.transpose(-2, -1)) * self.scale
+            if attn_bias is not None:
+                sim = sim + attn_bias
+            attn = sim.softmax(dim=-1)
+            x = attn @ v
+            x = x.transpose(1, 2).reshape(B, N1, C)
+        # with torch.autocast(device_type="cuda", dtype=torch.float32):
+        #     attn = F.scaled_dot_product_attention(query_heads, key_heads, value_heads, attn_mask=attn_bias, scale=1.0 / math.sqrt(self.query_dim))
+        # else:
+        #     sim = (query_heads @ key_heads.transpose(-2, -1)) * self.scale
+        #     if attn_bias is not None:
+        #         sim = sim + attn_bias
+        #     attn = sim.softmax(dim=-1)
+        #     attn = (attn @ value_heads)
+        # attn = attn.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, -1)
+        return self.final_proj(x)  # [T', D']
+    def _linear_projection(self, x, head_size, proj_layer):
+        batch_size, sequence_length, _ = x.shape
+        y = proj_layer(x)
+        y = y.reshape((batch_size, sequence_length, self.num_heads, head_size))
+        return y
+class UpsampleCrossAttnBlock(nn.Module):
+    def __init__(self, hidden_size, context_dim, num_heads=1, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm_context = nn.LayerNorm(hidden_size)
+        self.cross_attn = RelativeAttention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+    def forward(self, x, context, attn_bias=None):
+        x = x + self.cross_attn(x=self.norm1(x), context=self.norm_context(context), attn_bias=attn_bias)
+        x = x + self.mlp(self.norm2(x))
+        return x
+class DecoderUpsampler(nn.Module):
+    def __init__(self, in_channels: int, middle_channels: int, out_channels: int = None, stride: int = 4):
+        super().__init__()
+        self.stride = stride
+        if out_channels is None:
+            out_channels = middle_channels
+        self.conv_in = nn.Conv2d(in_channels, middle_channels, kernel_size=(3, 3), stride=(1, 1), padding=1)
+        self.norm1 = nn.GroupNorm(num_groups=middle_channels // 8, num_channels=middle_channels, eps=1e-6)
+        self.res_blocks = nn.ModuleList()
+        self.upsample_blocks = nn.ModuleList()
+        for i in range(int(math.log2(self.stride))):
+            self.res_blocks.append(ResidualBlock(middle_channels, middle_channels))
+            self.upsample_blocks.append(Upsample(middle_channels, with_conv=True))
+            # in_channels = middle_channels
+        self.norm2 = nn.GroupNorm(num_groups=middle_channels // 8, num_channels=middle_channels, eps=1e-6)
+        self.conv_out = nn.Conv2d(middle_channels, out_channels, kernel_size=(3, 3), stride=(1, 1), padding=1)
+        self.initialize_weight()
+    def initialize_weight(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Conv2d):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.res_blocks.apply(_basic_init)
+        self.conv_in.apply(_basic_init)
+        self.conv_out.apply(_basic_init)
+    def forward(
+        self,
+        x: Float[Tensor, "b c1 h_down w_down"],
+        mode: str = "nearest",
+    ) -> Float[Tensor, "b c1 h_up w_up"]:
+        x = F.relu(self.norm1(self.conv_in(x)))
+        for i in range(len(self.res_blocks)):
+            x = self.res_blocks[i](x)
+            x = self.upsample_blocks[i](x, mode=mode)
+        x = self.conv_out(F.relu(self.norm2(x)))
+        return x
+class UpsampleTransformer(nn.Module):
+    def __init__(
+        self,
+        kernel_size: int = 3,
+        stride: int = 4,
+        latent_dim: int = 128,
+        n_heads: int = 4,
+        num_attn_blocks: int = 2,
+        use_rel_emb: bool = True,
+        flash: bool = False,
+    ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.latent_dim = latent_dim
+        self.n_heads = n_heads
+        self.attnup_feat_cnn = DecoderUpsampler(
+            in_channels=self.latent_dim, middle_channels=self.latent_dim, out_channels=self.latent_dim
+        )
+        self.cross_blocks = nn.ModuleList(
+            [
+                UpsampleCrossAttnBlock(latent_dim + 64, latent_dim + 64, num_heads=n_heads, mlp_ratio=4, flash=flash)
+                for _ in range(num_attn_blocks)
+            ]
+        )
+        self.flow_mlp = nn.Sequential(
+            nn.Conv2d(2 * 16, 128, 7, padding=3),
+            nn.ReLU(),
+            nn.Conv2d(128, 64, 3, padding=1),
+            nn.ReLU(),
+        )
+        self.out = nn.Linear(latent_dim + 64, kernel_size * kernel_size, bias=True)
+        if use_rel_emb:
+            self.rpb_attnup = nn.Parameter(torch.zeros(kernel_size * kernel_size))
+            torch.nn.init.trunc_normal_(self.rpb_attnup, std=0.1, mean=0.0, a=-2.0, b=2.0)
+        else:
+            self.rpb_attnup = None
+    def forward(
+        self,
+        feat_map: Float[Tensor, "b c1 h w"],
+        flow_map: Float[Tensor, "b c2 h w"],
+    ):
+        B = feat_map.shape[0]
+        H_down, W_down = feat_map.shape[-2:]
+        # x0, y0 = x0y0
+        feat_map_up = self.attnup_feat_cnn(feat_map)  # learnable upsample by 4
+        # feat_map_down = F.interpolate(feat_map_up, scale_factor=1/self.stride, mode='nearest') # B C H*4 W*4
+        feat_map_down = feat_map
+        # depths_down = F.interpolate(depths, scale_factor=1/self.stride, mode='nearest')
+        # NOTE prepare attention bias
+        # depths_down_ = torch.stack([depths_down[b, :, y0_:y0_+H_down, x0_:x0_+W_down] for b, (x0_,y0_) in enumerate(zip(x0, y0))], dim=0)
+        # depths_ = torch.stack([depths[b, :, y0_*4:y0_*4+H_down*4, x0_*4:x0_*4+W_down*4] for b, (x0_,y0_) in enumerate(zip(x0, y0))], dim=0)
+        # guidance_downsample = F.interpolate(guidance, size=(H, W), mode='nearest')
+        pad_val = (self.kernel_size - 1) // 2
+        # depths_down_padded = F.pad(depths_down_, (pad_val, pad_val, pad_val, pad_val), "replicate")
+        if self.rpb_attnup is not None:
+            relative_pos_attn_map = self.rpb_attnup.view(1, 1, -1, 1, 1).repeat(
+                B, self.n_heads, 1, H_down * 4, W_down * 4
+            )
+            relative_pos_attn_map = rearrange(relative_pos_attn_map, "b k n h w -> (b h w) k 1 n")
+            attn_bias = relative_pos_attn_map
+        else:
+            attn_bias = None
+        # NOTE prepare context (low-reso feat)
+        context = feat_map_down
+        context = F.unfold(context, kernel_size=self.kernel_size, padding=pad_val)  # B C*kernel**2 H W
+        context = rearrange(context, "b c (h w) -> b c h w", h=H_down, w=W_down)
+        context = F.interpolate(context, scale_factor=self.stride, mode="nearest")  # B C*kernel**2 H*4 W*4
+        context = rearrange(context, "b (c i j) h w -> (b h w) (i j) c", i=self.kernel_size, j=self.kernel_size)
+        # NOTE prepare queries (high-reso feat)
+        x = feat_map_up
+        x = rearrange(x, "b c h w -> (b h w) 1 c")
+        assert flow_map.shape[-2:] == feat_map.shape[-2:]
+        flow_map = rearrange(flow_map, "b t c h w -> b (t c) h w")
+        flow_map = self.flow_mlp(flow_map)
+        nn_flow_map = F.unfold(flow_map, kernel_size=self.kernel_size, padding=pad_val)  # B C*kernel**2 H W
+        nn_flow_map = rearrange(nn_flow_map, "b c (h w) -> b c h w", h=H_down, w=W_down)
+        nn_flow_map = F.interpolate(nn_flow_map, scale_factor=self.stride, mode="nearest")  # B C*kernel**2 H*4 W*4
+        nn_flow_map = rearrange(
+            nn_flow_map, "b (c i j) h w -> (b h w) (i j) c", i=self.kernel_size, j=self.kernel_size
+        )
+        up_flow_map = F.interpolate(flow_map, scale_factor=4, mode="nearest")  # NN up # b 2 h w
+        up_flow_map = rearrange(up_flow_map, "b c h w -> (b h w) 1 c")
+        context = torch.cat([context, nn_flow_map], dim=-1)
+        x = torch.cat([x, up_flow_map], dim=-1)
+        for lvl in range(len(self.cross_blocks)):
+            x = self.cross_blocks[lvl](x, context, attn_bias)
+        mask_out = self.out(x)
+        mask_out = F.softmax(mask_out, dim=-1)
+        mask_out = rearrange(mask_out, "(b h w) 1 c -> b c h w", h=H_down * self.stride, w=W_down * self.stride)
+        return mask_out
+def get_alibi_slope(num_heads):
+    x = (24) ** (1 / num_heads)
+    return torch.tensor([1 / x ** (i + 1) for i in range(num_heads)]).float()
+class UpsampleTransformerAlibi(nn.Module):
+    def __init__(
+            self,
+            kernel_size: int = 3,
+            stride: int = 4,
+            latent_dim: int = 128,
+            n_heads: int = 4,
+            num_attn_blocks: int = 2,
+            upsample_factor: int = 4,
+        ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.latent_dim = latent_dim
+        self.upsample_factor = upsample_factor
+        self.n_heads = n_heads
+        self.attnup_feat_cnn = DecoderUpsampler(
+            in_channels=self.latent_dim,
+            middle_channels=self.latent_dim,
+            out_channels=self.latent_dim,
+            # stride=self.upsample_factor
+        )
+        self.cross_blocks = nn.ModuleList(
+            [
+                UpsampleCrossAttnBlock(
+                    latent_dim+64,
+                    latent_dim+64,
+                    num_heads=n_heads,
+                    mlp_ratio=4,
+                    flash=False
+                )
+                for _ in range(num_attn_blocks)
+            ]
+        )
+        self.flow_mlp = nn.Sequential(
+            nn.Conv2d(3*32, 128, 7, padding=3),
+            nn.ReLU(),
+            nn.Conv2d(128, 64, 3, padding=1),
+            nn.ReLU(),
+        )
+        self.out = nn.Linear(latent_dim+64, kernel_size*kernel_size, bias=True)
+        alibi_slope = get_alibi_slope(n_heads // 2)
+        grid_kernel = get_grid(kernel_size, kernel_size, normalize=False).reshape(kernel_size, kernel_size, 2)
+        grid_kernel = grid_kernel - (kernel_size - 1) / 2
+        grid_kernel = -torch.abs(grid_kernel)
+        alibi_bias = torch.cat([
+            alibi_slope.view(-1,1,1) * grid_kernel[..., 0].view(1,kernel_size,kernel_size),
+            alibi_slope.view(-1,1,1) * grid_kernel[..., 1].view(1,kernel_size,kernel_size)
+        ]) # n_heads, kernel_size, kernel_size
+        self.register_buffer("alibi_bias", alibi_bias)
+    def forward(
+            self,
+            feat_map: Float[Tensor, "b c1 h w"],
+            flow_map: Float[Tensor, "b c2 h w"],
+        ):
+        B = feat_map.shape[0]
+        H_down, W_down = feat_map.shape[-2:]
+        feat_map_up = self.attnup_feat_cnn(feat_map) # learnable upsample by 4
+        if self.upsample_factor != 4:
+            additional_scale = float(self.upsample_factor / 4)
+            if additional_scale > 1:
+                feat_map_up = F.interpolate(feat_map_up, scale_factor=additional_scale, mode='bilinear', align_corners=False)
+            else:
+                feat_map_up = F.interpolate(feat_map_up, scale_factor=additional_scale, mode='nearest')
+        feat_map_down = feat_map
+        pad_val = (self.kernel_size - 1) // 2
+        attn_bias = self.alibi_bias.view(1,self.n_heads,self.kernel_size**2,1,1).repeat(B,1,1,H_down*self.upsample_factor,W_down*self.upsample_factor)
+        attn_bias = rearrange(attn_bias, "b k n h w -> (b h w) k 1 n")
+        # NOTE prepare context (low-reso feat)
+        context = feat_map_down
+        context = F.unfold(context, kernel_size=self.kernel_size, padding=pad_val) # B C*kernel**2 H W
+        context = rearrange(context, 'b c (h w) -> b c h w', h=H_down, w=W_down)
+        context = F.interpolate(context, scale_factor=self.upsample_factor, mode='nearest') # B C*kernel**2 H*4 W*4
+        context = rearrange(context, 'b (c i j) h w -> (b h w) (i j) c', i=self.kernel_size, j=self.kernel_size)
+        # NOTE prepare queries (high-reso feat)
+        x = feat_map_up
+        x = rearrange(x, 'b c h w -> (b h w) 1 c')
+        assert flow_map.shape[-2:] == feat_map.shape[-2:]
+        flow_map = rearrange(flow_map, 'b t c h w -> b (t c) h w')
+        flow_map = self.flow_mlp(flow_map)
+        nn_flow_map = F.unfold(flow_map, kernel_size=self.kernel_size, padding=pad_val) # B C*kernel**2 H W
+        nn_flow_map = rearrange(nn_flow_map, 'b c (h w) -> b c h w', h=H_down, w=W_down)
+        nn_flow_map = F.interpolate(nn_flow_map, scale_factor=self.upsample_factor, mode='nearest') # B C*kernel**2 H*4 W*4
+        nn_flow_map = rearrange(nn_flow_map, 'b (c i j) h w -> (b h w) (i j) c', i=self.kernel_size, j=self.kernel_size)
+        up_flow_map = F.interpolate(flow_map, scale_factor=self.upsample_factor, mode="nearest") # NN up # b 2 h w
+        up_flow_map = rearrange(up_flow_map, 'b c h w -> (b h w) 1 c')
+        context = torch.cat([context, nn_flow_map], dim=-1)
+        x = torch.cat([x, up_flow_map], dim=-1)
+        for lvl in range(len(self.cross_blocks)):
+            x = self.cross_blocks[lvl](x, context, attn_bias)
+        mask_out = self.out(x)
+        mask_out = F.softmax(mask_out, dim=-1)
+        mask_out = rearrange(mask_out, '(b h w) 1 c -> b c h w', h=H_down*self.upsample_factor, w=W_down*self.upsample_factor)
+        return mask_out

models/SpaTrackV2/models/tracker3D/spatrack_modules/alignment.py ADDED Viewed

	@@ -0,0 +1,471 @@

+from typing import *
+import math
+from collections import namedtuple
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.types
+import utils3d
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.geometry_torch import (
+    weighted_mean,
+    harmonic_mean,
+    geometric_mean,
+    mask_aware_nearest_resize,
+    normalized_view_plane_uv,
+    angle_diff_vec3
+)
+def scatter_min(size: int, dim: int, index: torch.LongTensor, src: torch.Tensor) -> torch.return_types.min:
+    "Scatter the minimum value along the given dimension of `input` into `src` at the indices specified in `index`."
+    shape = src.shape[:dim] + (size,) + src.shape[dim + 1:]
+    minimum = torch.full(shape, float('inf'), dtype=src.dtype, device=src.device).scatter_reduce(dim=dim, index=index, src=src, reduce='amin', include_self=False)
+    minimum_where = torch.where(src == torch.gather(minimum, dim=dim, index=index))
+    indices = torch.full(shape, -1, dtype=torch.long, device=src.device)
+    indices[(*minimum_where[:dim], index[minimum_where], *minimum_where[dim + 1:])] = minimum_where[dim]
+    return torch.return_types.min((minimum, indices))
+def split_batch_fwd(fn: Callable, chunk_size: int, *args, **kwargs):
+    batch_size = next(x for x in (*args, *kwargs.values()) if isinstance(x, torch.Tensor)).shape[0]
+    n_chunks = batch_size // chunk_size + (batch_size % chunk_size > 0)
+    splited_args = tuple(arg.split(chunk_size, dim=0) if isinstance(arg, torch.Tensor) else [arg] * n_chunks for arg in args)
+    splited_kwargs = {k: [v.split(chunk_size, dim=0) if isinstance(v, torch.Tensor) else [v] * n_chunks] for k, v in kwargs.items()}
+    results = []
+    for i in range(n_chunks):
+        chunk_args = tuple(arg[i] for arg in splited_args)
+        chunk_kwargs = {k: v[i] for k, v in splited_kwargs.items()}
+        results.append(fn(*chunk_args, **chunk_kwargs))
+    if isinstance(results[0], tuple):
+        return tuple(torch.cat(r, dim=0) for r in zip(*results))
+    else:
+        return torch.cat(results, dim=0)
+def _pad_inf(x_: torch.Tensor):
+    return torch.cat([torch.full_like(x_[..., :1], -torch.inf), x_, torch.full_like(x_[..., :1], torch.inf)], dim=-1)
+def _pad_cumsum(cumsum: torch.Tensor):
+    return torch.cat([torch.zeros_like(cumsum[..., :1]), cumsum, cumsum[..., -1:]], dim=-1)
+def _compute_residual(a: torch.Tensor, xyw: torch.Tensor, trunc: float):
+    return a.mul(xyw[..., 0]).sub_(xyw[..., 1]).abs_().mul_(xyw[..., 2]).clamp_max_(trunc).sum(dim=-1)
+def align(x: torch.Tensor, y: torch.Tensor, w: torch.Tensor, trunc: Optional[Union[float, torch.Tensor]] = None, eps: float = 1e-7) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
+    """
+    If trunc is None, solve `min sum_i w_i * |a * x_i - y_i|`, otherwise solve `min sum_i min(trunc, w_i * |a * x_i - y_i|)`.
+    w_i must be >= 0.
+    ### Parameters:
+    - `x`: tensor of shape (..., n)
+    - `y`: tensor of shape (..., n)
+    - `w`: tensor of shape (..., n)
+    - `trunc`: optional, float or tensor of shape (..., n) or None
+    ### Returns:
+    - `a`: tensor of shape (...), differentiable
+    - `loss`: tensor of shape (...), value of loss function at `a`, detached
+    - `index`: tensor of shape (...), where a = y[idx] / x[idx]
+    """
+    if trunc is None:
+        x, y, w = torch.broadcast_tensors(x, y, w)
+        sign = torch.sign(x)
+        x, y = x * sign, y * sign
+        y_div_x = y / x.clamp_min(eps)
+        y_div_x, argsort = y_div_x.sort(dim=-1)
+        wx = torch.gather(x * w, dim=-1, index=argsort)
+        derivatives = 2 * wx.cumsum(dim=-1) - wx.sum(dim=-1, keepdim=True)
+        search = torch.searchsorted(derivatives, torch.zeros_like(derivatives[..., :1]), side='left').clamp_max(derivatives.shape[-1] - 1)
+        a = y_div_x.gather(dim=-1, index=search).squeeze(-1)
+        index = argsort.gather(dim=-1, index=search).squeeze(-1)
+        loss = (w * (a[..., None] * x - y).abs()).sum(dim=-1)
+    else:
+        # Reshape to (batch_size, n) for simplicity
+        x, y, w = torch.broadcast_tensors(x, y, w)
+        batch_shape = x.shape[:-1]
+        batch_size = math.prod(batch_shape)
+        x, y, w = x.reshape(-1, x.shape[-1]), y.reshape(-1, y.shape[-1]), w.reshape(-1, w.shape[-1])
+        sign = torch.sign(x)
+        x, y = x * sign, y * sign
+        wx, wy = w * x, w * y
+        xyw = torch.stack([x, y, w], dim=-1)    # Stacked for convenient gathering
+        y_div_x = A = y / x.clamp_min(eps)
+        B = (wy - trunc) / wx.clamp_min(eps)
+        C = (wy + trunc) / wx.clamp_min(eps)
+        with torch.no_grad():
+            # Caculate prefix sum by orders of A, B, C
+            A, A_argsort = A.sort(dim=-1)
+            Q_A = torch.cumsum(torch.gather(wx, dim=-1, index=A_argsort), dim=-1)
+            A, Q_A = _pad_inf(A), _pad_cumsum(Q_A)    # Pad [-inf, A1, ..., An, inf] and [0, Q1, ..., Qn, Qn] to handle edge cases.
+            B, B_argsort = B.sort(dim=-1)
+            Q_B = torch.cumsum(torch.gather(wx, dim=-1, index=B_argsort), dim=-1)
+            B, Q_B = _pad_inf(B), _pad_cumsum(Q_B)
+            C, C_argsort = C.sort(dim=-1)
+            Q_C = torch.cumsum(torch.gather(wx, dim=-1, index=C_argsort), dim=-1)
+            C, Q_C = _pad_inf(C), _pad_cumsum(Q_C)
+            # Caculate left and right derivative of A
+            j_A = torch.searchsorted(A, y_div_x, side='left').sub_(1)
+            j_B = torch.searchsorted(B, y_div_x, side='left').sub_(1)
+            j_C = torch.searchsorted(C, y_div_x, side='left').sub_(1)
+            left_derivative = 2 * torch.gather(Q_A, dim=-1, index=j_A) - torch.gather(Q_B, dim=-1, index=j_B) - torch.gather(Q_C, dim=-1, index=j_C)
+            j_A = torch.searchsorted(A, y_div_x, side='right').sub_(1)
+            j_B = torch.searchsorted(B, y_div_x, side='right').sub_(1)
+            j_C = torch.searchsorted(C, y_div_x, side='right').sub_(1)
+            right_derivative = 2 * torch.gather(Q_A, dim=-1, index=j_A) - torch.gather(Q_B, dim=-1, index=j_B) - torch.gather(Q_C, dim=-1, index=j_C)
+            # Find extrema
+            is_extrema = (left_derivative < 0) & (right_derivative >= 0)
+            is_extrema[..., 0] |= ~is_extrema.any(dim=-1)                       # In case all derivatives are zero, take the first one as extrema.
+            where_extrema_batch, where_extrema_index = torch.where(is_extrema)
+            # Calculate objective value at extrema
+            extrema_a = y_div_x[where_extrema_batch, where_extrema_index]               # (num_extrema,)
+            MAX_ELEMENTS = 4096 ** 2      # Split into small batches to avoid OOM in case there are too many extrema.(~1G)
+            SPLIT_SIZE = MAX_ELEMENTS // x.shape[-1]
+            extrema_value = torch.cat([
+                _compute_residual(extrema_a_split[:, None], xyw[extrema_i_split, :, :], trunc)
+                for extrema_a_split, extrema_i_split in zip(extrema_a.split(SPLIT_SIZE), where_extrema_batch.split(SPLIT_SIZE))
+            ])          # (num_extrema,)
+            # Find minima among corresponding extrema
+            minima, indices = scatter_min(size=batch_size, dim=0, index=where_extrema_batch, src=extrema_value)        # (batch_size,)
+            index = where_extrema_index[indices]
+        a = torch.gather(y, dim=-1, index=index[..., None]) / torch.gather(x, dim=-1, index=index[..., None]).clamp_min(eps)
+        a = a.reshape(batch_shape)
+        loss = minima.reshape(batch_shape)
+        index = index.reshape(batch_shape)
+    return a, loss, index
+def align_depth_scale(depth_src: torch.Tensor, depth_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None):
+    """
+    Align `depth_src` to `depth_tgt` with given constant weights.
+    ### Parameters:
+    - `depth_src: torch.Tensor` of shape (..., N)
+    - `depth_tgt: torch.Tensor` of shape (..., N)
+    """
+    scale, _, _ = align(depth_src, depth_tgt, weight, trunc)
+    return scale
+def align_depth_affine(depth_src: torch.Tensor, depth_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None):
+    """
+    Align `depth_src` to `depth_tgt` with given constant weights.
+    ### Parameters:
+    - `depth_src: torch.Tensor` of shape (..., N)
+    - `depth_tgt: torch.Tensor` of shape (..., N)
+    - `weight: torch.Tensor` of shape (..., N)
+    - `trunc: float` or tensor of shape (..., N) or None
+    ### Returns:
+    - `scale: torch.Tensor` of shape (...).
+    - `shift: torch.Tensor` of shape (...).
+    """
+    dtype, device = depth_src.dtype, depth_src.device
+    # Flatten batch dimensions for simplicity
+    batch_shape, n = depth_src.shape[:-1], depth_src.shape[-1]
+    batch_size = math.prod(batch_shape)
+    depth_src, depth_tgt, weight = depth_src.reshape(batch_size, n), depth_tgt.reshape(batch_size, n), weight.reshape(batch_size, n)
+    # Here, we take anchors only for non-zero weights.
+    # Although the results will be still correct even anchor points have zero weight,
+    # it is wasting computation and may cause instability in some cases, e.g. too many extrema.
+    anchors_where_batch, anchors_where_n = torch.where(weight > 0)
+    # Stop gradient when solving optimal anchors
+    with torch.no_grad():
+        depth_src_anchor = depth_src[anchors_where_batch, anchors_where_n]                              # (anchors)
+        depth_tgt_anchor = depth_tgt[anchors_where_batch, anchors_where_n]                              # (anchors)
+        depth_src_anchored = depth_src[anchors_where_batch, :] - depth_src_anchor[..., None]            # (anchors, n)
+        depth_tgt_anchored = depth_tgt[anchors_where_batch, :] - depth_tgt_anchor[..., None]            # (anchors, n)
+        weight_anchored = weight[anchors_where_batch, :]                                                # (anchors, n)
+        scale, loss, index = align(depth_src_anchored, depth_tgt_anchored, weight_anchored, trunc)      # (anchors)
+        loss, index_anchor = scatter_min(size=batch_size, dim=0, index=anchors_where_batch, src=loss)   # (batch_size,)
+    # Reproduce by indexing for shorter compute graph
+    index_1 = anchors_where_n[index_anchor]      # (batch_size,)
+    index_2 = index[index_anchor]                # (batch_size,)
+    tgt_1, src_1 = torch.gather(depth_tgt, dim=1, index=index_1[..., None]).squeeze(-1), torch.gather(depth_src, dim=1, index=index_1[..., None]).squeeze(-1)
+    tgt_2, src_2 = torch.gather(depth_tgt, dim=1, index=index_2[..., None]).squeeze(-1), torch.gather(depth_src, dim=1, index=index_2[..., None]).squeeze(-1)
+    scale = (tgt_2 - tgt_1) / torch.where(src_2 != src_1, src_2 - src_1, 1e-7)
+    shift = tgt_1 - scale * src_1
+    scale, shift = scale.reshape(batch_shape), shift.reshape(batch_shape)
+    return scale, shift
+def align_depth_affine_irls(depth_src: torch.Tensor, depth_tgt: torch.Tensor, weight: Optional[torch.Tensor], max_iter: int = 100, eps: float = 1e-12):
+    """
+    Align `depth_src` to `depth_tgt` with given constant weights using IRLS.
+    """
+    dtype, device = depth_src.dtype, depth_src.device
+    w = weight
+    x = torch.stack([depth_src, torch.ones_like(depth_src)], dim=-1)
+    y = depth_tgt
+    for i in range(max_iter):
+        beta = (x.transpose(-1, -2) @ (w * y)) @ (x.transpose(-1, -2) @ (w[..., None] * x)).inverse().transpose(-2, -1)
+        w = 1 / (y - (x @ beta[..., None])[..., 0]).abs().clamp_min(eps)
+    return beta[..., 0], beta[..., 1]
+def align_points_scale(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None):
+    """
+    ### Parameters:
+    - `points_src: torch.Tensor` of shape (..., N, 3)
+    - `points_tgt: torch.Tensor` of shape (..., N, 3)
+    - `weight: torch.Tensor` of shape (..., N)
+    ### Returns:
+    - `a: torch.Tensor` of shape (...). Only positive solutions are garunteed. You should filter out negative scales before using it.
+    - `b: torch.Tensor` of shape (...)
+    """
+    dtype, device = points_src.dtype, points_src.device
+    scale, _, _ = align(points_src.flatten(-2), points_tgt.flatten(-2), weight[..., None].expand_as(points_src).flatten(-2), trunc)
+    return scale
+def align_points_scale_z_shift(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None):
+    """
+    Align `points_src` to `points_tgt` with respect to a shared xyz scale and z shift.
+    It is similar to `align_affine` but scale and shift are applied to different dimensions.
+    ### Parameters:
+    - `points_src: torch.Tensor` of shape (..., N, 3)
+    - `points_tgt: torch.Tensor` of shape (..., N, 3)
+    - `weights: torch.Tensor` of shape (..., N)
+    ### Returns:
+    - `scale: torch.Tensor` of shape (...).
+    - `shift: torch.Tensor` of shape (..., 3). x and y shifts are zeros.
+    """
+    dtype, device = points_src.dtype, points_src.device
+    # Flatten batch dimensions for simplicity
+    batch_shape, n = points_src.shape[:-2], points_src.shape[-2]
+    batch_size = math.prod(batch_shape)
+    points_src, points_tgt, weight = points_src.reshape(batch_size, n, 3), points_tgt.reshape(batch_size, n, 3), weight.reshape(batch_size, n)
+    # Take anchors
+    anchor_where_batch, anchor_where_n = torch.where(weight > 0)
+    with torch.no_grad():
+        zeros = torch.zeros(anchor_where_batch.shape[0], device=device, dtype=dtype)
+        points_src_anchor = torch.stack([zeros, zeros, points_src[anchor_where_batch, anchor_where_n, 2]], dim=-1)      # (anchors, 3)
+        points_tgt_anchor = torch.stack([zeros, zeros, points_tgt[anchor_where_batch, anchor_where_n, 2]], dim=-1)      # (anchors, 3)
+        points_src_anchored = points_src[anchor_where_batch, :, :] - points_src_anchor[..., None, :]    # (anchors, n, 3)
+        points_tgt_anchored = points_tgt[anchor_where_batch, :, :] - points_tgt_anchor[..., None, :]    # (anchors, n, 3)
+        weight_anchored = weight[anchor_where_batch, :, None].expand(-1, -1, 3)                         # (anchors, n, 3)
+        # Solve optimal scale and shift for each anchor
+        MAX_ELEMENTS = 2 ** 20
+        scale, loss, index = split_batch_fwd(align, MAX_ELEMENTS // n, points_src_anchored.flatten(-2), points_tgt_anchored.flatten(-2), weight_anchored.flatten(-2), trunc)   # (anchors,)
+        loss, index_anchor = scatter_min(size=batch_size, dim=0, index=anchor_where_batch, src=loss)    # (batch_size,)
+    # Reproduce by indexing for shorter compute graph
+    index_2 = index[index_anchor]                               # (batch_size,) [0, 3n)
+    index_1 = anchor_where_n[index_anchor] * 3 + index_2 % 3    # (batch_size,) [0, 3n)
+    zeros = torch.zeros((batch_size, n), device=device, dtype=dtype)
+    points_tgt_00z, points_src_00z = torch.stack([zeros, zeros, points_tgt[..., 2]], dim=-1), torch.stack([zeros, zeros, points_src[..., 2]], dim=-1)
+    tgt_1, src_1 = torch.gather(points_tgt_00z.flatten(-2), dim=1, index=index_1[..., None]).squeeze(-1), torch.gather(points_src_00z.flatten(-2), dim=1, index=index_1[..., None]).squeeze(-1)
+    tgt_2, src_2 = torch.gather(points_tgt.flatten(-2), dim=1, index=index_2[..., None]).squeeze(-1), torch.gather(points_src.flatten(-2), dim=1, index=index_2[..., None]).squeeze(-1)
+    scale = (tgt_2 - tgt_1) / torch.where(src_2 != src_1, src_2 - src_1, 1.0)
+    shift = torch.gather(points_tgt_00z, dim=1, index=(index_1 // 3)[..., None, None].expand(-1, -1, 3)).squeeze(-2) - scale[..., None] * torch.gather(points_src_00z, dim=1, index=(index_1 // 3)[..., None, None].expand(-1, -1, 3)).squeeze(-2)
+    scale, shift = scale.reshape(batch_shape), shift.reshape(*batch_shape, 3)
+    return scale, shift
+def align_points_scale_xyz_shift(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None, max_iters: int = 30, eps: float = 1e-6):
+    """
+    Align `points_src` to `points_tgt` with respect to a shared xyz scale and z shift.
+    It is similar to `align_affine` but scale and shift are applied to different dimensions.
+    ### Parameters:
+    - `points_src: torch.Tensor` of shape (..., N, 3)
+    - `points_tgt: torch.Tensor` of shape (..., N, 3)
+    - `weights: torch.Tensor` of shape (..., N)
+    ### Returns:
+    - `scale: torch.Tensor` of shape (...).
+    - `shift: torch.Tensor` of shape (..., 3)
+    """
+    dtype, device = points_src.dtype, points_src.device
+    # Flatten batch dimensions for simplicity
+    batch_shape, n = points_src.shape[:-2], points_src.shape[-2]
+    batch_size = math.prod(batch_shape)
+    points_src, points_tgt, weight = points_src.reshape(batch_size, n, 3), points_tgt.reshape(batch_size, n, 3), weight.reshape(batch_size, n)
+    # Take anchors
+    anchor_where_batch, anchor_where_n = torch.where(weight > 0)
+    with torch.no_grad():
+        points_src_anchor = points_src[anchor_where_batch, anchor_where_n]          # (anchors, 3)
+        points_tgt_anchor = points_tgt[anchor_where_batch, anchor_where_n]          # (anchors, 3)
+        points_src_anchored = points_src[anchor_where_batch, :, :] - points_src_anchor[..., None, :]    # (anchors, n, 3)
+        points_tgt_anchored = points_tgt[anchor_where_batch, :, :] - points_tgt_anchor[..., None, :]    # (anchors, n, 3)
+        weight_anchored = weight[anchor_where_batch, :, None].expand(-1, -1, 3)                         # (anchors, n, 3)
+        # Solve optimal scale and shift for each anchor
+        MAX_ELEMENTS = 2 ** 20
+        scale, loss, index = split_batch_fwd(align, MAX_ELEMENTS // 2, points_src_anchored.flatten(-2), points_tgt_anchored.flatten(-2), weight_anchored.flatten(-2), trunc)   # (anchors,)
+        # Get optimal scale and shift for each batch element
+        loss, index_anchor = scatter_min(size=batch_size, dim=0, index=anchor_where_batch, src=loss)    # (batch_size,)
+    index_2 = index[index_anchor]                               # (batch_size,) [0, 3n)
+    index_1 = anchor_where_n[index_anchor] * 3 + index_2 % 3    # (batch_size,) [0, 3n)
+    src_1, tgt_1 = torch.gather(points_src.flatten(-2), dim=1, index=index_1[..., None]).squeeze(-1), torch.gather(points_tgt.flatten(-2), dim=1, index=index_1[..., None]).squeeze(-1)
+    src_2, tgt_2 = torch.gather(points_src.flatten(-2), dim=1, index=index_2[..., None]).squeeze(-1), torch.gather(points_tgt.flatten(-2), dim=1, index=index_2[..., None]).squeeze(-1)
+    scale = (tgt_2 - tgt_1) / torch.where(src_2 != src_1, src_2 - src_1, 1.0)
+    shift = torch.gather(points_tgt, dim=1, index=(index_1 // 3)[..., None, None].expand(-1, -1, 3)).squeeze(-2) - scale[..., None] * torch.gather(points_src, dim=1, index=(index_1 // 3)[..., None, None].expand(-1, -1, 3)).squeeze(-2)
+    scale, shift = scale.reshape(batch_shape), shift.reshape(*batch_shape, 3)
+    return scale, shift
+def align_points_z_shift(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None, max_iters: int = 30, eps: float = 1e-6):
+    """
+    Align `points_src` to `points_tgt` with respect to a Z-axis shift.
+    ### Parameters:
+    - `points_src: torch.Tensor` of shape (..., N, 3)
+    - `points_tgt: torch.Tensor` of shape (..., N, 3)
+    - `weights: torch.Tensor` of shape (..., N)
+    ### Returns:
+    - `scale: torch.Tensor` of shape (...).
+    - `shift: torch.Tensor` of shape (..., 3)
+    """
+    dtype, device = points_src.dtype, points_src.device
+    shift, _, _ = align(torch.ones_like(points_src[..., 2]), points_tgt[..., 2] - points_src[..., 2], weight, trunc)
+    shift = torch.stack([torch.zeros_like(shift), torch.zeros_like(shift), shift], dim=-1)
+    return shift
+def align_points_xyz_shift(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None, max_iters: int = 30, eps: float = 1e-6):
+    """
+    Align `points_src` to `points_tgt` with respect to a Z-axis shift.
+    ### Parameters:
+    - `points_src: torch.Tensor` of shape (..., N, 3)
+    - `points_tgt: torch.Tensor` of shape (..., N, 3)
+    - `weights: torch.Tensor` of shape (..., N)
+    ### Returns:
+    - `scale: torch.Tensor` of shape (...).
+    - `shift: torch.Tensor` of shape (..., 3)
+    """
+    dtype, device = points_src.dtype, points_src.device
+    shift, _, _ = align(torch.ones_like(points_src).swapaxes(-2, -1), (points_tgt - points_src).swapaxes(-2, -1), weight[..., None, :], trunc)
+    return shift
+def align_affine_lstsq(x: torch.Tensor, y: torch.Tensor, w: torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Solve `min sum_i w_i * (a * x_i + b - y_i ) ^ 2`, where `a` and `b` are scalars, with respect to `a` and `b` using least squares.
+    ### Parameters:
+    - `x: torch.Tensor` of shape (..., N)
+    - `y: torch.Tensor` of shape (..., N)
+    - `w: torch.Tensor` of shape (..., N)
+    ### Returns:
+    - `a: torch.Tensor` of shape (...,)
+    - `b: torch.Tensor` of shape (...,)
+    """
+    w_sqrt = torch.ones_like(x) if w is None else w.sqrt()
+    A = torch.stack([w_sqrt * x, torch.ones_like(x)], dim=-1)
+    B = (w_sqrt * y)[..., None]
+    a, b = torch.linalg.lstsq(A, B)[0].squeeze(-1).unbind(-1)
+    return a, b
+def _smooth(err: torch.FloatTensor, beta: float = 0.0) -> torch.FloatTensor:
+    if beta == 0:
+        return err
+    else:
+        return torch.where(err < beta, 0.5 * err.square() / beta, err - 0.5 * beta)
+def affine_invariant_global_loss(
+    pred_points: torch.Tensor,
+    gt_points: torch.Tensor,
+    mask: torch.Tensor,
+    align_resolution: int = 64,
+    beta: float = 0.0,
+    trunc: float = 1.0,
+    sparsity_aware: bool = False,
+    only_align: bool = False
+):
+    device = pred_points.device
+    # Align
+    (pred_points_lr, gt_points_lr), lr_mask = mask_aware_nearest_resize((pred_points, gt_points), mask=mask, size=(align_resolution, align_resolution))
+    scale, shift = align_points_scale_z_shift(pred_points_lr.flatten(-3, -2), gt_points_lr.flatten(-3, -2), lr_mask.flatten(-2, -1) / gt_points_lr[..., 2].flatten(-2, -1).clamp_min(1e-2), trunc=trunc)
+    valid = scale > 0
+    scale, shift = torch.where(valid, scale, 0), torch.where(valid[..., None], shift, 0)
+    pred_points = scale[..., None, None, None] * pred_points + shift[..., None, None, :]
+    if only_align:
+        return pred_points, scale, shift
+    # Compute loss
+    weight = (valid[..., None, None] & mask).float() / gt_points[..., 2].clamp_min(1e-5)
+    weight = weight.clamp_max(10.0 * weighted_mean(weight, mask, dim=(-2, -1), keepdim=True))   # In case your data contains extremely small depth values
+    loss = _smooth((pred_points - gt_points).abs() * weight[..., None], beta=beta).mean(dim=(-3, -2, -1))
+    if sparsity_aware:
+        # Reweighting improves performance on sparse depth data. NOTE: this is not used in MoGe-1.
+        sparsity = mask.float().mean(dim=(-2, -1)) / lr_mask.float().mean(dim=(-2, -1))
+        loss = loss / (sparsity + 1e-7)
+    err = (pred_points.detach() - gt_points).norm(dim=-1) / gt_points[..., 2]
+    # Record any scalar metric
+    misc = {
+        'truncated_error': weighted_mean(err.clamp_max(1.0), mask).item(),
+        'delta': weighted_mean((err < 1).float(), mask).item()
+    }
+    return loss, misc, scale.detach(), shift.detach()

models/SpaTrackV2/models/tracker3D/spatrack_modules/ba.py ADDED Viewed

	@@ -0,0 +1,538 @@

+import pycolmap
+import torch
+import numpy as np
+import pyceres
+from pyceres import SolverOptions, LinearSolverType, PreconditionerType, TrustRegionStrategyType, LoggingType
+import logging
+from scipy.spatial.transform import Rotation as R
+# config logging and make sure it print to the console
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def extract_static_from_3DTracks(world_tracks, dyn_prob,
+                                     query_3d_pts, vis_est, tracks2d, img_size, K=100, maintain_invisb=False):
+    """
+    world_tracks: B T N 3   this is the coarse 3d tracks in world coordinate  (coarse 3d tracks)
+    dyn_prob: B T N   this is the dynamic probability of the 3d tracks
+    query_3d_pts: B T N 3   this is the query 3d points in world coordinate (coarse by camera pose)
+    vis_est: B T N   this is the visibility of the 3d tracks
+    tracks2d: B T N 2   this is the 2d tracks
+    K: int   top K static points
+    """
+    B, T, N, _ = world_tracks.shape
+    static_msk = (dyn_prob<0.5).bool()
+    world_tracks_static = world_tracks[:,:,static_msk.squeeze(),:]
+    query_3d_pts_static = query_3d_pts[:,static_msk.squeeze(),:]
+    if maintain_invisb:
+        vis = (tracks2d[...,0] > 0).bool() * (tracks2d[...,1] > 0).bool()
+        vis_mask = vis * (img_size[1] > tracks2d[...,0]) * (img_size[0] > tracks2d[...,1])
+        vis_mask = vis_mask[:,:,static_msk.squeeze()]
+    else:
+        vis_mask = (vis_est>0.5).bool()[:,:,static_msk.squeeze()]
+    tracks2d_static = tracks2d[:,:,static_msk.squeeze(),:]
+    world_tracks_static = (world_tracks_static*vis_mask[...,None]).sum(dim=1)/(vis_mask.sum(dim=1)[...,None]+1e-6)
+    # get the distance between the query_3d_pts_static and the world_tracks_static
+    dist = (query_3d_pts_static-world_tracks_static).norm(dim=-1)
+    # get the top K static points, which have the smallest distance
+    topk_idx = torch.argsort(dist,dim=-1)[:,:K]
+    world_tracks_static = world_tracks_static[torch.arange(B)[:,None,None],topk_idx]
+    query_3d_pts_static = query_3d_pts_static[torch.arange(B)[:,None,None],topk_idx]
+    # get the visible selected
+    vis_mask_static = vis_mask[:,:,topk_idx.squeeze()]
+    tracks2d_static = tracks2d_static[:, :, topk_idx.squeeze(), :]
+    return world_tracks_static, static_msk, topk_idx, vis_mask_static, tracks2d_static
+def log_ba_summary(summary):
+    logging.info(f"Residuals : {summary.num_residuals_reduced}")
+    if summary.num_residuals_reduced > 0:
+        logging.info(f"Parameters : {summary.num_effective_parameters_reduced}")
+        logging.info(
+            f"Iterations : {summary.num_successful_steps + summary.num_unsuccessful_steps}"
+        )
+        logging.info(f"Time : {summary.total_time_in_seconds} [s]")
+        logging.info(
+            f"Initial cost : {np.sqrt(summary.initial_cost / summary.num_residuals_reduced)} [px]"
+        )
+        logging.info(
+            f"Final cost : {np.sqrt(summary.final_cost / summary.num_residuals_reduced)} [px]"
+        )
+        return True
+    else:
+        print("No residuals reduced")
+        return False
+# def solve_bundle_adjustment(reconstruction, ba_options, ba_config):
+#     bundle_adjuster = pycolmap.BundleAdjuster(ba_options, ba_config)
+#     bundle_adjuster.set_up_problem(
+#         reconstruction, ba_options.create_loss_function()
+#     )
+#     solver_options = bundle_adjuster.set_up_solver_options(
+#         bundle_adjuster.problem, ba_options.solver_options
+#     )
+#     summary = pyceres.SolverSummary()
+#     pyceres.solve(solver_options, bundle_adjuster.problem, summary)
+#     return summary
+def efficient_solver(solver_options, stability_mode=True):
+    # Set linear solver to ITERATIVE_SCHUR (using PCG to solve Schur complement)
+    solver_options.linear_solver_type = LinearSolverType.ITERATIVE_SCHUR
+    # Set preconditioner (critical for PCG)
+    solver_options.preconditioner_type = PreconditionerType.SCHUR_JACOBI
+    # Optimize trust region strategy
+    solver_options.trust_region_strategy_type = TrustRegionStrategyType.LEVENBERG_MARQUARDT
+    # Enable multi-threading acceleration
+    solver_options.num_threads = 32  # Adjust based on CPU cores
+    if stability_mode:
+        # Stability-first configuration
+        solver_options.initial_trust_region_radius = 1.0  # Reduce initial step size
+        solver_options.max_trust_region_radius = 10.0    # Limit max step size
+        solver_options.min_trust_region_radius = 1e-6    # Allow small step convergence
+        # Increase regularization parameters
+        solver_options.use_nonmonotonic_steps = True     # Allow non-monotonic steps
+        solver_options.max_consecutive_nonmonotonic_steps = 10
+        # Adjust iteration termination conditions
+        solver_options.max_num_iterations = 100          # Increase max iterations
+        solver_options.function_tolerance = 1e-8         # Stricter function convergence
+        solver_options.gradient_tolerance = 1e-12        # Stricter gradient convergence
+        solver_options.parameter_tolerance = 1e-10       # Stricter parameter convergence
+        # Control PCG iterations and precision
+        solver_options.min_linear_solver_iterations = 10
+        solver_options.max_linear_solver_iterations = 100
+        solver_options.inner_iteration_tolerance = 0.01  # Higher inner iteration precision
+        # Increase damping factor
+        solver_options.min_lm_diagonal = 1e-3            # Increase min LM diagonal
+        solver_options.max_lm_diagonal = 1e+10           # Limit max LM diagonal
+        # Enable parameter change limits
+        solver_options.update_state_every_iteration = True  # Update state each iteration
+    else:
+        # Efficiency-first configuration (original settings)
+        solver_options.initial_trust_region_radius = 10000.0
+        solver_options.max_trust_region_radius = 1e+16
+        solver_options.max_num_iterations = 50
+        solver_options.function_tolerance = 1e-6
+        solver_options.gradient_tolerance = 1e-10
+        solver_options.parameter_tolerance = 1e-8
+        solver_options.min_linear_solver_iterations = 5
+        solver_options.max_linear_solver_iterations = 50
+        solver_options.inner_iteration_tolerance = 0.1
+    # Enable Jacobi scaling for better numerical stability
+    solver_options.jacobi_scaling = True
+    # Disable verbose logging for better performance (enable for debugging)
+    solver_options.logging_type = LoggingType.SILENT
+    solver_options.minimizer_progress_to_stdout = False
+    return solver_options
+class SpatTrackCost_static(pyceres.CostFunction):
+    def __init__(self, observed_depth):
+        """
+        observed_depth: float
+        """
+        super().__init__()
+        self.observed_depth = float(observed_depth)
+        self.set_num_residuals(1)
+        self.set_parameter_block_sizes([4, 3, 3])  # [rotation_quat, translation, xyz]
+    def Evaluate(self, parameters, residuals, jacobians):
+        # Unpack parameters
+        quat = parameters[0]       # shape: (4,) [w, x, y, z]
+        t = parameters[1]          # shape: (3,)
+        point = parameters[2]      # shape: (3,)
+        # Convert COLMAP-style quat [w, x, y, z] to scipy format [x, y, z, w]
+        r = R.from_quat([quat[1], quat[2], quat[3], quat[0]])
+        R_mat = r.as_matrix()  # (3, 3)
+        # Transform point to camera frame
+        X_cam = R_mat @ point + t
+        z = X_cam[2]
+        # Compute residual (normalized depth error)
+        residuals[0] = 20.0 * (z - self.observed_depth) / self.observed_depth
+        if jacobians is not None:
+            if jacobians[2] is not None:
+                # dr/d(point3D): only z-axis matters, so only 3rd row of R
+                jacobians[2][0] = 20.0 * R_mat[2, 0] / self.observed_depth
+                jacobians[2][1] = 20.0 * R_mat[2, 1] / self.observed_depth
+                jacobians[2][2] = 20.0 * R_mat[2, 2] / self.observed_depth
+            if jacobians[1] is not None:
+                # dr/dt = ∂residual/∂translation = d(z)/dt = [0, 0, 1]
+                jacobians[1][0] = 0.0
+                jacobians[1][1] = 0.0
+                jacobians[1][2] = 20.0 / self.observed_depth
+            if jacobians[0] is not None:
+                # Optional: dr/d(quat) — not trivial to derive, can be left for autodiff if needed
+                # Set zero for now (not ideal but legal)
+                jacobians[0][:] = 0.0
+        return True
+class SpatTrackCost_dynamic(pyceres.CostFunction):
+    def __init__(self, observed_uv, image, point3D, camera):
+        """
+        observed_uv: 1 1 K 2   this is the 2d tracks
+        image: pycolmap.Image object
+        point3D: pycolmap.Point3D object
+        camera: pycolmap.Camera object
+        """
+        sizes = [image.cam_from_world.params.shape[0], point3D.xyz.shape[0], camera.params.shape[0]]
+        super().__init__(self, residual_size=2, parameter_block_sizes=sizes)
+        self.observed_uv = observed_uv
+        self.image = image
+        self.point3D = point3D
+        self.camera = camera
+def solve_bundle_adjustment(reconstruction, ba_options,
+                                ba_config=None, extra_residual=None):
+    """
+    Perform bundle adjustment optimization (compatible with pycolmap 0.5+)
+    Args:
+        reconstruction: pycolmap.Reconstruction object
+        ba_options: pycolmap.BundleAdjustmentOptions object
+        ba_config: pycolmap.BundleAdjustmentConfig object (optional)
+    """
+    # Alternatively, you can customize the existing problem or options as:
+    # import pyceres
+    bundle_adjuster = pycolmap.create_default_bundle_adjuster(
+        ba_options, ba_config, reconstruction
+    )
+    solver_options = ba_options.create_solver_options(
+        ba_config, bundle_adjuster.problem
+    )
+    summary = pyceres.SolverSummary()
+    solver_options = efficient_solver(solver_options)
+    problem = bundle_adjuster.problem
+    # problem = pyceres.Problem()
+    # if (extra_residual is not None):
+    #     observed_depths = []
+    #     quaternions = []
+    #     translations = []
+    #     points3d = []
+    #     for res_ in extra_residual:
+    #         point_id_i = res_["point3D_id"]
+    #         for img_id_i, obs_depth_i in zip(res_["image_ids"], res_["observed_depth"]):
+    #             if obs_depth_i > 0:
+    #                 observed_depths.append(obs_depth_i)
+    #                 quaternions.append(reconstruction.images[img_id_i].cam_from_world.rotation.quat)
+    #                 translations.append(reconstruction.images[img_id_i].cam_from_world.translation)
+    #                 points3d.append(reconstruction.points3D[point_id_i].xyz)
+    #     pyceres.add_spatrack_static_problem(
+    #         problem,
+    #         observed_depths,
+    #         quaternions,
+    #         translations,
+    #         points3d,
+    #         huber_loss_delta=5.0
+    #     )
+    pyceres.solve(solver_options, problem, summary)
+    return summary
+def batch_matrix_to_pycolmap(
+    points3d,
+    extrinsics,
+    intrinsics,
+    tracks,
+    masks,
+    image_size,
+    max_points3D_val=3000,
+    shared_camera=False,
+    camera_type="SIMPLE_PINHOLE",
+    extra_params=None,
+    cam_tracks_static=None,
+    query_pts=None,
+):
+    """
+    Convert Batched Pytorch Tensors to PyCOLMAP
+    Check https://github.com/colmap/pycolmap for more details about its format
+    """
+    # points3d: Px3
+    # extrinsics: Nx3x4
+    # intrinsics: Nx3x3
+    # tracks: NxPx2
+    # masks: NxP
+    # image_size: 2, assume all the frames have been padded to the same size
+    # where N is the number of frames and P is the number of tracks
+    N, P, _ = tracks.shape
+    assert len(extrinsics) == N
+    assert len(intrinsics) == N
+    assert len(points3d) == P
+    assert image_size.shape[0] == 2
+    extrinsics = extrinsics.cpu().numpy()
+    intrinsics = intrinsics.cpu().numpy()
+    if extra_params is not None:
+        extra_params = extra_params.cpu().numpy()
+    tracks = tracks.cpu().numpy()
+    masks = masks.cpu().numpy()
+    points3d = points3d.cpu().numpy()
+    image_size = image_size.cpu().numpy()
+    if cam_tracks_static is not None:
+        cam_tracks_static = cam_tracks_static.cpu().numpy()
+    # Reconstruction object, following the format of PyCOLMAP/COLMAP
+    reconstruction = pycolmap.Reconstruction()
+    inlier_num = masks.sum(0)
+    valid_mask = inlier_num >= 2  # a track is invalid if without two inliers
+    valid_idx = np.nonzero(valid_mask)[0]
+    # Only add 3D points that have sufficient 2D points
+    point3d_ids = []
+    for vidx in valid_idx:
+        point3d_id = reconstruction.add_point3D(
+            points3d[vidx], pycolmap.Track(), np.zeros(3)
+        )
+        point3d_ids.append(point3d_id)
+    # add the residual pair
+    if cam_tracks_static is not None:
+        extra_residual = []
+        for id_x, vidx in enumerate(valid_idx):
+            points_3d_id = point3d_ids[id_x]
+            point_residual = {
+                "point3D_id": points_3d_id,
+                "image_ids": [],
+                "observed_depth": [],
+            }
+            query_i = query_pts[:,:,vidx]
+            point_residual["image_ids"].append(int(query_i[0,0,0]))
+            point_residual["observed_depth"].append(query_i[0,0,-1])
+            extra_residual.append(point_residual)
+    else:
+        extra_residual = None
+    num_points3D = len(valid_idx)
+    camera = None
+    # frame idx
+    for fidx in range(N):
+        # set camera
+        if camera is None or (not shared_camera):
+            if camera_type == "SIMPLE_RADIAL":
+                pycolmap_intri = np.array(
+                    [
+                        intrinsics[fidx][0, 0],
+                        intrinsics[fidx][0, 2],
+                        intrinsics[fidx][1, 2],
+                        extra_params[fidx][0],
+                    ]
+                )
+            elif camera_type == "SIMPLE_PINHOLE":
+                pycolmap_intri = np.array(
+                    [
+                        intrinsics[fidx][0, 0],
+                        intrinsics[fidx][0, 2],
+                        intrinsics[fidx][1, 2],
+                    ]
+                )
+            else:
+                raise ValueError(
+                    f"Camera type {camera_type} is not supported yet"
+                )
+            camera = pycolmap.Camera(
+                model=camera_type,
+                width=image_size[0],
+                height=image_size[1],
+                params=pycolmap_intri,
+                camera_id=fidx,
+            )
+            # add camera
+            reconstruction.add_camera(camera)
+        # set image
+        cam_from_world = pycolmap.Rigid3d(
+            pycolmap.Rotation3d(extrinsics[fidx][:3, :3]),
+            extrinsics[fidx][:3, 3],
+        )  # Rot and Trans
+        image = pycolmap.Image(
+            id=fidx,
+            name=f"image_{fidx}",
+            camera_id=camera.camera_id,
+            cam_from_world=cam_from_world,
+        )
+        points2D_list = []
+        point2D_idx = 0
+        # NOTE point3D_id start by 1
+        for point3D_id in range(1, num_points3D + 1):
+            original_track_idx = valid_idx[point3D_id - 1]
+            if (
+                reconstruction.points3D[point3D_id].xyz < max_points3D_val
+            ).all():
+                if masks[fidx][original_track_idx]:
+                    # It seems we don't need +0.5 for BA
+                    point2D_xy = tracks[fidx][original_track_idx]
+                    # Please note when adding the Point2D object
+                    # It not only requires the 2D xy location, but also the id to 3D point
+                    points2D_list.append(
+                        pycolmap.Point2D(point2D_xy, point3D_id)
+                    )
+                    # add element
+                    track = reconstruction.points3D[point3D_id].track
+                    track.add_element(fidx, point2D_idx)
+                    point2D_idx += 1
+        assert point2D_idx == len(points2D_list)
+        try:
+            image.points2D = pycolmap.ListPoint2D(points2D_list)
+        except Exception as e:
+            print(f"frame {fidx} is out of BA: {e}")
+        # add image
+        reconstruction.add_image(image)
+    return reconstruction, valid_idx, extra_residual
+def pycolmap_to_batch_matrix(
+    reconstruction, device="cuda", camera_type="SIMPLE_PINHOLE"
+):
+    """
+    Convert a PyCOLMAP Reconstruction Object to batched PyTorch tensors.
+    Args:
+        reconstruction (pycolmap.Reconstruction): The reconstruction object from PyCOLMAP.
+        device (str): The device to place the tensors on (default: "cuda").
+        camera_type (str): The type of camera model used (default: "SIMPLE_PINHOLE").
+    Returns:
+        tuple: A tuple containing points3D, extrinsics, intrinsics, and optionally extra_params.
+    """
+    num_images = len(reconstruction.images)
+    max_points3D_id = max(reconstruction.point3D_ids())
+    points3D = np.zeros((max_points3D_id, 3))
+    for point3D_id in reconstruction.points3D:
+        points3D[point3D_id - 1] = reconstruction.points3D[point3D_id].xyz
+    points3D = torch.from_numpy(points3D).to(device)
+    extrinsics = []
+    intrinsics = []
+    extra_params = [] if camera_type == "SIMPLE_RADIAL" else None
+    for i in range(num_images):
+        # Extract and append extrinsics
+        pyimg = reconstruction.images[i]
+        pycam = reconstruction.cameras[pyimg.camera_id]
+        matrix = pyimg.cam_from_world.matrix()
+        extrinsics.append(matrix)
+        # Extract and append intrinsics
+        calibration_matrix = pycam.calibration_matrix()
+        intrinsics.append(calibration_matrix)
+        if camera_type == "SIMPLE_RADIAL":
+            extra_params.append(pycam.params[-1])
+    # Convert lists to torch tensors
+    extrinsics = torch.from_numpy(np.stack(extrinsics)).to(device)
+    intrinsics = torch.from_numpy(np.stack(intrinsics)).to(device)
+    if camera_type == "SIMPLE_RADIAL":
+        extra_params = torch.from_numpy(np.stack(extra_params)).to(device)
+        extra_params = extra_params[:, None]
+    return points3D, extrinsics, intrinsics, extra_params
+def ba_pycolmap(world_tracks, intrs, c2w_traj, visb, tracks2d, image_size, cam_tracks_static=None, training=True, query_pts=None):
+    """
+    world_tracks: 1 1 K 3   this is the coarse 3d tracks in world coordinate  (coarse 3d tracks)
+    intrs: B T 3 3   this is the intrinsic matrix
+    c2w_traj: B T 4 4   this is the camera trajectory
+    visb: B T K   this is the visibility of the 3d tracks
+    tracks2d: B T K 2   this is the 2d tracks
+    """
+    with torch.no_grad():
+        B, _, K, _ = world_tracks.shape
+        T = c2w_traj.shape[1]
+        world_tracks = world_tracks.view(K, 3).detach()
+        world_tracks_refine = world_tracks.view(K, 3).detach().clone()
+        c2w_traj_glob = c2w_traj.view(B*T, 4, 4).detach().clone()
+        c2w_traj = c2w_traj.view(B*T, 4, 4).detach()
+        intrs = intrs.view(B*T, 3, 3).detach()
+        visb = visb.view(B*T, K).detach()
+        tracks2d = tracks2d[...,:2].view(B*T, K, 2).detach()
+        rec, valid_idx_pts, extra_residual = batch_matrix_to_pycolmap(
+                world_tracks,
+                torch.inverse(c2w_traj)[:,:3,:],
+                intrs,
+                tracks2d,
+                visb,
+                image_size,
+                cam_tracks_static=cam_tracks_static,
+                query_pts=query_pts,
+            )
+        # NOTE It is window_size + 1 instead of window_size
+        ba_options = pycolmap.BundleAdjustmentOptions()
+        ba_options.refine_focal_length = False
+        ba_options.refine_principal_point = False
+        ba_options.refine_extra_params = False
+        ba_config = pycolmap.BundleAdjustmentConfig()
+        for image_id in rec.reg_image_ids():
+            ba_config.add_image(image_id)
+        # Fix frame 0, i.e, the end frame of the last window
+        ba_config.set_constant_cam_pose(0)
+        # fix the 3d points
+        for point3D_id in rec.points3D:
+            if training:
+                # ba_config.add_constant_point(point3D_id)
+                ba_config.add_variable_point(point3D_id)
+            else:
+                ba_config.add_variable_point(point3D_id)
+                # ba_config.add_constant_point(point3D_id)
+        if (len(ba_config.variable_point3D_ids) < 50) and (len(ba_config.constant_point3D_ids) < 50):
+            return c2w_traj_glob, world_tracks_refine, intrs
+        summary = solve_bundle_adjustment(rec, ba_options, ba_config, extra_residual=extra_residual)
+        # free the 3d points
+        # for point3D_id in rec.points3D:
+        #     ba_config.remove_constant_point(point3D_id)
+        #     ba_config.add_variable_point(point3D_id)
+        # summary = solve_bundle_adjustment(rec, ba_options, ba_config)
+        if not training:
+            ba_success = log_ba_summary(summary)
+        # get the refined results
+        points3D, extrinsics, intrinsics, extra_params = pycolmap_to_batch_matrix(rec, device="cuda", camera_type="SIMPLE_PINHOLE")
+        c2w_traj_glob[:, :3, :] = extrinsics
+        c2w_traj_glob = torch.inverse(c2w_traj_glob)
+    world_tracks_refine[valid_idx_pts] = points3D.to(world_tracks_refine.device).to(world_tracks_refine.dtype)
+    intrinsics = intrinsics.to(world_tracks_refine.device).to(world_tracks_refine.dtype)
+    # import pdb; pdb.set_trace()
+    return c2w_traj_glob, world_tracks_refine, intrinsics

models/SpaTrackV2/models/tracker3D/spatrack_modules/blocks.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PointDinoV2(nn.Module):
+    """
+    PointDinoV2 is a 3D point tracking model that uses a backbone and head to extract features from points and track them.
+    """
+    def __init__(self, ):
+        super(PointDinoV2, self).__init__()
+        # self.backbone = PointDinoV2Backbone()
+        # self.head = PointDinoV2Head()

models/SpaTrackV2/models/tracker3D/spatrack_modules/dynamic_point_refine.py ADDED Viewed

File without changes

models/SpaTrackV2/models/tracker3D/spatrack_modules/geometry_numpy.py ADDED Viewed

	@@ -0,0 +1,401 @@

+from typing import *
+from functools import partial
+import math
+import cv2
+import numpy as np
+from scipy.signal import fftconvolve
+import numpy as np
+import utils3d
+from .tools import timeit
+def weighted_mean_numpy(x: np.ndarray, w: np.ndarray = None, axis: Union[int, Tuple[int,...]] = None, keepdims: bool = False, eps: float = 1e-7) -> np.ndarray:
+    if w is None:
+        return np.mean(x, axis=axis)
+    else:
+        w = w.astype(x.dtype)
+        return (x * w).mean(axis=axis) / np.clip(w.mean(axis=axis), eps, None)
+def harmonic_mean_numpy(x: np.ndarray, w: np.ndarray = None, axis: Union[int, Tuple[int,...]] = None, keepdims: bool = False, eps: float = 1e-7) -> np.ndarray:
+    if w is None:
+        return 1 / (1 / np.clip(x, eps, None)).mean(axis=axis)
+    else:
+        w = w.astype(x.dtype)
+        return 1 / (weighted_mean_numpy(1 / (x + eps), w, axis=axis, keepdims=keepdims, eps=eps) + eps)
+def normalized_view_plane_uv_numpy(width: int, height: int, aspect_ratio: float = None, dtype: np.dtype = np.float32) -> np.ndarray:
+    "UV with left-top corner as (-width / diagonal, -height / diagonal) and right-bottom corner as (width / diagonal, height / diagonal)"
+    if aspect_ratio is None:
+        aspect_ratio = width / height
+    span_x = aspect_ratio / (1 + aspect_ratio ** 2) ** 0.5
+    span_y = 1 / (1 + aspect_ratio ** 2) ** 0.5
+    u = np.linspace(-span_x * (width - 1) / width, span_x * (width - 1) / width, width, dtype=dtype)
+    v = np.linspace(-span_y * (height - 1) / height, span_y * (height - 1) / height, height, dtype=dtype)
+    u, v = np.meshgrid(u, v, indexing='xy')
+    uv = np.stack([u, v], axis=-1)
+    return uv
+def focal_to_fov_numpy(focal: np.ndarray):
+    return 2 * np.arctan(0.5 / focal)
+def fov_to_focal_numpy(fov: np.ndarray):
+    return 0.5 / np.tan(fov / 2)
+def intrinsics_to_fov_numpy(intrinsics: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    fov_x = focal_to_fov_numpy(intrinsics[..., 0, 0])
+    fov_y = focal_to_fov_numpy(intrinsics[..., 1, 1])
+    return fov_x, fov_y
+def point_map_to_depth_legacy_numpy(points: np.ndarray):
+    height, width = points.shape[-3:-1]
+    diagonal = (height ** 2 + width ** 2) ** 0.5
+    uv = normalized_view_plane_uv_numpy(width, height, dtype=points.dtype)  # (H, W, 2)
+    _, uv = np.broadcast_arrays(points[..., :2], uv)
+    # Solve least squares problem
+    b = (uv * points[..., 2:]).reshape(*points.shape[:-3], -1)                                  # (..., H * W * 2)
+    A = np.stack([points[..., :2], -uv], axis=-1).reshape(*points.shape[:-3], -1, 2)   # (..., H * W * 2, 2)
+    M = A.swapaxes(-2, -1) @ A
+    solution = (np.linalg.inv(M + 1e-6 * np.eye(2)) @ (A.swapaxes(-2, -1) @ b[..., None])).squeeze(-1)
+    focal, shift = solution
+    depth = points[..., 2] + shift[..., None, None]
+    fov_x = np.arctan(width / diagonal / focal) * 2
+    fov_y = np.arctan(height / diagonal / focal) * 2
+    return depth, fov_x, fov_y, shift
+def solve_optimal_focal_shift(uv: np.ndarray, xyz: np.ndarray):
+    "Solve `min |focal * xy / (z + shift) - uv|` with respect to shift and focal"
+    from scipy.optimize import least_squares
+    uv, xy, z = uv.reshape(-1, 2), xyz[..., :2].reshape(-1, 2), xyz[..., 2].reshape(-1)
+    def fn(uv: np.ndarray, xy: np.ndarray, z: np.ndarray, shift: np.ndarray):
+        xy_proj = xy / (z + shift)[: , None]
+        f = (xy_proj * uv).sum() / np.square(xy_proj).sum()
+        err = (f * xy_proj - uv).ravel()
+        return err
+    solution = least_squares(partial(fn, uv, xy, z), x0=0, ftol=1e-3, method='lm')
+    optim_shift = solution['x'].squeeze().astype(np.float32)
+    xy_proj = xy / (z + optim_shift)[: , None]
+    optim_focal = (xy_proj * uv).sum() / np.square(xy_proj).sum()
+    return optim_shift, optim_focal
+def solve_optimal_shift(uv: np.ndarray, xyz: np.ndarray, focal: float):
+    "Solve `min |focal * xy / (z + shift) - uv|` with respect to shift"
+    from scipy.optimize import least_squares
+    uv, xy, z = uv.reshape(-1, 2), xyz[..., :2].reshape(-1, 2), xyz[..., 2].reshape(-1)
+    def fn(uv: np.ndarray, xy: np.ndarray, z: np.ndarray, shift: np.ndarray):
+        xy_proj = xy / (z + shift)[: , None]
+        err = (focal * xy_proj - uv).ravel()
+        return err
+    solution = least_squares(partial(fn, uv, xy, z), x0=0, ftol=1e-3, method='lm')
+    optim_shift = solution['x'].squeeze().astype(np.float32)
+    return optim_shift
+def recover_focal_shift_numpy(points: np.ndarray, mask: np.ndarray = None, focal: float = None, downsample_size: Tuple[int, int] = (64, 64)):
+    import cv2
+    assert points.shape[-1] == 3, "Points should (H, W, 3)"
+    height, width = points.shape[-3], points.shape[-2]
+    diagonal = (height ** 2 + width ** 2) ** 0.5
+    uv = normalized_view_plane_uv_numpy(width=width, height=height)
+    if mask is None:
+        points_lr = cv2.resize(points, downsample_size, interpolation=cv2.INTER_LINEAR).reshape(-1, 3)
+        uv_lr = cv2.resize(uv, downsample_size, interpolation=cv2.INTER_LINEAR).reshape(-1, 2)
+    else:
+        (points_lr, uv_lr), mask_lr = mask_aware_nearest_resize_numpy((points, uv), mask, downsample_size)
+    if points_lr.size < 2:
+        return 1., 0.
+    if focal is None:
+        focal, shift = solve_optimal_focal_shift(uv_lr, points_lr)
+    else:
+        shift = solve_optimal_shift(uv_lr, points_lr, focal)
+    return focal, shift
+def mask_aware_nearest_resize_numpy(
+    inputs: Union[np.ndarray, Tuple[np.ndarray, ...], None],
+    mask: np.ndarray,
+    size: Tuple[int, int],
+    return_index: bool = False
+) -> Tuple[Union[np.ndarray, Tuple[np.ndarray, ...], None], np.ndarray, Tuple[np.ndarray, ...]]:
+    """
+    Resize 2D map by nearest interpolation. Return the nearest neighbor index and mask of the resized map.
+    ### Parameters
+    - `inputs`: a single or a list of input 2D map(s) of shape (..., H, W, ...).
+    - `mask`: input 2D mask of shape (..., H, W)
+    - `size`: target size (width, height)
+    ### Returns
+    - `*resized_maps`: resized map(s) of shape (..., target_height, target_width, ...).
+    - `resized_mask`: mask of the resized map of shape (..., target_height, target_width)
+    - `nearest_idx`: if return_index is True, nearest neighbor index of the resized map of shape (..., target_height, target_width) for each dimension.
+    """
+    height, width = mask.shape[-2:]
+    target_width, target_height = size
+    filter_h_f, filter_w_f = max(1, height / target_height), max(1, width / target_width)
+    filter_h_i, filter_w_i = math.ceil(filter_h_f), math.ceil(filter_w_f)
+    filter_size = filter_h_i * filter_w_i
+    padding_h, padding_w = filter_h_i // 2 + 1, filter_w_i // 2 + 1
+    # Window the original mask and uv
+    uv = utils3d.numpy.image_pixel_center(width=width, height=height, dtype=np.float32)
+    indices = np.arange(height * width, dtype=np.int32).reshape(height, width)
+    padded_uv = np.full((height + 2 * padding_h, width + 2 * padding_w, 2), 0, dtype=np.float32)
+    padded_uv[padding_h:padding_h + height, padding_w:padding_w + width] = uv
+    padded_mask = np.full((*mask.shape[:-2], height + 2 * padding_h, width + 2 * padding_w), False, dtype=bool)
+    padded_mask[..., padding_h:padding_h + height, padding_w:padding_w + width] = mask
+    padded_indices = np.full((height + 2 * padding_h, width + 2 * padding_w), 0, dtype=np.int32)
+    padded_indices[padding_h:padding_h + height, padding_w:padding_w + width] = indices
+    windowed_uv = utils3d.numpy.sliding_window_2d(padded_uv, (filter_h_i, filter_w_i), 1, axis=(0, 1))
+    windowed_mask = utils3d.numpy.sliding_window_2d(padded_mask, (filter_h_i, filter_w_i), 1, axis=(-2, -1))
+    windowed_indices = utils3d.numpy.sliding_window_2d(padded_indices, (filter_h_i, filter_w_i), 1, axis=(0, 1))
+    # Gather the target pixels's local window
+    target_centers = utils3d.numpy.image_uv(width=target_width, height=target_height, dtype=np.float32) * np.array([width, height], dtype=np.float32)
+    target_lefttop = target_centers - np.array((filter_w_f / 2, filter_h_f / 2), dtype=np.float32)
+    target_window = np.round(target_lefttop).astype(np.int32) + np.array((padding_w, padding_h), dtype=np.int32)
+    target_window_centers = windowed_uv[target_window[..., 1], target_window[..., 0], :, :, :].reshape(target_height, target_width, 2, filter_size)                          # (target_height, tgt_width, 2, filter_size)
+    target_window_mask = windowed_mask[..., target_window[..., 1], target_window[..., 0], :, :].reshape(*mask.shape[:-2], target_height, target_width, filter_size)     # (..., target_height, tgt_width, filter_size)
+    target_window_indices = windowed_indices[target_window[..., 1], target_window[..., 0], :, :].reshape(*([-1] * (mask.ndim - 2)), target_height, target_width, filter_size)                      # (target_height, tgt_width, filter_size)
+    # Compute nearest neighbor in the local window for each pixel
+    dist = np.square(target_window_centers - target_centers[..., None])
+    dist = dist[..., 0, :] + dist[..., 1, :]
+    dist = np.where(target_window_mask, dist, np.inf)                                                   # (..., target_height, tgt_width, filter_size)
+    nearest_in_window = np.argmin(dist, axis=-1, keepdims=True)                                         # (..., target_height, tgt_width, 1)
+    nearest_idx = np.take_along_axis(target_window_indices, nearest_in_window, axis=-1).squeeze(-1)     # (..., target_height, tgt_width)
+    nearest_i, nearest_j = nearest_idx // width, nearest_idx % width
+    target_mask = np.any(target_window_mask, axis=-1)
+    batch_indices = [np.arange(n).reshape([1] * i + [n] + [1] * (mask.ndim - i - 1)) for i, n in enumerate(mask.shape[:-2])]
+    index = (*batch_indices, nearest_i, nearest_j)
+    if inputs is None:
+        outputs = None
+    elif isinstance(inputs, np.ndarray):
+        outputs = inputs[index]
+    elif isinstance(inputs, Sequence):
+        outputs = tuple(x[index] for x in inputs)
+    else:
+        raise ValueError(f'Invalid input type: {type(inputs)}')
+    if return_index:
+        return outputs, target_mask, index
+    else:
+        return outputs, target_mask
+def mask_aware_area_resize_numpy(image: np.ndarray, mask: np.ndarray, target_width: int, target_height: int) -> Tuple[Tuple[np.ndarray, ...], np.ndarray]:
+    """
+    Resize 2D map by nearest interpolation. Return the nearest neighbor index and mask of the resized map.
+    ### Parameters
+    - `image`: Input 2D image of shape (..., H, W, C)
+    - `mask`: Input 2D mask of shape (..., H, W)
+    - `target_width`: target width of the resized map
+    - `target_height`: target height of the resized map
+    ### Returns
+    - `nearest_idx`: Nearest neighbor index of the resized map of shape (..., target_height, target_width).
+    - `target_mask`: Mask of the resized map of shape (..., target_height, target_width)
+    """
+    height, width = mask.shape[-2:]
+    if image.shape[-2:] == (height, width):
+        omit_channel_dim = True
+    else:
+        omit_channel_dim = False
+    if omit_channel_dim:
+        image = image[..., None]
+    image = np.where(mask[..., None], image, 0)
+    filter_h_f, filter_w_f = max(1, height / target_height), max(1, width / target_width)
+    filter_h_i, filter_w_i = math.ceil(filter_h_f) + 1, math.ceil(filter_w_f) + 1
+    filter_size = filter_h_i * filter_w_i
+    padding_h, padding_w = filter_h_i // 2 + 1, filter_w_i // 2 + 1
+    # Window the original mask and uv (non-copy)
+    uv = utils3d.numpy.image_pixel_center(width=width, height=height, dtype=np.float32)
+    indices = np.arange(height * width, dtype=np.int32).reshape(height, width)
+    padded_uv = np.full((height + 2 * padding_h, width + 2 * padding_w, 2), 0, dtype=np.float32)
+    padded_uv[padding_h:padding_h + height, padding_w:padding_w + width] = uv
+    padded_mask = np.full((*mask.shape[:-2], height + 2 * padding_h, width + 2 * padding_w), False, dtype=bool)
+    padded_mask[..., padding_h:padding_h + height, padding_w:padding_w + width] = mask
+    padded_indices = np.full((height + 2 * padding_h, width + 2 * padding_w), 0, dtype=np.int32)
+    padded_indices[padding_h:padding_h + height, padding_w:padding_w + width] = indices
+    windowed_uv = utils3d.numpy.sliding_window_2d(padded_uv, (filter_h_i, filter_w_i), 1, axis=(0, 1))
+    windowed_mask = utils3d.numpy.sliding_window_2d(padded_mask, (filter_h_i, filter_w_i), 1, axis=(-2, -1))
+    windowed_indices = utils3d.numpy.sliding_window_2d(padded_indices, (filter_h_i, filter_w_i), 1, axis=(0, 1))
+    # Gather the target pixels's local window
+    target_center = utils3d.numpy.image_uv(width=target_width, height=target_height, dtype=np.float32) * np.array([width, height], dtype=np.float32)
+    target_lefttop = target_center - np.array((filter_w_f / 2, filter_h_f / 2), dtype=np.float32)
+    target_bottomright = target_center + np.array((filter_w_f / 2, filter_h_f / 2), dtype=np.float32)
+    target_window = np.floor(target_lefttop).astype(np.int32) + np.array((padding_w, padding_h), dtype=np.int32)
+    target_window_centers = windowed_uv[target_window[..., 1], target_window[..., 0], :, :, :].reshape(target_height, target_width, 2, filter_size)                          # (target_height, tgt_width, 2, filter_size)
+    target_window_mask = windowed_mask[..., target_window[..., 1], target_window[..., 0], :, :].reshape(*mask.shape[:-2], target_height, target_width, filter_size)     # (..., target_height, tgt_width, filter_size)
+    target_window_indices = windowed_indices[target_window[..., 1], target_window[..., 0], :, :].reshape(target_height, target_width, filter_size)                      # (target_height, tgt_width, filter_size)
+    # Compute pixel area in the local windows
+    target_window_lefttop = np.maximum(target_window_centers - 0.5, target_lefttop[..., None])
+    target_window_bottomright = np.minimum(target_window_centers + 0.5, target_bottomright[..., None])
+    target_window_area = (target_window_bottomright - target_window_lefttop).clip(0, None)
+    target_window_area = np.where(target_window_mask, target_window_area[..., 0, :] * target_window_area[..., 1, :], 0)
+    # Weighted sum by area
+    target_window_image = image.reshape(*image.shape[:-3], height * width, -1)[..., target_window_indices, :].swapaxes(-2, -1)
+    target_mask = np.sum(target_window_area, axis=-1) >= 0.25
+    target_image = weighted_mean_numpy(target_window_image, target_window_area[..., None, :], axis=-1)
+    if omit_channel_dim:
+        target_image = target_image[..., 0]
+    return target_image, target_mask
+def norm3d(x: np.ndarray) -> np.ndarray:
+    "Faster `np.linalg.norm(x, axis=-1)` for 3D vectors"
+    return np.sqrt(np.square(x[..., 0]) + np.square(x[..., 1]) + np.square(x[..., 2]))
+def depth_occlusion_edge_numpy(depth: np.ndarray, mask: np.ndarray, kernel_size: int = 3, tol: float = 0.1):
+    disp = np.where(mask, 1 / depth, 0)
+    disp_pad = np.pad(disp, (kernel_size // 2, kernel_size // 2), constant_values=0)
+    mask_pad = np.pad(mask, (kernel_size // 2, kernel_size // 2), constant_values=False)
+    disp_window = utils3d.numpy.sliding_window_2d(disp_pad, (kernel_size, kernel_size), 1, axis=(-2, -1))  # [..., H, W, kernel_size ** 2]
+    mask_window = utils3d.numpy.sliding_window_2d(mask_pad, (kernel_size, kernel_size), 1, axis=(-2, -1))  # [..., H, W, kernel_size ** 2]
+    disp_mean = weighted_mean_numpy(disp_window, mask_window, axis=(-2, -1))
+    fg_edge_mask = mask & (disp > (1 + tol) * disp_mean)
+    bg_edge_mask = mask & (disp_mean > (1 + tol) * disp)
+    return fg_edge_mask, bg_edge_mask
+def disk_kernel(radius: int) -> np.ndarray:
+    """
+    Generate disk kernel with given radius.
+    Args:
+        radius (int): Radius of the disk (in pixels).
+    Returns:
+        np.ndarray: (2*radius+1, 2*radius+1) normalized convolution kernel.
+    """
+    # Create coordinate grid centered at (0,0)
+    L = np.arange(-radius, radius + 1)
+    X, Y = np.meshgrid(L, L)
+    # Generate disk: region inside circle with radius R is 1
+    kernel = ((X**2 + Y**2) <= radius**2).astype(np.float32)
+    # Normalize the kernel
+    kernel /= np.sum(kernel)
+    return kernel
+def disk_blur(image: np.ndarray, radius: int) -> np.ndarray:
+    """
+    Apply disk blur to an image using FFT convolution.
+    Args:
+        image (np.ndarray): Input image, can be grayscale or color.
+        radius (int): Blur radius (in pixels).
+    Returns:
+        np.ndarray: Blurred image.
+    """
+    if radius == 0:
+        return image
+    kernel = disk_kernel(radius)
+    if image.ndim == 2:
+        blurred = fftconvolve(image, kernel, mode='same')
+    elif image.ndim == 3:
+        channels = []
+        for i in range(image.shape[2]):
+            blurred_channel = fftconvolve(image[..., i], kernel, mode='same')
+            channels.append(blurred_channel)
+        blurred = np.stack(channels, axis=-1)
+    else:
+        raise ValueError("Image must be 2D or 3D.")
+    return blurred
+def depth_of_field(
+    img: np.ndarray,
+    disp: np.ndarray,
+    focus_disp : float,
+    max_blur_radius : int = 10,
+) -> np.ndarray:
+    """
+    Apply depth of field effect to an image.
+    Args:
+        img (numpy.ndarray): (H, W, 3) input image.
+        depth (numpy.ndarray): (H, W) depth map of the scene.
+        focus_depth (float): Focus depth of the lens.
+        strength (float): Strength of the depth of field effect.
+        max_blur_radius (int): Maximum blur radius (in pixels).
+    Returns:
+        numpy.ndarray: (H, W, 3) output image with depth of field effect applied.
+    """
+    # Precalculate dialated depth map for each blur radius
+    max_disp = np.max(disp)
+    disp = disp / max_disp
+    focus_disp = focus_disp / max_disp
+    dilated_disp = []
+    for radius in range(max_blur_radius + 1):
+        dilated_disp.append(cv2.dilate(disp, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2*radius+1, 2*radius+1)), iterations=1))
+    # Determine the blur radius for each pixel based on the depth map
+    blur_radii = np.clip(abs(disp - focus_disp) * max_blur_radius, 0, max_blur_radius).astype(np.int32)
+    for radius in range(max_blur_radius + 1):
+        dialted_blur_radii = np.clip(abs(dilated_disp[radius] - focus_disp) * max_blur_radius, 0, max_blur_radius).astype(np.int32)
+        mask = (dialted_blur_radii >= radius) & (dialted_blur_radii >= blur_radii) & (dilated_disp[radius] > disp)
+        blur_radii[mask] = dialted_blur_radii[mask]
+    blur_radii = np.clip(blur_radii, 0, max_blur_radius)
+    blur_radii = cv2.blur(blur_radii, (5, 5))
+    # Precalculate the blured image for each blur radius
+    unique_radii = np.unique(blur_radii)
+    precomputed = {}
+    for radius in range(max_blur_radius + 1):
+        if radius not in unique_radii:
+            continue
+        precomputed[radius] = disk_blur(img, radius)
+    # Composit the blured image for each pixel
+    output = np.zeros_like(img)
+    for r in unique_radii:
+        mask = blur_radii == r
+        output[mask] = precomputed[r][mask]
+    return output

models/SpaTrackV2/models/tracker3D/spatrack_modules/geometry_torch.py ADDED Viewed

	@@ -0,0 +1,323 @@

+from typing import *
+import math
+from collections import namedtuple
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.types
+import utils3d
+from .tools import timeit
+from .geometry_numpy import solve_optimal_focal_shift, solve_optimal_shift
+def weighted_mean(x: torch.Tensor, w: torch.Tensor = None, dim: Union[int, torch.Size] = None, keepdim: bool = False, eps: float = 1e-7) -> torch.Tensor:
+    if w is None:
+        return x.mean(dim=dim, keepdim=keepdim)
+    else:
+        w = w.to(x.dtype)
+        return (x * w).mean(dim=dim, keepdim=keepdim) / w.mean(dim=dim, keepdim=keepdim).add(eps)
+def harmonic_mean(x: torch.Tensor, w: torch.Tensor = None, dim: Union[int, torch.Size] = None, keepdim: bool = False, eps: float = 1e-7) -> torch.Tensor:
+    if w is None:
+        return x.add(eps).reciprocal().mean(dim=dim, keepdim=keepdim).reciprocal()
+    else:
+        w = w.to(x.dtype)
+        return weighted_mean(x.add(eps).reciprocal(), w, dim=dim, keepdim=keepdim, eps=eps).add(eps).reciprocal()
+def geometric_mean(x: torch.Tensor, w: torch.Tensor = None, dim: Union[int, torch.Size] = None, keepdim: bool = False, eps: float = 1e-7) -> torch.Tensor:
+    if w is None:
+        return x.add(eps).log().mean(dim=dim).exp()
+    else:
+        w = w.to(x.dtype)
+        return weighted_mean(x.add(eps).log(), w, dim=dim, keepdim=keepdim, eps=eps).exp()
+def normalized_view_plane_uv(width: int, height: int, aspect_ratio: float = None, dtype: torch.dtype = None, device: torch.device = None) -> torch.Tensor:
+    "UV with left-top corner as (-width / diagonal, -height / diagonal) and right-bottom corner as (width / diagonal, height / diagonal)"
+    if aspect_ratio is None:
+        aspect_ratio = width / height
+    span_x = aspect_ratio / (1 + aspect_ratio ** 2) ** 0.5
+    span_y = 1 / (1 + aspect_ratio ** 2) ** 0.5
+    u = torch.linspace(-span_x * (width - 1) / width, span_x * (width - 1) / width, width, dtype=dtype, device=device)
+    v = torch.linspace(-span_y * (height - 1) / height, span_y * (height - 1) / height, height, dtype=dtype, device=device)
+    u, v = torch.meshgrid(u, v, indexing='xy')
+    uv = torch.stack([u, v], dim=-1)
+    return uv
+def gaussian_blur_2d(input: torch.Tensor, kernel_size: int, sigma: float) -> torch.Tensor:
+    kernel = torch.exp(-(torch.arange(-kernel_size // 2 + 1, kernel_size // 2 + 1, dtype=input.dtype, device=input.device) ** 2) / (2 * sigma ** 2))
+    kernel = kernel / kernel.sum()
+    kernel = (kernel[:, None] * kernel[None, :]).reshape(1, 1, kernel_size, kernel_size)
+    input = F.pad(input, (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2), mode='replicate')
+    input = F.conv2d(input, kernel, groups=input.shape[1])
+    return input
+def focal_to_fov(focal: torch.Tensor):
+    return 2 * torch.atan(0.5 / focal)
+def fov_to_focal(fov: torch.Tensor):
+    return 0.5 / torch.tan(fov / 2)
+def angle_diff_vec3(v1: torch.Tensor, v2: torch.Tensor, eps: float = 1e-12):
+    return torch.atan2(torch.cross(v1, v2, dim=-1).norm(dim=-1) + eps, (v1 * v2).sum(dim=-1))
+def intrinsics_to_fov(intrinsics: torch.Tensor):
+    """
+    Returns field of view in radians from normalized intrinsics matrix.
+    ### Parameters:
+    - intrinsics: torch.Tensor of shape (..., 3, 3)
+    ### Returns:
+    - fov_x: torch.Tensor of shape (...)
+    - fov_y: torch.Tensor of shape (...)
+    """
+    focal_x = intrinsics[..., 0, 0]
+    focal_y = intrinsics[..., 1, 1]
+    return 2 * torch.atan(0.5 / focal_x), 2 * torch.atan(0.5 / focal_y)
+def point_map_to_depth_legacy(points: torch.Tensor):
+    height, width = points.shape[-3:-1]
+    diagonal = (height ** 2 + width ** 2) ** 0.5
+    uv = normalized_view_plane_uv(width, height, dtype=points.dtype, device=points.device)  # (H, W, 2)
+    # Solve least squares problem
+    b = (uv * points[..., 2:]).flatten(-3, -1)                        # (..., H * W * 2)
+    A = torch.stack([points[..., :2], -uv.expand_as(points[..., :2])], dim=-1).flatten(-4, -2)   # (..., H * W * 2, 2)
+    M = A.transpose(-2, -1) @ A
+    solution = (torch.inverse(M + 1e-6 * torch.eye(2).to(A)) @ (A.transpose(-2, -1) @ b[..., None])).squeeze(-1)
+    focal, shift = solution.unbind(-1)
+    depth = points[..., 2] + shift[..., None, None]
+    fov_x = torch.atan(width / diagonal / focal) * 2
+    fov_y = torch.atan(height / diagonal / focal) * 2
+    return depth, fov_x, fov_y, shift
+def view_plane_uv_to_focal(uv: torch.Tensor):
+    normed_uv = normalized_view_plane_uv(width=uv.shape[-2], height=uv.shape[-3], device=uv.device, dtype=uv.dtype)
+    focal = (uv * normed_uv).sum() / uv.square().sum().add(1e-12)
+    return focal
+def recover_focal_shift(points: torch.Tensor, mask: torch.Tensor = None, focal: torch.Tensor = None, downsample_size: Tuple[int, int] = (64, 64)):
+    """
+    Recover the depth map and FoV from a point map with unknown z shift and focal.
+    Note that it assumes:
+    - the optical center is at the center of the map
+    - the map is undistorted
+    - the map is isometric in the x and y directions
+    ### Parameters:
+    - `points: torch.Tensor` of shape (..., H, W, 3)
+    - `downsample_size: Tuple[int, int]` in (height, width), the size of the downsampled map. Downsampling produces approximate solution and is efficient for large maps.
+    ### Returns:
+    - `focal`: torch.Tensor of shape (...) the estimated focal length, relative to the half diagonal of the map
+    - `shift`: torch.Tensor of shape (...) Z-axis shift to translate the point map to camera space
+    """
+    shape = points.shape
+    height, width = points.shape[-3], points.shape[-2]
+    diagonal = (height ** 2 + width ** 2) ** 0.5
+    points = points.reshape(-1, *shape[-3:])
+    mask = None if mask is None else mask.reshape(-1, *shape[-3:-1])
+    focal = focal.reshape(-1) if focal is not None else None
+    uv = normalized_view_plane_uv(width, height, dtype=points.dtype, device=points.device)  # (H, W, 2)
+    points_lr = F.interpolate(points.permute(0, 3, 1, 2), downsample_size, mode='nearest').permute(0, 2, 3, 1)
+    uv_lr = F.interpolate(uv.unsqueeze(0).permute(0, 3, 1, 2), downsample_size, mode='nearest').squeeze(0).permute(1, 2, 0)
+    mask_lr = None if mask is None else F.interpolate(mask.to(torch.float32).unsqueeze(1), downsample_size, mode='nearest').squeeze(1) > 0
+    uv_lr_np = uv_lr.cpu().numpy()
+    points_lr_np = points_lr.detach().cpu().numpy()
+    focal_np = focal.cpu().numpy() if focal is not None else None
+    mask_lr_np = None if mask is None else mask_lr.cpu().numpy()
+    optim_shift, optim_focal = [], []
+    for i in range(points.shape[0]):
+        points_lr_i_np = points_lr_np[i] if mask is None else points_lr_np[i][mask_lr_np[i]]
+        uv_lr_i_np = uv_lr_np if mask is None else uv_lr_np[mask_lr_np[i]]
+        if uv_lr_i_np.shape[0] < 2:
+            optim_focal.append(1)
+            optim_shift.append(0)
+            continue
+        if focal is None:
+            optim_shift_i, optim_focal_i = solve_optimal_focal_shift(uv_lr_i_np, points_lr_i_np)
+            optim_focal.append(float(optim_focal_i))
+        else:
+            optim_shift_i = solve_optimal_shift(uv_lr_i_np, points_lr_i_np, focal_np[i])
+        optim_shift.append(float(optim_shift_i))
+    optim_shift = torch.tensor(optim_shift, device=points.device, dtype=points.dtype).reshape(shape[:-3])
+    if focal is None:
+        optim_focal = torch.tensor(optim_focal, device=points.device, dtype=points.dtype).reshape(shape[:-3])
+    else:
+        optim_focal = focal.reshape(shape[:-3])
+    return optim_focal, optim_shift
+def mask_aware_nearest_resize(
+    inputs: Union[torch.Tensor, Sequence[torch.Tensor], None],
+    mask: torch.BoolTensor,
+    size: Tuple[int, int],
+    return_index: bool = False
+) -> Tuple[Union[torch.Tensor, Sequence[torch.Tensor], None], torch.BoolTensor, Tuple[torch.LongTensor, ...]]:
+    """
+    Resize 2D map by nearest interpolation. Return the nearest neighbor index and mask of the resized map.
+    ### Parameters
+    - `inputs`: a single or a list of input 2D map(s) of shape (..., H, W, ...).
+    - `mask`: input 2D mask of shape (..., H, W)
+    - `size`: target size (target_width, target_height)
+    ### Returns
+    - `*resized_maps`: resized map(s) of shape (..., target_height, target_width, ...).
+    - `resized_mask`: mask of the resized map of shape (..., target_height, target_width)
+    - `nearest_idx`: if return_index is True, nearest neighbor index of the resized map of shape (..., target_height, target_width) for each dimension, .
+    """
+    height, width = mask.shape[-2:]
+    target_width, target_height = size
+    device = mask.device
+    filter_h_f, filter_w_f = max(1, height / target_height), max(1, width / target_width)
+    filter_h_i, filter_w_i = math.ceil(filter_h_f), math.ceil(filter_w_f)
+    filter_size = filter_h_i * filter_w_i
+    padding_h, padding_w = filter_h_i // 2 + 1, filter_w_i // 2 + 1
+    # Window the original mask and uv
+    uv = utils3d.torch.image_pixel_center(width=width, height=height, dtype=torch.float32, device=device)
+    indices = torch.arange(height * width, dtype=torch.long, device=device).reshape(height, width)
+    padded_uv = torch.full((height + 2 * padding_h, width + 2 * padding_w, 2), 0, dtype=torch.float32, device=device)
+    padded_uv[padding_h:padding_h + height, padding_w:padding_w + width] = uv
+    padded_mask = torch.full((*mask.shape[:-2], height + 2 * padding_h, width + 2 * padding_w), False, dtype=torch.bool, device=device)
+    padded_mask[..., padding_h:padding_h + height, padding_w:padding_w + width] = mask
+    padded_indices = torch.full((height + 2 * padding_h, width + 2 * padding_w), 0, dtype=torch.long, device=device)
+    padded_indices[padding_h:padding_h + height, padding_w:padding_w + width] = indices
+    windowed_uv = utils3d.torch.sliding_window_2d(padded_uv, (filter_h_i, filter_w_i), 1, dim=(0, 1))
+    windowed_mask = utils3d.torch.sliding_window_2d(padded_mask, (filter_h_i, filter_w_i), 1, dim=(-2, -1))
+    windowed_indices = utils3d.torch.sliding_window_2d(padded_indices, (filter_h_i, filter_w_i), 1, dim=(0, 1))
+    # Gather the target pixels's local window
+    target_uv = utils3d.torch.image_uv(width=target_width, height=target_height, dtype=torch.float32, device=device) * torch.tensor([width, height], dtype=torch.float32, device=device)
+    target_lefttop = target_uv - torch.tensor((filter_w_f / 2, filter_h_f / 2), dtype=torch.float32, device=device)
+    target_window = torch.round(target_lefttop).long() + torch.tensor((padding_w, padding_h), dtype=torch.long, device=device)
+    target_window_uv = windowed_uv[target_window[..., 1], target_window[..., 0], :, :, :].reshape(target_height, target_width, 2, filter_size)                          # (target_height, tgt_width, 2, filter_size)
+    target_window_mask = windowed_mask[..., target_window[..., 1], target_window[..., 0], :, :].reshape(*mask.shape[:-2], target_height, target_width, filter_size)     # (..., target_height, tgt_width, filter_size)
+    target_window_indices = windowed_indices[target_window[..., 1], target_window[..., 0], :, :].reshape(target_height, target_width, filter_size)                      # (target_height, tgt_width, filter_size)
+    target_window_indices = target_window_indices.expand_as(target_window_mask)
+    # Compute nearest neighbor in the local window for each pixel
+    dist = torch.where(target_window_mask, torch.norm(target_window_uv - target_uv[..., None], dim=-2), torch.inf)  # (..., target_height, tgt_width, filter_size)
+    nearest = torch.argmin(dist, dim=-1, keepdim=True)                                                              # (..., target_height, tgt_width, 1)
+    nearest_idx = torch.gather(target_window_indices, index=nearest, dim=-1).squeeze(-1)                            # (..., target_height, tgt_width)
+    target_mask = torch.any(target_window_mask, dim=-1)
+    nearest_i, nearest_j = nearest_idx // width, nearest_idx % width
+    batch_indices = [torch.arange(n, device=device).reshape([1] * i + [n] + [1] * (mask.dim() - i - 1)) for i, n in enumerate(mask.shape[:-2])]
+    index = (*batch_indices, nearest_i, nearest_j)
+    if inputs is None:
+        outputs = None
+    elif isinstance(inputs, torch.Tensor):
+        outputs = inputs[index]
+    elif isinstance(inputs, Sequence):
+        outputs = tuple(x[index] for x in inputs)
+    else:
+        raise ValueError(f'Invalid input type: {type(inputs)}')
+    if return_index:
+        return outputs, target_mask, index
+    else:
+        return outputs, target_mask
+def theshold_depth_change(depth: torch.Tensor, mask: torch.Tensor, pooler: Literal['min', 'max'], rtol: float = 0.2, kernel_size: int = 3):
+    *batch_shape, height, width = depth.shape
+    depth = depth.reshape(-1, 1, height, width)
+    mask = mask.reshape(-1, 1, height, width)
+    if pooler =='max':
+        pooled_depth = F.max_pool2d(torch.where(mask, depth, -torch.inf), kernel_size, stride=1, padding=kernel_size // 2)
+        output_mask = pooled_depth > depth * (1 + rtol)
+    elif pooler =='min':
+        pooled_depth = -F.max_pool2d(-torch.where(mask, depth, torch.inf), kernel_size, stride=1, padding=kernel_size // 2)
+        output_mask =  pooled_depth < depth * (1 - rtol)
+    else:
+        raise ValueError(f'Unsupported pooler: {pooler}')
+    output_mask = output_mask.reshape(*batch_shape, height, width)
+    return output_mask
+def depth_occlusion_edge(depth: torch.FloatTensor, mask: torch.BoolTensor, kernel_size: int = 3, tol: float = 0.1):
+    device, dtype = depth.device, depth.dtype
+    disp = torch.where(mask, 1 / depth, 0)
+    disp_pad = F.pad(disp, (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2), value=0)
+    mask_pad = F.pad(mask, (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2), value=False)
+    disp_window = utils3d.torch.sliding_window_2d(disp_pad, (kernel_size, kernel_size), 1, dim=(-2, -1)).flatten(-2)  # [..., H, W, kernel_size ** 2]
+    mask_window = utils3d.torch.sliding_window_2d(mask_pad, (kernel_size, kernel_size), 1, dim=(-2, -1)).flatten(-2)            # [..., H, W, kernel_size ** 2]
+    x = torch.linspace(-kernel_size // 2, kernel_size // 2, kernel_size, device=device, dtype=dtype)
+    A = torch.stack([*torch.meshgrid(x, x, indexing='xy'), torch.ones((kernel_size, kernel_size), device=device, dtype=dtype)], dim=-1).reshape(kernel_size ** 2, 3)        # [kernel_size ** 2, 3]
+    A = mask_window[..., None] * A
+    I = torch.eye(3, device=device, dtype=dtype)
+    affine_disp_window = (disp_window[..., None, :] @ A @ torch.inverse(A.mT @ A + 1e-5 * I) @ A.mT).clamp_min(1e-12)[..., 0, :]  # [..., H, W, kernel_size ** 2]
+    diff = torch.where(mask_window, torch.maximum(affine_disp_window, disp_window) / torch.minimum(affine_disp_window, disp_window) - 1, 0)
+    edge_mask = mask & (diff > tol).any(dim=-1)
+    disp_mean = weighted_mean(disp_window, mask_window, dim=-1)
+    fg_edge_mask = edge_mask & (disp > disp_mean)
+    # fg_edge_mask = edge_mask & theshold_depth_change(depth, mask, pooler='max', rtol=tol, kernel_size=kernel_size)
+    bg_edge_mask = edge_mask & ~fg_edge_mask
+    return fg_edge_mask, bg_edge_mask
+def depth_occlusion_edge(depth: torch.FloatTensor, mask: torch.BoolTensor, kernel_size: int = 3, tol: float = 0.1):
+    device, dtype = depth.device, depth.dtype
+    disp = torch.where(mask, 1 / depth, 0)
+    disp_pad = F.pad(disp, (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2), value=0)
+    mask_pad = F.pad(mask, (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2), value=False)
+    disp_window = utils3d.torch.sliding_window_2d(disp_pad, (kernel_size, kernel_size), 1, dim=(-2, -1))  # [..., H, W, kernel_size ** 2]
+    mask_window = utils3d.torch.sliding_window_2d(mask_pad, (kernel_size, kernel_size), 1, dim=(-2, -1))  # [..., H, W, kernel_size ** 2]
+    disp_mean = weighted_mean(disp_window, mask_window, dim=(-2, -1))
+    fg_edge_mask = mask & (disp / disp_mean > 1 + tol)
+    bg_edge_mask = mask & (disp_mean / disp > 1 + tol)
+    fg_edge_mask = fg_edge_mask & F.max_pool2d(bg_edge_mask.float(), kernel_size + 2, stride=1, padding=kernel_size // 2 + 1).bool()
+    bg_edge_mask = bg_edge_mask & F.max_pool2d(fg_edge_mask.float(), kernel_size + 2, stride=1, padding=kernel_size // 2 + 1).bool()
+    return fg_edge_mask, bg_edge_mask
+def dilate_with_mask(input: torch.Tensor, mask: torch.BoolTensor, filter: Literal['min', 'max', 'mean', 'median'] = 'mean', iterations: int = 1) -> torch.Tensor:
+    kernel = torch.tensor([[False, True, False], [True, True, True], [False, True, False]], device=input.device, dtype=torch.bool)
+    for _ in range(iterations):
+        input_window = utils3d.torch.sliding_window_2d(F.pad(input, (1, 1, 1, 1), mode='constant', value=0), window_size=3, stride=1, dim=(-2, -1))
+        mask_window = kernel & utils3d.torch.sliding_window_2d(F.pad(mask, (1, 1, 1, 1), mode='constant', value=False), window_size=3, stride=1, dim=(-2, -1))
+        if filter =='min':
+            input = torch.where(mask, input, torch.where(mask_window, input_window, torch.inf).min(dim=(-2, -1)).values)
+        elif filter =='max':
+            input = torch.where(mask, input, torch.where(mask_window, input_window, -torch.inf).max(dim=(-2, -1)).values)
+        elif filter == 'mean':
+            input = torch.where(mask, input, torch.where(mask_window, input_window, torch.nan).nanmean(dim=(-2, -1)))
+        elif filter =='median':
+            input = torch.where(mask, input, torch.where(mask_window, input_window, torch.nan).flatten(-2).nanmedian(dim=-1).values)
+        mask = mask_window.any(dim=(-2, -1))
+    return input, mask

models/SpaTrackV2/models/tracker3D/spatrack_modules/pointmap_updator.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import torch
+import torch.nn as nn
+from models.SpaTrackV2.models.blocks import bilinear_sampler
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.alignment import align_points_scale, align_points_scale_xyz_shift
+def compute_affine_scale_and_shift(points, pointmap, mask, weights=None, eps=1e-6):
+    """
+    Compute global affine transform (scale * pointmap + shift = points)
+    using least-squares fitting with optional weights and mask.
+    Args:
+        points (BT, N, 3): Target points
+        pointmap (BT, N, 3): Source points
+        mask (BT, N): Binary mask indicating valid points
+        weights (BT, N): Optional weights per point
+        eps (float): Numerical stability
+    Returns:
+        scale (BT, 1): Scalar scale per batch
+        shift (BT, 3): Shift vector per batch
+    """
+    if weights is None:
+        weights = mask.float()
+    else:
+        weights = weights * mask  # combine mask
+    # Sum of weights
+    weight_sum = weights.sum(dim=1, keepdim=True) + eps  # (BT, 1)
+    # Compute weighted centroids
+    centroid_p = (points * weights.unsqueeze(-1)).sum(dim=1) / weight_sum  # (BT, 3)
+    centroid_m = (pointmap * weights.unsqueeze(-1)).sum(dim=1) / weight_sum  # (BT, 3)
+    # Center the point sets
+    p_centered = points - centroid_p.unsqueeze(1)  # (BT, N, 3)
+    m_centered = pointmap - centroid_m.unsqueeze(1)  # (BT, N, 3)
+    # Compute scale: ratio of dot products
+    numerator = (weights.unsqueeze(-1) * (p_centered * m_centered)).sum(dim=1).sum(dim=-1)  # (BT,)
+    denominator = (weights.unsqueeze(-1) * (m_centered ** 2)).sum(dim=1).sum(dim=-1) + eps  # (BT,)
+    scale = (numerator / denominator).unsqueeze(-1)  # (BT, 1)
+    # Compute shift: t = c_p - s * c_m
+    shift = centroid_p - scale * centroid_m  # (BT, 3)
+    return scale, shift
+def compute_weighted_std(track2d, vis_est, eps=1e-6):
+    """
+    Compute the weighted standard deviation of 2D tracks across time.
+    Args:
+        track2d (Tensor): shape (B, T, N, 2), 2D tracked points.
+        vis_est (Tensor): shape (B, T, N), visibility weights (0~1).
+        eps (float): small epsilon to avoid division by zero.
+    Returns:
+        std (Tensor): shape (B, N, 2), weighted standard deviation for each point.
+    """
+    B, T, N, _ = track2d.shape
+    # Compute weighted mean
+    weighted_sum = (track2d * vis_est[..., None]).sum(dim=1)  # (B, N, 2)
+    weight_sum = vis_est.sum(dim=1)[..., None] + eps          # (B, N, 1)
+    track_mean = weighted_sum / weight_sum                    # (B, N, 2)
+    # Compute squared residuals
+    residuals = track2d - track_mean[:, None, :, :]           # (B, T, N, 2)
+    weighted_sq_res = (residuals ** 2) * vis_est[..., None]   # (B, T, N, 2)
+    # Compute weighted variance and std
+    var = weighted_sq_res.sum(dim=1) / (weight_sum + eps)     # (B, N, 2)
+    std = var.sqrt()                                           # (B, N, 2)
+    return std
+class PointMapUpdator(nn.Module):
+    def __init__(self, stablizer):
+        super(PointMapUpdator, self).__init__()
+        self.stablizer = stablizer()
+    def init_pointmap(self, points_map):
+        pass
+    def scale_update_from_tracks(self, cam_pts_est, coords_append, point_map_org, vis_est, reproj_loss):
+        B, T, N, _ = coords_append.shape
+        track2d = coords_append[...,:2].view(B*T, N, 2)
+        track_len_std = compute_weighted_std(track2d.view(B, T, N, 2), vis_est.view(B, T, N)).norm(dim=-1)
+        point_samp = bilinear_sampler(point_map_org, track2d[:,None], mode="nearest")
+        point_samp = point_samp.permute(0,3,1,2).view(B*T, N, 3)
+        cam_pts_est = cam_pts_est.view(B*T, N, 3)
+        # mask
+        mask = vis_est.view(B*T, N)
+        # using gaussian weights, mean is 2 pixels
+        nm_reproj_loss = (reproj_loss.view(B*T, N) / (track_len_std.view(B, N) + 1e-6)).clamp(0, 5)
+        std = nm_reproj_loss.std(dim=-1).view(B*T, 1) # B*T 1
+        weights = torch.exp(-(0.5-nm_reproj_loss.view(B*T, N))**2 / (2*std**2))
+        mask = mask*(point_samp[...,2]>0)*(cam_pts_est[...,2]>0)*weights
+        scales, shift = align_points_scale_xyz_shift(point_samp, cam_pts_est, mask)
+        return scales, shift

models/SpaTrackV2/models/tracker3D/spatrack_modules/simple_vit_1d.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import torch
+from torch import nn
+from einops import rearrange
+from einops.layers.torch import Rearrange
+# helpers
+def posemb_sincos_1d(patches, temperature = 10000, dtype = torch.float32):
+    _, n, dim, device, dtype = *patches.shape, patches.device, patches.dtype
+    n = torch.arange(n, device = device)
+    assert (dim % 2) == 0, 'feature dimension must be multiple of 2 for sincos emb'
+    omega = torch.arange(dim // 2, device = device) / (dim // 2 - 1)
+    omega = 1. / (temperature ** omega)
+    n = n.flatten()[:, None] * omega[None, :]
+    pe = torch.cat((n.sin(), n.cos()), dim = 1)
+    return pe.type(dtype)
+# classes
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, dim),
+        )
+    def forward(self, x):
+        return self.net(x)
+class Attention(nn.Module):
+    def __init__(self, dim, heads = 8, dim_head = 64):
+        super().__init__()
+        inner_dim = dim_head *  heads
+        self.heads = heads
+        self.scale = dim_head ** -0.5
+        self.norm = nn.LayerNorm(dim)
+        self.attend = nn.Softmax(dim = -1)
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
+        self.to_out = nn.Linear(inner_dim, dim, bias = False)
+    def forward(self, x):
+        x = self.norm(x)
+        qkv = self.to_qkv(x).chunk(3, dim = -1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
+        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+        attn = self.attend(dots)
+        out = torch.matmul(attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+class Transformer(nn.Module):
+    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(nn.ModuleList([
+                Attention(dim, heads = heads, dim_head = dim_head),
+                FeedForward(dim, mlp_dim)
+            ]))
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+        return self.norm(x)
+class SimpleViT(nn.Module):
+    def __init__(self, *, seq_len, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
+        super().__init__()
+        assert seq_len % patch_size == 0
+        num_patches = seq_len // patch_size
+        patch_dim = channels * patch_size
+        self.to_patch_embedding = nn.Sequential(
+            Rearrange('b c (n p) -> b n (p c)', p = patch_size),
+            nn.LayerNorm(patch_dim),
+            nn.Linear(patch_dim, dim),
+            nn.LayerNorm(dim),
+        )
+        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)
+        self.to_latent = nn.Identity()
+        self.linear_head = nn.Linear(dim, num_classes)
+    def forward(self, series):
+        *_, n, dtype = *series.shape, series.dtype
+        x = self.to_patch_embedding(series)
+        pe = posemb_sincos_1d(x)
+        x = rearrange(x, 'b ... d -> b (...) d') + pe
+        x = self.transformer(x)
+        x = x.mean(dim = 1)
+        x = self.to_latent(x)
+        return self.linear_head(x)
+if __name__ == '__main__':
+    v = SimpleViT(
+        seq_len = 256,
+        patch_size = 16,
+        num_classes = 1000,
+        dim = 1024,
+        depth = 6,
+        heads = 8,
+        mlp_dim = 2048
+    )
+    time_series = torch.randn(4, 3, 256)
+    logits = v(time_series) # (4, 1000)

models/SpaTrackV2/models/tracker3D/spatrack_modules/tools.py ADDED Viewed

	@@ -0,0 +1,289 @@

+from typing import *
+import time
+from pathlib import Path
+from numbers import Number
+from functools import wraps
+import warnings
+import math
+import json
+import os
+import importlib
+import importlib.util
+def catch_exception(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            import traceback
+            print(f"Exception in {fn.__name__}",  end='r')
+            # print({', '.join(repr(arg) for arg in args)}, {', '.join(f'{k}={v!r}' for k, v in kwargs.items())})
+            traceback.print_exc(chain=False)
+            time.sleep(0.1)
+            return None
+    return wrapper
+class CallbackOnException:
+    def __init__(self, callback: Callable, exception: type):
+        self.exception = exception
+        self.callback = callback
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if isinstance(exc_val, self.exception):
+            self.callback()
+            return True
+        return False
+def traverse_nested_dict_keys(d: Dict[str, Dict]) -> Generator[Tuple[str, ...], None, None]:
+    for k, v in d.items():
+        if isinstance(v, dict):
+            for sub_key in traverse_nested_dict_keys(v):
+                yield (k, ) + sub_key
+        else:
+            yield (k, )
+def get_nested_dict(d: Dict[str, Dict], keys: Tuple[str, ...], default: Any = None):
+    for k in keys:
+        d = d.get(k, default)
+        if d is None:
+            break
+    return d
+def set_nested_dict(d: Dict[str, Dict], keys: Tuple[str, ...], value: Any):
+    for k in keys[:-1]:
+        d = d.setdefault(k, {})
+    d[keys[-1]] = value
+def key_average(list_of_dicts: list) -> Dict[str, Any]:
+    """
+    Returns a dictionary with the average value of each key in the input list of dictionaries.
+    """
+    _nested_dict_keys = set()
+    for d in list_of_dicts:
+        _nested_dict_keys.update(traverse_nested_dict_keys(d))
+    _nested_dict_keys = sorted(_nested_dict_keys)
+    result = {}
+    for k in _nested_dict_keys:
+        values = []
+        for d in list_of_dicts:
+            v = get_nested_dict(d, k)
+            if v is not None and not math.isnan(v):
+                values.append(v)
+        avg = sum(values) / len(values) if values else float('nan')
+        set_nested_dict(result, k, avg)
+    return result
+def flatten_nested_dict(d: Dict[str, Any], parent_key: Tuple[str, ...] = None) -> Dict[Tuple[str, ...], Any]:
+    """
+    Flattens a nested dictionary into a single-level dictionary, with keys as tuples.
+    """
+    items = []
+    if parent_key is None:
+        parent_key = ()
+    for k, v in d.items():
+        new_key = parent_key + (k, )
+        if isinstance(v, MutableMapping):
+            items.extend(flatten_nested_dict(v, new_key).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+def unflatten_nested_dict(d: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Unflattens a single-level dictionary into a nested dictionary, with keys as tuples.
+    """
+    result = {}
+    for k, v in d.items():
+        sub_dict = result
+        for k_ in k[:-1]:
+            if k_ not in sub_dict:
+                sub_dict[k_] = {}
+            sub_dict = sub_dict[k_]
+        sub_dict[k[-1]] = v
+    return result
+def read_jsonl(file):
+    import json
+    with open(file, 'r') as f:
+        data = f.readlines()
+    return [json.loads(line) for line in data]
+def write_jsonl(data: List[dict], file):
+    import json
+    with open(file, 'w') as f:
+        for item in data:
+            f.write(json.dumps(item) + '\n')
+def to_hierachical_dataframe(data: List[Dict[Tuple[str, ...], Any]]):
+    import pandas as pd
+    data = [flatten_nested_dict(d) for d in data]
+    df = pd.DataFrame(data)
+    df = df.sort_index(axis=1)
+    df.columns = pd.MultiIndex.from_tuples(df.columns)
+    return df
+def recursive_replace(d: Union[List, Dict, str], mapping: Dict[str, str]):
+    if isinstance(d, str):
+        for old, new in mapping.items():
+            d = d.replace(old, new)
+    elif isinstance(d, list):
+        for i, item in enumerate(d):
+            d[i] = recursive_replace(item, mapping)
+    elif isinstance(d, dict):
+        for k, v in d.items():
+            d[k] = recursive_replace(v, mapping)
+    return d
+class timeit:
+    _history: Dict[str, List['timeit']] = {}
+    def __init__(self, name: str = None, verbose: bool = True, average: bool = False):
+        self.name = name
+        self.verbose = verbose
+        self.start = None
+        self.end = None
+        self.average = average
+        if average and name not in timeit._history:
+            timeit._history[name] = []
+    def __call__(self, func: Callable):
+        import inspect
+        if inspect.iscoroutinefunction(func):
+            async def wrapper(*args, **kwargs):
+                with timeit(self.name or func.__qualname__):
+                    ret = await func(*args, **kwargs)
+                return ret
+            return wrapper
+        else:
+            def wrapper(*args, **kwargs):
+                with timeit(self.name or func.__qualname__):
+                    ret = func(*args, **kwargs)
+                return ret
+            return wrapper
+    def __enter__(self):
+        self.start = time.time()
+        return self
+    @property
+    def time(self) -> float:
+        assert self.start is not None, "Time not yet started."
+        assert self.end is not None, "Time not yet ended."
+        return self.end - self.start
+    @property
+    def average_time(self) -> float:
+        assert self.average, "Average time not available."
+        return sum(t.time for t in timeit._history[self.name]) / len(timeit._history[self.name])
+    @property
+    def history(self) -> List['timeit']:
+        return timeit._history.get(self.name, [])
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.end = time.time()
+        if self.average:
+            timeit._history[self.name].append(self)
+        if self.verbose:
+            if self.average:
+                avg = self.average_time
+                print(f"{self.name or 'It'} took {avg:.6f} seconds in average.")
+            else:
+                print(f"{self.name or 'It'} took {self.time:.6f} seconds.")
+def strip_common_prefix_suffix(strings: List[str]) -> List[str]:
+    first = strings[0]
+    for start in range(len(first)):
+        if any(s[start] != strings[0][start] for s in strings):
+            break
+    for end in range(1, min(len(s) for s in strings)):
+        if any(s[-end] != first[-end] for s in strings):
+            break
+    return [s[start:len(s) - end + 1] for s in strings]
+def multithead_execute(inputs: List[Any], num_workers: int, pbar = None):
+    from concurrent.futures import ThreadPoolExecutor
+    from contextlib import nullcontext
+    from tqdm import tqdm
+    if pbar is not None:
+        pbar.total = len(inputs) if hasattr(inputs, '__len__') else None
+    else:
+        pbar = tqdm(total=len(inputs) if hasattr(inputs, '__len__') else None)
+    def decorator(fn: Callable):
+        with (
+            ThreadPoolExecutor(max_workers=num_workers) as executor,
+            pbar
+        ):
+            pbar.refresh()
+            @catch_exception
+            @suppress_traceback
+            def _fn(input):
+                ret = fn(input)
+                pbar.update()
+                return ret
+            executor.map(_fn, inputs)
+            executor.shutdown(wait=True)
+    return decorator
+def suppress_traceback(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            e.__traceback__ = e.__traceback__.tb_next.tb_next
+            raise
+    return wrapper
+class no_warnings:
+    def __init__(self, action: str = 'ignore', **kwargs):
+        self.action = action
+        self.filter_kwargs = kwargs
+    def __call__(self, fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            with warnings.catch_warnings():
+                warnings.simplefilter(self.action, **self.filter_kwargs)
+                return fn(*args, **kwargs)
+        return wrapper
+    def __enter__(self):
+        self.warnings_manager = warnings.catch_warnings()
+        self.warnings_manager.__enter__()
+        warnings.simplefilter(self.action, **self.filter_kwargs)
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.warnings_manager.__exit__(exc_type, exc_val, exc_tb)
+def import_file_as_module(file_path: Union[str, os.PathLike], module_name: str):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module

models/SpaTrackV2/models/tracker3D/spatrack_modules/utils.py ADDED Viewed

	@@ -0,0 +1,1006 @@

+import os, sys
+import torch
+import torch.amp
+import torch.nn.functional as F
+import torch.nn as nn
+from models.SpaTrackV2.models.tracker3D.co_tracker.utils import (
+    EfficientUpdateFormer, AttnBlock, Attention, CrossAttnBlock,
+    sequence_BCE_loss, sequence_loss, sequence_prob_loss, sequence_dyn_prob_loss
+)
+import math
+from models.SpaTrackV2.models.tracker3D.co_tracker.utils import (
+    Mlp, BasicEncoder, EfficientUpdateFormer, GeometryEncoder, NeighborTransformer
+)
+import numpy as np
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.simple_vit_1d import Transformer,posemb_sincos_1d
+from einops import rearrange
+def self_grid_pos_embedding(B, T, H, W, level=None):
+    import pdb; pdb.set_trace()
+def random_se3_transformation(
+    batch_size: int = 1,
+    max_rotation_angle: float = math.pi,
+    max_translation: float = 1.0,
+    device: str = "cpu",
+    dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """
+    随机生成刚体变换矩阵（SE(3) Transformation Matrix）。
+    Args:
+        batch_size (int): 批大小，默认为 1。
+        max_rotation_angle (float): 最大旋转角度（弧度），默认 π（180°）。
+        max_translation (float): 最大平移量，默认 1.0。
+        device (str): 设备（'cpu' 或 'cuda'）。
+        dtype (torch.dtype): 数据类型（推荐 float32）。
+    Returns:
+        torch.Tensor: 形状为 (batch_size, 4, 4) 的齐次变换矩阵。
+    """
+    # 随机生成旋转矩阵 R (batch_size, 3, 3)
+    # 方法 1：使用轴角表示（Axis-Angle）转换为旋转矩阵
+    axis = torch.randn(batch_size, 3, device=device, dtype=dtype)  # 随机旋转轴
+    axis = axis / torch.norm(axis, dim=1, keepdim=True)  # 归一化
+    angle = torch.rand(batch_size, 1, device=device, dtype=dtype) * max_rotation_angle  # 随机角度 [0, max_angle]
+    # 计算旋转矩阵（Rodrigues' rotation formula）
+    K = torch.zeros(batch_size, 3, 3, device=device, dtype=dtype)
+    K[:, 0, 1] = -axis[:, 2]
+    K[:, 0, 2] = axis[:, 1]
+    K[:, 1, 0] = axis[:, 2]
+    K[:, 1, 2] = -axis[:, 0]
+    K[:, 2, 0] = -axis[:, 1]
+    K[:, 2, 1] = axis[:, 0]
+    I = torch.eye(3, device=device, dtype=dtype).unsqueeze(0).expand(batch_size, -1, -1)
+    R = I + torch.sin(angle).unsqueeze(-1) * K + (1 - torch.cos(angle).unsqueeze(-1)) * (K @ K)
+    # 随机生成平移向量 t (batch_size, 3)
+    t = (torch.rand(batch_size, 3, device=device, dtype=dtype) - 0.5) * 2 * max_translation
+    # 组合成齐次变换矩阵 T (batch_size, 4, 4)
+    T = torch.eye(4, device=device, dtype=dtype).unsqueeze(0).expand(batch_size, -1, -1)
+    T[:, :3, :3] = R
+    T[:, :3, 3] = t
+    return T
+def weighted_procrustes_torch(X, Y, W=None, RT=None):
+    """
+    Weighted Procrustes Analysis in PyTorch (batched).
+    Args:
+        X: (B, 1, N, 3), source point cloud.
+        Y: (B, T, N, 3), target point cloud.
+        W: (B, T, N) or (B, 1, N), optional weights for each point.
+    Returns:
+        t: (B, T, 3), optimal translation vectors.
+        R: (B, T, 3, 3), optimal rotation matrices.
+    """
+    device = X.device
+    B, T, N, _ = Y.shape
+    # Default weights: uniform
+    if W is None:
+        W = torch.ones(B, 1, N, device=device)
+    elif W.dim() == 3:  # (B, T, N) -> expand to match Y
+        W = W.unsqueeze(-1)  # (B, T, N, 1)
+    else:  # (B, 1, N)
+        W = W.unsqueeze(-1).expand(B, T, N, 1)
+    # Reshape X to (B, T, N, 3) by broadcasting
+    X = X.expand(B, T, N, 3)
+    # Compute weighted centroids
+    sum_W = torch.sum(W, dim=2, keepdim=True)  # (B, T, 1, 1)
+    centroid_X = torch.sum(W * X, dim=2) / sum_W.squeeze(-1)  # (B, T, 3)
+    centroid_Y = torch.sum(W * Y, dim=2) / sum_W.squeeze(-1)  # (B, T, 3)
+    # Center the point clouds
+    X_centered = X - centroid_X.unsqueeze(2)  # (B, T, N, 3)
+    Y_centered = Y - centroid_Y.unsqueeze(2)  # (B, T, N, 3)
+    # Compute weighted covariance matrix H = X^T W Y
+    X_weighted = X_centered * W  # (B, T, N, 3)
+    H = torch.matmul(X_weighted.transpose(2, 3), Y_centered)  # (B, T, 3, 3)
+    # SVD decomposition
+    U, S, Vt = torch.linalg.svd(H)  # U/Vt: (B, T, 3, 3)
+    # Ensure right-handed rotation (det(R) = +1)
+    det = torch.det(torch.matmul(U, Vt))  # (B, T)
+    Vt_corrected = Vt.clone()
+    mask = det < 0
+    B_idx, T_idx = torch.nonzero(mask, as_tuple=True)
+    Vt_corrected[B_idx, T_idx, -1, :] *= -1  # Flip last row for those needing correction
+    # Optimal rotation and translation
+    R = torch.matmul(U, Vt_corrected).inverse()  # (B, T, 3, 3)
+    t = centroid_Y - torch.matmul(R, centroid_X.unsqueeze(-1)).squeeze(-1)  # (B, T, 3)
+    w2c = torch.eye(4, device=device).unsqueeze(0).unsqueeze(0).repeat(B, T, 1, 1)
+    if (torch.det(R) - 1).abs().max() < 1e-3:
+        w2c[:, :, :3, :3] = R
+    else:
+        import pdb; pdb.set_trace()
+    w2c[:, :, :3, 3] = t
+    try:
+        c2w_traj = torch.inverse(w2c)  # or torch.linalg.inv()
+    except:
+        c2w_traj = torch.eye(4, device=device).unsqueeze(0).unsqueeze(0).repeat(B, T, 1, 1)
+    return c2w_traj
+def key_fr_wprocrustes(cam_pts, graph_matrix, dyn_weight, vis_mask,slide_len=16, overlap=8, K=3, mode="keyframe"):
+    """
+    cam_pts: (B, T, N, 3)
+    graph_matrix: (B, 1, N)
+    dyn_weight: (B, T, N)
+    K: number of keyframes to select (including start and end)
+    Returns:
+        c2w_traj: (B, T, 4, 4)
+    """
+    B, T, N, _ = cam_pts.shape
+    device = cam_pts.device
+    if mode == "keyframe":
+        # Step 1: Keyframe selection
+        ky_fr_idx = [0, T - 1]
+        graph_sum = torch.sum(graph_matrix, dim=-1)  # (B, T, T)
+        dist = torch.max(graph_sum[:, 0, :], graph_sum[:, T - 1, :])  # (B, T)
+        dist[:, [0, T - 1]] = float('inf')
+        for _ in range(K - 2):  # already have 2
+            last_idx = ky_fr_idx[-1]
+            dist = torch.max(dist, graph_sum[:, last_idx, :])
+            dist[:, last_idx] = float('inf')
+            next_id = torch.argmin(dist, dim=1)[0].item()  # Assuming batch=1 or shared
+            ky_fr_idx.append(next_id)
+        ky_fr_idx = sorted(ky_fr_idx)
+    elif mode == "slide":
+        id_slide = torch.arange(0, T)
+        id_slide = id_slide.unfold(0, slide_len, overlap)
+        vis_mask_slide = vis_mask.unfold(1, slide_len, overlap)
+        cam_pts_slide = cam_pts.unfold(1, slide_len, overlap)
+        ky_fr_idx = torch.arange(0, T - slide_len + 1, overlap)
+        if ky_fr_idx[-1] + slide_len < T:
+            # if the last keyframe does not cover the whole sequence, add one more keyframe
+            ky_fr_idx = torch.cat([ky_fr_idx, ky_fr_idx[-1:] + overlap])
+            id_add = torch.arange(ky_fr_idx[-1], ky_fr_idx[-1] + slide_len).clamp(max=T-1)
+            id_slide = torch.cat([id_slide, id_add[None, :]], dim=0)
+            cam_pts_add = cam_pts[:, id_add, :, :]
+            cam_pts_slide = torch.cat([cam_pts_slide, cam_pts_add.permute(0,2,3,1)[:, None, ...]], dim=1)
+            vis_mask_add = vis_mask[:, id_add, :]
+            vis_mask_slide = torch.cat([vis_mask_slide, vis_mask_add.permute(0,2,3,1)[:, None, ...]], dim=1)
+    if mode == "keyframe":
+        # Step 2: Weighted Procrustes in windows
+        base_pose = torch.eye(4, device=cam_pts.device).view(1, 1, 4, 4).repeat(B, 1, 1, 1)  # (B, 1, 4, 4)
+        c2w_traj_out = []
+        for i in range(len(ky_fr_idx) - 1):
+            start_idx = ky_fr_idx[i]
+            end_idx = ky_fr_idx[i + 1]
+            # Visibility mask
+            vis_mask_i = graph_matrix[:, start_idx, end_idx, :]  # (B, N) or (N,)
+            if vis_mask_i.dim() == 1:
+                vis_mask_i = vis_mask_i.unsqueeze(0)  # (1, N)
+            # Broadcast cam_pts and dyn_weight
+            cam_ref = cam_pts[:, start_idx:start_idx+1, :, :]  # (B, 1, M, 3)
+            cam_win = cam_pts[:, start_idx:end_idx+1, :, :]    # (B, W, M, 3)
+            weight = dyn_weight[:, :, :] * vis_mask_i[:, None, :]     # (B, W, M)
+            # Compute relative transformations
+            if weight.sum() < 50:
+                weight = weight.clamp(min=5e-2)
+            relative_tfms = weighted_procrustes_torch(cam_ref, cam_win, weight)  # (B, W, 4, 4)
+            # Apply to original c2w_traj
+            updated_pose = base_pose.detach() @ relative_tfms                             # (B, W, 4, 4)
+            base_pose = relative_tfms[:, -1:, :, :].detach()       # (B, 1, 4, 4)
+            # Assign to output trajectory (avoid in-place on autograd path)
+            c2w_traj_out.append(updated_pose[:, 1:, ...])
+        c2w_traj_out = torch.cat(c2w_traj_out, dim=1)
+        c2w_traj_out = torch.cat([torch.eye(4, device=device).repeat(B, 1, 1, 1), c2w_traj_out], dim=1)
+    elif mode == "slide":
+        c2w_traj_out = torch.eye(4, device=device).repeat(B, T, 1, 1)
+        for i in range(cam_pts_slide.shape[1]):
+            cam_pts_slide_i = cam_pts_slide[:, i, :, :].permute(0,3,1,2)
+            id_slide_i = id_slide[i, :]
+            vis_mask_i = vis_mask_slide[:, i, :, 0, :].permute(0,2,1)  # (B, N) or (N,)
+            vis_mask_i = vis_mask_i[:,:1] * vis_mask_i
+            weight_i = dyn_weight * vis_mask_i
+            if weight_i.sum() < 50:
+                weight_i = weight_i.clamp(min=5e-2)
+            if i == 0:
+                c2w_traj_out[:, id_slide_i, :, :] = weighted_procrustes_torch(cam_pts_slide_i[:,:1], cam_pts_slide_i, weight_i)
+            else:
+                campts_update = torch.einsum("btij,btnj->btni", c2w_traj_out[:,id_slide_i][...,:3,:3], cam_pts_slide_i) + c2w_traj_out[:,id_slide_i][...,None,:3,3]
+                c2w_traj_update = weighted_procrustes_torch(campts_update[:,:1], campts_update, weight_i)
+                c2w_traj_out[:, id_slide_i, :, :] = c2w_traj_update@c2w_traj_out[:,id_slide_i]
+    return c2w_traj_out
+def posenc(x, min_deg, max_deg):
+    """Cat x with a positional encoding of x with scales 2^[min_deg, max_deg-1].
+    Instead of computing [sin(x), cos(x)], we use the trig identity
+    cos(x) = sin(x + pi/2) and do one vectorized call to sin([x, x+pi/2]).
+    Args:
+      x: torch.Tensor, variables to be encoded. Note that x should be in [-pi, pi].
+      min_deg: int, the minimum (inclusive) degree of the encoding.
+      max_deg: int, the maximum (exclusive) degree of the encoding.
+      legacy_posenc_order: bool, keep the same ordering as the original tf code.
+    Returns:
+      encoded: torch.Tensor, encoded variables.
+    """
+    if min_deg == max_deg:
+        return x
+    scales = torch.tensor(
+        [2**i for i in range(min_deg, max_deg)], dtype=x.dtype, device=x.device
+    )
+    xb = (x[..., None, :] * scales[:, None]).reshape(list(x.shape[:-1]) + [-1])
+    four_feat = torch.sin(torch.cat([xb, xb + 0.5 * torch.pi], dim=-1))
+    return torch.cat([x] + [four_feat], dim=-1)
+class EfficientUpdateFormer3D(nn.Module):
+    """
+    Transformer model that updates track in 3D
+    """
+    def __init__(
+        self,
+        EFormer: EfficientUpdateFormer,
+        update_points=True
+    ):
+        super().__init__()
+        hidden_size =  EFormer.hidden_size
+        num_virtual_tracks = EFormer.num_virtual_tracks
+        num_heads = EFormer.num_heads
+        mlp_ratio = 4.0
+        #NOTE: we design a switcher to bridege the camera pose, 3d tracks and 2d tracks
+        # iteract with pretrained 2d tracking
+        self.switcher_tokens = nn.Parameter(
+            torch.randn(1, num_virtual_tracks, 1, hidden_size)
+        )
+        # cross attention
+        space_depth=len(EFormer.space_virtual_blocks)
+        self.space_switcher_blocks = nn.ModuleList(
+            [
+                AttnBlock(
+                    hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    attn_class=Attention,
+                )
+                for _ in range(space_depth)
+            ]
+        )
+        # config 3d tracks blocks
+        self.space_track3d2switcher_blocks = nn.ModuleList(
+            [
+                CrossAttnBlock(
+                    hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                )
+                for _ in range(space_depth)
+            ]
+        )
+        self.space_switcher2track3d_blocks = nn.ModuleList(
+            [
+                CrossAttnBlock(
+                    hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                )
+                for _ in range(space_depth)
+            ]
+        )
+        # config switcher blocks
+        self.space_virtual2switcher_blocks = nn.ModuleList(
+            [
+                CrossAttnBlock(
+                    hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                )
+                for _ in range(space_depth)
+            ]
+        )
+        self.space_switcher2virtual_blocks = nn.ModuleList(
+            [
+                CrossAttnBlock(
+                    hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                )
+                for _ in range(space_depth)
+            ]
+        )
+        # config the temporal blocks
+        self.time_blocks_new = nn.ModuleList(
+            [
+                AttnBlock(
+                    hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    attn_class=Attention,
+                )
+                for _ in range(len(EFormer.time_blocks))
+            ]
+        )
+        # scale and shift cross attention
+        self.scale_shift_cross_attn = nn.ModuleList(
+            [
+                CrossAttnBlock(
+                    128, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                )
+                for _ in range(len(EFormer.time_blocks))
+            ]
+        )
+        self.scale_shift_self_attn = nn.ModuleList(
+            [
+                AttnBlock(
+                    128, num_heads, mlp_ratio=mlp_ratio, attn_class=Attention
+                )
+                for _ in range(len(EFormer.time_blocks))
+            ]
+        )
+        self.scale_shift_dec = torch.nn.Linear(128, 128+1, bias=True)
+        # dense cross attention
+        self.dense_res_cross_attn = nn.ModuleList(
+            [
+                CrossAttnBlock(
+                    128, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                )
+                for _ in range(len(EFormer.time_blocks))
+            ]
+        )
+        self.dense_res_self_attn = nn.ModuleList(
+            [
+                AttnBlock(
+                    128, num_heads, mlp_ratio=mlp_ratio, attn_class=Attention
+                )
+                for _ in range(len(EFormer.time_blocks))
+            ]
+        )
+        self.dense_res_dec = torch.nn.Conv2d(128, 3+128, kernel_size=1, stride=1, padding=0)
+        # set different heads
+        self.update_points = update_points
+        if update_points:
+            self.point_head = torch.nn.Linear(hidden_size, 4, bias=True)
+        else:
+            self.depth_head = torch.nn.Linear(hidden_size, 1, bias=True)
+        self.pro_analysis_w_head = torch.nn.Linear(hidden_size, 1, bias=True)
+        self.vis_conf_head = torch.nn.Linear(hidden_size, 2, bias=True)
+        self.residual_head = torch.nn.Linear(hidden_size,
+                                              hidden_size, bias=True)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+            if getattr(self, "point_head", None) is not None:
+                torch.nn.init.trunc_normal_(self.point_head.weight, std=1e-6)
+                torch.nn.init.constant_(self.point_head.bias, 0)
+            if getattr(self, "depth_head", None) is not None:
+                torch.nn.init.trunc_normal_(self.depth_head.weight, std=0.001)
+            if getattr(self, "vis_conf_head", None) is not None:
+                torch.nn.init.trunc_normal_(self.vis_conf_head.weight, std=1e-6)
+            if getattr(self, "scale_shift_dec", None) is not None:
+                torch.nn.init.trunc_normal_(self.scale_shift_dec.weight, std=0.001)
+            if getattr(self, "residual_head", None) is not None:
+                torch.nn.init.trunc_normal_(self.residual_head.weight, std=0.001)
+        def _trunc_init(module):
+            """ViT weight initialization, original timm impl (for reproducibility)"""
+            if isinstance(module, nn.Linear):
+                torch.nn.init.trunc_normal_(module.weight, std=0.02)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+        self.apply(_basic_init)
+    def forward(self, input_tensor, input_tensor3d, EFormer: EfficientUpdateFormer,
+                        mask=None, add_space_attn=True, extra_sparse_tokens=None, extra_dense_tokens=None):
+        #NOTE: prepare the pose and 3d tracks features
+        tokens3d = EFormer.input_transform(input_tensor3d)
+        tokens = EFormer.input_transform(input_tensor)
+        B, _, T, _ = tokens.shape
+        virtual_tokens = EFormer.virual_tracks.repeat(B, 1, T, 1)
+        switcher_tokens = self.switcher_tokens.repeat(B, 1, T, 1)
+        tokens = torch.cat([tokens, virtual_tokens], dim=1)
+        tokens3d = torch.cat([tokens3d, switcher_tokens], dim=1)
+        _, N, _, _ = tokens.shape
+        j = 0
+        layers = []
+        for i in range(len(EFormer.time_blocks)):
+            if extra_sparse_tokens is not None:
+                extra_sparse_tokens = rearrange(extra_sparse_tokens, 'b n t c -> (b t) n c')
+                extra_sparse_tokens = self.scale_shift_cross_attn[i](extra_sparse_tokens, rearrange(tokens3d, 'b n t c -> (b t) n c'))
+                extra_sparse_tokens = rearrange(extra_sparse_tokens, '(b t) n c -> (b n) t c', b=B, t=T)
+                extra_sparse_tokens = self.scale_shift_self_attn[i](extra_sparse_tokens)
+                extra_sparse_tokens = rearrange(extra_sparse_tokens, '(b n) t c -> b n t c', b=B, n=2, t=T)
+            if extra_dense_tokens is not None:
+                h_p, w_p = extra_dense_tokens.shape[-2:]
+                extra_dense_tokens = rearrange(extra_dense_tokens, 'b t c h w -> (b t) (h w) c')
+                extra_dense_tokens = self.dense_res_cross_attn[i](extra_dense_tokens, rearrange(tokens3d, 'b n t c -> (b t) n c'))
+                extra_dense_tokens = rearrange(extra_dense_tokens, '(b t) n c -> (b n) t c', b=B, t=T)
+                extra_dense_tokens = self.dense_res_self_attn[i](extra_dense_tokens)
+                extra_dense_tokens = rearrange(extra_dense_tokens, '(b h w) t c -> b t c h w', b=B, h=h_p, w=w_p)
+            # temporal
+            time_tokens = tokens.contiguous().view(B * N, T, -1)  # B N T C -> (B N) T C
+            time_tokens = EFormer.time_blocks[i](time_tokens)
+            # temporal 3d
+            time_tokens3d = tokens3d.contiguous().view(B * N, T, -1)  # B N T C -> (B N) T C
+            time_tokens3d = self.time_blocks_new[i](time_tokens3d)
+            tokens = time_tokens.view(B, N, T, -1)  # (B N) T C -> B N T C
+            tokens3d = time_tokens3d.view(B, N, T, -1)
+            if (
+                add_space_attn
+                and hasattr(EFormer, "space_virtual_blocks")
+                and (i % (len(EFormer.time_blocks) // len(EFormer.space_virtual_blocks)) == 0)
+            ):
+                space_tokens = (
+                    tokens.permute(0, 2, 1, 3).contiguous().view(B * T, N, -1)
+                )  # B N T C -> (B T) N C
+                space_tokens3d = (
+                    tokens3d.permute(0, 2, 1, 3).contiguous().view(B * T, N, -1)
+                )  # B N T C -> (B T) N C
+                point_tokens = space_tokens[:, : N - EFormer.num_virtual_tracks]
+                virtual_tokens = space_tokens[:, N - EFormer.num_virtual_tracks :]
+                # get the 3d relevant tokens
+                track3d_tokens = space_tokens3d[:, : N - EFormer.num_virtual_tracks]
+                switcher_tokens = space_tokens[:, N - EFormer.num_virtual_tracks + 1:]
+                # iteract switcher with pose and tracks3d
+                switcher_tokens = self.space_track3d2switcher_blocks[j](
+                    switcher_tokens, track3d_tokens, mask=mask
+                )
+                virtual_tokens = EFormer.space_virtual2point_blocks[j](
+                    virtual_tokens, point_tokens, mask=mask
+                )
+                # get the switcher_tokens
+                switcher_tokens = self.space_virtual2switcher_blocks[j](
+                    switcher_tokens, virtual_tokens
+                )
+                virtual_tokens_res = self.residual_head(
+                    self.space_switcher2virtual_blocks[j](
+                    virtual_tokens, switcher_tokens
+                )
+                )
+                switcher_tokens_res = self.residual_head(
+                    self.space_switcher2virtual_blocks[j](
+                    switcher_tokens, virtual_tokens
+                )
+                )
+                # add residual
+                virtual_tokens = virtual_tokens + virtual_tokens_res
+                switcher_tokens = switcher_tokens + switcher_tokens_res
+                virtual_tokens = EFormer.space_virtual_blocks[j](virtual_tokens)
+                switcher_tokens = self.space_switcher_blocks[j](switcher_tokens)
+                # decode
+                point_tokens = EFormer.space_point2virtual_blocks[j](
+                    point_tokens, virtual_tokens, mask=mask
+                )
+                track3d_tokens = self.space_switcher2track3d_blocks[j](
+                    track3d_tokens, switcher_tokens, mask=mask
+                )
+                space_tokens = torch.cat([point_tokens, virtual_tokens], dim=1)
+                space_tokens3d =  torch.cat([track3d_tokens, virtual_tokens], dim=1)
+                tokens = space_tokens.view(B, T, N, -1).permute(
+                    0, 2, 1, 3
+                )  # (B T) N C -> B N T C
+                tokens3d = space_tokens3d.view(B, T, N, -1).permute(
+                    0, 2, 1, 3
+                )  # (B T) N C -> B N T C
+                j += 1
+        tokens = tokens[:, : N - EFormer.num_virtual_tracks]
+        track3d_tokens = tokens3d[:, : N - EFormer.num_virtual_tracks]
+        if self.update_points:
+            depth_update, dynamic_prob_update = self.point_head(track3d_tokens)[..., :3], self.point_head(track3d_tokens)[..., 3:]
+        else:
+            depth_update, dynamic_prob_update = self.depth_head(track3d_tokens)[..., :1], self.depth_head(track3d_tokens)[..., 1:]
+        pro_analysis_w = self.pro_analysis_w_head(track3d_tokens)
+        flow = EFormer.flow_head(tokens)
+        if EFormer.linear_layer_for_vis_conf:
+            vis_conf = EFormer.vis_conf_head(tokens)
+            flow = torch.cat([flow, vis_conf], dim=-1)
+        if extra_sparse_tokens is not None:
+            scale_shift_out = self.scale_shift_dec(extra_sparse_tokens)
+            dense_res_out = self.dense_res_dec(extra_dense_tokens.view(B*T, -1, h_p, w_p)).view(B, T, -1, h_p, w_p)
+            return flow, depth_update, dynamic_prob_update, pro_analysis_w, scale_shift_out, dense_res_out
+        else:
+            return flow, depth_update, dynamic_prob_update, pro_analysis_w, None, None
+def recover_global_translations_batch(global_rot, c2w_traj, graph_weight):
+    B, T = global_rot.shape[:2]
+    device = global_rot.device
+    # Compute R_i @ t_ij
+    t_rel = c2w_traj[:, :, :, :3, 3]  # (B, T, T, 3)
+    R_i = global_rot[:, :, None, :, :]  # (B, T, 1, 3, 3)
+    t_rhs = torch.matmul(R_i, t_rel.unsqueeze(-1)).squeeze(-1)  # (B, T, T, 3)
+    # Mask: exclude self-loops and small weights
+    valid_mask = (graph_weight > 1e-5) & (~torch.eye(T, dtype=bool, device=device)[None, :, :])  # (B, T, T)
+    # Get all valid (i, j) edge indices
+    i_idx, j_idx = torch.meshgrid(
+        torch.arange(T, device=device),
+        torch.arange(T, device=device),
+        indexing="ij"
+    )
+    i_idx = i_idx.reshape(-1)  # (T*T,)
+    j_idx = j_idx.reshape(-1)
+    # Expand to batch (B, T*T)
+    i_idx = i_idx[None, :].repeat(B, 1)
+    j_idx = j_idx[None, :].repeat(B, 1)
+    # Flatten everything
+    valid_mask_flat = valid_mask.view(B, -1)  # (B, T*T)
+    w_flat = graph_weight.view(B, -1)         # (B, T*T)
+    rhs_flat = t_rhs.view(B, -1, 3)           # (B, T*T, 3)
+    # Initialize output translations
+    global_translations = torch.zeros(B, T, 3, device=device)
+    for b_id in range(B):
+        mask = valid_mask_flat[b_id]
+        i_valid = i_idx[b_id][mask]
+        j_valid = j_idx[b_id][mask]
+        w_valid = w_flat[b_id][mask]
+        rhs_valid = rhs_flat[b_id][mask]
+        n_edges = i_valid.shape[0]
+        # Build A matrix: (n_edges*3, T*3)
+        A = torch.zeros(n_edges*3, T*3, device=device)
+        # Build b vector: (n_edges*3,)
+        b = torch.zeros(n_edges*3, device=device)
+        for k in range(n_edges):
+            i, j = i_valid[k], j_valid[k]
+            weight = w_valid[k]
+            # Fill A matrix for x,y,z components
+            for dim in range(3):
+                row = k*3 + dim
+                A[row, i*3 + dim] = -weight
+                A[row, j*3 + dim] = weight
+                # Fill b vector
+                b[row] = rhs_valid[k, dim] * weight
+        # Solve least squares
+        try:
+            # Add small regularization for stability
+            AtA = A.transpose(-1, -2) @ A + 1e-4 * torch.eye(A.shape[-1], device=A.device)
+            Atb = A.transpose(-1, -2) @ b.unsqueeze(-1)
+            solution = torch.linalg.solve(AtA, Atb).squeeze(-1)  # (3*T,)
+            t_batch = solution.view(T, 3)
+            # Fix scale by setting first frame to origin
+            t_batch = t_batch - t_batch[0:1]
+            global_translations[b_id] = t_batch
+        except RuntimeError as e:
+            print(f"Error in batch {b_id}: {e}")
+            global_translations[b_id] = torch.zeros(T, 3, device=device)
+    return global_translations
+def global_graph_motion_average(c2w_traj, graph_weight):
+    """
+    This function will average the c2w_traj by the graph_weight
+    """
+    B, T, T, _, _ = c2w_traj.shape
+    mask = graph_weight[..., 0, 0] < 1e-5  # (B, T, T)
+    mask = mask.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, 4, 4)  # (B, T, T, 4, 4)
+    identity = torch.eye(4, device=c2w_traj.device).view(1, 1, 1, 4, 4).expand(B, T, T, 4, 4)
+    c2w_traj = torch.where(mask, identity, c2w_traj)
+    Rot_rel_weighted = c2w_traj[:,:,:,:3,:3].contiguous() * graph_weight     # B T T 3 3
+    Rot_big = Rot_rel_weighted.permute(0, 1, 3, 2, 4).reshape(B, 3*T, 3*T) # B 3T 3T
+    epsilon = 1e-8
+    I_big = torch.eye(3*T, device=Rot_big.device).unsqueeze(0)  # (1, 3T, 3T)
+    Rot_big_reg = Rot_big + epsilon * I_big  # (B, 3T, 3T)
+    #NOTE: cal the global rotation
+    # Step 1: batch eigendecomposition
+    try:
+        eigvals, eigvecs = torch.linalg.eigh(Rot_big_reg)  # eigvecs: (B, 3T, 3)
+    except:
+        import pdb; pdb.set_trace()
+    # Step 2: get the largest 3 eigenvectors
+    X = eigvecs[:, :, -3:]  # (B, 3T, 3)
+    # Step 3: split into (B, T, 3, 3)
+    X = X.view(B, T, 3, 3)  # each frame's rotation block (non-orthogonal)
+    # Step 4: project to SO(3), using SVD
+    U, _, Vh = torch.linalg.svd(X)  # (B, T, 3, 3)
+    R = U @ Vh
+    # Step 5: ensure det(R)=1 (right-handed coordinate system)
+    det = torch.linalg.det(R)  # (B, T)
+    neg_det_mask = det < 0
+    # if det<0, reverse the last column and multiply
+    U_flip = U.clone()
+    U_flip[neg_det_mask, :, -1] *= -1
+    R = U_flip @ Vh
+    # global rotation
+    Rot_glob = R[:,:1].inverse() @ R
+    # global translation
+    t_glob = recover_global_translations_batch(Rot_glob,
+                                                c2w_traj, graph_weight[...,0,0])
+    c2w_traj_final = torch.eye(4, device=c2w_traj.device)[None,None].repeat(B, T, 1, 1)
+    c2w_traj_final[:,:,:3,:3] = Rot_glob
+    c2w_traj_final[:,:,:3,3] = t_glob
+    return c2w_traj_final
+def depth_to_points_colmap(metric_depth: torch.Tensor,
+                           intrinsics: torch.Tensor) -> torch.Tensor:
+    """
+    Unproject a depth map to a point cloud in COLMAP convention.
+    Args:
+        metric_depth: (B, H, W) depth map, meters.
+        intrinsics:   (B, 3, 3) COLMAP-style K matrix.
+    Returns:
+        points_map:   (B, H, W, 3) point cloud in camera coordinates.
+    """
+    # 因为输入的 metric_depth 维度是 (B, H, W)
+    B, H, W = metric_depth.shape
+    # 因为需要每个像素的 [u, v, 1] 齐次坐标
+    u = torch.arange(W, device=metric_depth.device, dtype=metric_depth.dtype)
+    v = torch.arange(H, device=metric_depth.device, dtype=metric_depth.dtype)
+    uu, vv = torch.meshgrid(u, v, indexing='xy')
+    pix = torch.stack([uu, vv, torch.ones_like(uu)], dim=-1)
+    pix = pix.reshape(-1, 3)  # (H*W, 3)
+    # 因为要对 B 张图做相同操作
+    pix = pix.unsqueeze(0).expand(B, -1, -1)  # (B, H*W, 3)
+    # import pdb; pdb.set_trace()
+    # 因为 K 是 (B, 3, 3)
+    K_inv = torch.inverse(intrinsics)        # (B, 3, 3)
+    # 因为反投影方向是 X_cam = K^{-1} * pix
+    dirs = torch.einsum('bij,bkj->bki', K_inv, pix)  # (B, H*W, 3)
+    # 因为要按深度伸缩
+    depths = metric_depth.reshape(B, -1)           # (B, H*W)
+    pts = dirs * depths.unsqueeze(-1)              # (B, H*W, 3)
+    # 因为希望输出 (B, H, W, 3)
+    points_map = pts.view(B, H, W, 3)              # (B, H, W, 3)
+    return points_map
+def vec6d_to_R(vector_6D):
+    v1=vector_6D[:,:3]/vector_6D[:,:3].norm(dim=-1,keepdim=True)
+    v2=vector_6D[:,3:]-(vector_6D[:,3:]*v1).sum(dim=-1,keepdim=True)*v1
+    v2=v2/v2.norm(dim=-1,keepdim=True)
+    v3=torch.cross(v1,v2,dim=-1)
+    return torch.concatenate((v1.unsqueeze(1),v2.unsqueeze(1),v3.unsqueeze(1)),dim=1)
+class MyTransformerHead(nn.Module):
+    def __init__(self,input_dim,dim,use_positional_encoding_transformer):
+        super(MyTransformerHead,self).__init__()
+        patch_dim=input_dim+1
+        self.layers=3
+        # dim=128
+        self.use_positional_encoding_transformer=use_positional_encoding_transformer
+        self.to_patch_embedding = nn.Sequential(
+            nn.LayerNorm(patch_dim),
+            nn.Linear(patch_dim, dim),
+            nn.LayerNorm(dim),
+        )
+        self.transformer_frames=[]
+        self.transformer_points=[]
+        for i in range(self.layers):
+            self.transformer_frames.append(Transformer(dim, 1, 16, 64, 2048))
+            self.transformer_points.append(Transformer(dim, 1, 16, 64, 2048))
+        self.transformer_frames=nn.ModuleList(self.transformer_frames)
+        self.transformer_points=nn.ModuleList(self.transformer_points)
+    def forward(self, x):
+        x=torch.cat((x,torch.ones(x.shape[0],x.shape[1],1,x.shape[3]).cuda()),dim=2)
+        x=x.transpose(2,3)
+        b,n,f,c=x.shape
+        x=self.to_patch_embedding(x)
+        x=x.view(b*n,f,-1) # x.shape [390, 33, 256]
+        if self.use_positional_encoding_transformer:
+            pe = posemb_sincos_1d(x) #pe.shape= [33,256] (33 frame, 256 embedding dim)
+            x=pe.unsqueeze(0)+x
+        for i in range(self.layers):
+            #frames aggregation
+            x=self.transformer_frames[i](x)
+            #point sets aggregation
+            x=x.view(b,n,f,-1).transpose(1,2).reshape(b*f,n,-1)
+            x=self.transformer_points[i](x)
+            x=x.view(b,f,n,-1)
+            x=x.transpose(1,2).reshape(b*n,f,-1)
+        x=x.view(b,n,f,-1)
+        x=x.transpose(2,3)
+        return x
+def positionalEncoding_vec(in_tensor, b):
+    proj = torch.einsum('ij, k -> ijk', in_tensor, b)
+    mapped_coords = torch.cat((torch.sin(proj), torch.cos(proj)), dim=1)
+    output = mapped_coords.transpose(2, 1).contiguous().view(mapped_coords.size(0), -1)
+    return output
+class TrackFusion(nn.Module):
+    def __init__(self,width1=320,conv2_kernel_size=31,K=12,
+                                    conv_kernel_size=3,inputdim=2,use_positionl_encoding=True,
+                                    positional_dim=4,use_transformer=True,detach_cameras_dynamic=True,
+                                    use_positional_encoding_transformer=True,use_set_of_sets=False,predict_focal_length=False):
+        super(TrackFusion, self).__init__()
+        self.predict_focal_length=predict_focal_length
+        self.inputdim = inputdim
+        self.n1 = width1
+        self.K=K
+        self.n2 = 6+3+1+self.K+2
+        self.detach_cameras_dynamic=detach_cameras_dynamic
+        l=conv_kernel_size
+        # layers
+        self.use_set_of_sets=use_set_of_sets
+        self.use_positionl_encoding=use_positionl_encoding
+        self.positional_dim=positional_dim
+        actual_input_dim=inputdim
+        if self.use_positionl_encoding:
+            actual_input_dim=2 * inputdim * self.positional_dim+inputdim
+        self.use_transformer=use_transformer
+        if self.use_positionl_encoding:
+            self.b = torch.tensor([(2 ** j) * np.pi for j in range(self.positional_dim)],requires_grad = False)
+        if True:
+            if self.use_transformer:
+                self.transformer_my=MyTransformerHead(actual_input_dim,width1,use_positional_encoding_transformer)
+            self.conv_final = nn.Conv1d(self.n1, self.n2, kernel_size=conv2_kernel_size,stride=1, padding=conv2_kernel_size//2, padding_mode='circular')
+            self.fc1 = nn.Linear(self.n1,3*self.K+1)
+            torch.nn.init.xavier_uniform_(self.conv_final.weight)
+            torch.nn.init.xavier_uniform_(self.fc1.weight)
+    def forward(self, x, pts_miu=None, pts_radis=None, simple_return=True):
+        B, N, C, T = x.shape
+        if self.use_positionl_encoding:
+            x_original_shape=x.shape
+            x=x.transpose(2,3)
+            x=x.reshape(-1,x.shape[-1])
+            if self.b.device!=x.device:
+                self.b=self.b.to(x.device)
+            pos = positionalEncoding_vec(x,self.b)
+            x=torch.cat((x,pos),dim=1)
+            x=x.view(x_original_shape[0],x_original_shape[1],x_original_shape[3],x.shape[-1]).transpose(2,3)
+        b = len(x)
+        n= x.shape[1]
+        l= x.shape[-1]
+        if self.use_set_of_sets:
+            cameras,perpoint_features=self.set_of_sets_my(x)
+        else:
+            if  self.use_transformer:
+                x=self.transformer_my(x)
+            else:
+                for i in range(len( self.conv1)):
+                    if i==0:
+                        x = x.reshape(n*b, x.shape[2],l)
+                    else:
+                        x = x.view(n * b, self.n1, l)
+                    x1 = self.bn1[i](self.conv1[i](x)).view(b,n,self.n1,l)
+                    x2 = self.bn1s[i](self.conv1s[i](x)).view(b,n,self.n1,l).mean(dim=1).view(b,1,self.n1,l).repeat(1,n,1,1)
+                    x = F.relu(x1 + x2)
+            cameras=torch.mean(x,dim=1)
+            cameras=self.conv_final(cameras)
+            perpoint_features = torch.mean(x,dim=3)
+            perpoint_features = self.fc1(perpoint_features.view(n*b,self.n1))
+        B=perpoint_features[:,:self.K*3].view(b,n,3,self.K)     # motion basis
+        NR=F.elu(perpoint_features[:,-1].view(b,n))+1+0.00001
+        position_params=cameras[:,:3,:]
+        if self.predict_focal_length:
+            focal_params=1+0.05*cameras[:,3:4,:].clone().transpose(1,2)
+        else:
+            focal_params=1.0
+        basis_params=cameras[:,4:4+self.K]
+        basis_params[:,0,:]=torch.clamp(basis_params[:,0,:].clone(),min=1.0,max=1.0)
+        basis_params.transpose(1,2).unsqueeze(1).unsqueeze(1)
+        rotation_params=cameras[:,4+self.K:4+self.K+6]
+        # Converting rotation parameters into a valid rotation matrix (probably better to move to 6d representation)
+        rotation_params=vec6d_to_R(rotation_params.transpose(1,2).reshape(b*l,6)).view(b,l,3,3)
+        # Transfering global 3D into each camera coordinates (using per camera roation and translation)
+        points3D_static=((basis_params.transpose(1,2).unsqueeze(1).unsqueeze(1))[:,:,:,:,:1]*B.unsqueeze(-2)[:,:,:,:,:1]).sum(-1)
+        if  self.detach_cameras_dynamic==False:
+            points3D=((basis_params.transpose(1,2).unsqueeze(1).unsqueeze(1))[:,:,:,:,1:]*B.unsqueeze(-2)[:,:,:,:,1:]).sum(-1)+points3D_static
+        else:
+            points3D=((basis_params.transpose(1,2).unsqueeze(1).unsqueeze(1))[:,:,:,:,1:]*B.unsqueeze(-2)[:,:,:,:,1:]).sum(-1)+points3D_static.detach()
+        points3D=points3D.transpose(1,3)
+        points3D_static=points3D_static.transpose(1,3)
+        position_params=position_params.transpose(1,2)
+        if pts_miu is not None:
+            position_params=position_params*pts_radis.squeeze(-1)+pts_miu.squeeze(-2)
+            points3D_static = points3D_static*pts_radis.squeeze(-1)+pts_miu.permute(0,1,3,2)
+            points3D = points3D*pts_radis.squeeze(-1)+pts_miu.permute(0,1,3,2)
+        if  self.detach_cameras_dynamic==False:
+            points3D_camera=(torch.bmm(rotation_params.view(b*l,3,3).transpose(1,2),points3D.reshape(b*l,3,n)-position_params.reshape(b*l,3).unsqueeze(-1)))
+            points3D_camera=points3D_camera.view(b,l,3,n)
+        else:
+            points3D_camera=(torch.bmm(rotation_params.view(b*l,3,3).transpose(1,2).detach(),points3D.reshape(b*l,3,n)-position_params.detach().reshape(b*l,3).unsqueeze(-1)))
+            points3D_camera=points3D_camera.view(b,l,3,n)
+        points3D_static_camera=(torch.bmm(rotation_params.view(b*l,3,3).transpose(1,2),points3D_static.reshape(b*l,3,n)-position_params.reshape(b*l,3).unsqueeze(-1)))
+        points3D_static_camera=points3D_static_camera.view(b,l,3,n)
+        # Projecting from 3D to 2D
+        projections=points3D_camera.clone()
+        projections_static=points3D_static_camera.clone()
+        depths=projections[:,:,2,:]
+        depths_static=projections_static[:,:,2,:]
+        projectionx=focal_params*projections[:,:,0,:]/torch.clamp(projections[:,:,2,:].clone(),min=0.01)
+        projectiony=focal_params*projections[:,:,1,:]/torch.clamp(projections[:,:,2,:].clone(),min=0.01)
+        projectionx_static=focal_params*projections_static[:,:,0,:]/torch.clamp(projections_static[:,:,2,:].clone(),min=0.01)
+        projectiony_static=focal_params*projections_static[:,:,1,:]/torch.clamp(projections_static[:,:,2,:].clone(),min=0.01)
+        projections2=torch.cat((projectionx.unsqueeze(2),projectiony.unsqueeze(2)),dim=2)
+        projections2_static=torch.cat((projectionx_static.unsqueeze(2),projectiony_static.unsqueeze(2)),dim=2)
+        if simple_return:
+            c2w_traj = torch.eye(4, device=x.device)[None,None].repeat(b,T,1,1)
+            c2w_traj[:,:,:3,:3] = rotation_params
+            c2w_traj[:,:,:3,3] = position_params
+            return c2w_traj, points3D, points3D_camera
+        else:
+            return focal_params,projections2,projections2_static,rotation_params,position_params,B,points3D,points3D_static,depths,depths_static,0,basis_params,0,0,points3D_camera,NR
+def get_nth_visible_time_index(vis_gt: torch.Tensor, n: torch.Tensor) -> torch.Tensor:
+    """
+    vis_gt: [B, T, N]  0/1 binary tensor
+    n: [B, N] int tensor, the n-th visible time index to get (1-based)
+    Returns: [B, N] tensor of time indices into T, or -1 if not enough visible steps
+    """
+    B, T, N = vis_gt.shape
+    # Create a tensor [0, 1, ..., T-1] for time indices
+    time_idx = torch.arange(T, device=vis_gt.device).view(1, T, 1).expand(B, T, N)  # [B, T, N]
+    # Mask invisible steps with a large number (T)
+    masked_time = torch.where(vis_gt.bool(), time_idx, torch.full_like(time_idx, T))
+    # Sort along time dimension
+    sorted_time, _ = masked_time.sort(dim=1)  # [B, T, N]
+    # Prepare index tensor for gather: [B, N] -> [B, 1, N]
+    gather_idx = (n - 1).clamp(min=0, max=T-1).unsqueeze(1)  # shape: [B, 1, N]
+    assert gather_idx.shape == sorted_time.shape[:1] + (1, sorted_time.shape[2])  # [B, 1, N]
+    # Gather from sorted_time: result is [B, 1, N]
+    nth_time = sorted_time.gather(dim=1, index=gather_idx).squeeze(1)  # [B, N]
+    # If value is T (i.e., masked), then not enough visible → set to -1
+    nth_time = torch.where(nth_time == T, torch.full_like(nth_time, -1), nth_time)
+    return nth_time  # [B, N]
+def knn_torch(x, k):
+    """
+    x: (B, T, N, 2)
+    return: indices of k-NN, shape (B, T, N, k)
+    """
+    B, T, N, C = x.shape
+    # Reshape to (B*T, N, 2)
+    x = x.view(B*T, N, C)  # Merge the first two dimensions for easier processing
+    # Calculate pairwise distance: (B*T, N, N)
+    dist = torch.cdist(x, x, p=2)  # Euclidean distance
+    # Exclude self: set diagonal to a large number (to prevent self from being a neighbor)
+    mask = torch.eye(N, device=x.device).bool()[None, :, :]  # (1, N, N)
+    dist.masked_fill_(mask, float('inf'))
+    # Get indices of top k smallest distances
+    knn_idx = dist.topk(k, largest=False).indices  # (B*T, N, k)
+    # Restore dimensions (B, T, N, k)
+    knn_idx = knn_idx.view(B, T, N, k)
+    return knn_idx
+def get_topo_mask(coords_xyz_append: torch.Tensor,
+                                 coords_2d_lift: torch.Tensor, replace_ratio: float = 0.6) -> torch.Tensor:
+    """
+    coords_xyz_append: [B, T, N, 3] 3d coordinates
+    coords_2d_lift: [B*T, N] depth map
+    replace_ratio: float, the ratio of the depth change to be considered as a topological change
+    """
+    B, T, N, _ = coords_xyz_append.shape
+    # if N > 1024:
+    #     pick_idx = torch.randperm(N)[:1024]
+    # else:
+    pick_idx = torch.arange(N, device=coords_xyz_append.device)
+    coords_xyz_append = coords_xyz_append[:,:,pick_idx,:]
+    knn_idx = knn_torch(coords_xyz_append, 49)
+    knn_idx = pick_idx[knn_idx]
+    # raw topology
+    raw_depth = coords_xyz_append[...,2:]   # B T N 1     knn_idx  B T N K
+    knn_depth = torch.gather(
+            raw_depth.expand(-1, -1, -1, knn_idx.shape[-1]),  # (B, T, N, K)
+            dim=2,
+            index=knn_idx  # (B, T, N, K)
+        ).squeeze(-1)  # → (B, T, N, K)
+    depth_rel_neg_raw = (knn_depth - raw_depth)
+    # unproj depth
+    knn_depth_unproj = torch.gather(
+            depth_unproj.view(B,T,N,1).expand(-1, -1, -1, knn_idx.shape[-1]),  # (B, T, N, K)
+            dim=2,
+            index=knn_idx  # (B, T, N, K)
+        ).squeeze(-1)  # → (B, T, N, K)
+    depth_rel_neg_unproj = (knn_depth_unproj - depth_unproj.view(B,T,N,1))
+    # topological change threshold
+    mask_topo = (depth_rel_neg_raw.abs() / (depth_rel_neg_unproj.abs()+1e-8) - 1).abs() < 0.4
+    mask_topo = mask_topo.sum(dim=-1) > 9
+    return mask_topo

models/SpaTrackV2/models/utils.py ADDED Viewed

	@@ -0,0 +1,1221 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from https://github.com/facebookresearch/PoseDiffusion
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple, Union, List
+from einops import rearrange, repeat
+import cv2
+import numpy as np
+# from torchmetrics.functional.regression import pearson_corrcoef
+from easydict import EasyDict as edict
+from enum import Enum
+import torch.utils.data.distributed as dist
+from typing import Literal, Union, List, Tuple, Dict
+from models.monoD.depth_anything_v2.util.transform import Resize
+from models.SpaTrackV2.utils.model_utils import sample_features5d
+EPS = 1e-9
+class Summary(Enum):
+    NONE = 0
+    AVERAGE = 1
+    SUM = 2
+    COUNT = 3
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE):
+        self.name = name
+        self.fmt = fmt
+        self.summary_type = summary_type
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+    def all_reduce(self):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        if isinstance(self.sum, np.ndarray):
+            total = torch.tensor(
+                self.sum.tolist()
+                + [
+                    self.count,
+                ],
+                dtype=torch.float32,
+                device=device,
+            )
+        else:
+            total = torch.tensor(
+                [self.sum, self.count], dtype=torch.float32, device=device
+            )
+        dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
+        if total.shape[0] > 2:
+            self.sum, self.count = total[:-1].cpu().numpy(), total[-1].cpu().item()
+        else:
+            self.sum, self.count = total.tolist()
+        self.avg = self.sum / (self.count + 1e-5)
+    def __str__(self):
+        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+    def summary(self):
+        fmtstr = ""
+        if self.summary_type is Summary.NONE:
+            fmtstr = ""
+        elif self.summary_type is Summary.AVERAGE:
+            fmtstr = "{name} {avg:.3f}"
+        elif self.summary_type is Summary.SUM:
+            fmtstr = "{name} {sum:.3f}"
+        elif self.summary_type is Summary.COUNT:
+            fmtstr = "{name} {count:.3f}"
+        else:
+            raise ValueError("invalid summary type %r" % self.summary_type)
+        return fmtstr.format(**self.__dict__)
+def procrustes_analysis(X0,X1): # [N,3]
+    # translation
+    t0 = X0.mean(dim=0,keepdim=True)
+    t1 = X1.mean(dim=0,keepdim=True)
+    X0c = X0-t0
+    X1c = X1-t1
+    # scale
+    s0 = (X0c**2).sum(dim=-1).mean().sqrt()
+    s1 = (X1c**2).sum(dim=-1).mean().sqrt()
+    X0cs = X0c/s0
+    X1cs = X1c/s1
+    # rotation (use double for SVD, float loses precision)
+    U,S,V = (X0cs.t()@X1cs).double().svd(some=True)
+    R = (U@V.t()).float()
+    if R.det()<0: R[2] *= -1
+    # align X1 to X0: X1to0 = (X1-t1)/s1@R.t()*s0+t0
+    sim3 = edict(t0=t0[0],t1=t1[0],s0=s0,s1=s1,R=R)
+    return sim3
+def create_intri_matrix(focal_length, principal_point):
+    """
+    Creates a intri matrix from focal length and principal point.
+    Args:
+        focal_length (torch.Tensor): A Bx2 or BxSx2 tensor containing the focal lengths (fx, fy) for each image.
+        principal_point (torch.Tensor): A Bx2 or BxSx2 tensor containing the principal point coordinates (cx, cy) for each image.
+    Returns:
+        torch.Tensor: A Bx3x3 or BxSx3x3 tensor containing the camera matrix for each image.
+    """
+    if len(focal_length.shape) == 2:
+        B = focal_length.shape[0]
+        intri_matrix = torch.zeros(B, 3, 3, dtype=focal_length.dtype, device=focal_length.device)
+        intri_matrix[:, 0, 0] = focal_length[:, 0]
+        intri_matrix[:, 1, 1] = focal_length[:, 1]
+        intri_matrix[:, 2, 2] = 1.0
+        intri_matrix[:, 0, 2] = principal_point[:, 0]
+        intri_matrix[:, 1, 2] = principal_point[:, 1]
+    else:
+        B, S = focal_length.shape[0], focal_length.shape[1]
+        intri_matrix = torch.zeros(B, S, 3, 3, dtype=focal_length.dtype, device=focal_length.device)
+        intri_matrix[:, :, 0, 0] = focal_length[:, :, 0]
+        intri_matrix[:, :, 1, 1] = focal_length[:, :, 1]
+        intri_matrix[:, :, 2, 2] = 1.0
+        intri_matrix[:, :, 0, 2] = principal_point[:, :, 0]
+        intri_matrix[:, :, 1, 2] = principal_point[:, :, 1]
+    return intri_matrix
+def closed_form_inverse_OpenCV(se3, R=None, T=None):
+    """
+    Computes the inverse of each 4x4 SE3 matrix in the batch.
+    Args:
+    - se3 (Tensor): Nx4x4 tensor of SE3 matrices.
+    Returns:
+    - Tensor: Nx4x4 tensor of inverted SE3 matrices.
+    | R t |
+    | 0 1 |
+    -->
+    | R^T  -R^T t|
+    | 0       1  |
+    """
+    if R is None:
+        R = se3[:, :3, :3]
+    if T is None:
+        T = se3[:, :3, 3:]
+    # Compute the transpose of the rotation
+    R_transposed = R.transpose(1, 2)
+    # -R^T t
+    top_right = -R_transposed.bmm(T)
+    inverted_matrix = torch.eye(4, 4)[None].repeat(len(se3), 1, 1)
+    inverted_matrix = inverted_matrix.to(R.dtype).to(R.device)
+    inverted_matrix[:, :3, :3] = R_transposed
+    inverted_matrix[:, :3, 3:] = top_right
+    return inverted_matrix
+def get_EFP(pred_cameras, image_size, B, S, default_focal=False):
+    """
+    Converting PyTorch3D cameras to extrinsics, intrinsics matrix
+    Return extrinsics, intrinsics, focal_length, principal_point
+    """
+    scale = image_size.min()
+    focal_length = pred_cameras.focal_length
+    principal_point = torch.zeros_like(focal_length)
+    focal_length = focal_length * scale / 2
+    principal_point = (image_size[None] - principal_point * scale) / 2
+    Rots = pred_cameras.R.clone()
+    Trans = pred_cameras.T.clone()
+    extrinsics = torch.cat([Rots, Trans[..., None]], dim=-1)
+    # reshape
+    extrinsics = extrinsics.reshape(B, S, 3, 4)
+    focal_length = focal_length.reshape(B, S, 2)
+    principal_point = principal_point.reshape(B, S, 2)
+    # only one dof focal length
+    if default_focal:
+        focal_length[:] = scale
+    else:
+        focal_length = focal_length.mean(dim=-1, keepdim=True).expand(-1, -1, 2)
+        focal_length = focal_length.clamp(0.2 * scale, 5 * scale)
+    intrinsics = create_intri_matrix(focal_length, principal_point)
+    return extrinsics, intrinsics
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+def pose_encoding_to_camera(
+    pose_encoding,
+    pose_encoding_type="absT_quaR_logFL",
+    log_focal_length_bias=1.8,
+    min_focal_length=0.1,
+    max_focal_length=30,
+    return_dict=False,
+    to_OpenCV=True,
+):
+    """
+    Args:
+        pose_encoding: A tensor of shape `BxNxC`, containing a batch of
+                        `BxN` `C`-dimensional pose encodings.
+        pose_encoding_type: The type of pose encoding,
+    """
+    pose_encoding_reshaped = pose_encoding.reshape(-1, pose_encoding.shape[-1])  # Reshape to BNxC
+    if pose_encoding_type == "absT_quaR_logFL":
+        # 3 for absT, 4 for quaR, 2 for absFL
+        abs_T = pose_encoding_reshaped[:, :3]
+        quaternion_R = pose_encoding_reshaped[:, 3:7]
+        R = quaternion_to_matrix(quaternion_R)
+        log_focal_length = pose_encoding_reshaped[:, 7:9]
+        # log_focal_length_bias was the hyperparameter
+        # to ensure the mean of logFL close to 0 during training
+        # Now converted back
+        focal_length = (log_focal_length + log_focal_length_bias).exp()
+        # clamp to avoid weird fl values
+        focal_length = torch.clamp(focal_length,
+                                   min=min_focal_length, max=max_focal_length)
+    elif pose_encoding_type == "absT_quaR_OneFL":
+        # 3 for absT, 4 for quaR, 1 for absFL
+        # [absolute translation, quaternion rotation, normalized focal length]
+        abs_T = pose_encoding_reshaped[:, :3]
+        quaternion_R = pose_encoding_reshaped[:, 3:7]
+        R = quaternion_to_matrix(quaternion_R)
+        focal_length = pose_encoding_reshaped[:, 7:8]
+        focal_length = torch.clamp(focal_length,
+                                   min=min_focal_length, max=max_focal_length)
+    else:
+        raise ValueError(f"Unknown pose encoding {pose_encoding_type}")
+    if to_OpenCV:
+        ### From Pytorch3D coordinate to OpenCV coordinate:
+        # I hate coordinate conversion
+        R = R.clone()
+        abs_T = abs_T.clone()
+        R[:, :, :2] *= -1
+        abs_T[:, :2] *= -1
+        R = R.permute(0, 2, 1)
+        extrinsics_4x4 = torch.eye(4, 4).to(R.dtype).to(R.device)
+        extrinsics_4x4 = extrinsics_4x4[None].repeat(len(R), 1, 1)
+        extrinsics_4x4[:, :3, :3] = R.clone()
+        extrinsics_4x4[:, :3, 3] = abs_T.clone()
+        rel_transform = closed_form_inverse_OpenCV(extrinsics_4x4[0:1])
+        rel_transform = rel_transform.expand(len(extrinsics_4x4), -1, -1)
+        # relative to the first camera
+        # NOTE it is extrinsics_4x4 x rel_transform instead of rel_transform x extrinsics_4x4
+        extrinsics_4x4 = torch.bmm(extrinsics_4x4, rel_transform)
+        R = extrinsics_4x4[:, :3, :3].clone()
+        abs_T = extrinsics_4x4[:, :3, 3].clone()
+    if return_dict:
+        return {"focal_length": focal_length, "R": R, "T": abs_T}
+    pred_cameras = PerspectiveCameras(focal_length=focal_length,
+                                       R=R, T=abs_T, device=R.device, in_ndc=False)
+    return pred_cameras
+def camera_to_pose_encoding(
+    camera, pose_encoding_type="absT_quaR_logFL",
+    log_focal_length_bias=1.8, min_focal_length=0.1, max_focal_length=30
+):
+    """
+    Inverse to pose_encoding_to_camera
+    """
+    if pose_encoding_type == "absT_quaR_logFL":
+        # Convert rotation matrix to quaternion
+        quaternion_R = matrix_to_quaternion(camera.R)
+        # Calculate log_focal_length
+        log_focal_length = (
+            torch.log(torch.clamp(camera.focal_length,
+                                   min=min_focal_length, max=max_focal_length))
+            - log_focal_length_bias
+        )
+        # Concatenate to form pose_encoding
+        pose_encoding = torch.cat([camera.T, quaternion_R, log_focal_length], dim=-1)
+    elif pose_encoding_type == "absT_quaR_OneFL":
+        # [absolute translation, quaternion rotation, normalized focal length]
+        quaternion_R = matrix_to_quaternion(camera.R)
+        focal_length = (torch.clamp(camera.focal_length,
+                                     min=min_focal_length,
+                                     max=max_focal_length))[..., 0:1]
+        pose_encoding = torch.cat([camera.T, quaternion_R, focal_length], dim=-1)
+    else:
+        raise ValueError(f"Unknown pose encoding {pose_encoding_type}")
+    return pose_encoding
+def init_pose_enc(B: int,
+                  S: int, pose_encoding_type: str="absT_quaR_logFL",
+                  device: Optional[torch.device]=None):
+    """
+    Initialize the pose encoding tensor
+    args:
+        B: batch size
+        S: number of frames
+        pose_encoding_type: the type of pose encoding
+        device: device to put the tensor
+    return:
+        pose_enc: [B S C]
+    """
+    if pose_encoding_type == "absT_quaR_logFL":
+        C = 9
+    elif pose_encoding_type == "absT_quaR_OneFL":
+        C = 8
+    else:
+        raise ValueError(f"Unknown pose encoding {pose_encoding_type}")
+    pose_enc = torch.zeros(B, S, C, device=device)
+    pose_enc[..., :3] = 0 # absT
+    pose_enc[..., 3] = 1 # quaR
+    pose_enc[..., 7:] = 1 # logFL
+    return pose_enc
+def first_pose_enc_norm(pose_enc: torch.Tensor,
+                        pose_encoding_type: str="absT_quaR_OneFL",
+                        pose_mode: str = "W2C"):
+    """
+    make sure the poses in on window are normalized by the first frame, where the
+    first frame transformation is the Identity Matrix.
+    NOTE: Poses are all W2C
+    args:
+        pose_enc: [B S C]
+    return:
+        pose_enc_norm: [B S C]
+    """
+    B, S, C = pose_enc.shape
+    # Pose encoding to Cameras (Pytorch3D coordinate)
+    pred_cameras = pose_encoding_to_camera(
+        pose_enc, pose_encoding_type=pose_encoding_type,
+        to_OpenCV=False
+    ) #NOTE: the camera parameters are not in NDC
+    R = pred_cameras.R    # [B*S, 3, 3]
+    T = pred_cameras.T    # [B*S, 3]
+    Tran_M = torch.cat([R, T.unsqueeze(-1)], dim=-1) # [B*S, 3, 4]
+    extra_ = torch.tensor([[[0, 0, 0, 1]]],
+                          device=Tran_M.device).expand(Tran_M.shape[0], -1, -1)
+    Tran_M = torch.cat([Tran_M, extra_
+                        ], dim=1)
+    Tran_M = rearrange(Tran_M, '(b s) c d -> b s c d', b=B)
+    # Take the first frame as the base of world coordinate
+    if pose_mode == "C2W":
+        Tran_M_new = (Tran_M[:,:1,...].inverse())@Tran_M
+    elif pose_mode == "W2C":
+        Tran_M_new = Tran_M@(Tran_M[:,:1,...].inverse())
+    Tran_M_new = rearrange(Tran_M_new, 'b s c d -> (b s) c d')
+    R_ = Tran_M_new[:, :3, :3]
+    T_ = Tran_M_new[:, :3, 3]
+    # Cameras to Pose encoding
+    pred_cameras.R = R_
+    pred_cameras.T = T_
+    pose_enc_norm = camera_to_pose_encoding(pred_cameras,
+                                             pose_encoding_type=pose_encoding_type)
+    pose_enc_norm = rearrange(pose_enc_norm, '(b s) c -> b s c', b=B)
+    return pose_enc_norm
+def first_pose_enc_denorm(
+                        pose_enc: torch.Tensor,
+                        pose_enc_1st: torch.Tensor,
+                        pose_encoding_type: str="absT_quaR_OneFL",
+                        pose_mode: str = "W2C"):
+    """
+    make sure the poses in on window are de-normalized by the first frame, where the
+    first frame transformation is the Identity Matrix.
+    args:
+        pose_enc: [B S C]
+        pose_enc_1st: [B 1 C]
+    return:
+        pose_enc_denorm: [B S C]
+    """
+    B, S, C = pose_enc.shape
+    pose_enc_all = torch.cat([pose_enc_1st, pose_enc], dim=1)
+    # Pose encoding to Cameras (Pytorch3D coordinate)
+    pred_cameras = pose_encoding_to_camera(
+        pose_enc_all, pose_encoding_type=pose_encoding_type,
+        to_OpenCV=False
+    ) #NOTE: the camera parameters are not in NDC
+    R = pred_cameras.R    # [B*(1+S), 3, 3]
+    T = pred_cameras.T    # [B*(1+S), 3]
+    Tran_M = torch.cat([R, T.unsqueeze(-1)], dim=-1) # [B*(1+S), 3, 4]
+    extra_ = torch.tensor([[[0, 0, 0, 1]]],
+                          device=Tran_M.device).expand(Tran_M.shape[0], -1, -1)
+    Tran_M = torch.cat([Tran_M, extra_
+                        ], dim=1)
+    Tran_M_new = rearrange(Tran_M, '(b s) c d -> b s c d', b=B)[:, 1:]
+    Tran_M_1st = rearrange(Tran_M, '(b s) c d -> b s c d', b=B)[:,:1]
+    if pose_mode == "C2W":
+        Tran_M_new = Tran_M_1st@Tran_M_new
+    elif pose_mode == "W2C":
+        Tran_M_new = Tran_M_new@Tran_M_1st
+    Tran_M_new_ = torch.cat([Tran_M_1st, Tran_M_new], dim=1)
+    R_ = Tran_M_new_[..., :3, :3].view(-1, 3, 3)
+    T_ = Tran_M_new_[..., :3, 3].view(-1, 3)
+    # Cameras to Pose encoding
+    pred_cameras.R = R_
+    pred_cameras.T = T_
+    # Cameras to Pose encoding
+    pose_enc_denorm = camera_to_pose_encoding(pred_cameras,
+                                             pose_encoding_type=pose_encoding_type)
+    pose_enc_denorm = rearrange(pose_enc_denorm, '(b s) c -> b s c', b=B)
+    return pose_enc_denorm[:, 1:]
+def compute_scale_and_shift(prediction, target, mask):
+    # system matrix: A = [[a_00, a_01], [a_10, a_11]]
+    a_00 = torch.sum(mask * prediction * prediction, (1, 2))
+    a_01 = torch.sum(mask * prediction, (1, 2))
+    a_11 = torch.sum(mask, (1, 2))
+    # right hand side: b = [b_0, b_1]
+    b_0 = torch.sum(mask * prediction * target, (1, 2))
+    b_1 = torch.sum(mask * target, (1, 2))
+    # solution: x = A^-1 . b = [[a_11, -a_01], [-a_10, a_00]] / (a_00 * a_11 - a_01 * a_10) . b
+    x_0 = torch.zeros_like(b_0)
+    x_1 = torch.zeros_like(b_1)
+    det = a_00 * a_11 - a_01 * a_01
+    # A needs to be a positive definite matrix.
+    valid = det > 0
+    x_0[valid] = (a_11[valid] * b_0[valid] - a_01[valid] * b_1[valid]) / det[valid]
+    x_1[valid] = (-a_01[valid] * b_0[valid] + a_00[valid] * b_1[valid]) / det[valid]
+    return x_0, x_1
+def normalize_prediction_robust(target, mask, Bs):
+    ssum = torch.sum(mask, (1, 2))
+    valid = ssum > 0
+    m = torch.zeros_like(ssum).to(target.dtype)
+    s = torch.ones_like(ssum).to(target.dtype)
+    m[valid] = torch.median(
+        (mask[valid] * target[valid]).view(valid.sum(), -1), dim=1
+    ).values
+    target = rearrange(target, '(b c) h w -> b c h w', b=Bs)
+    m_vid = rearrange(m, '(b c) -> b c 1 1', b=Bs)   #.mean(dim=1, keepdim=True)
+    mask = rearrange(mask, '(b c) h w -> b c h w', b=Bs)
+    target = target - m_vid
+    sq = torch.sum(mask * target.abs(), (2, 3))
+    sq = rearrange(sq, 'b c -> (b c)')
+    s[valid] = torch.clamp((sq[valid] / ssum[valid]), min=1e-6)
+    s_vid = rearrange(s, '(b c) -> b c 1 1', b=Bs)  #.mean(dim=1, keepdim=True)
+    target = target / s_vid
+    target = rearrange(target, 'b c h w -> (b c) h w', b=Bs)
+    return target, m_vid, s_vid
+def normalize_video_robust(target, mask, Bs):
+    vid_valid = target[mask]
+    # downsample to 1/20
+    with torch.no_grad():
+        vid_valid = vid_valid[torch.randperm(vid_valid.shape[0], device='cuda')[:vid_valid.shape[0]//5]]
+        t_2, t_98 = torch.quantile(vid_valid, 0.02), torch.quantile(vid_valid, 0.98)
+    # normalize
+    target = (target - t_2) / (t_98 - t_2)*2 - 1
+    return target, t_2, t_98
+def video_loss(prediction, target, mask, Bs):
+    # median norm
+    prediction_nm, a_norm, b_norm = normalize_video_robust(prediction, mask, Bs)
+    target_nm, a_norm_gt, b_norm_gt = normalize_video_robust(target.float(), mask, Bs)
+    depth_loss = nn.functional.l1_loss(prediction_nm[mask], target_nm[mask])
+    # rel depth 2 metric --> (pred - a')/(b'-a')*(b-a) + a
+    scale = (b_norm_gt - a_norm_gt) / (b_norm - a_norm)
+    shift = a_norm_gt - a_norm*scale
+    return depth_loss, scale, shift, prediction_nm, target_nm
+def median_loss(prediction, target, mask, Bs):
+    # median norm
+    prediction_nm, a_norm, b_norm = normalize_prediction_robust(prediction, mask, Bs)
+    target_nm, a_norm_gt, b_norm_gt = normalize_prediction_robust(target.float(), mask, Bs)
+    depth_loss = nn.functional.l1_loss(prediction_nm[mask], target_nm[mask])
+    scale = b_norm_gt/b_norm
+    shift = a_norm_gt - a_norm*scale
+    return depth_loss, scale, shift, prediction_nm, target_nm
+def reduction_batch_based(image_loss, M):
+    # average of all valid pixels of the batch
+    # avoid division by 0 (if sum(M) = sum(sum(mask)) = 0: sum(image_loss) = 0)
+    divisor = torch.sum(M)
+    if divisor == 0:
+        return 0
+    else:
+        return torch.sum(image_loss) / divisor
+def reduction_image_based(image_loss, M):
+    # mean of average of valid pixels of an image
+    # avoid division by 0 (if M = sum(mask) = 0: image_loss = 0)
+    valid = M.nonzero()
+    image_loss[valid] = image_loss[valid] / M[valid]
+    return torch.mean(image_loss)
+class ScaleAndShiftInvariantLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.name = "SSILoss"
+    def forward(self, prediction, target, mask, Bs,
+                 interpolate=True, return_interpolated=False):
+        if prediction.shape[-1] != target.shape[-1] and interpolate:
+            prediction = nn.functional.interpolate(prediction, target.shape[-2:], mode='bilinear', align_corners=True)
+            intr_input = prediction
+        else:
+            intr_input = prediction
+        prediction, target, mask = prediction.squeeze(), target.squeeze(), mask.squeeze()
+        assert prediction.shape == target.shape, f"Shape mismatch: Expected same shape but got {prediction.shape} and {target.shape}."
+        scale, shift = compute_scale_and_shift(prediction, target, mask)
+        a_norm = scale.view(Bs, -1, 1, 1).mean(dim=1, keepdim=True)
+        b_norm = shift.view(Bs, -1, 1, 1).mean(dim=1, keepdim=True)
+        prediction = rearrange(prediction, '(b c) h w -> b c h w', b=Bs)
+        target = rearrange(target, '(b c) h w -> b c h w', b=Bs)
+        mask = rearrange(mask, '(b c) h w -> b c h w', b=Bs)
+        scaled_prediction = a_norm * prediction + b_norm
+        loss = nn.functional.l1_loss(scaled_prediction[mask], target[mask])
+        if not return_interpolated:
+            return loss, a_norm, b_norm
+        return loss, a_norm, b_norm
+ScaleAndShiftInvariantLoss_fn = ScaleAndShiftInvariantLoss()
+class GradientLoss(nn.Module):
+    def __init__(self, scales=4, reduction='batch-based'):
+        super().__init__()
+        if reduction == 'batch-based':
+            self.__reduction = reduction_batch_based
+        else:
+            self.__reduction = reduction_image_based
+        self.__scales = scales
+    def forward(self, prediction, target, mask):
+        total = 0
+        for scale in range(self.__scales):
+            step = pow(2, scale)
+            l1_ln, a_nm, b_nm = ScaleAndShiftInvariantLoss_fn(prediction[:, ::step, ::step],
+                                                   target[:, ::step, ::step], mask[:, ::step, ::step], 1)
+            total += l1_ln
+            a_nm = a_nm.squeeze().detach()  # [B, 1, 1]
+            b_nm = b_nm.squeeze().detach()  # [B, 1, 1]
+            total += 2*gradient_loss(a_nm*prediction[:, ::step, ::step]+b_nm, target[:, ::step, ::step],
+                                   mask[:, ::step, ::step], reduction=self.__reduction)
+        return total
+Grad_fn = GradientLoss()
+def gradient_loss(prediction, target, mask, reduction=reduction_batch_based):
+    M = torch.sum(mask, (1, 2))
+    diff = prediction - target
+    diff = torch.mul(mask, diff)
+    grad_x = torch.abs(diff[:, :, 1:] - diff[:, :, :-1])
+    mask_x = torch.mul(mask[:, :, 1:], mask[:, :, :-1])
+    grad_x = torch.mul(mask_x, grad_x)
+    grad_y = torch.abs(diff[:, 1:, :] - diff[:, :-1, :])
+    mask_y = torch.mul(mask[:, 1:, :], mask[:, :-1, :])
+    grad_y = torch.mul(mask_y, grad_y)
+    image_loss = torch.sum(grad_x, (1, 2)) + torch.sum(grad_y, (1, 2))
+    return reduction(image_loss, M)
+def loss_fn(
+        poses_preds: List[torch.Tensor],
+        poses_pred_all: List[torch.Tensor],
+        poses_gt: torch.Tensor,
+        inv_depth_preds: List[torch.Tensor],
+        inv_depth_raw: List[torch.Tensor],
+        depths_gt: torch.Tensor,
+        S: int = 16,
+        gamma: float = 0.8,
+        logger=None,
+        logger_tf=None,
+        global_step=0,
+        ):
+    """
+    Args:
+        poses_preds: list of predicted poses
+        poses_gt: ground truth poses
+        inv_depth_preds: list of predicted inverse depth maps
+        depths_gt: ground truth depth maps
+        S: length of sliding window
+    """
+    B, T, _, H, W = depths_gt.shape
+    loss_total = 0
+    for i in range(len(poses_preds)):
+        poses_preds_i = poses_preds[i][0]
+        poses_unc_i = poses_preds[i][1]
+        poses_gt_i = poses_gt[:, i*S//2:i*S//2+S,:]
+        poses_gt_i_norm = first_pose_enc_norm(poses_gt_i,
+                                               pose_encoding_type="absT_quaR_OneFL")
+        pose_loss = 0.0
+        for idx, poses_preds_ij in enumerate(poses_preds_i):
+            i_weight = gamma ** (len(poses_preds_i) - idx - 1)
+            if logger is not None:
+                if poses_preds_ij.max()>5e1:
+                    logger.info(f"pose_pred_max_and_mean: {poses_preds_ij.max(), poses_preds_ij.mean()}")
+            trans_loss = (poses_preds_ij[...,:3] - poses_gt_i_norm[...,:3]).abs().sum(dim=-1).mean()
+            rot_loss = (poses_preds_ij[...,3:7] - poses_gt_i_norm[...,3:7]).abs().sum(dim=-1).mean()
+            focal_loss = (poses_preds_ij[...,7:] - poses_gt_i_norm[...,7:]).abs().sum(dim=-1).mean()
+            if torch.isnan((trans_loss + rot_loss + focal_loss)).any():
+                pose_loss += 0
+            else:
+                pose_loss += i_weight*(trans_loss + rot_loss + focal_loss)
+            if (logger_tf is not None)&(i==len(poses_preds)-1):
+                logger_tf.add_scalar(f"loss@pose/trans_iter{idx}",
+                                            trans_loss, global_step=global_step)
+                logger_tf.add_scalar(f"loss@pose/rot_iter{idx}",
+                                            rot_loss, global_step=global_step)
+                logger_tf.add_scalar(f"loss@pose/focal_iter{idx}",
+                                            focal_loss, global_step=global_step)
+        # compute the uncertainty loss
+        with torch.no_grad():
+            pose_loss_dist = (poses_preds_ij-poses_gt_i_norm).detach().abs()
+            pose_loss_std = 3*pose_loss_dist.view(-1,8).std(dim=0)[None,None,:]
+            gt_dist =  F.relu(pose_loss_std - pose_loss_dist) / (pose_loss_std + 1e-3)
+        unc_loss = (gt_dist - poses_unc_i).abs().mean()
+        if (logger_tf is not None)&(i==len(poses_preds)-1):
+            logger_tf.add_scalar(f"loss@uncertainty/unc",
+                                    unc_loss,
+                                    global_step=global_step)
+        # if logger is not None:
+        #     logger.info(f"pose_loss: {pose_loss}, unc_loss: {unc_loss}")
+        # total loss
+        loss_total += 0.1*unc_loss + 2*pose_loss
+    poses_gt_norm = poses_gt
+    pose_all_loss = 0.0
+    prev_loss = None
+    for idx, poses_preds_all_j in enumerate(poses_pred_all):
+        i_weight = gamma ** (len(poses_pred_all) - idx - 1)
+        trans_loss = (poses_preds_all_j[...,:3] - poses_gt_norm[...,:3]).abs().sum(dim=-1).mean()
+        rot_loss = (poses_preds_all_j[...,3:7] - poses_gt_norm[...,3:7]).abs().sum(dim=-1).mean()
+        focal_loss = (poses_preds_all_j[...,7:] - poses_gt_norm[...,7:]).abs().sum(dim=-1).mean()
+        if (logger_tf is not None):
+            if prev_loss is None:
+                prev_loss = (trans_loss + rot_loss + focal_loss)
+            else:
+                des_loss = (trans_loss + rot_loss + focal_loss) - prev_loss
+                prev_loss = trans_loss + rot_loss + focal_loss
+                logger_tf.add_scalar(f"loss@global_pose/des_iter{idx}",
+                                        des_loss, global_step=global_step)
+            logger_tf.add_scalar(f"loss@global_pose/trans_iter{idx}",
+                                        trans_loss, global_step=global_step)
+            logger_tf.add_scalar(f"loss@global_pose/rot_iter{idx}",
+                                        rot_loss, global_step=global_step)
+            logger_tf.add_scalar(f"loss@global_pose/focal_iter{idx}",
+                                        focal_loss, global_step=global_step)
+        if torch.isnan((trans_loss + rot_loss + focal_loss)).any():
+            pose_all_loss += 0
+        else:
+            pose_all_loss += i_weight*(trans_loss + rot_loss + focal_loss)
+    # if logger is not None:
+    #     logger.info(f"global_pose_loss: {pose_all_loss}")
+    # compute the depth loss
+    if inv_depth_preds[0] is not None:
+        depths_gt = depths_gt[:,:,0]
+        msk = depths_gt > 5e-2
+        inv_gt = 1.0 / (depths_gt.clamp(1e-3, 1e16))
+        inv_gt_reshp = rearrange(inv_gt, 'b t h w -> (b t) h w')
+        inv_depth_preds_reshp = rearrange(inv_depth_preds[0], 'b t h w -> (b t) h w')
+        inv_raw_reshp = rearrange(inv_depth_raw[0], 'b t h w -> (b t) h w')
+        msk_reshp = rearrange(msk, 'b t h w -> (b t) h w')
+        huber_loss = ScaleAndShiftInvariantLoss_fn(inv_depth_preds_reshp, inv_gt_reshp, msk_reshp)
+        huber_loss_raw = ScaleAndShiftInvariantLoss_fn(inv_raw_reshp, inv_gt_reshp, msk_reshp)
+        # huber_loss = (inv_depth_preds[0][msk]-inv_gt[msk]).abs().mean()
+        # cal perason loss
+        perason_loss = 0
+        # for i in range(B):
+        #     perason_loss += (1 - pearson_corrcoef(inv_depth_preds[0].view(B*T,-1), inv_gt.view(B*T,-1))).mean()
+        # perason_loss = perason_loss/B
+        if torch.isnan(huber_loss).any():
+            huber_loss = 0
+        depth_loss = huber_loss + perason_loss
+        if (logger_tf is not None)&(i==len(poses_preds)-1):
+            logger_tf.add_scalar(f"loss@depth/huber_iter{idx}",
+                                        depth_loss,
+                                        global_step=global_step)
+        # if logger is not None:
+        #         logger.info(f"opt_depth: {huber_loss_raw - huber_loss}")
+    else:
+        depth_loss = 0.0
+    loss_total = loss_total/(len(poses_preds)) + 20*depth_loss + pose_all_loss
+    return loss_total, (huber_loss_raw - huber_loss)
+def vis_depth(x: torch.tensor,
+              logger_tf = None, title: str = "depth", step: int = 0):
+    """
+    args:
+        x: H W
+    """
+    assert len(x.shape) == 2
+    depth_map_normalized = cv2.normalize(x.cpu().numpy(),
+                                        None, 0, 255, cv2.NORM_MINMAX)
+    depth_map_colored = cv2.applyColorMap(depth_map_normalized.astype(np.uint8),
+                                        cv2.COLORMAP_JET)
+    depth_map_tensor = torch.from_numpy(depth_map_colored).permute(2, 0, 1).unsqueeze(0)
+    if logger_tf is not None:
+        logger_tf.add_image(title, depth_map_tensor[0], step)
+    else:
+        return depth_map_tensor
+def vis_pcd(
+        rgbs: torch.Tensor,
+        R: torch.Tensor,
+        T: torch.Tensor,
+        xy_depth: torch.Tensor,
+        focal_length: torch.Tensor,
+        pick_idx: List = [0]
+        ):
+    """
+    args:
+        rgbs: [S C H W]
+        R: [S 3 3]
+        T: [S 3]
+        xy_depth: [S H W 3]
+        focal_length: [S]
+        pick_idx: list of the index to pick
+    """
+    S, C, H, W = rgbs.shape
+    rgbs_pick = rgbs[pick_idx]
+    R_pick = R[pick_idx]
+    T_pick = T[pick_idx]
+    xy_depth_pick = xy_depth[pick_idx]
+    focal_length_pick = focal_length[pick_idx]
+    pcd_world = depth2pcd(xy_depth_pick.clone(),
+                        focal_length_pick, R_pick.clone(), T_pick.clone(),
+                        device=xy_depth.device, H=H, W=W)
+    pcd_world = pcd_world.permute(0, 2, 1)                   #[...,[1,0,2]]
+    mask = pcd_world.reshape(-1,3)[:,2] < 20
+    rgb_world = rgbs_pick.view(len(pick_idx), 3, -1).permute(0, 2, 1)
+    pcl = Pointclouds(points=[pcd_world.reshape(-1,3)[mask]],
+                    features=[rgb_world.reshape(-1,3)[mask]/255])
+    return pcl
+def vis_result(rgbs, poses_pred, poses_gt,
+                depth_gt, depth_pred, iter_num=0,
+                vis=None, logger_tf=None, cfg=None):
+    """
+    Args:
+        rgbs: [S C H W]
+        depths_gt: [S C H W]
+        poses_gt: [S C]
+        poses_pred: [S C]
+        depth_pred: [S H W]
+    """
+    assert len(rgbs.shape) == 4, "only support one sequence, T 3 H W of rbg"
+    if vis is None:
+        return
+    S, _, H, W = depth_gt.shape
+    # get the xy
+    yx = torch.meshgrid(torch.arange(H).to(depth_pred.device),
+                        torch.arange(W).to(depth_pred.device),indexing='ij')
+    xy = torch.stack(yx[::-1], dim=0).float().to(depth_pred.device)
+    xy_norm = (xy / torch.tensor([W, H],
+                                    device=depth_pred.device).view(2, 1, 1) - 0.5)*2
+    xy = xy[None].repeat(S, 1, 1, 1)
+    xy_depth = torch.cat([xy, depth_pred[:,None]], dim=1).permute(0, 2, 3, 1)
+    xy_depth_gt = torch.cat([xy, depth_gt], dim=1).permute(0, 2, 3, 1)
+    # get the focal length
+    focal_length = poses_gt[:,-1]*max(H, W)
+    # vis the camera poses
+    poses_gt_vis = pose_encoding_to_camera(poses_gt,
+                                            pose_encoding_type="absT_quaR_OneFL",to_OpenCV=False)
+    poses_pred_vis = pose_encoding_to_camera(poses_pred,
+                                              pose_encoding_type="absT_quaR_OneFL",to_OpenCV=False)
+    R_gt = poses_gt_vis.R.float()
+    R_pred = poses_pred_vis.R.float()
+    T_gt = poses_gt_vis.T.float()
+    T_pred = poses_pred_vis.T.float()
+    # C2W poses
+    R_gt_c2w = R_gt.permute(0,2,1)
+    T_gt_c2w = (-R_gt_c2w @ T_gt[:, :, None]).squeeze(-1)
+    R_pred_c2w = R_pred.permute(0,2,1)
+    T_pred_c2w = (-R_pred_c2w @ T_pred[:, :, None]).squeeze(-1)
+    with torch.cuda.amp.autocast(enabled=False):
+        pick_idx = torch.randperm(S)[:min(24, S)]
+        # pick_idx = [1]
+        #NOTE: very strange that the camera need C2W Rotation and W2C translation as input
+        poses_gt_vis = PerspectiveCamerasVisual(
+            R=R_gt_c2w[pick_idx], T=T_gt[pick_idx],
+            device=poses_gt_vis.device, image_size=((H, W),)
+        )
+        poses_pred_vis = PerspectiveCamerasVisual(
+            R=R_pred_c2w[pick_idx], T=T_pred[pick_idx],
+            device=poses_pred_vis.device
+        )
+        visual_dict = {"scenes": {"cameras": poses_pred_vis, "cameras_gt": poses_gt_vis}}
+        env_name = f"train_visualize_iter_{iter_num:05d}"
+        print(f"Visualizing the scene by visdom at env: {env_name}")
+        # visualize the depth map
+        vis_depth(depth_pred[0].detach(), logger_tf, title="vis/depth_pred",step=iter_num)
+        msk = depth_pred[0] > 1e-3
+        vis_depth(depth_gt[0,0].detach(), logger_tf, title="vis/depth_gt",step=iter_num)
+        depth_res = (depth_gt[0,0] - depth_pred[0]).abs()
+        vis_depth(depth_res.detach(), logger_tf, title="vis/depth_res",step=iter_num)
+        # visualize the point cloud
+        if cfg.debug.vis_pcd:
+            visual_dict["scenes"]["points_gt"] = vis_pcd(rgbs, R_gt, T_gt,
+                                                        xy_depth_gt, focal_length, pick_idx)
+        else:
+            visual_dict["scenes"]["points_pred"] = vis_pcd(rgbs, R_pred, T_pred,
+                                                            xy_depth, focal_length, pick_idx)
+        # visualize in visdom
+        fig = plot_scene(visual_dict, camera_scale=0.05)
+        vis.plotlyplot(fig, env=env_name, win="3D")
+        vis.save([env_name])
+    return
+def depth2pcd(
+        xy_depth: torch.Tensor,
+        focal_length: torch.Tensor,
+        R: torch.Tensor,
+        T: torch.Tensor,
+        device: torch.device = None,
+        H: int = 518,
+        W: int = 518
+    ):
+    """
+    args:
+        xy_depth: [S H W 3]
+        focal_length: [S]
+        R: [S 3 3]   W2C
+        T: [S 3]     W2C
+    return:
+        xyz: [S 3 (H W)]
+    """
+    S, H, W, _ = xy_depth.shape
+    # get the intrinsic
+    K = torch.eye(3, device=device)[None].repeat(len(focal_length), 1, 1).to(device)
+    K[:, 0, 0] = focal_length
+    K[:, 1, 1] = focal_length
+    K[:, 0, 2] = 0.5 * W
+    K[:, 1, 2] = 0.5 * H
+    K_inv = K.inverse()
+    # xyz
+    xyz = xy_depth.view(S, -1, 3).permute(0, 2, 1) # S 3 (H W)
+    depth = xyz[:, 2:].clone() # S (H W) 1
+    xyz[:, 2] = 1
+    xyz = K_inv @ xyz # S 3 (H W)
+    xyz = xyz * depth
+    # to world coordinate
+    xyz = R.permute(0,2,1) @ (xyz - T[:, :, None])
+    return xyz
+def pose_enc2mat(poses_pred,
+                 H_resize, W_resize, resolution=336):
+    """
+    This function convert the pose encoding into `intrinsic` and `extrinsic`
+    Args:
+        poses_pred: B T 8
+    Return:
+        Intrinsic B T 3 3
+        Extrinsic B T 4 4
+    """
+    B, T, _ = poses_pred.shape
+    focal_pred = poses_pred[:, :, -1].clone()
+    pos_quat_preds = poses_pred[:, :, :7].clone()
+    pos_quat_preds = pos_quat_preds.view(B*T, -1)
+    # get extrinsic
+    c2w_rot = quaternion_to_matrix(pos_quat_preds[:, 3:])
+    c2w_tran = pos_quat_preds[:, :3]
+    c2w_traj = torch.eye(4)[None].repeat(B*T, 1, 1).to(poses_pred.device)
+    c2w_traj[:, :3, :3], c2w_traj[:, :3, 3] = c2w_rot, c2w_tran
+    c2w_traj = c2w_traj.view(B, T, 4, 4)
+    # get intrinsic
+    fxs, fys = focal_pred*resolution, focal_pred*resolution
+    intrs = torch.eye(3).to(c2w_traj.device).to(c2w_traj.dtype)[None, None].repeat(B, T, 1, 1)
+    intrs[:,:,0,0], intrs[:,:,1,1] = fxs, fys
+    intrs[:,:,0,2], intrs[:,:,1,2] = W_resize/2, H_resize/2
+    return intrs, c2w_traj
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    ret[positive_mask] = torch.sqrt(x[positive_mask])
+    return ret
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+    Args:
+        quaternions: Quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions)
+def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(matrix.reshape(batch_dim + (9,)), dim=-1)
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [1.0 + m00 + m11 + m22, 1.0 + m00 - m11 - m22, 1.0 - m00 + m11 - m22, 1.0 - m00 - m11 + m22], dim=-1
+        )
+    )
+    # we produce the desired quaternion multiplied by each of r, i, j, k
+    quat_by_rijk = torch.stack(
+        [
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+    # the candidate won't be picked.
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+    # forall i; we pick the best-conditioned one (with the largest denominator)
+    out = quat_candidates[F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :].reshape(batch_dim + (4,))
+    return standardize_quaternion(out)
+def meshgrid2d(B, Y, X, stack=False, norm=False, device="cuda"):
+    # returns a meshgrid sized B x Y x X
+    grid_y = torch.linspace(0.0, Y - 1, Y, device=torch.device(device))
+    grid_y = torch.reshape(grid_y, [1, Y, 1])
+    grid_y = grid_y.repeat(B, 1, X)
+    grid_x = torch.linspace(0.0, X - 1, X, device=torch.device(device))
+    grid_x = torch.reshape(grid_x, [1, 1, X])
+    grid_x = grid_x.repeat(B, Y, 1)
+    if stack:
+        # note we stack in xy order
+        # (see https://pytorch.org/docs/stable/nn.functional.html#torch.nn.functional.grid_sample)
+        grid = torch.stack([grid_x, grid_y], dim=-1)
+        return grid
+    else:
+        return grid_y, grid_x
+def get_points_on_a_grid(grid_size, interp_shape,
+                          grid_center=(0, 0), device="cuda"):
+    if grid_size == 1:
+        return torch.tensor([interp_shape[1] / 2,
+                             interp_shape[0] / 2], device=device)[
+            None, None
+        ]
+    grid_y, grid_x = meshgrid2d(
+        1, grid_size, grid_size, stack=False, norm=False, device=device
+    )
+    step = interp_shape[1] // 64
+    if grid_center[0] != 0 or grid_center[1] != 0:
+        grid_y = grid_y - grid_size / 2.0
+        grid_x = grid_x - grid_size / 2.0
+    grid_y = step + grid_y.reshape(1, -1) / float(grid_size - 1) * (
+        interp_shape[0] - step * 2
+    )
+    grid_x = step + grid_x.reshape(1, -1) / float(grid_size - 1) * (
+        interp_shape[1] - step * 2
+    )
+    grid_y = grid_y + grid_center[0]
+    grid_x = grid_x + grid_center[1]
+    xy = torch.stack([grid_x, grid_y], dim=-1).to(device)
+    return xy
+def normalize_rgb(x,input_size=224,
+                resize_mode: Literal['resize', 'padding'] = 'resize',
+                if_da=False):
+        """
+        normalize the image for depth anything input
+        args:
+            x: the input images  [B T C H W]
+        """
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x) / 255.0
+        elif isinstance(x, torch.Tensor):
+            x = x / 255.0
+        B, T, C, H, W = x.shape
+        x = x.view(B * T, C, H, W)
+        Resizer = Resize(
+                width=input_size,
+                height=input_size,
+                resize_target=False,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=14,
+                resize_method='lower_bound',
+            )
+        if resize_mode == 'padding':
+            # zero padding to make the input size to be multiple of 14
+            if H > W:
+                H_scale = input_size
+                W_scale = W * input_size // H
+            else:
+                W_scale = input_size
+                H_scale = H * input_size // W
+            # resize the image
+            x = F.interpolate(x, size=(H_scale, W_scale),
+                                mode='bilinear', align_corners=False)
+            # central padding the image
+            padding_x = (input_size - W_scale) // 2
+            padding_y = (input_size - H_scale) // 2
+            extra_x = (input_size - W_scale) % 2
+            extra_y = (input_size - H_scale) % 2
+            x = F.pad(x, (padding_x, padding_x+extra_x,
+                        padding_y, padding_y+extra_y), value=0.)
+        elif resize_mode == 'resize':
+            H_scale, W_scale = Resizer.get_size(H, W)
+            x = F.interpolate(x, size=(int(H_scale), int(W_scale)),
+                                    mode='bicubic', align_corners=True)
+        # get the mean and std
+        __mean__ = torch.tensor([0.485,
+                                 0.456, 0.406]).view(1, 3, 1, 1).to(x.device)
+        __std__ = torch.tensor([0.229,
+                                 0.224, 0.225]).view(1, 3, 1, 1).to(x.device)
+        # normalize the image
+        if if_da:
+            x = (x - __mean__) / __std__
+        else:
+            x = x
+        return x.view(B, T, C, x.shape[-2], x.shape[-1])
+def get_track_points(H, W, T, device, size=100, support_frame=0,
+                                query_size=768, unc_metric=None, mode="mixed"):
+    """
+    This function is used to get the points on the grid
+    args:
+        H: the height of the grid.
+        W: the width of the grid.
+        T: the number of frames.
+        device: the device of the points.
+        size: the size of the grid.
+    """
+    grid_pts = get_points_on_a_grid(size, (H, W), device=device)
+    grid_pts = grid_pts.round()
+    if mode == "incremental":
+        queries = torch.cat(
+                [torch.randint_like(grid_pts[:, :, :1], T), grid_pts],
+                dim=2,
+            )
+    elif mode == "first":
+        queries_first = torch.cat(
+                [torch.zeros_like(grid_pts[:, :, :1]), grid_pts],
+                dim=2,
+            )
+        queries_support = torch.cat(
+                [torch.randint_like(grid_pts[:, :, :1],  T), grid_pts],
+                dim=2,
+            )
+        queries = torch.cat([queries_first, queries_support, queries_support], dim=1)
+    elif mode == "mixed":
+        queries = torch.cat(
+                [torch.randint_like(grid_pts[:, :, :1], T), grid_pts],
+                dim=2,
+            )
+        queries_first = torch.cat(
+                [torch.ones_like(grid_pts[:, :, :1]) * support_frame, grid_pts],
+                dim=2,
+            )
+        queries = torch.cat([queries_first, queries, queries], dim=1)
+    if unc_metric is not None:
+        # filter the points with high uncertainty
+        sample_unc = sample_features5d(unc_metric[None], queries[:,None]).squeeze()
+        if ((sample_unc>0.5).sum() < 20):
+            queries = queries
+        else:
+            queries = queries[:,sample_unc>0.5,:]
+    idx_ = torch.randperm(queries.shape[1], device=device)[:query_size]
+    queries = queries[:, idx_]
+    return queries

models/SpaTrackV2/utils/embeddings.py ADDED Viewed

	@@ -0,0 +1,247 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import numpy as np
+def get_3d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, tuple):
+        grid_size_h, grid_size_w = grid_size
+    else:
+        grid_size_h = grid_size_w = grid_size
+    grid_h = np.arange(grid_size_h, dtype=np.float32)
+    grid_w = np.arange(grid_size_w, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size_h, grid_size_w])
+    pos_embed = get_3d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate(
+            [np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0
+        )
+    return pos_embed
+def get_3d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 3 == 0
+    # use half of dimensions to encode grid_h
+    B, S, N, _ = grid.shape
+    gridx = grid[..., 0].view(B*S*N).detach().cpu().numpy()
+    gridy = grid[..., 1].view(B*S*N).detach().cpu().numpy()
+    gridz = grid[..., 2].view(B*S*N).detach().cpu().numpy()
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, gridx)  # (N, D/3)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, gridy)  # (N, D/3)
+    emb_z = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, gridz)  # (N, D/3)
+    emb = np.concatenate([emb_h, emb_w, emb_z], axis=1)  # (N, D)
+    emb = torch.from_numpy(emb).to(grid.device)
+    return emb.view(B, S, N, embed_dim)
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, tuple):
+        grid_size_h, grid_size_w = grid_size
+    else:
+        grid_size_h = grid_size_w = grid_size
+    grid_h = np.arange(grid_size_h, dtype=np.float32)
+    grid_w = np.arange(grid_size_w, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size_h, grid_size_w])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate(
+            [np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0
+        )
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000 ** omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def get_2d_embedding(xy, C, cat_coords=True):
+    B, N, D = xy.shape
+    assert D == 2
+    x = xy[:, :, 0:1]
+    y = xy[:, :, 1:2]
+    div_term = (
+        torch.arange(0, C, 2, device=xy.device, dtype=torch.float32) * (1000.0 / C)
+    ).reshape(1, 1, int(C / 2))
+    pe_x = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32)
+    pe_y = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32)
+    pe_x[:, :, 0::2] = torch.sin(x * div_term)
+    pe_x[:, :, 1::2] = torch.cos(x * div_term)
+    pe_y[:, :, 0::2] = torch.sin(y * div_term)
+    pe_y[:, :, 1::2] = torch.cos(y * div_term)
+    pe = torch.cat([pe_x, pe_y], dim=2)  # B, N, C*3
+    if cat_coords:
+        pe = torch.cat([xy, pe], dim=2)  # B, N, C*3+3
+    return pe
+def get_3d_embedding(xyz, C, cat_coords=True):
+    B, N, D = xyz.shape
+    assert D == 3
+    x = xyz[:, :, 0:1]
+    y = xyz[:, :, 1:2]
+    z = xyz[:, :, 2:3]
+    div_term = (
+        torch.arange(0, C, 2, device=xyz.device, dtype=torch.float32) * (1000.0 / C)
+    ).reshape(1, 1, int(C / 2))
+    pe_x = torch.zeros(B, N, C, device=xyz.device, dtype=torch.float32)
+    pe_y = torch.zeros(B, N, C, device=xyz.device, dtype=torch.float32)
+    pe_z = torch.zeros(B, N, C, device=xyz.device, dtype=torch.float32)
+    pe_x[:, :, 0::2] = torch.sin(x * div_term)
+    pe_x[:, :, 1::2] = torch.cos(x * div_term)
+    pe_y[:, :, 0::2] = torch.sin(y * div_term)
+    pe_y[:, :, 1::2] = torch.cos(y * div_term)
+    pe_z[:, :, 0::2] = torch.sin(z * div_term)
+    pe_z[:, :, 1::2] = torch.cos(z * div_term)
+    pe = torch.cat([pe_x, pe_y, pe_z], dim=2)  # B, N, C*3
+    if cat_coords:
+        pe = torch.cat([pe, xyz], dim=2)  # B, N, C*3+3
+    return pe
+def get_4d_embedding(xyzw, C, cat_coords=True):
+    B, N, D = xyzw.shape
+    assert D == 4
+    x = xyzw[:, :, 0:1]
+    y = xyzw[:, :, 1:2]
+    z = xyzw[:, :, 2:3]
+    w = xyzw[:, :, 3:4]
+    div_term = (
+        torch.arange(0, C, 2, device=xyzw.device, dtype=torch.float32) * (1000.0 / C)
+    ).reshape(1, 1, int(C / 2))
+    pe_x = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32)
+    pe_y = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32)
+    pe_z = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32)
+    pe_w = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32)
+    pe_x[:, :, 0::2] = torch.sin(x * div_term)
+    pe_x[:, :, 1::2] = torch.cos(x * div_term)
+    pe_y[:, :, 0::2] = torch.sin(y * div_term)
+    pe_y[:, :, 1::2] = torch.cos(y * div_term)
+    pe_z[:, :, 0::2] = torch.sin(z * div_term)
+    pe_z[:, :, 1::2] = torch.cos(z * div_term)
+    pe_w[:, :, 0::2] = torch.sin(w * div_term)
+    pe_w[:, :, 1::2] = torch.cos(w * div_term)
+    pe = torch.cat([pe_x, pe_y, pe_z, pe_w], dim=2)  # B, N, C*3
+    if cat_coords:
+        pe = torch.cat([pe, xyzw], dim=2)  # B, N, C*3+3
+    return pe
+import torch.nn as nn
+class Embedder_Fourier(nn.Module):
+    def __init__(self, input_dim, max_freq_log2, N_freqs,
+                 log_sampling=True, include_input=True,
+                 periodic_fns=(torch.sin, torch.cos)):
+        '''
+        :param input_dim: dimension of input to be embedded
+        :param max_freq_log2: log2 of max freq; min freq is 1 by default
+        :param N_freqs: number of frequency bands
+        :param log_sampling: if True, frequency bands are linerly sampled in log-space
+        :param include_input: if True, raw input is included in the embedding
+        :param periodic_fns: periodic functions used to embed input
+        '''
+        super(Embedder_Fourier, self).__init__()
+        self.input_dim = input_dim
+        self.include_input = include_input
+        self.periodic_fns = periodic_fns
+        self.out_dim = 0
+        if self.include_input:
+            self.out_dim += self.input_dim
+        self.out_dim += self.input_dim * N_freqs * len(self.periodic_fns)
+        if log_sampling:
+            self.freq_bands = 2. ** torch.linspace(0., max_freq_log2, N_freqs)
+        else:
+            self.freq_bands = torch.linspace(
+                2. ** 0., 2. ** max_freq_log2, N_freqs)
+        self.freq_bands = self.freq_bands.numpy().tolist()
+    def forward(self,
+                input: torch.Tensor,
+                rescale: float = 1.0):
+        '''
+        :param input: tensor of shape [..., self.input_dim]
+        :return: tensor of shape [..., self.out_dim]
+        '''
+        assert (input.shape[-1] == self.input_dim)
+        out = []
+        if self.include_input:
+            out.append(input/rescale)
+        for i in range(len(self.freq_bands)):
+            freq = self.freq_bands[i]
+            for p_fn in self.periodic_fns:
+                out.append(p_fn(input * freq))
+        out = torch.cat(out, dim=-1)
+        assert (out.shape[-1] == self.out_dim)
+        return out

models/SpaTrackV2/utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,444 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+from easydict import EasyDict as edict
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+EPS = 1e-6
+def nearest_sample2d(im, x, y, return_inbounds=False):
+    # x and y are each B, N
+    # output is B, C, N
+    if len(im.shape) == 5:
+        B, N, C, H, W = list(im.shape)
+    else:
+        B, C, H, W = list(im.shape)
+    N = list(x.shape)[1]
+    x = x.float()
+    y = y.float()
+    H_f = torch.tensor(H, dtype=torch.float32)
+    W_f = torch.tensor(W, dtype=torch.float32)
+    # inbound_mask = (x>-0.5).float()*(y>-0.5).float()*(x<W_f+0.5).float()*(y<H_f+0.5).float()
+    max_y = (H_f - 1).int()
+    max_x = (W_f - 1).int()
+    x0 = torch.floor(x).int()
+    x1 = x0 + 1
+    y0 = torch.floor(y).int()
+    y1 = y0 + 1
+    x0_clip = torch.clamp(x0, 0, max_x)
+    x1_clip = torch.clamp(x1, 0, max_x)
+    y0_clip = torch.clamp(y0, 0, max_y)
+    y1_clip = torch.clamp(y1, 0, max_y)
+    dim2 = W
+    dim1 = W * H
+    base = torch.arange(0, B, dtype=torch.int64, device=x.device) * dim1
+    base = torch.reshape(base, [B, 1]).repeat([1, N])
+    base_y0 = base + y0_clip * dim2
+    base_y1 = base + y1_clip * dim2
+    idx_y0_x0 = base_y0 + x0_clip
+    idx_y0_x1 = base_y0 + x1_clip
+    idx_y1_x0 = base_y1 + x0_clip
+    idx_y1_x1 = base_y1 + x1_clip
+    # use the indices to lookup pixels in the flat image
+    # im is B x C x H x W
+    # move C out to last dim
+    if len(im.shape) == 5:
+        im_flat = (im.permute(0, 3, 4, 1, 2)).reshape(B * H * W, N, C)
+        i_y0_x0 = torch.diagonal(im_flat[idx_y0_x0.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+        i_y0_x1 = torch.diagonal(im_flat[idx_y0_x1.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+        i_y1_x0 = torch.diagonal(im_flat[idx_y1_x0.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+        i_y1_x1 = torch.diagonal(im_flat[idx_y1_x1.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+    else:
+        im_flat = (im.permute(0, 2, 3, 1)).reshape(B * H * W, C)
+        i_y0_x0 = im_flat[idx_y0_x0.long()]
+        i_y0_x1 = im_flat[idx_y0_x1.long()]
+        i_y1_x0 = im_flat[idx_y1_x0.long()]
+        i_y1_x1 = im_flat[idx_y1_x1.long()]
+    # Finally calculate interpolated values.
+    x0_f = x0.float()
+    x1_f = x1.float()
+    y0_f = y0.float()
+    y1_f = y1.float()
+    w_y0_x0 = ((x1_f - x) * (y1_f - y)).unsqueeze(2)
+    w_y0_x1 = ((x - x0_f) * (y1_f - y)).unsqueeze(2)
+    w_y1_x0 = ((x1_f - x) * (y - y0_f)).unsqueeze(2)
+    w_y1_x1 = ((x - x0_f) * (y - y0_f)).unsqueeze(2)
+    # w_yi_xo is B * N * 1
+    max_idx = torch.cat([w_y0_x0, w_y0_x1, w_y1_x0, w_y1_x1], dim=-1).max(dim=-1)[1]
+    output = torch.stack([i_y0_x0, i_y0_x1, i_y1_x0, i_y1_x1], dim=-1).gather(-1, max_idx[...,None,None].repeat(1,1,C,1)).squeeze(-1)
+    # output is B*N x C
+    output = output.view(B, -1, C)
+    output = output.permute(0, 2, 1)
+    # output is B x C x N
+    if return_inbounds:
+        x_valid = (x > -0.5).byte() & (x < float(W_f - 0.5)).byte()
+        y_valid = (y > -0.5).byte() & (y < float(H_f - 0.5)).byte()
+        inbounds = (x_valid & y_valid).float()
+        inbounds = inbounds.reshape(
+            B, N
+        )  # something seems wrong here for B>1; i'm getting an error here (or downstream if i put -1)
+        return output, inbounds
+    return output  # B, C, N
+def smart_cat(tensor1, tensor2, dim):
+    if tensor1 is None:
+        return tensor2
+    return torch.cat([tensor1, tensor2], dim=dim)
+def normalize_single(d):
+    # d is a whatever shape torch tensor
+    dmin = torch.min(d)
+    dmax = torch.max(d)
+    d = (d - dmin) / (EPS + (dmax - dmin))
+    return d
+def normalize(d):
+    # d is B x whatever. normalize within each element of the batch
+    out = torch.zeros(d.size())
+    if d.is_cuda:
+        out = out.cuda()
+    B = list(d.size())[0]
+    for b in list(range(B)):
+        out[b] = normalize_single(d[b])
+    return out
+def meshgrid2d(B, Y, X, stack=False, norm=False, device="cuda"):
+    # returns a meshgrid sized B x Y x X
+    grid_y = torch.linspace(0.0, Y - 1, Y, device=torch.device(device))
+    grid_y = torch.reshape(grid_y, [1, Y, 1])
+    grid_y = grid_y.repeat(B, 1, X)
+    grid_x = torch.linspace(0.0, X - 1, X, device=torch.device(device))
+    grid_x = torch.reshape(grid_x, [1, 1, X])
+    grid_x = grid_x.repeat(B, Y, 1)
+    if stack:
+        # note we stack in xy order
+        # (see https://pytorch.org/docs/stable/nn.functional.html#torch.nn.functional.grid_sample)
+        grid = torch.stack([grid_x, grid_y], dim=-1)
+        return grid
+    else:
+        return grid_y, grid_x
+def reduce_masked_mean(x, mask, dim=None, keepdim=False):
+    # x and mask are the same shape, or at least broadcastably so < actually it's safer if you disallow broadcasting
+    # returns shape-1
+    # axis can be a list of axes
+    for (a, b) in zip(x.size(), mask.size()):
+        assert a == b  # some shape mismatch!
+    prod = x * mask
+    if dim is None:
+        numer = torch.sum(prod)
+        denom = EPS + torch.sum(mask)
+    else:
+        numer = torch.sum(prod, dim=dim, keepdim=keepdim)
+        denom = EPS + torch.sum(mask, dim=dim, keepdim=keepdim)
+    mean = numer / denom
+    return mean
+def bilinear_sample2d(im, x, y, return_inbounds=False):
+    # x and y are each B, N
+    # output is B, C, N
+    if len(im.shape) == 5:
+        B, N, C, H, W = list(im.shape)
+    else:
+        B, C, H, W = list(im.shape)
+    N = list(x.shape)[1]
+    x = x.float()
+    y = y.float()
+    H_f = torch.tensor(H, dtype=torch.float32)
+    W_f = torch.tensor(W, dtype=torch.float32)
+    # inbound_mask = (x>-0.5).float()*(y>-0.5).float()*(x<W_f+0.5).float()*(y<H_f+0.5).float()
+    max_y = (H_f - 1).int()
+    max_x = (W_f - 1).int()
+    x0 = torch.floor(x).int()
+    x1 = x0 + 1
+    y0 = torch.floor(y).int()
+    y1 = y0 + 1
+    x0_clip = torch.clamp(x0, 0, max_x)
+    x1_clip = torch.clamp(x1, 0, max_x)
+    y0_clip = torch.clamp(y0, 0, max_y)
+    y1_clip = torch.clamp(y1, 0, max_y)
+    dim2 = W
+    dim1 = W * H
+    base = torch.arange(0, B, dtype=torch.int64, device=x.device) * dim1
+    base = torch.reshape(base, [B, 1]).repeat([1, N])
+    base_y0 = base + y0_clip * dim2
+    base_y1 = base + y1_clip * dim2
+    idx_y0_x0 = base_y0 + x0_clip
+    idx_y0_x1 = base_y0 + x1_clip
+    idx_y1_x0 = base_y1 + x0_clip
+    idx_y1_x1 = base_y1 + x1_clip
+    # use the indices to lookup pixels in the flat image
+    # im is B x C x H x W
+    # move C out to last dim
+    if len(im.shape) == 5:
+        im_flat = (im.permute(0, 3, 4, 1, 2)).reshape(B * H * W, N, C)
+        i_y0_x0 = torch.diagonal(im_flat[idx_y0_x0.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+        i_y0_x1 = torch.diagonal(im_flat[idx_y0_x1.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+        i_y1_x0 = torch.diagonal(im_flat[idx_y1_x0.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+        i_y1_x1 = torch.diagonal(im_flat[idx_y1_x1.long()], dim1=1, dim2=2).permute(
+            0, 2, 1
+        )
+    else:
+        im_flat = (im.permute(0, 2, 3, 1)).reshape(B * H * W, C)
+        i_y0_x0 = im_flat[idx_y0_x0.long()]
+        i_y0_x1 = im_flat[idx_y0_x1.long()]
+        i_y1_x0 = im_flat[idx_y1_x0.long()]
+        i_y1_x1 = im_flat[idx_y1_x1.long()]
+    # Finally calculate interpolated values.
+    x0_f = x0.float()
+    x1_f = x1.float()
+    y0_f = y0.float()
+    y1_f = y1.float()
+    w_y0_x0 = ((x1_f - x) * (y1_f - y)).unsqueeze(2)
+    w_y0_x1 = ((x - x0_f) * (y1_f - y)).unsqueeze(2)
+    w_y1_x0 = ((x1_f - x) * (y - y0_f)).unsqueeze(2)
+    w_y1_x1 = ((x - x0_f) * (y - y0_f)).unsqueeze(2)
+    output = (
+        w_y0_x0 * i_y0_x0 + w_y0_x1 * i_y0_x1 + w_y1_x0 * i_y1_x0 + w_y1_x1 * i_y1_x1
+    )
+    # output is B*N x C
+    output = output.view(B, -1, C)
+    output = output.permute(0, 2, 1)
+    # output is B x C x N
+    if return_inbounds:
+        x_valid = (x > -0.5).byte() & (x < float(W_f - 0.5)).byte()
+        y_valid = (y > -0.5).byte() & (y < float(H_f - 0.5)).byte()
+        inbounds = (x_valid & y_valid).float()
+        inbounds = inbounds.reshape(
+            B, N
+        )  # something seems wrong here for B>1; i'm getting an error here (or downstream if i put -1)
+        return output, inbounds
+    return output  # B, C, N
+def procrustes_analysis(X0,X1,Weight): # [B,N,3]
+    # translation
+    t0 = X0.mean(dim=1,keepdim=True)
+    t1 = X1.mean(dim=1,keepdim=True)
+    X0c = X0-t0
+    X1c = X1-t1
+    # scale
+    # s0 = (X0c**2).sum(dim=-1).mean().sqrt()
+    # s1 = (X1c**2).sum(dim=-1).mean().sqrt()
+    # X0cs = X0c/s0
+    # X1cs = X1c/s1
+    # rotation (use double for SVD, float loses precision)
+    U,_,V = (X0c.t()@X1c).double().svd(some=True)
+    R = (U@V.t()).float()
+    if R.det()<0: R[2] *= -1
+    # align X1 to X0: X1to0 = (X1-t1)/@R.t()+t0
+    se3 = edict(t0=t0[0],t1=t1[0],R=R)
+    return se3
+def bilinear_sampler(input, coords, align_corners=True, padding_mode="border", interp_mode="bilinear"):
+    r"""Sample a tensor using bilinear interpolation
+    `bilinear_sampler(input, coords)` samples a tensor :attr:`input` at
+    coordinates :attr:`coords` using bilinear interpolation. It is the same
+    as `torch.nn.functional.grid_sample()` but with a different coordinate
+    convention.
+    The input tensor is assumed to be of shape :math:`(B, C, H, W)`, where
+    :math:`B` is the batch size, :math:`C` is the number of channels,
+    :math:`H` is the height of the image, and :math:`W` is the width of the
+    image. The tensor :attr:`coords` of shape :math:`(B, H_o, W_o, 2)` is
+    interpreted as an array of 2D point coordinates :math:`(x_i,y_i)`.
+    Alternatively, the input tensor can be of size :math:`(B, C, T, H, W)`,
+    in which case sample points are triplets :math:`(t_i,x_i,y_i)`. Note
+    that in this case the order of the components is slightly different
+    from `grid_sample()`, which would expect :math:`(x_i,y_i,t_i)`.
+    If `align_corners` is `True`, the coordinate :math:`x` is assumed to be
+    in the range :math:`[0,W-1]`, with 0 corresponding to the center of the
+    left-most image pixel :math:`W-1` to the center of the right-most
+    pixel.
+    If `align_corners` is `False`, the coordinate :math:`x` is assumed to
+    be in the range :math:`[0,W]`, with 0 corresponding to the left edge of
+    the left-most pixel :math:`W` to the right edge of the right-most
+    pixel.
+    Similar conventions apply to the :math:`y` for the range
+    :math:`[0,H-1]` and :math:`[0,H]` and to :math:`t` for the range
+    :math:`[0,T-1]` and :math:`[0,T]`.
+    Args:
+        input (Tensor): batch of input images.
+        coords (Tensor): batch of coordinates.
+        align_corners (bool, optional): Coordinate convention. Defaults to `True`.
+        padding_mode (str, optional): Padding mode. Defaults to `"border"`.
+    Returns:
+        Tensor: sampled points.
+    """
+    sizes = input.shape[2:]
+    assert len(sizes) in [2, 3]
+    if len(sizes) == 3:
+        # t x y -> x y t to match dimensions T H W in grid_sample
+        coords = coords[..., [1, 2, 0]]
+    if align_corners:
+        coords = coords * torch.tensor(
+            [2 / max(size - 1, 1) for size in reversed(sizes)], device=coords.device
+        )
+    else:
+        coords = coords * torch.tensor([2 / size for size in reversed(sizes)], device=coords.device)
+    coords -= 1
+    return F.grid_sample(input, coords, align_corners=align_corners, padding_mode=padding_mode, mode=interp_mode)
+def sample_features4d(input, coords, interp_mode="bilinear"):
+    r"""Sample spatial features
+    `sample_features4d(input, coords)` samples the spatial features
+    :attr:`input` represented by a 4D tensor :math:`(B, C, H, W)`.
+    The field is sampled at coordinates :attr:`coords` using bilinear
+    interpolation. :attr:`coords` is assumed to be of shape :math:`(B, R,
+    3)`, where each sample has the format :math:`(x_i, y_i)`. This uses the
+    same convention as :func:`bilinear_sampler` with `align_corners=True`.
+    The output tensor has one feature per point, and has shape :math:`(B,
+    R, C)`.
+    Args:
+        input (Tensor): spatial features.
+        coords (Tensor): points.
+    Returns:
+        Tensor: sampled features.
+    """
+    B, _, _, _ = input.shape
+    # B R 2 -> B R 1 2
+    coords = coords.unsqueeze(2)
+    # B C R 1
+    feats = bilinear_sampler(input, coords, interp_mode=interp_mode)
+    return feats.permute(0, 2, 1, 3).view(
+        B, -1, feats.shape[1] * feats.shape[3]
+    )  # B C R 1 -> B R C
+def sample_features5d(input, coords, interp_mode="bilinear"):
+    r"""Sample spatio-temporal features
+    `sample_features5d(input, coords)` works in the same way as
+    :func:`sample_features4d` but for spatio-temporal features and points:
+    :attr:`input` is a 5D tensor :math:`(B, T, C, H, W)`, :attr:`coords` is
+    a :math:`(B, R1, R2, 3)` tensor of spatio-temporal point :math:`(t_i,
+    x_i, y_i)`. The output tensor has shape :math:`(B, R1, R2, C)`.
+    Args:
+        input (Tensor): spatio-temporal features.
+        coords (Tensor): spatio-temporal points.
+    Returns:
+        Tensor: sampled features.
+    """
+    B, T, _, _, _ = input.shape
+    # B T C H W -> B C T H W
+    input = input.permute(0, 2, 1, 3, 4)
+    # B R1 R2 3 -> B R1 R2 1 3
+    coords = coords.unsqueeze(3)
+    # B C R1 R2 1
+    feats = bilinear_sampler(input, coords, interp_mode=interp_mode)
+    return feats.permute(0, 2, 3, 1, 4).view(
+        B, feats.shape[2], feats.shape[3], feats.shape[1]
+    )  # B C R1 R2 1 -> B R1 R2 C
+def vis_PCA(fmaps, save_dir):
+    """
+        visualize the PCA of the feature maps
+    args:
+        fmaps: feature maps  1 C H W
+        save_dir: the directory to save the PCA visualization
+    """
+    pca = PCA(n_components=3)
+    fmap_vis = fmaps[0,...]
+    fmap_vnorm = (
+        (fmap_vis-fmap_vis.min())/
+        (fmap_vis.max()-fmap_vis.min()))
+    H_vis, W_vis = fmap_vis.shape[1:]
+    fmap_vnorm = fmap_vnorm.reshape(fmap_vnorm.shape[0],
+                                        -1).permute(1,0)
+    fmap_pca = pca.fit_transform(fmap_vnorm.detach().cpu().numpy())
+    pca = fmap_pca.reshape(H_vis,W_vis,3)
+    plt.imsave(save_dir,
+                (
+                    (pca-pca.min())/
+                    (pca.max()-pca.min())
+                    ))

models/SpaTrackV2/utils/visualizer.py ADDED Viewed

	@@ -0,0 +1,352 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import numpy as np
+import cv2
+import torch
+import flow_vis
+from matplotlib import cm
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+import moviepy
+from moviepy.editor import ImageSequenceClip
+import matplotlib.pyplot as plt
+def read_video_from_path(path):
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        print("Error opening video file")
+    else:
+        frames = []
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if ret == True:
+                frames.append(np.array(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
+            else:
+                break
+        cap.release()
+    return np.stack(frames)
+class Visualizer:
+    def __init__(
+        self,
+        save_dir: str = "./results",
+        grayscale: bool = False,
+        pad_value: int = 0,
+        fps: int = 10,
+        mode: str = "rainbow",  # 'cool', 'optical_flow'
+        linewidth: int = 2,
+        show_first_frame: int = 10,
+        tracks_leave_trace: int = 0,  # -1 for infinite
+    ):
+        self.mode = mode
+        self.save_dir = save_dir
+        if mode == "rainbow":
+            self.color_map = cm.get_cmap("gist_rainbow")
+        elif mode == "cool":
+            self.color_map = cm.get_cmap(mode)
+        self.show_first_frame = show_first_frame
+        self.grayscale = grayscale
+        self.tracks_leave_trace = tracks_leave_trace
+        self.pad_value = pad_value
+        self.linewidth = linewidth
+        self.fps = fps
+    def visualize(
+        self,
+        video: torch.Tensor,  # (B,T,C,H,W)
+        tracks: torch.Tensor,  # (B,T,N,2)
+        visibility: torch.Tensor = None,  # (B, T, N, 1) bool
+        gt_tracks: torch.Tensor = None,  # (B,T,N,2)
+        segm_mask: torch.Tensor = None,  # (B,1,H,W)
+        filename: str = "video",
+        writer=None,  # tensorboard Summary Writer, used for visualization during training
+        step: int = 0,
+        query_frame: int = 0,
+        save_video: bool = True,
+        compensate_for_camera_motion: bool = False,
+        rigid_part = None,
+        video_depth = None # (B,T,C,H,W)
+    ):
+        if compensate_for_camera_motion:
+            assert segm_mask is not None
+        if segm_mask is not None:
+            coords = tracks[0, query_frame].round().long()
+            segm_mask = segm_mask[0, query_frame][coords[:, 1], coords[:, 0]].long()
+        video = F.pad(
+            video,
+            (self.pad_value, self.pad_value, self.pad_value, self.pad_value),
+            "constant",
+            255,
+        )
+        if video_depth is not None:
+            video_depth = (video_depth*255).cpu().numpy().astype(np.uint8)
+            video_depth = ([cv2.applyColorMap(video_depth[0,i,0], cv2.COLORMAP_INFERNO)
+                            for i in range(video_depth.shape[1])])
+            video_depth = np.stack(video_depth, axis=0)
+            video_depth = torch.from_numpy(video_depth).permute(0, 3, 1, 2)[None]
+        tracks = tracks + self.pad_value
+        if self.grayscale:
+            transform = transforms.Grayscale()
+            video = transform(video)
+            video = video.repeat(1, 1, 3, 1, 1)
+        res_video = self.draw_tracks_on_video(
+            video=video,
+            tracks=tracks,
+            visibility=visibility,
+            segm_mask=segm_mask,
+            gt_tracks=gt_tracks,
+            query_frame=query_frame,
+            compensate_for_camera_motion=compensate_for_camera_motion,
+            rigid_part=rigid_part
+        )
+        if save_video:
+            self.save_video(res_video, filename=filename,
+                            writer=writer, step=step)
+            if video_depth is not None:
+                self.save_video(video_depth, filename=filename+"_depth",
+                                writer=writer, step=step)
+        return res_video
+    def save_video(self, video, filename, writer=None, step=0):
+        if writer is not None:
+            writer.add_video(
+                f"{filename}_pred_track",
+                video.to(torch.uint8),
+                global_step=step,
+                fps=self.fps,
+            )
+        else:
+            os.makedirs(self.save_dir, exist_ok=True)
+            wide_list = list(video.unbind(1))
+            wide_list = [wide[0].permute(1, 2, 0).cpu().numpy() for wide in wide_list]
+            clip = ImageSequenceClip(wide_list[2:-1], fps=self.fps)
+            # Write the video file
+            save_path = os.path.join(self.save_dir, f"{filename}_pred_track.mp4")
+            clip.write_videofile(save_path, codec="libx264", fps=self.fps, logger=None)
+            print(f"Video saved to {save_path}")
+    def draw_tracks_on_video(
+        self,
+        video: torch.Tensor,
+        tracks: torch.Tensor,
+        visibility: torch.Tensor = None,
+        segm_mask: torch.Tensor = None,
+        gt_tracks=None,
+        query_frame: int = 0,
+        compensate_for_camera_motion=False,
+        rigid_part=None,
+    ):
+        B, T, C, H, W = video.shape
+        _, _, N, D = tracks.shape
+        assert D == 2
+        assert C == 3
+        video = video[0].permute(0, 2, 3, 1).byte().detach().cpu().numpy()  # S, H, W, C
+        tracks = tracks[0].long().detach().cpu().numpy()  # S, N, 2
+        if gt_tracks is not None:
+            gt_tracks = gt_tracks.detach().cpu().numpy()
+        res_video = []
+        # process input video
+        for rgb in video:
+            res_video.append(rgb.copy())
+        vector_colors = np.zeros((T, N, 3))
+        if self.mode == "optical_flow":
+            vector_colors = flow_vis.flow_to_color(tracks - tracks[query_frame][None])
+        elif segm_mask is None:
+            if self.mode == "rainbow":
+                y_min, y_max = (
+                    tracks[query_frame, :, 1].min(),
+                    tracks[query_frame, :, 1].max(),
+                )
+                norm = plt.Normalize(y_min, y_max)
+                for n in range(N):
+                    color = self.color_map(norm(tracks[query_frame, n, 1]))
+                    color = np.array(color[:3])[None] * 255
+                    vector_colors[:, n] = np.repeat(color, T, axis=0)
+            else:
+                # color changes with time
+                for t in range(T):
+                    color = np.array(self.color_map(t / T)[:3])[None] * 255
+                    vector_colors[t] = np.repeat(color, N, axis=0)
+        else:
+            if self.mode == "rainbow":
+                vector_colors[:, segm_mask <= 0, :] = 255
+                y_min, y_max = (
+                    tracks[0, segm_mask > 0, 1].min(),
+                    tracks[0, segm_mask > 0, 1].max(),
+                )
+                norm = plt.Normalize(y_min, y_max)
+                for n in range(N):
+                    if segm_mask[n] > 0:
+                        color = self.color_map(norm(tracks[0, n, 1]))
+                        color = np.array(color[:3])[None] * 255
+                        vector_colors[:, n] = np.repeat(color, T, axis=0)
+            else:
+                # color changes with segm class
+                segm_mask = segm_mask.cpu()
+                color = np.zeros((segm_mask.shape[0], 3), dtype=np.float32)
+                color[segm_mask > 0] = np.array(self.color_map(1.0)[:3]) * 255.0
+                color[segm_mask <= 0] = np.array(self.color_map(0.0)[:3]) * 255.0
+                vector_colors = np.repeat(color[None], T, axis=0)
+        #  draw tracks
+        if self.tracks_leave_trace != 0:
+            for t in range(1, T):
+                first_ind = (
+                    max(0, t - self.tracks_leave_trace)
+                    if self.tracks_leave_trace >= 0
+                    else 0
+                )
+                curr_tracks = tracks[first_ind : t + 1]
+                curr_colors = vector_colors[first_ind : t + 1]
+                if compensate_for_camera_motion:
+                    diff = (
+                        tracks[first_ind : t + 1, segm_mask <= 0]
+                        - tracks[t : t + 1, segm_mask <= 0]
+                    ).mean(1)[:, None]
+                    curr_tracks = curr_tracks - diff
+                    curr_tracks = curr_tracks[:, segm_mask > 0]
+                    curr_colors = curr_colors[:, segm_mask > 0]
+                res_video[t] = self._draw_pred_tracks(
+                    res_video[t],
+                    curr_tracks,
+                    curr_colors,
+                )
+                if gt_tracks is not None:
+                    res_video[t] = self._draw_gt_tracks(
+                        res_video[t], gt_tracks[first_ind : t + 1]
+                    )
+        if rigid_part is not None:
+            cls_label = torch.unique(rigid_part)
+            cls_num = len(torch.unique(rigid_part))
+            # visualize the clustering results
+            cmap = plt.get_cmap('jet')  # get the color mapping
+            colors = cmap(np.linspace(0, 1, cls_num))
+            colors = (colors[:, :3] * 255)
+            color_map = {lable.item(): color for lable, color in zip(cls_label, colors)}
+        #  draw points
+        for t in range(T):
+            for i in range(N):
+                coord = (tracks[t, i, 0], tracks[t, i, 1])
+                visibile = True
+                if visibility is not None:
+                    visibile = visibility[0, t, i] > 0.5
+                if coord[0] != 0 and coord[1] != 0:
+                    if not compensate_for_camera_motion or (
+                        compensate_for_camera_motion and segm_mask[i] > 0
+                    ):
+                        if rigid_part is not None:
+                            color = color_map[rigid_part.squeeze()[i].item()]
+                            cv2.circle(
+                                res_video[t],
+                                coord,
+                                int(self.linewidth * 2),
+                                color.tolist(),
+                                thickness=-1 if visibile else 2
+                                -1,
+                            )
+                        else:
+                            cv2.circle(
+                                res_video[t],
+                                coord,
+                                int(self.linewidth * 2),
+                                vector_colors[t, i].tolist(),
+                                thickness=-1 if visibile else 2
+                                -1,
+                            )
+        #  construct the final rgb sequence
+        if self.show_first_frame > 0:
+            res_video = [res_video[0]] * self.show_first_frame + res_video[1:]
+        return torch.from_numpy(np.stack(res_video)).permute(0, 3, 1, 2)[None].byte()
+    def _draw_pred_tracks(
+        self,
+        rgb: np.ndarray,  # H x W x 3
+        tracks: np.ndarray,  # T x 2
+        vector_colors: np.ndarray,
+        alpha: float = 0.5,
+    ):
+        T, N, _ = tracks.shape
+        for s in range(T - 1):
+            vector_color = vector_colors[s]
+            original = rgb.copy()
+            alpha = (s / T) ** 2
+            for i in range(N):
+                coord_y = (int(tracks[s, i, 0]), int(tracks[s, i, 1]))
+                coord_x = (int(tracks[s + 1, i, 0]), int(tracks[s + 1, i, 1]))
+                if coord_y[0] != 0 and coord_y[1] != 0:
+                    cv2.line(
+                        rgb,
+                        coord_y,
+                        coord_x,
+                        vector_color[i].tolist(),
+                        self.linewidth,
+                        cv2.LINE_AA,
+                    )
+            if self.tracks_leave_trace > 0:
+                rgb = cv2.addWeighted(rgb, alpha, original, 1 - alpha, 0)
+        return rgb
+    def _draw_gt_tracks(
+        self,
+        rgb: np.ndarray,  # H x W x 3,
+        gt_tracks: np.ndarray,  # T x 2
+    ):
+        T, N, _ = gt_tracks.shape
+        color = np.array((211.0, 0.0, 0.0))
+        for t in range(T):
+            for i in range(N):
+                gt_tracks_i = gt_tracks[t][i]
+                #  draw a red cross
+                if gt_tracks_i[0] > 0 and gt_tracks_i[1] > 0:
+                    length = self.linewidth * 3
+                    coord_y = (int(gt_tracks_i[0]) + length, int(gt_tracks_i[1]) + length)
+                    coord_x = (int(gt_tracks_i[0]) - length, int(gt_tracks_i[1]) - length)
+                    cv2.line(
+                        rgb,
+                        coord_y,
+                        coord_x,
+                        color,
+                        self.linewidth,
+                        cv2.LINE_AA,
+                    )
+                    coord_y = (int(gt_tracks_i[0]) - length, int(gt_tracks_i[1]) + length)
+                    coord_x = (int(gt_tracks_i[0]) + length, int(gt_tracks_i[1]) - length)
+                    cv2.line(
+                        rgb,
+                        coord_y,
+                        coord_x,
+                        color,
+                        self.linewidth,
+                        cv2.LINE_AA,
+                    )
+        return rgb

models/moge/__init__.py ADDED Viewed

File without changes

models/moge/model/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import importlib
+from typing import *
+if TYPE_CHECKING:
+    from .v1 import MoGeModel as MoGeModelV1
+    from .v2 import MoGeModel as MoGeModelV2
+def import_model_class_by_version(version: str) -> Type[Union['MoGeModelV1', 'MoGeModelV2']]:
+    assert version in ['v1', 'v2'], f'Unsupported model version: {version}'
+    try:
+        module = importlib.import_module(f'.{version}', __package__)
+    except ModuleNotFoundError:
+        raise ValueError(f'Model version "{version}" not found.')
+    cls = getattr(module, 'MoGeModel')
+    return cls

models/moge/model/dinov2/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+__version__ = "0.0.1"

models/moge/model/dinov2/hub/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

models/moge/model/dinov2/hub/backbones.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+from typing import Union
+import torch
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
+class Weights(Enum):
+    LVD142M = "LVD142M"
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    from ..models import vision_transformer as vits
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+    return model
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )

models/moge/model/dinov2/hub/utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import itertools
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+        output = F.pad(x, pads)
+        return output

models/moge/model/dinov2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

models/moge/model/dinov2/layers/attention.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Attention)")
+    else:
+        # warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x