Zhen Ye commited on
Commit
5e832fe
·
1 Parent(s): 012b29b

added depth view

Browse files
LaserPerception/LaserPerception.html CHANGED
@@ -291,6 +291,10 @@
291
  <button id="btnRecompute" class="btn secondary">Recompute HEL</button>
292
  <button id="btnClear" class="btn secondary">Clear</button>
293
  </div>
 
 
 
 
294
  </div>
295
 
296
  <div class="panel panel-objects">
@@ -419,6 +423,7 @@
419
  <span class="chip" id="chipBeam">BEAM:OFF</span>
420
  <span class="chip" id="chipHz">DET:6Hz</span>
421
  <span class="chip" id="chipFeed" title="Toggle raw vs HF-processed feed (if available)">FEED:RAW</span>
 
422
  </div>
423
 
424
  <div class="mt-md">
 
291
  <button id="btnRecompute" class="btn secondary">Recompute HEL</button>
292
  <button id="btnClear" class="btn secondary">Clear</button>
293
  </div>
294
+
295
+ <div class="strip mt-md">
296
+ <span class="chip" id="chipFrameDepth" title="Toggle depth view of first frame (if available)">VIEW:DEFAULT</span>
297
+ </div>
298
  </div>
299
 
300
  <div class="panel panel-objects">
 
423
  <span class="chip" id="chipBeam">BEAM:OFF</span>
424
  <span class="chip" id="chipHz">DET:6Hz</span>
425
  <span class="chip" id="chipFeed" title="Toggle raw vs HF-processed feed (if available)">FEED:RAW</span>
426
+ <span class="chip" id="chipDepth" title="Toggle depth view (if available)">VIEW:DEFAULT</span>
427
  </div>
428
 
429
  <div class="mt-md">
LaserPerception/LaserPerception.js CHANGED
@@ -25,6 +25,8 @@
25
  videoFile: null,
26
  videoLoaded: false,
27
  useProcessedFeed: false,
 
 
28
  hasReasoned: false,
29
  isReasoning: false, // Flag to prevent concurrent Reason executions
30
 
@@ -42,6 +44,10 @@
42
  queries: [], // Mission objective used as query
43
  processedUrl: null,
44
  processedBlob: null,
 
 
 
 
45
  summary: null,
46
  busy: false,
47
  lastError: null
@@ -179,6 +185,8 @@
179
  const chipTracks = $("#chipTracks");
180
  const chipBeam = $("#chipBeam");
181
  const chipHz = $("#chipHz");
 
 
182
 
183
  const dwellText = $("#dwellText");
184
  const dwellBar = $("#dwellBar");
@@ -246,6 +254,29 @@
246
  log(`Engage feed set to: ${state.useProcessedFeed ? "HF" : "RAW"}`, "t");
247
  });
248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
  // Refresh intel summary (unbiased)
251
  if (btnIntelRefresh) {
@@ -378,15 +409,27 @@
378
  if (state.hf.processedUrl && state.hf.processedUrl.startsWith("blob:")) {
379
  try { URL.revokeObjectURL(state.hf.processedUrl); } catch (_) { }
380
  }
 
 
 
 
 
 
381
  state.videoUrl = null;
382
  state.videoFile = null;
383
  state.videoLoaded = false;
384
  state.useProcessedFeed = false;
 
 
385
 
386
  state.hf.missionId = null;
387
  state.hf.plan = null;
388
  state.hf.processedUrl = null;
389
  state.hf.processedBlob = null;
 
 
 
 
390
  state.hf.summary = null;
391
  state.hf.busy = false;
392
  state.hf.lastError = null;
@@ -469,8 +512,18 @@
469
  if (state.hf.processedUrl && state.hf.processedUrl.startsWith("blob:")) {
470
  try { URL.revokeObjectURL(state.hf.processedUrl); } catch (_) { }
471
  }
 
 
 
 
 
 
472
  state.hf.processedUrl = null;
473
  state.hf.processedBlob = null;
 
 
 
 
474
  state.hf.asyncJobId = null;
475
  state.hf.firstFrameUrl = null;
476
  state.hf.firstFrameDetections = null;
@@ -483,6 +536,8 @@
483
  state.hf.lastError = null;
484
  state.hf.busy = false;
485
  state.useProcessedFeed = false;
 
 
486
  setHfStatus("idle");
487
  renderMissionContext();
488
  videoHidden.src = state.videoUrl;
@@ -702,6 +757,7 @@
702
  if (chipFeed) {
703
  chipFeed.textContent = state.useProcessedFeed ? "FEED:HF" : "FEED:RAW";
704
  }
 
705
  }
706
 
707
  function normalizeToken(s) {
@@ -812,6 +868,9 @@
812
  }
813
  // drone_detection uses drone_yolo automatically
814
 
 
 
 
815
  // Submit async job
816
  setHfStatus(`submitting ${mode} job...`);
817
  log(`Submitting ${mode} to ${state.hf.baseUrl || "(same-origin)"} (detector=${detector || "n/a"})`, "t");
@@ -837,6 +896,16 @@
837
  state.hf.videoUrl = `${state.hf.baseUrl}${data.video_url}`;
838
  state.hf.asyncStatus = data.status;
839
 
 
 
 
 
 
 
 
 
 
 
840
  // Display first frame immediately (if object detection, segmentation, or drone)
841
  if ((mode === "object_detection" || mode === "segmentation" || mode === "drone_detection") && state.hf.firstFrameUrl) {
842
  const count = Array.isArray(data.first_frame_detections) ? data.first_frame_detections.length : null;
@@ -961,6 +1030,8 @@
961
  setHfStatus("job completed, fetching video...");
962
  try {
963
  await fetchProcessedVideo();
 
 
964
  clearInterval(state.hf.asyncPollInterval);
965
  // Clear job ID to prevent cancel attempts after completion
966
  state.hf.asyncJobId = null;
@@ -1037,6 +1108,82 @@
1037
  log(`Processed video ready (${(blob.size / 1024 / 1024).toFixed(1)} MB)`);
1038
  }
1039
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1040
  function stopAsyncPolling() {
1041
  if (state.hf.asyncPollInterval) {
1042
  clearInterval(state.hf.asyncPollInterval);
@@ -1075,6 +1222,118 @@
1075
  }
1076
  }
1077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1078
  async function startHfPipeline() {
1079
  if (state.hf.busy) {
1080
  log("HF pipeline already running");
 
25
  videoFile: null,
26
  videoLoaded: false,
27
  useProcessedFeed: false,
28
+ useDepthFeed: false, // Flag for depth view (Tab 2 video)
29
+ useFrameDepthView: false, // Flag for first frame depth view (Tab 1)
30
  hasReasoned: false,
31
  isReasoning: false, // Flag to prevent concurrent Reason executions
32
 
 
44
  queries: [], // Mission objective used as query
45
  processedUrl: null,
46
  processedBlob: null,
47
+ depthVideoUrl: null, // Depth video URL
48
+ depthFirstFrameUrl: null, // First frame depth URL
49
+ depthBlob: null, // Depth video blob
50
+ depthFirstFrameBlob: null, // Depth first frame blob
51
  summary: null,
52
  busy: false,
53
  lastError: null
 
185
  const chipTracks = $("#chipTracks");
186
  const chipBeam = $("#chipBeam");
187
  const chipHz = $("#chipHz");
188
+ const chipDepth = $("#chipDepth");
189
+ const chipFrameDepth = $("#chipFrameDepth");
190
 
191
  const dwellText = $("#dwellText");
192
  const dwellBar = $("#dwellBar");
 
254
  log(`Engage feed set to: ${state.useProcessedFeed ? "HF" : "RAW"}`, "t");
255
  });
256
 
257
+ // Toggle depth view
258
+ chipDepth.addEventListener("click", async () => {
259
+ if (!state.videoLoaded) return;
260
+ if (!state.hf.depthVideoUrl) {
261
+ log("Depth video not ready yet. Run Reason and wait for depth processing.", "w");
262
+ return;
263
+ }
264
+ await toggleDepthView();
265
+ log(`View set to: ${state.useDepthFeed ? "DEPTH" : "DEFAULT"}`, "t");
266
+ });
267
+
268
+ // Toggle first frame depth view (Tab 1)
269
+ if (chipFrameDepth) {
270
+ chipFrameDepth.addEventListener("click", () => {
271
+ if (!state.videoLoaded) return;
272
+ if (!state.hf.depthFirstFrameUrl) {
273
+ log("First frame depth not ready yet. Run Reason and wait for depth processing.", "w");
274
+ return;
275
+ }
276
+ toggleFirstFrameDepthView();
277
+ log(`First frame view set to: ${state.useFrameDepthView ? "DEPTH" : "DEFAULT"}`, "t");
278
+ });
279
+ }
280
 
281
  // Refresh intel summary (unbiased)
282
  if (btnIntelRefresh) {
 
409
  if (state.hf.processedUrl && state.hf.processedUrl.startsWith("blob:")) {
410
  try { URL.revokeObjectURL(state.hf.processedUrl); } catch (_) { }
411
  }
412
+ if (state.hf.depthVideoUrl && state.hf.depthVideoUrl.startsWith("blob:")) {
413
+ try { URL.revokeObjectURL(state.hf.depthVideoUrl); } catch (_) { }
414
+ }
415
+ if (state.hf.depthFirstFrameUrl && state.hf.depthFirstFrameUrl.startsWith("blob:")) {
416
+ try { URL.revokeObjectURL(state.hf.depthFirstFrameUrl); } catch (_) { }
417
+ }
418
  state.videoUrl = null;
419
  state.videoFile = null;
420
  state.videoLoaded = false;
421
  state.useProcessedFeed = false;
422
+ state.useDepthFeed = false;
423
+ state.useFrameDepthView = false;
424
 
425
  state.hf.missionId = null;
426
  state.hf.plan = null;
427
  state.hf.processedUrl = null;
428
  state.hf.processedBlob = null;
429
+ state.hf.depthVideoUrl = null;
430
+ state.hf.depthBlob = null;
431
+ state.hf.depthFirstFrameUrl = null;
432
+ state.hf.depthFirstFrameBlob = null;
433
  state.hf.summary = null;
434
  state.hf.busy = false;
435
  state.hf.lastError = null;
 
512
  if (state.hf.processedUrl && state.hf.processedUrl.startsWith("blob:")) {
513
  try { URL.revokeObjectURL(state.hf.processedUrl); } catch (_) { }
514
  }
515
+ if (state.hf.depthVideoUrl && state.hf.depthVideoUrl.startsWith("blob:")) {
516
+ try { URL.revokeObjectURL(state.hf.depthVideoUrl); } catch (_) { }
517
+ }
518
+ if (state.hf.depthFirstFrameUrl && state.hf.depthFirstFrameUrl.startsWith("blob:")) {
519
+ try { URL.revokeObjectURL(state.hf.depthFirstFrameUrl); } catch (_) { }
520
+ }
521
  state.hf.processedUrl = null;
522
  state.hf.processedBlob = null;
523
+ state.hf.depthVideoUrl = null;
524
+ state.hf.depthBlob = null;
525
+ state.hf.depthFirstFrameUrl = null;
526
+ state.hf.depthFirstFrameBlob = null;
527
  state.hf.asyncJobId = null;
528
  state.hf.firstFrameUrl = null;
529
  state.hf.firstFrameDetections = null;
 
536
  state.hf.lastError = null;
537
  state.hf.busy = false;
538
  state.useProcessedFeed = false;
539
+ state.useDepthFeed = false;
540
+ state.useFrameDepthView = false;
541
  setHfStatus("idle");
542
  renderMissionContext();
543
  videoHidden.src = state.videoUrl;
 
757
  if (chipFeed) {
758
  chipFeed.textContent = state.useProcessedFeed ? "FEED:HF" : "FEED:RAW";
759
  }
760
+ updateDepthChip();
761
  }
762
 
763
  function normalizeToken(s) {
 
868
  }
869
  // drone_detection uses drone_yolo automatically
870
 
871
+ // Add depth_estimator parameter for depth processing
872
+ form.append("depth_estimator", "depth");
873
+
874
  // Submit async job
875
  setHfStatus(`submitting ${mode} job...`);
876
  log(`Submitting ${mode} to ${state.hf.baseUrl || "(same-origin)"} (detector=${detector || "n/a"})`, "t");
 
896
  state.hf.videoUrl = `${state.hf.baseUrl}${data.video_url}`;
897
  state.hf.asyncStatus = data.status;
898
 
899
+ // Store depth URLs if provided
900
+ if (data.depth_video_url) {
901
+ state.hf.depthVideoUrl = `${state.hf.baseUrl}${data.depth_video_url}`;
902
+ log("Depth video URL received", "t");
903
+ }
904
+ if (data.first_frame_depth_url) {
905
+ state.hf.depthFirstFrameUrl = `${state.hf.baseUrl}${data.first_frame_depth_url}`;
906
+ log("First frame depth URL received (will fetch when ready)", "t");
907
+ }
908
+
909
  // Display first frame immediately (if object detection, segmentation, or drone)
910
  if ((mode === "object_detection" || mode === "segmentation" || mode === "drone_detection") && state.hf.firstFrameUrl) {
911
  const count = Array.isArray(data.first_frame_detections) ? data.first_frame_detections.length : null;
 
1030
  setHfStatus("job completed, fetching video...");
1031
  try {
1032
  await fetchProcessedVideo();
1033
+ await fetchDepthVideo();
1034
+ await fetchDepthFirstFrame();
1035
  clearInterval(state.hf.asyncPollInterval);
1036
  // Clear job ID to prevent cancel attempts after completion
1037
  state.hf.asyncJobId = null;
 
1108
  log(`Processed video ready (${(blob.size / 1024 / 1024).toFixed(1)} MB)`);
1109
  }
1110
 
1111
+ async function fetchDepthVideo() {
1112
+ if (!state.hf.depthVideoUrl) {
1113
+ log("No depth video URL available", "w");
1114
+ return;
1115
+ }
1116
+
1117
+ try {
1118
+ const resp = await fetch(state.hf.depthVideoUrl, { cache: "no-store" });
1119
+
1120
+ if (!resp.ok) {
1121
+ if (resp.status === 202) {
1122
+ log("Depth video still processing", "w");
1123
+ return;
1124
+ }
1125
+ throw new Error(`Failed to fetch depth video: ${resp.statusText}`);
1126
+ }
1127
+
1128
+ const nullOrigin = (window.location && window.location.origin) === "null";
1129
+ if (nullOrigin) {
1130
+ state.hf.depthBlob = null;
1131
+ state.hf.depthVideoUrl = `${state.hf.depthVideoUrl}?t=${Date.now()}`;
1132
+ log("Depth video ready (streaming URL)");
1133
+ return;
1134
+ }
1135
+
1136
+ const blob = await resp.blob();
1137
+
1138
+ // Store the original URL before creating blob
1139
+ const originalUrl = state.hf.depthVideoUrl;
1140
+
1141
+ state.hf.depthBlob = blob;
1142
+ const blobUrl = URL.createObjectURL(blob);
1143
+ state.hf.depthVideoUrl = blobUrl;
1144
+
1145
+ log(`Depth video ready (${(blob.size / 1024 / 1024).toFixed(1)} MB) - Click VIEW chip to toggle`, "g");
1146
+ updateDepthChip();
1147
+ } catch (err) {
1148
+ log(`Error fetching depth video: ${err.message}`, "e");
1149
+ }
1150
+ }
1151
+
1152
+ async function fetchDepthFirstFrame() {
1153
+ if (!state.hf.depthFirstFrameUrl) {
1154
+ log("No depth first frame URL available", "w");
1155
+ return;
1156
+ }
1157
+
1158
+ try {
1159
+ const resp = await fetch(state.hf.depthFirstFrameUrl, { cache: "no-store" });
1160
+
1161
+ if (!resp.ok) {
1162
+ if (resp.status === 202) {
1163
+ log("Depth first frame still processing", "w");
1164
+ return;
1165
+ }
1166
+ throw new Error(`Failed to fetch depth first frame: ${resp.statusText}`);
1167
+ }
1168
+
1169
+ // Fetch as blob and create blob URL
1170
+ const blob = await resp.blob();
1171
+
1172
+ // Store the blob and create a blob URL
1173
+ state.hf.depthFirstFrameBlob = blob;
1174
+ const blobUrl = URL.createObjectURL(blob);
1175
+
1176
+ // Replace the server URL with the blob URL
1177
+ const originalUrl = state.hf.depthFirstFrameUrl;
1178
+ state.hf.depthFirstFrameUrl = blobUrl;
1179
+
1180
+ log(`✓ Depth first frame ready (${(blob.size / 1024).toFixed(1)} KB) - Click VIEW chip on Tab 1 to toggle`, "g");
1181
+ updateFirstFrameDepthChip();
1182
+ } catch (err) {
1183
+ log(`Error fetching depth first frame: ${err.message}`, "e");
1184
+ }
1185
+ }
1186
+
1187
  function stopAsyncPolling() {
1188
  if (state.hf.asyncPollInterval) {
1189
  clearInterval(state.hf.asyncPollInterval);
 
1222
  }
1223
  }
1224
 
1225
+ async function toggleDepthView() {
1226
+ state.useDepthFeed = !state.useDepthFeed;
1227
+ updateDepthChip();
1228
+
1229
+ if (!state.videoLoaded) return;
1230
+
1231
+ const wasPlaying = !videoEngage.paused;
1232
+ const t = videoEngage.currentTime || 0;
1233
+
1234
+ try { videoEngage.pause(); } catch (_) { }
1235
+
1236
+ let desiredSrc;
1237
+ if (state.useDepthFeed && state.hf.depthVideoUrl) {
1238
+ desiredSrc = state.hf.depthVideoUrl;
1239
+ } else if (state.useProcessedFeed && state.hf.processedUrl) {
1240
+ desiredSrc = state.hf.processedUrl;
1241
+ } else {
1242
+ desiredSrc = state.videoUrl;
1243
+ }
1244
+
1245
+ if (videoEngage.src !== desiredSrc) {
1246
+ videoEngage.src = desiredSrc;
1247
+ videoEngage.setAttribute('data-depth', state.useDepthFeed ? 'true' : 'false');
1248
+ log(`Video view switched to: ${state.useDepthFeed ? 'depth' : 'default'}`, "t");
1249
+ videoEngage.load();
1250
+ await waitVideoReady(videoEngage);
1251
+ try { videoEngage.currentTime = Math.min(t, (videoEngage.duration || t)); } catch (_) { }
1252
+ }
1253
+
1254
+ resizeOverlays();
1255
+ if (wasPlaying) {
1256
+ try { await videoEngage.play(); } catch (_) { }
1257
+ }
1258
+ }
1259
+
1260
+ function updateDepthChip() {
1261
+ if (chipDepth) {
1262
+ chipDepth.textContent = state.useDepthFeed ? "VIEW:DEPTH" : "VIEW:DEFAULT";
1263
+ }
1264
+ }
1265
+
1266
+ function toggleFirstFrameDepthView() {
1267
+ state.useFrameDepthView = !state.useFrameDepthView;
1268
+ updateFirstFrameDepthChip();
1269
+ displayFirstFrameWithDepth();
1270
+ }
1271
+
1272
+ function updateFirstFrameDepthChip() {
1273
+ if (chipFrameDepth) {
1274
+ chipFrameDepth.textContent = state.useFrameDepthView ? "VIEW:DEPTH" : "VIEW:DEFAULT";
1275
+ }
1276
+ }
1277
+
1278
+ function displayFirstFrameWithDepth() {
1279
+ // Determine which URL to use based on state
1280
+ let frameUrl;
1281
+ if (state.useFrameDepthView && state.hf.depthFirstFrameUrl) {
1282
+ // Check if we have a blob URL (starts with 'blob:')
1283
+ if (state.hf.depthFirstFrameUrl.startsWith('blob:')) {
1284
+ frameUrl = state.hf.depthFirstFrameUrl;
1285
+ } else {
1286
+ log("Depth first frame not ready yet. Please wait for processing to complete.", "w");
1287
+ state.useFrameDepthView = false; // Revert to default view
1288
+ updateFirstFrameDepthChip();
1289
+ frameUrl = state.hf.firstFrameUrl;
1290
+ }
1291
+ } else if (state.hf.firstFrameUrl) {
1292
+ frameUrl = state.hf.firstFrameUrl;
1293
+ } else {
1294
+ log("No first frame URL available", "w");
1295
+ return;
1296
+ }
1297
+
1298
+ if (!frameUrl) {
1299
+ log("No valid frame URL to display", "w");
1300
+ return;
1301
+ }
1302
+
1303
+ log(`Displaying ${state.useFrameDepthView ? 'depth' : 'default'} first frame`, "t");
1304
+
1305
+ // Load and display the frame
1306
+ const img = new Image();
1307
+ img.crossOrigin = "anonymous";
1308
+ img.src = frameUrl;
1309
+
1310
+ img.onload = () => {
1311
+ frameCanvas.width = img.width;
1312
+ frameCanvas.height = img.height;
1313
+ frameOverlay.width = img.width;
1314
+ frameOverlay.height = img.height;
1315
+
1316
+ const ctx = frameCanvas.getContext("2d");
1317
+ ctx.clearRect(0, 0, img.width, img.height);
1318
+ ctx.drawImage(img, 0, 0);
1319
+
1320
+ frameEmpty.style.display = "none";
1321
+ log(`✓ ${state.useFrameDepthView ? 'Depth' : 'Default'} first frame displayed (${img.width}×${img.height})`, "g");
1322
+ };
1323
+
1324
+ img.onerror = (err) => {
1325
+ console.error(`Failed to load ${state.useFrameDepthView ? 'depth' : 'default'} first frame:`, err);
1326
+ log(`✗ ${state.useFrameDepthView ? 'Depth' : 'Default'} first frame load failed - reverting to default view`, "e");
1327
+
1328
+ // If depth frame fails, revert to default
1329
+ if (state.useFrameDepthView) {
1330
+ state.useFrameDepthView = false;
1331
+ updateFirstFrameDepthChip();
1332
+ displayFirstFrameWithDepth(); // Retry with default view
1333
+ }
1334
+ };
1335
+ }
1336
+
1337
  async function startHfPipeline() {
1338
  if (state.hf.busy) {
1339
  log("HF pipeline already running");
app.py CHANGED
@@ -260,7 +260,7 @@ async def detect_async_endpoint(
260
  queries: str = Form(""),
261
  detector: str = Form("hf_yolov8"),
262
  segmenter: str = Form("sam3"),
263
- depth_estimator: str = Form("depth_pro"),
264
  ):
265
  if mode not in VALID_MODES:
266
  raise HTTPException(
 
260
  queries: str = Form(""),
261
  detector: str = Form("hf_yolov8"),
262
  segmenter: str = Form("sam3"),
263
+ depth_estimator: str = Form("depth"),
264
  ):
265
  if mode not in VALID_MODES:
266
  raise HTTPException(
demo.html CHANGED
@@ -409,8 +409,7 @@
409
  <div class="input-group">
410
  <label for="depthModel">3. Select Depth Model</label>
411
  <select id="depthModel">
412
- <option value="depth_pro">Depth Pro (Apple)</option>
413
- <option value="depth_anything">Depth Anything (LiheYoung)</option>
414
  </select>
415
  </div>
416
  </div>
 
409
  <div class="input-group">
410
  <label for="depthModel">3. Select Depth Model</label>
411
  <select id="depthModel">
412
+ <option value="depth">Depth</option>
 
413
  </select>
414
  </div>
415
  </div>
inference.py CHANGED
@@ -353,7 +353,7 @@ def run_depth_inference(
353
  input_video_path: str,
354
  output_video_path: str,
355
  max_frames: Optional[int] = None,
356
- depth_estimator_name: str = "depth_pro",
357
  first_frame_depth_path: Optional[str] = None,
358
  job_id: Optional[str] = None,
359
  ) -> str:
@@ -364,7 +364,7 @@ def run_depth_inference(
364
  input_video_path: Path to input video
365
  output_video_path: Path to write depth visualization video
366
  max_frames: Optional frame limit for testing
367
- depth_estimator_name: Depth estimator to use (default: depth_pro)
368
  first_frame_depth_path: Optional path to save the first depth visualization frame
369
  job_id: Optional job ID for cancellation support
370
 
 
353
  input_video_path: str,
354
  output_video_path: str,
355
  max_frames: Optional[int] = None,
356
+ depth_estimator_name: str = "depth",
357
  first_frame_depth_path: Optional[str] = None,
358
  job_id: Optional[str] = None,
359
  ) -> str:
 
364
  input_video_path: Path to input video
365
  output_video_path: Path to write depth visualization video
366
  max_frames: Optional frame limit for testing
367
+ depth_estimator_name: Depth estimator to use (default: depth)
368
  first_frame_depth_path: Optional path to save the first depth visualization frame
369
  job_id: Optional job ID for cancellation support
370
 
jobs/models.py CHANGED
@@ -27,7 +27,7 @@ class JobInfo:
27
  error: Optional[str] = None
28
  first_frame_detections: List[Dict[str, Any]] = field(default_factory=list)
29
  # Depth estimation fields
30
- depth_estimator_name: str = "depth_pro"
31
  depth_output_path: Optional[str] = None
32
  first_frame_depth_path: Optional[str] = None
33
  partial_success: bool = False # True if one component failed but job completed
 
27
  error: Optional[str] = None
28
  first_frame_detections: List[Dict[str, Any]] = field(default_factory=list)
29
  # Depth estimation fields
30
+ depth_estimator_name: str = "depth"
31
  depth_output_path: Optional[str] = None
32
  first_frame_depth_path: Optional[str] = None
33
  partial_success: bool = False # True if one component failed but job completed
models/depth_estimators/__init__.py CHANGED
@@ -1,13 +1,11 @@
1
  """Depth estimation models for video processing."""
2
 
3
  from .base import DepthEstimator, DepthResult
4
- from .depth_pro import DepthProEstimator
5
  from .model_loader import list_depth_estimators, load_depth_estimator
6
 
7
  __all__ = [
8
  "DepthEstimator",
9
  "DepthResult",
10
- "DepthProEstimator",
11
  "load_depth_estimator",
12
  "list_depth_estimators",
13
  ]
 
1
  """Depth estimation models for video processing."""
2
 
3
  from .base import DepthEstimator, DepthResult
 
4
  from .model_loader import list_depth_estimators, load_depth_estimator
5
 
6
  __all__ = [
7
  "DepthEstimator",
8
  "DepthResult",
 
9
  "load_depth_estimator",
10
  "list_depth_estimators",
11
  ]
models/depth_estimators/depth_anything_v2.py CHANGED
@@ -11,7 +11,7 @@ from .base import DepthEstimator, DepthResult
11
  class DepthAnythingV2Estimator(DepthEstimator):
12
  """Depth-Anything depth estimator (Transformers-compatible)."""
13
 
14
- name = "depth_anything_v2"
15
 
16
  def __init__(self) -> None:
17
  logging.info("Loading Depth-Anything model from Hugging Face (transformers)...")
 
11
  class DepthAnythingV2Estimator(DepthEstimator):
12
  """Depth-Anything depth estimator (Transformers-compatible)."""
13
 
14
+ name = "depth"
15
 
16
  def __init__(self) -> None:
17
  logging.info("Loading Depth-Anything model from Hugging Face (transformers)...")
models/depth_estimators/depth_pro.py DELETED
@@ -1,132 +0,0 @@
1
- import logging
2
-
3
- import numpy as np
4
- import torch
5
- from PIL import Image
6
-
7
- from .base import DepthEstimator, DepthResult
8
-
9
-
10
- class DepthProEstimator(DepthEstimator):
11
- """Apple Depth Pro depth estimator using Hugging Face transformers."""
12
-
13
- name = "depth_pro"
14
-
15
- def __init__(self):
16
- """Initialize Depth Pro model from Hugging Face."""
17
- try:
18
- from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation
19
- except ImportError as exc:
20
- raise ImportError(
21
- "transformers package not installed or doesn't include DepthPro. "
22
- "Update with: pip install transformers --upgrade"
23
- ) from exc
24
-
25
- logging.info("Loading Depth Pro model from Hugging Face...")
26
-
27
- # Set device
28
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
-
30
- # Load model and processor
31
- model_id = "apple/DepthPro-hf"
32
- self.image_processor = DepthProImageProcessorFast.from_pretrained(model_id)
33
- self.model = DepthProForDepthEstimation.from_pretrained(model_id).to(self.device)
34
- self.model.eval()
35
-
36
- if torch.cuda.is_available():
37
- logging.info("Depth Pro model loaded on GPU")
38
- else:
39
- logging.warning("Depth Pro model loaded on CPU (no CUDA available)")
40
-
41
- def predict(self, frame: np.ndarray) -> DepthResult:
42
- """
43
- Run depth estimation on a single frame.
44
-
45
- Args:
46
- frame: HxWx3 BGR uint8 numpy array (OpenCV format)
47
-
48
- Returns:
49
- DepthResult with depth_map (HxW float32 in meters) and focal_length
50
- """
51
- try:
52
- # Convert BGR to RGB
53
- rgb_frame = frame[:, :, ::-1] # BGR → RGB
54
-
55
- # Convert to PIL Image
56
- pil_image = Image.fromarray(rgb_frame)
57
- height, width = pil_image.height, pil_image.width
58
-
59
- # Preprocess image
60
- inputs = self.image_processor(images=pil_image, return_tensors="pt").to(self.device)
61
-
62
- # Run inference (no gradient needed)
63
- with torch.no_grad():
64
- outputs = self.model(**inputs)
65
-
66
- # Debug: Inspect output structure
67
- logging.debug(f"Model outputs type: {type(outputs)}")
68
- logging.debug(f"Model outputs keys: {outputs.keys() if hasattr(outputs, 'keys') else 'N/A'}")
69
-
70
- # Get raw depth prediction - the shape varies by model
71
- raw_depth = outputs.predicted_depth
72
-
73
- # Log the actual shape for debugging
74
- logging.info(f"Raw depth shape: {raw_depth.shape}, dtype: {raw_depth.dtype}")
75
-
76
- # Ensure we have a 4D tensor [B, C, H, W]
77
- if raw_depth.dim() == 2:
78
- # [H, W] -> [1, 1, H, W]
79
- raw_depth = raw_depth.unsqueeze(0).unsqueeze(0)
80
- elif raw_depth.dim() == 3:
81
- # [B, H, W] or [C, H, W] -> [1, 1, H, W]
82
- raw_depth = raw_depth.unsqueeze(1) if raw_depth.shape[0] == 1 else raw_depth.unsqueeze(0)
83
- elif raw_depth.dim() == 1:
84
- # This is unexpected - possibly a flattened output
85
- # Try to reshape based on expected output size
86
- expected_size = 1536 # Model's default output size
87
- raw_depth = raw_depth.reshape(1, 1, expected_size, expected_size)
88
-
89
- # Now resize to target size
90
- if raw_depth.shape[-2:] != (height, width):
91
- import torch.nn.functional as F
92
- raw_depth = F.interpolate(
93
- raw_depth,
94
- size=(height, width),
95
- mode='bilinear',
96
- align_corners=False
97
- )
98
-
99
- # Convert to numpy and remove batch/channel dims
100
- depth_map = raw_depth.squeeze().cpu().numpy() # Shape: [H, W]
101
-
102
- # Get focal length from outputs if available
103
- if hasattr(outputs, 'fov_deg') and outputs.fov_deg is not None:
104
- # Convert field of view to focal length
105
- fov_rad = float(outputs.fov_deg) * np.pi / 180.0
106
- focal_length = float(width / (2.0 * np.tan(fov_rad / 2.0)))
107
- else:
108
- focal_length = 1.0
109
-
110
- # Debug: Check for NaN values
111
- if np.isnan(depth_map).any():
112
- nan_count = np.isnan(depth_map).sum()
113
- total = depth_map.size
114
- logging.warning(
115
- f"Depth map contains {nan_count}/{total} ({100*nan_count/total:.1f}%) NaN values"
116
- )
117
- logging.warning(f"Depth map shape: {depth_map.shape}, dtype: {depth_map.dtype}")
118
- valid_depths = depth_map[np.isfinite(depth_map)]
119
- if len(valid_depths) > 0:
120
- logging.warning(
121
- f"Valid depth range: {valid_depths.min():.4f} - {valid_depths.max():.4f}"
122
- )
123
-
124
- return DepthResult(depth_map=depth_map, focal_length=focal_length)
125
-
126
- except Exception as e:
127
- logging.error(f"Depth estimation failed: {e}")
128
- logging.error(f"Frame shape: {frame.shape}")
129
- # Return a blank depth map as fallback
130
- h, w = frame.shape[:2]
131
- depth_map = np.zeros((h, w), dtype=np.float32)
132
- return DepthResult(depth_map=depth_map, focal_length=1.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/depth_estimators/model_loader.py CHANGED
@@ -5,14 +5,11 @@ from typing import Callable, Dict
5
 
6
  from .base import DepthEstimator
7
  from .depth_anything_v2 import DepthAnythingV2Estimator
8
- from .depth_pro import DepthProEstimator
9
 
10
 
11
  # Registry of depth estimators
12
  _REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
13
- "depth_anything": DepthAnythingV2Estimator,
14
- "depth_anything_v2": DepthAnythingV2Estimator,
15
- "depth_pro": DepthProEstimator,
16
  }
17
 
18
 
@@ -22,7 +19,7 @@ def _get_cached_depth_estimator(name: str) -> DepthEstimator:
22
  Create and cache depth estimator instance.
23
 
24
  Args:
25
- name: Depth estimator name (e.g., "depth_pro")
26
 
27
  Returns:
28
  Depth estimator instance
@@ -52,12 +49,12 @@ def _create_depth_estimator(name: str) -> DepthEstimator:
52
  return estimator_class()
53
 
54
 
55
- def load_depth_estimator(name: str = "depth_pro") -> DepthEstimator:
56
  """
57
  Load depth estimator by name (with caching).
58
 
59
  Args:
60
- name: Depth estimator name (default: "depth_pro")
61
 
62
  Returns:
63
  Cached depth estimator instance
 
5
 
6
  from .base import DepthEstimator
7
  from .depth_anything_v2 import DepthAnythingV2Estimator
 
8
 
9
 
10
  # Registry of depth estimators
11
  _REGISTRY: Dict[str, Callable[[], DepthEstimator]] = {
12
+ "depth": DepthAnythingV2Estimator,
 
 
13
  }
14
 
15
 
 
19
  Create and cache depth estimator instance.
20
 
21
  Args:
22
+ name: Depth estimator name (e.g., "depth")
23
 
24
  Returns:
25
  Depth estimator instance
 
49
  return estimator_class()
50
 
51
 
52
+ def load_depth_estimator(name: str = "depth") -> DepthEstimator:
53
  """
54
  Load depth estimator by name (with caching).
55
 
56
  Args:
57
+ name: Depth estimator name (default: "depth")
58
 
59
  Returns:
60
  Cached depth estimator instance