Spaces:

BiasLab2025
/

perception

Paused

App Files Files Community

Zhen Ye commited on 25 days ago

Commit

6a62982

1 Parent(s): bbef397

Add aggressive worker debug logging and BaseException catch

Browse files

Files changed (1) hide show

inference.py +22 -13

inference.py CHANGED Viewed

@@ -1047,6 +1047,7 @@ def run_inference(
     writer_finished = False
     def worker_task(gpu_idx: int):
         detector_instance = detectors[gpu_idx]
         depth_instance = depth_estimators[gpu_idx] if gpu_idx < len(depth_estimators) else None # Handle mismatched lists safely
@@ -1055,6 +1056,7 @@ def run_inference(
         def flush_batch():
             if not batch_accum: return
             indices = [item[0] for item in batch_accum]
             frames = [item[1] for item in batch_accum]
@@ -1068,8 +1070,8 @@ def run_inference(
                 else:
                     with detector_instance.lock:
                          det_results = [detector_instance.predict(f, queries) for f in frames]
-            except Exception:
-                logging.exception("Batch detection failed")
                 det_results = [None] * len(frames)
             # Run depth batch (if enabled)
@@ -1081,8 +1083,8 @@ def run_inference(
                              depth_results = depth_instance.predict_batch(frames)
                          else:
                              depth_results = [depth_instance.predict(f) for f in frames]
-                except Exception:
-                     logging.exception("Batch depth failed")
             # --- POST PROCESSING ---
             for i, (idx, frame, d_res, dep_res) in enumerate(zip(indices, frames, det_results, depth_results)):
@@ -1103,11 +1105,6 @@ def run_inference(
                          _attach_depth_from_result(detections, dep_res, depth_scale)
                      except: pass
-                # B. Render Boxes - DEFERRED TO WRITER THREAD FOR SEQUENTIAL TRACKING
-                # display_labels = [_build_display_label(d) for d in detections]
-                # if d_res:
-                #     processed = draw_boxes(processed, d_res.boxes, label_names=display_labels)
                 # 3. Output
                 while True:
                     try:
@@ -1120,34 +1117,46 @@ def run_inference(
                         if job_id: _check_cancellation(job_id)
             batch_accum.clear()
         while True:
-            item = queue_in.get()
             try:
                 if item is None:
                     flush_batch()
                     break
                 frame_idx, frame_data = item
                 if frame_idx % 30 == 0:
-                    logging.debug("Processing frame %d on device %s", frame_idx, "cpu" if num_gpus==0 else f"cuda:{gpu_idx}")
                 batch_accum.append((frame_idx, frame_data))
                 if len(batch_accum) >= batch_size:
                     flush_batch()
-            except Exception as e:
-                logging.exception("Worker failed processing frame")
                 # Emit empty/failed frames for the batch to keep sequence alive
                 for idx, frm in batch_accum:
                     try:
                         # Fallback: Return original frame with empty detections
                         queue_out.put((idx, frm, []), timeout=5.0)
                     except:
                         pass
                 batch_accum.clear()
             finally:
                 queue_in.task_done()
     # 6. Start Workers
     workers = []

     writer_finished = False
     def worker_task(gpu_idx: int):
+        logging.info(f"Worker {gpu_idx} started. PID: {os.getpid()}")
         detector_instance = detectors[gpu_idx]
         depth_instance = depth_estimators[gpu_idx] if gpu_idx < len(depth_estimators) else None # Handle mismatched lists safely
         def flush_batch():
             if not batch_accum: return
+            logging.info(f"Worker {gpu_idx} flushing batch of {len(batch_accum)} frames")
             indices = [item[0] for item in batch_accum]
             frames = [item[1] for item in batch_accum]
                 else:
                     with detector_instance.lock:
                          det_results = [detector_instance.predict(f, queries) for f in frames]
+            except BaseException as e:
+                logging.exception("Batch detection crashed with critical error")
                 det_results = [None] * len(frames)
             # Run depth batch (if enabled)
                              depth_results = depth_instance.predict_batch(frames)
                          else:
                              depth_results = [depth_instance.predict(f) for f in frames]
+                except BaseException as e:
+                     logging.exception("Batch depth crashed with critical error")
             # --- POST PROCESSING ---
             for i, (idx, frame, d_res, dep_res) in enumerate(zip(indices, frames, det_results, depth_results)):
                          _attach_depth_from_result(detections, dep_res, depth_scale)
                      except: pass
                 # 3. Output
                 while True:
                     try:
                         if job_id: _check_cancellation(job_id)
             batch_accum.clear()
+            logging.info(f"Worker {gpu_idx} finished flushing batch")
         while True:
+            try:
+                item = queue_in.get(timeout=2.0)
+            except Empty:
+                # Periodic check for cancellation if main thread is slow
+                if job_id: _check_cancellation(job_id)
+                continue
             try:
                 if item is None:
+                    logging.info(f"Worker {gpu_idx} received sentinel. Flushing and exiting.")
                     flush_batch()
                     break
                 frame_idx, frame_data = item
+                # logging.info(f"Worker {gpu_idx} got frame {frame_idx}") # Verbose
                 if frame_idx % 30 == 0:
+                    logging.info("Processing frame %d on device %s", frame_idx, "cpu" if num_gpus==0 else f"cuda:{gpu_idx}")
                 batch_accum.append((frame_idx, frame_data))
                 if len(batch_accum) >= batch_size:
                     flush_batch()
+            except BaseException as e:
+                logging.exception(f"Worker {gpu_idx} CRASHED processing frame. Recovering...")
                 # Emit empty/failed frames for the batch to keep sequence alive
                 for idx, frm in batch_accum:
                     try:
                         # Fallback: Return original frame with empty detections
                         queue_out.put((idx, frm, []), timeout=5.0)
+                        logging.info(f"Emitted fallback frame {idx}")
                     except:
                         pass
                 batch_accum.clear()
             finally:
                 queue_in.task_done()
+        logging.info(f"Worker {gpu_idx} thread exiting normally.")
     # 6. Start Workers
     workers = []