aivertex95827
/

test

ONNX

Model card Files Files and versions

xet

Community

aivertex95827 commited on Mar 4

Commit

bf0a356

verified ·

1 Parent(s): 60f4903

Update miner.py

Browse files

Files changed (1) hide show

miner.py +28 -107

miner.py CHANGED Viewed

@@ -18,7 +18,6 @@ from collections import OrderedDict, defaultdict
 from PIL import Image
 import torchvision.transforms as T
 import time
-import onnxruntime as ort
 try:
     from scipy.optimize import linear_sum_assignment as _linear_sum_assignment
@@ -1016,8 +1015,8 @@ TEMPLATE_F1: List[Tuple[float, float]] = [
 HOMOGRAPHY_FILL_ONLY_VALID = True
 KP_THRESHOLD = 0.2  # new-5 style (was 0.3)
 # HRNet: smaller input = faster; 432x768 balances speed/accuracy (new-2 style)
-# KP_H, KP_W = 540, 960
-KP_H, KP_W = 432, 768
 def _preprocess_batch(frames):
@@ -1028,7 +1027,6 @@ def _preprocess_batch(frames):
         img = cv2.resize(img, (target_w, target_h)).astype(np.float32) / 255.0
         batch.append(np.transpose(img, (2, 0, 1)))
     return torch.from_numpy(np.stack(batch)).float()
-    # return np.stack(batch)
 def _extract_keypoints(heatmap: torch.Tensor, scale: int = 2):
@@ -1062,77 +1060,19 @@ def _process_keypoints(kp_coords, threshold, w, h, batch_size):
 def _run_hrnet_batch(frames, model, threshold, batch_size=16):
     if not frames or model is None:
         return []
-    # device = next(model.parameters()).device
-    # use_amp = device.type == "cuda"
-    output_shape = (batch_size, 58, 270, 480)
-    output_tensor = torch.empty(output_shape, dtype=torch.float32, device='cuda')
-    io_binding = model.io_binding()
-    input_name = model.get_inputs()[0].name
-    output_name = model.get_outputs()[0].name
     results = []
     for i in range(0, len(frames), batch_size):
         chunk = frames[i:i + batch_size]
-        curr_bs = len(chunk)
-        # Preprocess stays on GPU
-        # Ensure _preprocess_batch returns a GPU tensor
-        batch_cuda = _preprocess_batch(chunk).to('cuda:0')
-        # --- I/O BINDING (The "Keep on GPU" secret) ---
-        io_binding.bind_input(
-            name=input_name,
-            device_type='cuda',
-            device_id=0,
-            element_type=np.float32,
-            shape=batch_cuda.shape,
-            buffer_ptr=batch_cuda.data_ptr()
-        )
-        io_binding.bind_output(
-            name=output_name,
-            device_type='cuda',
-            device_id=0,
-            element_type=np.float32,
-            shape=(curr_bs, 58, 270, 480), # Current dynamic shape
-            buffer_ptr=output_tensor.data_ptr()
-        )
-        # Sync and Run (Zero CPU usage here)
-        model.run_with_iobinding(io_binding)
-        # Access the output directly from GPU memory
-        # We slice it to match current batch size
-        heatmaps = output_tensor[:curr_bs]
-        # 3. GPU Post-processing
-        # Ensure _extract_keypoints uses torch functions (not cv2/numpy)
         kp_coords = _extract_keypoints(heatmaps[:, :-1, :, :], scale=2)
-        batch_kps = _process_keypoints(kp_coords, threshold, KP_W, KP_H, curr_bs)
         results.extend(batch_kps)
-    # device = 'cuda'
-    # results = []
-    # torch.cuda.empty_cache()
-    # for i in range(0, len(frames), batch_size):
-    #     chunk = frames[i:i + batch_size]
-    #     batch = _preprocess_batch(chunk)#.to(device, non_blocking=True)
-    #     print(batch.shape, flush=True)
-    #     input_name = model.get_inputs()[0].name
-    #     outputs = model.run(None, {input_name: batch})
-    #     print(f"Inference output shape: {outputs[0].shape}")
-    #     heatmaps = torch.from_numpy(outputs[0]).to(device)
-    #     del outputs
-    #     gc.collect()
-    #     # with torch.inference_mode():
-    #         # with torch.amp.autocast("cuda", enabled=use_amp):
-    #             # heatmaps = model(batch)
-    #     kp_coords = _extract_keypoints(heatmaps[:, :-1, :, :], scale=2)
-    #     batch_kps = _process_keypoints(kp_coords, threshold, KP_W, KP_H, len(chunk))
-    #     results.extend(batch_kps)
-    #     del heatmaps, kp_coords, batch
     if results:
         gc.collect()
     return results
@@ -1316,7 +1256,6 @@ def _smooth_boxes(
 # ── Miner ─────────────────────────────────────────────────────────────────────
-HRNET_BATCH_SIZE = 8  # larger batch = faster (if GPU mem allows)
 class Miner:
     def __init__(self, path_hf_repo: Path) -> None:
         self.path_hf_repo = Path(path_hf_repo)
@@ -1348,30 +1287,16 @@ class Miner:
             print(f"⚠️ OSNet weights not found at {osnet_weight_path}. Using HSV fallback.")
         # Keypoints model: HRNet
-        # kp_config_file  = "hrnetv2_w48.yaml"
-        # kp_weights_file = "keypoint_detect.pt"
-        # config_path  = Path(kp_config_file)  if Path(kp_config_file).exists()  else self.path_hf_repo / kp_config_file
-        # weights_path = Path(kp_weights_file) if Path(kp_weights_file).exists() else self.path_hf_repo / kp_weights_file
-        # cfg = yaml.safe_load(open(config_path, 'r'))
-        # hrnet = get_cls_net(cfg)
-        # state = torch.load(weights_path, map_location=device, weights_only=False)
-        # hrnet.load_state_dict(state)
-        # hrnet.to(device).eval()
-        # self.keypoints_model = hrnet
-        available = ort.get_available_providers()
-        print(f"Available Providers: {available}")
-        if 'CUDAExecutionProvider' not in ort.get_available_providers():
-            raise RuntimeError("ONNX Runtime cannot find CUDA! Check your onnxruntime-gpu installation.")
-        providers = [
-            ('CUDAExecutionProvider', {
-                'device_id': 0,
-                'arena_extend_strategy': 'kSameAsRequested',
-            })
-        ]
-        session = ort.InferenceSession(self.path_hf_repo / "hrnet_final.onnx", providers=providers)
-        self.keypoints_model = session
         print("✅ HRNet Keypoints Model Loaded")
         # Person detection state (new-2 style)
@@ -1630,11 +1555,11 @@ class Miner:
             return []
         if self.keypoints_model is None:
             return [[(0, 0)] * n_keypoints for _ in images]
-        # try:
-        raw_kps = _run_hrnet_batch(images, self.keypoints_model, KP_THRESHOLD, batch_size=HRNET_BATCH_SIZE)
-        # except Exception as e:
-            # print(f"Error in _keypoint_task: {e}")
-            # return [[(0, 0)] * n_keypoints for _ in images]
         raw_kps = [_apply_keypoint_mapping(kp) for kp in raw_kps] if raw_kps else []
         keypoints = _normalize_keypoints(raw_kps, images, n_keypoints) if raw_kps else [[(0, 0)] * n_keypoints for _ in images]
         keypoints = [_fix_keypoints(kps, n_keypoints) for kps in keypoints]
@@ -1667,16 +1592,12 @@ class Miner:
                 torch.cuda.empty_cache()
         # Run bbox (batched YOLO) and keypoints in parallel
-        # future_bbox = self._executor.submit(self._bbox_task, images, offset)
         future_kp = self._executor.submit(self._keypoint_task, images, n_keypoints)
-        # bbox_per_frame = future_bbox.result()
         keypoints = future_kp.result()
         return [
-            TVFrameResult(
-                frame_id=offset + i,
-                # boxes=bbox_per_frame[i],
-                boxes=[],
-                keypoints=keypoints[i])
             for i in range(len(images))
         ]

 from PIL import Image
 import torchvision.transforms as T
 import time
 try:
     from scipy.optimize import linear_sum_assignment as _linear_sum_assignment
 HOMOGRAPHY_FILL_ONLY_VALID = True
 KP_THRESHOLD = 0.2  # new-5 style (was 0.3)
 # HRNet: smaller input = faster; 432x768 balances speed/accuracy (new-2 style)
+KP_H, KP_W = 360, 640
+HRNET_BATCH_SIZE = 8  # larger batch = faster (if GPU mem allows)
 def _preprocess_batch(frames):
         img = cv2.resize(img, (target_w, target_h)).astype(np.float32) / 255.0
         batch.append(np.transpose(img, (2, 0, 1)))
     return torch.from_numpy(np.stack(batch)).float()
 def _extract_keypoints(heatmap: torch.Tensor, scale: int = 2):
 def _run_hrnet_batch(frames, model, threshold, batch_size=16):
     if not frames or model is None:
         return []
+    device = next(model.parameters()).device
+    use_amp = device.type == "cuda"
     results = []
     for i in range(0, len(frames), batch_size):
         chunk = frames[i:i + batch_size]
+        batch = _preprocess_batch(chunk).to(device, non_blocking=True)
+        with torch.inference_mode():
+            with torch.amp.autocast("cuda", enabled=use_amp):
+                heatmaps = model(batch)
         kp_coords = _extract_keypoints(heatmaps[:, :-1, :, :], scale=2)
+        batch_kps = _process_keypoints(kp_coords, threshold, KP_W, KP_H, len(chunk))
         results.extend(batch_kps)
+        del heatmaps, kp_coords, batch
     if results:
         gc.collect()
     return results
 # ── Miner ─────────────────────────────────────────────────────────────────────
 class Miner:
     def __init__(self, path_hf_repo: Path) -> None:
         self.path_hf_repo = Path(path_hf_repo)
             print(f"⚠️ OSNet weights not found at {osnet_weight_path}. Using HSV fallback.")
         # Keypoints model: HRNet
+        kp_config_file  = "hrnetv2_w48.yaml"
+        kp_weights_file = "keypoint_detect.pt"
+        config_path  = Path(kp_config_file)  if Path(kp_config_file).exists()  else self.path_hf_repo / kp_config_file
+        weights_path = Path(kp_weights_file) if Path(kp_weights_file).exists() else self.path_hf_repo / kp_weights_file
+        cfg = yaml.safe_load(open(config_path, 'r'))
+        hrnet = get_cls_net(cfg)
+        state = torch.load(weights_path, map_location=device, weights_only=False)
+        hrnet.load_state_dict(state)
+        hrnet.to(device).eval()
+        self.keypoints_model = hrnet
         print("✅ HRNet Keypoints Model Loaded")
         # Person detection state (new-2 style)
             return []
         if self.keypoints_model is None:
             return [[(0, 0)] * n_keypoints for _ in images]
+        try:
+            raw_kps = _run_hrnet_batch(images, self.keypoints_model, KP_THRESHOLD, batch_size=HRNET_BATCH_SIZE)
+        except Exception as e:
+            print(f"Error in _keypoint_task: {e}")
+            return [[(0, 0)] * n_keypoints for _ in images]
         raw_kps = [_apply_keypoint_mapping(kp) for kp in raw_kps] if raw_kps else []
         keypoints = _normalize_keypoints(raw_kps, images, n_keypoints) if raw_kps else [[(0, 0)] * n_keypoints for _ in images]
         keypoints = [_fix_keypoints(kps, n_keypoints) for kps in keypoints]
                 torch.cuda.empty_cache()
         # Run bbox (batched YOLO) and keypoints in parallel
+        future_bbox = self._executor.submit(self._bbox_task, images, offset)
         future_kp = self._executor.submit(self._keypoint_task, images, n_keypoints)
+        bbox_per_frame = future_bbox.result()
         keypoints = future_kp.result()
         return [
+            TVFrameResult(frame_id=offset + i, boxes=bbox_per_frame[i], keypoints=keypoints[i])
             for i in range(len(images))
         ]