aivertex95827
/

test

ONNX

Model card Files Files and versions

xet

Community

aivertex95827 commited on Mar 4

Commit

60f4903

verified ·

1 Parent(s): e1d0c86

Update miner.py

Browse files

Files changed (1) hide show

miner.py +107 -28

miner.py CHANGED Viewed

@@ -18,6 +18,7 @@ from collections import OrderedDict, defaultdict
 from PIL import Image
 import torchvision.transforms as T
 import time
 try:
     from scipy.optimize import linear_sum_assignment as _linear_sum_assignment
@@ -1015,8 +1016,8 @@ TEMPLATE_F1: List[Tuple[float, float]] = [
 HOMOGRAPHY_FILL_ONLY_VALID = True
 KP_THRESHOLD = 0.2  # new-5 style (was 0.3)
 # HRNet: smaller input = faster; 432x768 balances speed/accuracy (new-2 style)
-KP_H, KP_W = 540, 960
-HRNET_BATCH_SIZE = 8  # larger batch = faster (if GPU mem allows)
 def _preprocess_batch(frames):
@@ -1027,6 +1028,7 @@ def _preprocess_batch(frames):
         img = cv2.resize(img, (target_w, target_h)).astype(np.float32) / 255.0
         batch.append(np.transpose(img, (2, 0, 1)))
     return torch.from_numpy(np.stack(batch)).float()
 def _extract_keypoints(heatmap: torch.Tensor, scale: int = 2):
@@ -1060,19 +1062,77 @@ def _process_keypoints(kp_coords, threshold, w, h, batch_size):
 def _run_hrnet_batch(frames, model, threshold, batch_size=16):
     if not frames or model is None:
         return []
-    device = next(model.parameters()).device
-    use_amp = device.type == "cuda"
     results = []
     for i in range(0, len(frames), batch_size):
         chunk = frames[i:i + batch_size]
-        batch = _preprocess_batch(chunk).to(device, non_blocking=True)
-        with torch.inference_mode():
-            with torch.amp.autocast("cuda", enabled=use_amp):
-                heatmaps = model(batch)
         kp_coords = _extract_keypoints(heatmaps[:, :-1, :, :], scale=2)
-        batch_kps = _process_keypoints(kp_coords, threshold, KP_W, KP_H, len(chunk))
         results.extend(batch_kps)
-        del heatmaps, kp_coords, batch
     if results:
         gc.collect()
     return results
@@ -1256,6 +1316,7 @@ def _smooth_boxes(
 # ── Miner ─────────────────────────────────────────────────────────────────────
 class Miner:
     def __init__(self, path_hf_repo: Path) -> None:
         self.path_hf_repo = Path(path_hf_repo)
@@ -1287,16 +1348,30 @@ class Miner:
             print(f"⚠️ OSNet weights not found at {osnet_weight_path}. Using HSV fallback.")
         # Keypoints model: HRNet
-        kp_config_file  = "hrnetv2_w48.yaml"
-        kp_weights_file = "keypoint_detect.pt"
-        config_path  = Path(kp_config_file)  if Path(kp_config_file).exists()  else self.path_hf_repo / kp_config_file
-        weights_path = Path(kp_weights_file) if Path(kp_weights_file).exists() else self.path_hf_repo / kp_weights_file
-        cfg = yaml.safe_load(open(config_path, 'r'))
-        hrnet = get_cls_net(cfg)
-        state = torch.load(weights_path, map_location=device, weights_only=False)
-        hrnet.load_state_dict(state)
-        hrnet.to(device).eval()
-        self.keypoints_model = hrnet
         print("✅ HRNet Keypoints Model Loaded")
         # Person detection state (new-2 style)
@@ -1555,11 +1630,11 @@ class Miner:
             return []
         if self.keypoints_model is None:
             return [[(0, 0)] * n_keypoints for _ in images]
-        try:
-            raw_kps = _run_hrnet_batch(images, self.keypoints_model, KP_THRESHOLD, batch_size=HRNET_BATCH_SIZE)
-        except Exception as e:
-            print(f"Error in _keypoint_task: {e}")
-            return [[(0, 0)] * n_keypoints for _ in images]
         raw_kps = [_apply_keypoint_mapping(kp) for kp in raw_kps] if raw_kps else []
         keypoints = _normalize_keypoints(raw_kps, images, n_keypoints) if raw_kps else [[(0, 0)] * n_keypoints for _ in images]
         keypoints = [_fix_keypoints(kps, n_keypoints) for kps in keypoints]
@@ -1592,12 +1667,16 @@ class Miner:
                 torch.cuda.empty_cache()
         # Run bbox (batched YOLO) and keypoints in parallel
-        future_bbox = self._executor.submit(self._bbox_task, images, offset)
         future_kp = self._executor.submit(self._keypoint_task, images, n_keypoints)
-        bbox_per_frame = future_bbox.result()
         keypoints = future_kp.result()
         return [
-            TVFrameResult(frame_id=offset + i, boxes=bbox_per_frame[i], keypoints=keypoints[i])
             for i in range(len(images))
         ]

 from PIL import Image
 import torchvision.transforms as T
 import time
+import onnxruntime as ort
 try:
     from scipy.optimize import linear_sum_assignment as _linear_sum_assignment
 HOMOGRAPHY_FILL_ONLY_VALID = True
 KP_THRESHOLD = 0.2  # new-5 style (was 0.3)
 # HRNet: smaller input = faster; 432x768 balances speed/accuracy (new-2 style)
+# KP_H, KP_W = 540, 960
+KP_H, KP_W = 432, 768
 def _preprocess_batch(frames):
         img = cv2.resize(img, (target_w, target_h)).astype(np.float32) / 255.0
         batch.append(np.transpose(img, (2, 0, 1)))
     return torch.from_numpy(np.stack(batch)).float()
+    # return np.stack(batch)
 def _extract_keypoints(heatmap: torch.Tensor, scale: int = 2):
 def _run_hrnet_batch(frames, model, threshold, batch_size=16):
     if not frames or model is None:
         return []
+    # device = next(model.parameters()).device
+    # use_amp = device.type == "cuda"
+    output_shape = (batch_size, 58, 270, 480)
+    output_tensor = torch.empty(output_shape, dtype=torch.float32, device='cuda')
+    io_binding = model.io_binding()
+    input_name = model.get_inputs()[0].name
+    output_name = model.get_outputs()[0].name
     results = []
     for i in range(0, len(frames), batch_size):
         chunk = frames[i:i + batch_size]
+        curr_bs = len(chunk)
+        # Preprocess stays on GPU
+        # Ensure _preprocess_batch returns a GPU tensor
+        batch_cuda = _preprocess_batch(chunk).to('cuda:0')
+        # --- I/O BINDING (The "Keep on GPU" secret) ---
+        io_binding.bind_input(
+            name=input_name,
+            device_type='cuda',
+            device_id=0,
+            element_type=np.float32,
+            shape=batch_cuda.shape,
+            buffer_ptr=batch_cuda.data_ptr()
+        )
+        io_binding.bind_output(
+            name=output_name,
+            device_type='cuda',
+            device_id=0,
+            element_type=np.float32,
+            shape=(curr_bs, 58, 270, 480), # Current dynamic shape
+            buffer_ptr=output_tensor.data_ptr()
+        )
+        # Sync and Run (Zero CPU usage here)
+        model.run_with_iobinding(io_binding)
+        # Access the output directly from GPU memory
+        # We slice it to match current batch size
+        heatmaps = output_tensor[:curr_bs]
+        # 3. GPU Post-processing
+        # Ensure _extract_keypoints uses torch functions (not cv2/numpy)
         kp_coords = _extract_keypoints(heatmaps[:, :-1, :, :], scale=2)
+        batch_kps = _process_keypoints(kp_coords, threshold, KP_W, KP_H, curr_bs)
         results.extend(batch_kps)
+    # device = 'cuda'
+    # results = []
+    # torch.cuda.empty_cache()
+    # for i in range(0, len(frames), batch_size):
+    #     chunk = frames[i:i + batch_size]
+    #     batch = _preprocess_batch(chunk)#.to(device, non_blocking=True)
+    #     print(batch.shape, flush=True)
+    #     input_name = model.get_inputs()[0].name
+    #     outputs = model.run(None, {input_name: batch})
+    #     print(f"Inference output shape: {outputs[0].shape}")
+    #     heatmaps = torch.from_numpy(outputs[0]).to(device)
+    #     del outputs
+    #     gc.collect()
+    #     # with torch.inference_mode():
+    #         # with torch.amp.autocast("cuda", enabled=use_amp):
+    #             # heatmaps = model(batch)
+    #     kp_coords = _extract_keypoints(heatmaps[:, :-1, :, :], scale=2)
+    #     batch_kps = _process_keypoints(kp_coords, threshold, KP_W, KP_H, len(chunk))
+    #     results.extend(batch_kps)
+    #     del heatmaps, kp_coords, batch
     if results:
         gc.collect()
     return results
 # ── Miner ─────────────────────────────────────────────────────────────────────
+HRNET_BATCH_SIZE = 8  # larger batch = faster (if GPU mem allows)
 class Miner:
     def __init__(self, path_hf_repo: Path) -> None:
         self.path_hf_repo = Path(path_hf_repo)
             print(f"⚠️ OSNet weights not found at {osnet_weight_path}. Using HSV fallback.")
         # Keypoints model: HRNet
+        # kp_config_file  = "hrnetv2_w48.yaml"
+        # kp_weights_file = "keypoint_detect.pt"
+        # config_path  = Path(kp_config_file)  if Path(kp_config_file).exists()  else self.path_hf_repo / kp_config_file
+        # weights_path = Path(kp_weights_file) if Path(kp_weights_file).exists() else self.path_hf_repo / kp_weights_file
+        # cfg = yaml.safe_load(open(config_path, 'r'))
+        # hrnet = get_cls_net(cfg)
+        # state = torch.load(weights_path, map_location=device, weights_only=False)
+        # hrnet.load_state_dict(state)
+        # hrnet.to(device).eval()
+        # self.keypoints_model = hrnet
+        available = ort.get_available_providers()
+        print(f"Available Providers: {available}")
+        if 'CUDAExecutionProvider' not in ort.get_available_providers():
+            raise RuntimeError("ONNX Runtime cannot find CUDA! Check your onnxruntime-gpu installation.")
+        providers = [
+            ('CUDAExecutionProvider', {
+                'device_id': 0,
+                'arena_extend_strategy': 'kSameAsRequested',
+            })
+        ]
+        session = ort.InferenceSession(self.path_hf_repo / "hrnet_final.onnx", providers=providers)
+        self.keypoints_model = session
         print("✅ HRNet Keypoints Model Loaded")
         # Person detection state (new-2 style)
             return []
         if self.keypoints_model is None:
             return [[(0, 0)] * n_keypoints for _ in images]
+        # try:
+        raw_kps = _run_hrnet_batch(images, self.keypoints_model, KP_THRESHOLD, batch_size=HRNET_BATCH_SIZE)
+        # except Exception as e:
+            # print(f"Error in _keypoint_task: {e}")
+            # return [[(0, 0)] * n_keypoints for _ in images]
         raw_kps = [_apply_keypoint_mapping(kp) for kp in raw_kps] if raw_kps else []
         keypoints = _normalize_keypoints(raw_kps, images, n_keypoints) if raw_kps else [[(0, 0)] * n_keypoints for _ in images]
         keypoints = [_fix_keypoints(kps, n_keypoints) for kps in keypoints]
                 torch.cuda.empty_cache()
         # Run bbox (batched YOLO) and keypoints in parallel
+        # future_bbox = self._executor.submit(self._bbox_task, images, offset)
         future_kp = self._executor.submit(self._keypoint_task, images, n_keypoints)
+        # bbox_per_frame = future_bbox.result()
         keypoints = future_kp.result()
         return [
+            TVFrameResult(
+                frame_id=offset + i,
+                # boxes=bbox_per_frame[i],
+                boxes=[],
+                keypoints=keypoints[i])
             for i in range(len(images))
         ]