aivertex95827 commited on
Commit
60f4903
Β·
verified Β·
1 Parent(s): e1d0c86

Update miner.py

Browse files
Files changed (1) hide show
  1. miner.py +107 -28
miner.py CHANGED
@@ -18,6 +18,7 @@ from collections import OrderedDict, defaultdict
18
  from PIL import Image
19
  import torchvision.transforms as T
20
  import time
 
21
 
22
  try:
23
  from scipy.optimize import linear_sum_assignment as _linear_sum_assignment
@@ -1015,8 +1016,8 @@ TEMPLATE_F1: List[Tuple[float, float]] = [
1015
  HOMOGRAPHY_FILL_ONLY_VALID = True
1016
  KP_THRESHOLD = 0.2 # new-5 style (was 0.3)
1017
  # HRNet: smaller input = faster; 432x768 balances speed/accuracy (new-2 style)
1018
- KP_H, KP_W = 540, 960
1019
- HRNET_BATCH_SIZE = 8 # larger batch = faster (if GPU mem allows)
1020
 
1021
 
1022
  def _preprocess_batch(frames):
@@ -1027,6 +1028,7 @@ def _preprocess_batch(frames):
1027
  img = cv2.resize(img, (target_w, target_h)).astype(np.float32) / 255.0
1028
  batch.append(np.transpose(img, (2, 0, 1)))
1029
  return torch.from_numpy(np.stack(batch)).float()
 
1030
 
1031
 
1032
  def _extract_keypoints(heatmap: torch.Tensor, scale: int = 2):
@@ -1060,19 +1062,77 @@ def _process_keypoints(kp_coords, threshold, w, h, batch_size):
1060
  def _run_hrnet_batch(frames, model, threshold, batch_size=16):
1061
  if not frames or model is None:
1062
  return []
1063
- device = next(model.parameters()).device
1064
- use_amp = device.type == "cuda"
 
 
 
 
 
 
1065
  results = []
 
1066
  for i in range(0, len(frames), batch_size):
1067
  chunk = frames[i:i + batch_size]
1068
- batch = _preprocess_batch(chunk).to(device, non_blocking=True)
1069
- with torch.inference_mode():
1070
- with torch.amp.autocast("cuda", enabled=use_amp):
1071
- heatmaps = model(batch)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1072
  kp_coords = _extract_keypoints(heatmaps[:, :-1, :, :], scale=2)
1073
- batch_kps = _process_keypoints(kp_coords, threshold, KP_W, KP_H, len(chunk))
 
1074
  results.extend(batch_kps)
1075
- del heatmaps, kp_coords, batch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1076
  if results:
1077
  gc.collect()
1078
  return results
@@ -1256,6 +1316,7 @@ def _smooth_boxes(
1256
 
1257
  # ── Miner ─────────────────────────────────────────────────────────────────────
1258
 
 
1259
  class Miner:
1260
  def __init__(self, path_hf_repo: Path) -> None:
1261
  self.path_hf_repo = Path(path_hf_repo)
@@ -1287,16 +1348,30 @@ class Miner:
1287
  print(f"⚠️ OSNet weights not found at {osnet_weight_path}. Using HSV fallback.")
1288
 
1289
  # Keypoints model: HRNet
1290
- kp_config_file = "hrnetv2_w48.yaml"
1291
- kp_weights_file = "keypoint_detect.pt"
1292
- config_path = Path(kp_config_file) if Path(kp_config_file).exists() else self.path_hf_repo / kp_config_file
1293
- weights_path = Path(kp_weights_file) if Path(kp_weights_file).exists() else self.path_hf_repo / kp_weights_file
1294
- cfg = yaml.safe_load(open(config_path, 'r'))
1295
- hrnet = get_cls_net(cfg)
1296
- state = torch.load(weights_path, map_location=device, weights_only=False)
1297
- hrnet.load_state_dict(state)
1298
- hrnet.to(device).eval()
1299
- self.keypoints_model = hrnet
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1300
  print("βœ… HRNet Keypoints Model Loaded")
1301
 
1302
  # Person detection state (new-2 style)
@@ -1555,11 +1630,11 @@ class Miner:
1555
  return []
1556
  if self.keypoints_model is None:
1557
  return [[(0, 0)] * n_keypoints for _ in images]
1558
- try:
1559
- raw_kps = _run_hrnet_batch(images, self.keypoints_model, KP_THRESHOLD, batch_size=HRNET_BATCH_SIZE)
1560
- except Exception as e:
1561
- print(f"Error in _keypoint_task: {e}")
1562
- return [[(0, 0)] * n_keypoints for _ in images]
1563
  raw_kps = [_apply_keypoint_mapping(kp) for kp in raw_kps] if raw_kps else []
1564
  keypoints = _normalize_keypoints(raw_kps, images, n_keypoints) if raw_kps else [[(0, 0)] * n_keypoints for _ in images]
1565
  keypoints = [_fix_keypoints(kps, n_keypoints) for kps in keypoints]
@@ -1592,12 +1667,16 @@ class Miner:
1592
  torch.cuda.empty_cache()
1593
 
1594
  # Run bbox (batched YOLO) and keypoints in parallel
1595
- future_bbox = self._executor.submit(self._bbox_task, images, offset)
1596
  future_kp = self._executor.submit(self._keypoint_task, images, n_keypoints)
1597
- bbox_per_frame = future_bbox.result()
1598
  keypoints = future_kp.result()
1599
 
1600
  return [
1601
- TVFrameResult(frame_id=offset + i, boxes=bbox_per_frame[i], keypoints=keypoints[i])
 
 
 
 
1602
  for i in range(len(images))
1603
  ]
 
18
  from PIL import Image
19
  import torchvision.transforms as T
20
  import time
21
+ import onnxruntime as ort
22
 
23
  try:
24
  from scipy.optimize import linear_sum_assignment as _linear_sum_assignment
 
1016
  HOMOGRAPHY_FILL_ONLY_VALID = True
1017
  KP_THRESHOLD = 0.2 # new-5 style (was 0.3)
1018
  # HRNet: smaller input = faster; 432x768 balances speed/accuracy (new-2 style)
1019
+ # KP_H, KP_W = 540, 960
1020
+ KP_H, KP_W = 432, 768
1021
 
1022
 
1023
  def _preprocess_batch(frames):
 
1028
  img = cv2.resize(img, (target_w, target_h)).astype(np.float32) / 255.0
1029
  batch.append(np.transpose(img, (2, 0, 1)))
1030
  return torch.from_numpy(np.stack(batch)).float()
1031
+ # return np.stack(batch)
1032
 
1033
 
1034
  def _extract_keypoints(heatmap: torch.Tensor, scale: int = 2):
 
1062
  def _run_hrnet_batch(frames, model, threshold, batch_size=16):
1063
  if not frames or model is None:
1064
  return []
1065
+ # device = next(model.parameters()).device
1066
+ # use_amp = device.type == "cuda"
1067
+ output_shape = (batch_size, 58, 270, 480)
1068
+ output_tensor = torch.empty(output_shape, dtype=torch.float32, device='cuda')
1069
+ io_binding = model.io_binding()
1070
+
1071
+ input_name = model.get_inputs()[0].name
1072
+ output_name = model.get_outputs()[0].name
1073
  results = []
1074
+
1075
  for i in range(0, len(frames), batch_size):
1076
  chunk = frames[i:i + batch_size]
1077
+ curr_bs = len(chunk)
1078
+
1079
+ # Preprocess stays on GPU
1080
+ # Ensure _preprocess_batch returns a GPU tensor
1081
+ batch_cuda = _preprocess_batch(chunk).to('cuda:0')
1082
+
1083
+ # --- I/O BINDING (The "Keep on GPU" secret) ---
1084
+ io_binding.bind_input(
1085
+ name=input_name,
1086
+ device_type='cuda',
1087
+ device_id=0,
1088
+ element_type=np.float32,
1089
+ shape=batch_cuda.shape,
1090
+ buffer_ptr=batch_cuda.data_ptr()
1091
+ )
1092
+
1093
+ io_binding.bind_output(
1094
+ name=output_name,
1095
+ device_type='cuda',
1096
+ device_id=0,
1097
+ element_type=np.float32,
1098
+ shape=(curr_bs, 58, 270, 480), # Current dynamic shape
1099
+ buffer_ptr=output_tensor.data_ptr()
1100
+ )
1101
+
1102
+ # Sync and Run (Zero CPU usage here)
1103
+ model.run_with_iobinding(io_binding)
1104
+
1105
+ # Access the output directly from GPU memory
1106
+ # We slice it to match current batch size
1107
+ heatmaps = output_tensor[:curr_bs]
1108
+
1109
+ # 3. GPU Post-processing
1110
+ # Ensure _extract_keypoints uses torch functions (not cv2/numpy)
1111
  kp_coords = _extract_keypoints(heatmaps[:, :-1, :, :], scale=2)
1112
+ batch_kps = _process_keypoints(kp_coords, threshold, KP_W, KP_H, curr_bs)
1113
+
1114
  results.extend(batch_kps)
1115
+ # device = 'cuda'
1116
+ # results = []
1117
+ # torch.cuda.empty_cache()
1118
+ # for i in range(0, len(frames), batch_size):
1119
+ # chunk = frames[i:i + batch_size]
1120
+ # batch = _preprocess_batch(chunk)#.to(device, non_blocking=True)
1121
+ # print(batch.shape, flush=True)
1122
+
1123
+ # input_name = model.get_inputs()[0].name
1124
+ # outputs = model.run(None, {input_name: batch})
1125
+ # print(f"Inference output shape: {outputs[0].shape}")
1126
+ # heatmaps = torch.from_numpy(outputs[0]).to(device)
1127
+ # del outputs
1128
+ # gc.collect()
1129
+ # # with torch.inference_mode():
1130
+ # # with torch.amp.autocast("cuda", enabled=use_amp):
1131
+ # # heatmaps = model(batch)
1132
+ # kp_coords = _extract_keypoints(heatmaps[:, :-1, :, :], scale=2)
1133
+ # batch_kps = _process_keypoints(kp_coords, threshold, KP_W, KP_H, len(chunk))
1134
+ # results.extend(batch_kps)
1135
+ # del heatmaps, kp_coords, batch
1136
  if results:
1137
  gc.collect()
1138
  return results
 
1316
 
1317
  # ── Miner ─────────────────────────────────────────────────────────────────────
1318
 
1319
+ HRNET_BATCH_SIZE = 8 # larger batch = faster (if GPU mem allows)
1320
  class Miner:
1321
  def __init__(self, path_hf_repo: Path) -> None:
1322
  self.path_hf_repo = Path(path_hf_repo)
 
1348
  print(f"⚠️ OSNet weights not found at {osnet_weight_path}. Using HSV fallback.")
1349
 
1350
  # Keypoints model: HRNet
1351
+ # kp_config_file = "hrnetv2_w48.yaml"
1352
+ # kp_weights_file = "keypoint_detect.pt"
1353
+ # config_path = Path(kp_config_file) if Path(kp_config_file).exists() else self.path_hf_repo / kp_config_file
1354
+ # weights_path = Path(kp_weights_file) if Path(kp_weights_file).exists() else self.path_hf_repo / kp_weights_file
1355
+ # cfg = yaml.safe_load(open(config_path, 'r'))
1356
+ # hrnet = get_cls_net(cfg)
1357
+ # state = torch.load(weights_path, map_location=device, weights_only=False)
1358
+ # hrnet.load_state_dict(state)
1359
+ # hrnet.to(device).eval()
1360
+ # self.keypoints_model = hrnet
1361
+
1362
+ available = ort.get_available_providers()
1363
+ print(f"Available Providers: {available}")
1364
+ if 'CUDAExecutionProvider' not in ort.get_available_providers():
1365
+ raise RuntimeError("ONNX Runtime cannot find CUDA! Check your onnxruntime-gpu installation.")
1366
+ providers = [
1367
+ ('CUDAExecutionProvider', {
1368
+ 'device_id': 0,
1369
+ 'arena_extend_strategy': 'kSameAsRequested',
1370
+ })
1371
+ ]
1372
+
1373
+ session = ort.InferenceSession(self.path_hf_repo / "hrnet_final.onnx", providers=providers)
1374
+ self.keypoints_model = session
1375
  print("βœ… HRNet Keypoints Model Loaded")
1376
 
1377
  # Person detection state (new-2 style)
 
1630
  return []
1631
  if self.keypoints_model is None:
1632
  return [[(0, 0)] * n_keypoints for _ in images]
1633
+ # try:
1634
+ raw_kps = _run_hrnet_batch(images, self.keypoints_model, KP_THRESHOLD, batch_size=HRNET_BATCH_SIZE)
1635
+ # except Exception as e:
1636
+ # print(f"Error in _keypoint_task: {e}")
1637
+ # return [[(0, 0)] * n_keypoints for _ in images]
1638
  raw_kps = [_apply_keypoint_mapping(kp) for kp in raw_kps] if raw_kps else []
1639
  keypoints = _normalize_keypoints(raw_kps, images, n_keypoints) if raw_kps else [[(0, 0)] * n_keypoints for _ in images]
1640
  keypoints = [_fix_keypoints(kps, n_keypoints) for kps in keypoints]
 
1667
  torch.cuda.empty_cache()
1668
 
1669
  # Run bbox (batched YOLO) and keypoints in parallel
1670
+ # future_bbox = self._executor.submit(self._bbox_task, images, offset)
1671
  future_kp = self._executor.submit(self._keypoint_task, images, n_keypoints)
1672
+ # bbox_per_frame = future_bbox.result()
1673
  keypoints = future_kp.result()
1674
 
1675
  return [
1676
+ TVFrameResult(
1677
+ frame_id=offset + i,
1678
+ # boxes=bbox_per_frame[i],
1679
+ boxes=[],
1680
+ keypoints=keypoints[i])
1681
  for i in range(len(images))
1682
  ]