Update miner.py
Browse files
miner.py
CHANGED
|
@@ -18,7 +18,6 @@ from collections import OrderedDict, defaultdict
|
|
| 18 |
from PIL import Image
|
| 19 |
import torchvision.transforms as T
|
| 20 |
import time
|
| 21 |
-
import onnxruntime as ort
|
| 22 |
|
| 23 |
try:
|
| 24 |
from scipy.optimize import linear_sum_assignment as _linear_sum_assignment
|
|
@@ -1016,8 +1015,8 @@ TEMPLATE_F1: List[Tuple[float, float]] = [
|
|
| 1016 |
HOMOGRAPHY_FILL_ONLY_VALID = True
|
| 1017 |
KP_THRESHOLD = 0.2 # new-5 style (was 0.3)
|
| 1018 |
# HRNet: smaller input = faster; 432x768 balances speed/accuracy (new-2 style)
|
| 1019 |
-
|
| 1020 |
-
|
| 1021 |
|
| 1022 |
|
| 1023 |
def _preprocess_batch(frames):
|
|
@@ -1028,7 +1027,6 @@ def _preprocess_batch(frames):
|
|
| 1028 |
img = cv2.resize(img, (target_w, target_h)).astype(np.float32) / 255.0
|
| 1029 |
batch.append(np.transpose(img, (2, 0, 1)))
|
| 1030 |
return torch.from_numpy(np.stack(batch)).float()
|
| 1031 |
-
# return np.stack(batch)
|
| 1032 |
|
| 1033 |
|
| 1034 |
def _extract_keypoints(heatmap: torch.Tensor, scale: int = 2):
|
|
@@ -1062,77 +1060,19 @@ def _process_keypoints(kp_coords, threshold, w, h, batch_size):
|
|
| 1062 |
def _run_hrnet_batch(frames, model, threshold, batch_size=16):
|
| 1063 |
if not frames or model is None:
|
| 1064 |
return []
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
output_shape = (batch_size, 58, 270, 480)
|
| 1068 |
-
output_tensor = torch.empty(output_shape, dtype=torch.float32, device='cuda')
|
| 1069 |
-
io_binding = model.io_binding()
|
| 1070 |
-
|
| 1071 |
-
input_name = model.get_inputs()[0].name
|
| 1072 |
-
output_name = model.get_outputs()[0].name
|
| 1073 |
results = []
|
| 1074 |
-
|
| 1075 |
for i in range(0, len(frames), batch_size):
|
| 1076 |
chunk = frames[i:i + batch_size]
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
batch_cuda = _preprocess_batch(chunk).to('cuda:0')
|
| 1082 |
-
|
| 1083 |
-
# --- I/O BINDING (The "Keep on GPU" secret) ---
|
| 1084 |
-
io_binding.bind_input(
|
| 1085 |
-
name=input_name,
|
| 1086 |
-
device_type='cuda',
|
| 1087 |
-
device_id=0,
|
| 1088 |
-
element_type=np.float32,
|
| 1089 |
-
shape=batch_cuda.shape,
|
| 1090 |
-
buffer_ptr=batch_cuda.data_ptr()
|
| 1091 |
-
)
|
| 1092 |
-
|
| 1093 |
-
io_binding.bind_output(
|
| 1094 |
-
name=output_name,
|
| 1095 |
-
device_type='cuda',
|
| 1096 |
-
device_id=0,
|
| 1097 |
-
element_type=np.float32,
|
| 1098 |
-
shape=(curr_bs, 58, 270, 480), # Current dynamic shape
|
| 1099 |
-
buffer_ptr=output_tensor.data_ptr()
|
| 1100 |
-
)
|
| 1101 |
-
|
| 1102 |
-
# Sync and Run (Zero CPU usage here)
|
| 1103 |
-
model.run_with_iobinding(io_binding)
|
| 1104 |
-
|
| 1105 |
-
# Access the output directly from GPU memory
|
| 1106 |
-
# We slice it to match current batch size
|
| 1107 |
-
heatmaps = output_tensor[:curr_bs]
|
| 1108 |
-
|
| 1109 |
-
# 3. GPU Post-processing
|
| 1110 |
-
# Ensure _extract_keypoints uses torch functions (not cv2/numpy)
|
| 1111 |
kp_coords = _extract_keypoints(heatmaps[:, :-1, :, :], scale=2)
|
| 1112 |
-
batch_kps = _process_keypoints(kp_coords, threshold, KP_W, KP_H,
|
| 1113 |
-
|
| 1114 |
results.extend(batch_kps)
|
| 1115 |
-
|
| 1116 |
-
# results = []
|
| 1117 |
-
# torch.cuda.empty_cache()
|
| 1118 |
-
# for i in range(0, len(frames), batch_size):
|
| 1119 |
-
# chunk = frames[i:i + batch_size]
|
| 1120 |
-
# batch = _preprocess_batch(chunk)#.to(device, non_blocking=True)
|
| 1121 |
-
# print(batch.shape, flush=True)
|
| 1122 |
-
|
| 1123 |
-
# input_name = model.get_inputs()[0].name
|
| 1124 |
-
# outputs = model.run(None, {input_name: batch})
|
| 1125 |
-
# print(f"Inference output shape: {outputs[0].shape}")
|
| 1126 |
-
# heatmaps = torch.from_numpy(outputs[0]).to(device)
|
| 1127 |
-
# del outputs
|
| 1128 |
-
# gc.collect()
|
| 1129 |
-
# # with torch.inference_mode():
|
| 1130 |
-
# # with torch.amp.autocast("cuda", enabled=use_amp):
|
| 1131 |
-
# # heatmaps = model(batch)
|
| 1132 |
-
# kp_coords = _extract_keypoints(heatmaps[:, :-1, :, :], scale=2)
|
| 1133 |
-
# batch_kps = _process_keypoints(kp_coords, threshold, KP_W, KP_H, len(chunk))
|
| 1134 |
-
# results.extend(batch_kps)
|
| 1135 |
-
# del heatmaps, kp_coords, batch
|
| 1136 |
if results:
|
| 1137 |
gc.collect()
|
| 1138 |
return results
|
|
@@ -1316,7 +1256,6 @@ def _smooth_boxes(
|
|
| 1316 |
|
| 1317 |
# ββ Miner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1318 |
|
| 1319 |
-
HRNET_BATCH_SIZE = 8 # larger batch = faster (if GPU mem allows)
|
| 1320 |
class Miner:
|
| 1321 |
def __init__(self, path_hf_repo: Path) -> None:
|
| 1322 |
self.path_hf_repo = Path(path_hf_repo)
|
|
@@ -1348,30 +1287,16 @@ class Miner:
|
|
| 1348 |
print(f"β οΈ OSNet weights not found at {osnet_weight_path}. Using HSV fallback.")
|
| 1349 |
|
| 1350 |
# Keypoints model: HRNet
|
| 1351 |
-
|
| 1352 |
-
|
| 1353 |
-
|
| 1354 |
-
|
| 1355 |
-
|
| 1356 |
-
|
| 1357 |
-
|
| 1358 |
-
|
| 1359 |
-
|
| 1360 |
-
|
| 1361 |
-
|
| 1362 |
-
available = ort.get_available_providers()
|
| 1363 |
-
print(f"Available Providers: {available}")
|
| 1364 |
-
if 'CUDAExecutionProvider' not in ort.get_available_providers():
|
| 1365 |
-
raise RuntimeError("ONNX Runtime cannot find CUDA! Check your onnxruntime-gpu installation.")
|
| 1366 |
-
providers = [
|
| 1367 |
-
('CUDAExecutionProvider', {
|
| 1368 |
-
'device_id': 0,
|
| 1369 |
-
'arena_extend_strategy': 'kSameAsRequested',
|
| 1370 |
-
})
|
| 1371 |
-
]
|
| 1372 |
-
|
| 1373 |
-
session = ort.InferenceSession(self.path_hf_repo / "hrnet_final.onnx", providers=providers)
|
| 1374 |
-
self.keypoints_model = session
|
| 1375 |
print("β
HRNet Keypoints Model Loaded")
|
| 1376 |
|
| 1377 |
# Person detection state (new-2 style)
|
|
@@ -1630,11 +1555,11 @@ class Miner:
|
|
| 1630 |
return []
|
| 1631 |
if self.keypoints_model is None:
|
| 1632 |
return [[(0, 0)] * n_keypoints for _ in images]
|
| 1633 |
-
|
| 1634 |
-
|
| 1635 |
-
|
| 1636 |
-
|
| 1637 |
-
|
| 1638 |
raw_kps = [_apply_keypoint_mapping(kp) for kp in raw_kps] if raw_kps else []
|
| 1639 |
keypoints = _normalize_keypoints(raw_kps, images, n_keypoints) if raw_kps else [[(0, 0)] * n_keypoints for _ in images]
|
| 1640 |
keypoints = [_fix_keypoints(kps, n_keypoints) for kps in keypoints]
|
|
@@ -1667,16 +1592,12 @@ class Miner:
|
|
| 1667 |
torch.cuda.empty_cache()
|
| 1668 |
|
| 1669 |
# Run bbox (batched YOLO) and keypoints in parallel
|
| 1670 |
-
|
| 1671 |
future_kp = self._executor.submit(self._keypoint_task, images, n_keypoints)
|
| 1672 |
-
|
| 1673 |
keypoints = future_kp.result()
|
| 1674 |
|
| 1675 |
return [
|
| 1676 |
-
TVFrameResult(
|
| 1677 |
-
frame_id=offset + i,
|
| 1678 |
-
# boxes=bbox_per_frame[i],
|
| 1679 |
-
boxes=[],
|
| 1680 |
-
keypoints=keypoints[i])
|
| 1681 |
for i in range(len(images))
|
| 1682 |
]
|
|
|
|
| 18 |
from PIL import Image
|
| 19 |
import torchvision.transforms as T
|
| 20 |
import time
|
|
|
|
| 21 |
|
| 22 |
try:
|
| 23 |
from scipy.optimize import linear_sum_assignment as _linear_sum_assignment
|
|
|
|
| 1015 |
HOMOGRAPHY_FILL_ONLY_VALID = True
|
| 1016 |
KP_THRESHOLD = 0.2 # new-5 style (was 0.3)
|
| 1017 |
# HRNet: smaller input = faster; 432x768 balances speed/accuracy (new-2 style)
|
| 1018 |
+
KP_H, KP_W = 360, 640
|
| 1019 |
+
HRNET_BATCH_SIZE = 8 # larger batch = faster (if GPU mem allows)
|
| 1020 |
|
| 1021 |
|
| 1022 |
def _preprocess_batch(frames):
|
|
|
|
| 1027 |
img = cv2.resize(img, (target_w, target_h)).astype(np.float32) / 255.0
|
| 1028 |
batch.append(np.transpose(img, (2, 0, 1)))
|
| 1029 |
return torch.from_numpy(np.stack(batch)).float()
|
|
|
|
| 1030 |
|
| 1031 |
|
| 1032 |
def _extract_keypoints(heatmap: torch.Tensor, scale: int = 2):
|
|
|
|
| 1060 |
def _run_hrnet_batch(frames, model, threshold, batch_size=16):
|
| 1061 |
if not frames or model is None:
|
| 1062 |
return []
|
| 1063 |
+
device = next(model.parameters()).device
|
| 1064 |
+
use_amp = device.type == "cuda"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1065 |
results = []
|
|
|
|
| 1066 |
for i in range(0, len(frames), batch_size):
|
| 1067 |
chunk = frames[i:i + batch_size]
|
| 1068 |
+
batch = _preprocess_batch(chunk).to(device, non_blocking=True)
|
| 1069 |
+
with torch.inference_mode():
|
| 1070 |
+
with torch.amp.autocast("cuda", enabled=use_amp):
|
| 1071 |
+
heatmaps = model(batch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1072 |
kp_coords = _extract_keypoints(heatmaps[:, :-1, :, :], scale=2)
|
| 1073 |
+
batch_kps = _process_keypoints(kp_coords, threshold, KP_W, KP_H, len(chunk))
|
|
|
|
| 1074 |
results.extend(batch_kps)
|
| 1075 |
+
del heatmaps, kp_coords, batch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1076 |
if results:
|
| 1077 |
gc.collect()
|
| 1078 |
return results
|
|
|
|
| 1256 |
|
| 1257 |
# ββ Miner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1258 |
|
|
|
|
| 1259 |
class Miner:
|
| 1260 |
def __init__(self, path_hf_repo: Path) -> None:
|
| 1261 |
self.path_hf_repo = Path(path_hf_repo)
|
|
|
|
| 1287 |
print(f"β οΈ OSNet weights not found at {osnet_weight_path}. Using HSV fallback.")
|
| 1288 |
|
| 1289 |
# Keypoints model: HRNet
|
| 1290 |
+
kp_config_file = "hrnetv2_w48.yaml"
|
| 1291 |
+
kp_weights_file = "keypoint_detect.pt"
|
| 1292 |
+
config_path = Path(kp_config_file) if Path(kp_config_file).exists() else self.path_hf_repo / kp_config_file
|
| 1293 |
+
weights_path = Path(kp_weights_file) if Path(kp_weights_file).exists() else self.path_hf_repo / kp_weights_file
|
| 1294 |
+
cfg = yaml.safe_load(open(config_path, 'r'))
|
| 1295 |
+
hrnet = get_cls_net(cfg)
|
| 1296 |
+
state = torch.load(weights_path, map_location=device, weights_only=False)
|
| 1297 |
+
hrnet.load_state_dict(state)
|
| 1298 |
+
hrnet.to(device).eval()
|
| 1299 |
+
self.keypoints_model = hrnet
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1300 |
print("β
HRNet Keypoints Model Loaded")
|
| 1301 |
|
| 1302 |
# Person detection state (new-2 style)
|
|
|
|
| 1555 |
return []
|
| 1556 |
if self.keypoints_model is None:
|
| 1557 |
return [[(0, 0)] * n_keypoints for _ in images]
|
| 1558 |
+
try:
|
| 1559 |
+
raw_kps = _run_hrnet_batch(images, self.keypoints_model, KP_THRESHOLD, batch_size=HRNET_BATCH_SIZE)
|
| 1560 |
+
except Exception as e:
|
| 1561 |
+
print(f"Error in _keypoint_task: {e}")
|
| 1562 |
+
return [[(0, 0)] * n_keypoints for _ in images]
|
| 1563 |
raw_kps = [_apply_keypoint_mapping(kp) for kp in raw_kps] if raw_kps else []
|
| 1564 |
keypoints = _normalize_keypoints(raw_kps, images, n_keypoints) if raw_kps else [[(0, 0)] * n_keypoints for _ in images]
|
| 1565 |
keypoints = [_fix_keypoints(kps, n_keypoints) for kps in keypoints]
|
|
|
|
| 1592 |
torch.cuda.empty_cache()
|
| 1593 |
|
| 1594 |
# Run bbox (batched YOLO) and keypoints in parallel
|
| 1595 |
+
future_bbox = self._executor.submit(self._bbox_task, images, offset)
|
| 1596 |
future_kp = self._executor.submit(self._keypoint_task, images, n_keypoints)
|
| 1597 |
+
bbox_per_frame = future_bbox.result()
|
| 1598 |
keypoints = future_kp.result()
|
| 1599 |
|
| 1600 |
return [
|
| 1601 |
+
TVFrameResult(frame_id=offset + i, boxes=bbox_per_frame[i], keypoints=keypoints[i])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1602 |
for i in range(len(images))
|
| 1603 |
]
|