docs: cleaner end-to-end pipeline — processor handles resize/normalize, no manual scaling
Browse files
README.md
CHANGED
|
@@ -92,63 +92,57 @@ out_root = model(**inputs, coordinate_mode="root_relative")
|
|
| 92 |
|
| 93 |
### End-to-end with RTMDet person detector
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
|
|
|
| 97 |
|
| 98 |
```python
|
| 99 |
from transformers import AutoModel, AutoImageProcessor
|
| 100 |
from PIL import Image
|
| 101 |
-
import torch
|
| 102 |
|
| 103 |
-
# ── Load
|
| 104 |
-
rtmdet
|
| 105 |
rtmdet_proc = AutoImageProcessor.from_pretrained("akore/rtmdet-tiny")
|
| 106 |
|
| 107 |
-
rtmw
|
|
|
|
| 108 |
|
| 109 |
-
# ──
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
|
| 113 |
|
| 114 |
-
# ──
|
| 115 |
det_inputs = rtmdet_proc(images=pil_img, return_tensors="pt")
|
| 116 |
with torch.no_grad():
|
| 117 |
-
det_out = rtmdet(pixel_values=det_inputs["pixel_values"]
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
# ── Per-person pose estimation ───────────────────────────────────────────────
|
| 125 |
-
RTMW_W, RTMW_H = 192, 256
|
| 126 |
-
mean = np.array([123.675, 116.28, 103.53], np.float32)
|
| 127 |
-
std = np.array([ 58.395, 57.12, 57.375], np.float32)
|
| 128 |
-
|
| 129 |
-
for box, lbl, sc in zip(boxes, labels, scores):
|
| 130 |
-
if int(lbl) != 0 or float(sc) < 0.3: # class 0 = person
|
| 131 |
-
continue
|
| 132 |
-
|
| 133 |
-
x1, y1, x2, y2 = box[0]*sx, box[1]*sy, box[2]*sx, box[3]*sy
|
| 134 |
-
xi1, yi1 = max(0, int(x1)), max(0, int(y1))
|
| 135 |
-
xi2, yi2 = min(orig_w, int(x2)), min(orig_h, int(y2))
|
| 136 |
-
|
| 137 |
-
# Crop → resize → normalize
|
| 138 |
-
crop = img_bgr[yi1:yi2, xi1:xi2]
|
| 139 |
-
patch = cv2.resize(crop, (RTMW_W, RTMW_H), interpolation=cv2.INTER_LINEAR)
|
| 140 |
-
pv = torch.from_numpy(
|
| 141 |
-
(cv2.cvtColor(patch, cv2.COLOR_BGR2RGB).astype(np.float32) - mean) / std
|
| 142 |
-
).permute(2, 0, 1).unsqueeze(0)
|
| 143 |
-
|
| 144 |
-
# coordinate_mode="image" + bbox → keypoints in original image pixels
|
| 145 |
-
bbox = torch.tensor([[xi1, yi1, xi2, yi2]], dtype=torch.float32)
|
| 146 |
-
with torch.no_grad():
|
| 147 |
-
out = rtmw(pixel_values=pv, coordinate_mode="image", bbox=bbox)
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
```
|
| 153 |
|
| 154 |
## Cocktail14 training datasets
|
|
|
|
| 92 |
|
| 93 |
### End-to-end with RTMDet person detector
|
| 94 |
|
| 95 |
+
Uses [`akore/rtmdet-tiny`](https://huggingface.co/akore/rtmdet-tiny) for detection and
|
| 96 |
+
RTMW for pose estimation. Both preprocessors handle all the resize / normalise bookkeeping
|
| 97 |
+
— no manual mean/std or scaling arithmetic required.
|
| 98 |
|
| 99 |
```python
|
| 100 |
from transformers import AutoModel, AutoImageProcessor
|
| 101 |
from PIL import Image
|
| 102 |
+
import torch
|
| 103 |
|
| 104 |
+
# ── Load once ────────────────────────────────────────────────────────────────
|
| 105 |
+
rtmdet = AutoModel.from_pretrained("akore/rtmdet-tiny", trust_remote_code=True).eval()
|
| 106 |
rtmdet_proc = AutoImageProcessor.from_pretrained("akore/rtmdet-tiny")
|
| 107 |
|
| 108 |
+
rtmw = AutoModel.from_pretrained("akore/rtmw-x-256x192", trust_remote_code=True).eval()
|
| 109 |
+
rtmw_proc = AutoImageProcessor.from_pretrained("akore/rtmw-x-256x192")
|
| 110 |
|
| 111 |
+
# ── Load image ───────────────────────────────────────────────────────────────
|
| 112 |
+
pil_img = Image.open("photo.jpg").convert("RGB")
|
| 113 |
+
orig_w, orig_h = pil_img.size # PIL gives (width, height)
|
|
|
|
| 114 |
|
| 115 |
+
# ── Detect people — boxes returned in original image pixel coords ─────────────
|
| 116 |
det_inputs = rtmdet_proc(images=pil_img, return_tensors="pt")
|
| 117 |
with torch.no_grad():
|
| 118 |
+
det_out = rtmdet(pixel_values=det_inputs["pixel_values"],
|
| 119 |
+
original_size=(orig_h, orig_w)) # ← scale happens inside
|
| 120 |
+
|
| 121 |
+
boxes = det_out.boxes[0] # (N, 4) already in original image pixels
|
| 122 |
+
labels = det_out.labels[0] # (N,)
|
| 123 |
+
scores = det_out.scores[0] # (N,)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
+
# ── Batch all person crops through the RTMW preprocessor ─────────────────────
|
| 126 |
+
person_boxes = [
|
| 127 |
+
(boxes[i], scores[i]) for i in range(len(labels))
|
| 128 |
+
if int(labels[i]) == 0 and float(scores[i]) > 0.3
|
| 129 |
+
]
|
| 130 |
+
|
| 131 |
+
if person_boxes:
|
| 132 |
+
# PIL.Image.crop handles resize bookkeeping; processor handles normalize + batch
|
| 133 |
+
crops = [pil_img.crop(b.tolist()) for b, _ in person_boxes]
|
| 134 |
+
bboxes = torch.stack([b for b, _ in person_boxes]) # (P, 4)
|
| 135 |
+
|
| 136 |
+
inputs = rtmw_proc(images=crops, return_tensors="pt") # resize + normalize
|
| 137 |
+
with torch.no_grad():
|
| 138 |
+
out = rtmw(pixel_values=inputs["pixel_values"],
|
| 139 |
+
coordinate_mode="image", bbox=bboxes)
|
| 140 |
+
|
| 141 |
+
# out.keypoints: (P, 133, 2) — [x, y] in original image pixels
|
| 142 |
+
# out.scores: (P, 133) — confidence in [0, 1]
|
| 143 |
+
for i, (_, sc) in enumerate(person_boxes):
|
| 144 |
+
visible = (out.scores[i] > 0.3).sum()
|
| 145 |
+
print(f"Person {float(sc):.2f}: {visible} / 133 keypoints visible")
|
| 146 |
```
|
| 147 |
|
| 148 |
## Cocktail14 training datasets
|