akore commited on
Commit
9cf0ceb
·
verified ·
1 Parent(s): cf254a6

docs: cleaner end-to-end pipeline — processor handles resize/normalize, no manual scaling

Browse files
Files changed (1) hide show
  1. README.md +39 -45
README.md CHANGED
@@ -92,63 +92,57 @@ out_root = model(**inputs, coordinate_mode="root_relative")
92
 
93
  ### End-to-end with RTMDet person detector
94
 
95
- This example uses [`akore/rtmdet-tiny`](https://huggingface.co/akore/rtmdet-tiny) to find
96
- people and then runs RTMW on each crop, returning keypoints back in original image coordinates.
 
97
 
98
  ```python
99
  from transformers import AutoModel, AutoImageProcessor
100
  from PIL import Image
101
- import torch, cv2, numpy as np
102
 
103
- # ── Load models ─────────────────────────────────────────────────────────────
104
- rtmdet = AutoModel.from_pretrained("akore/rtmdet-tiny", trust_remote_code=True).eval()
105
  rtmdet_proc = AutoImageProcessor.from_pretrained("akore/rtmdet-tiny")
106
 
107
- rtmw = AutoModel.from_pretrained("akore/rtmw-x-256x192", trust_remote_code=True).eval()
 
108
 
109
- # ── Prepare image ────────────────────────────────────────────────────────────
110
- img_bgr = cv2.imread("photo.jpg")
111
- orig_h, orig_w = img_bgr.shape[:2]
112
- pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
113
 
114
- # ── Person detection (RTMDet outputs in 640×640 space) ───────────────────────
115
  det_inputs = rtmdet_proc(images=pil_img, return_tensors="pt")
116
  with torch.no_grad():
117
- det_out = rtmdet(pixel_values=det_inputs["pixel_values"])
118
-
119
- boxes = det_out["boxes"][0].numpy() # (N, 4)
120
- labels = det_out["labels"][0].numpy() # (N,)
121
- scores = det_out["scores"][0].numpy() # (N,)
122
- sx, sy = orig_w / 640.0, orig_h / 640.0
123
-
124
- # ── Per-person pose estimation ───────────────────────────────────────────────
125
- RTMW_W, RTMW_H = 192, 256
126
- mean = np.array([123.675, 116.28, 103.53], np.float32)
127
- std = np.array([ 58.395, 57.12, 57.375], np.float32)
128
-
129
- for box, lbl, sc in zip(boxes, labels, scores):
130
- if int(lbl) != 0 or float(sc) < 0.3: # class 0 = person
131
- continue
132
-
133
- x1, y1, x2, y2 = box[0]*sx, box[1]*sy, box[2]*sx, box[3]*sy
134
- xi1, yi1 = max(0, int(x1)), max(0, int(y1))
135
- xi2, yi2 = min(orig_w, int(x2)), min(orig_h, int(y2))
136
-
137
- # Crop → resize → normalize
138
- crop = img_bgr[yi1:yi2, xi1:xi2]
139
- patch = cv2.resize(crop, (RTMW_W, RTMW_H), interpolation=cv2.INTER_LINEAR)
140
- pv = torch.from_numpy(
141
- (cv2.cvtColor(patch, cv2.COLOR_BGR2RGB).astype(np.float32) - mean) / std
142
- ).permute(2, 0, 1).unsqueeze(0)
143
-
144
- # coordinate_mode="image" + bbox → keypoints in original image pixels
145
- bbox = torch.tensor([[xi1, yi1, xi2, yi2]], dtype=torch.float32)
146
- with torch.no_grad():
147
- out = rtmw(pixel_values=pv, coordinate_mode="image", bbox=bbox)
148
 
149
- kp = out.keypoints[0] # (133, 2) — original image coords
150
- score = out.scores[0] # (133,) — confidence in [0, 1]
151
- print(f"Person {sc:.2f}: {(score > 0.3).sum()} / 133 keypoints visible")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  ```
153
 
154
  ## Cocktail14 training datasets
 
92
 
93
  ### End-to-end with RTMDet person detector
94
 
95
+ Uses [`akore/rtmdet-tiny`](https://huggingface.co/akore/rtmdet-tiny) for detection and
96
+ RTMW for pose estimation. Both preprocessors handle all the resize / normalise bookkeeping
97
+ — no manual mean/std or scaling arithmetic required.
98
 
99
  ```python
100
  from transformers import AutoModel, AutoImageProcessor
101
  from PIL import Image
102
+ import torch
103
 
104
+ # ── Load once ────────────────────────────────────────────────────────────────
105
+ rtmdet = AutoModel.from_pretrained("akore/rtmdet-tiny", trust_remote_code=True).eval()
106
  rtmdet_proc = AutoImageProcessor.from_pretrained("akore/rtmdet-tiny")
107
 
108
+ rtmw = AutoModel.from_pretrained("akore/rtmw-x-256x192", trust_remote_code=True).eval()
109
+ rtmw_proc = AutoImageProcessor.from_pretrained("akore/rtmw-x-256x192")
110
 
111
+ # ── Load image ───────────────────────────────────────────────────────────────
112
+ pil_img = Image.open("photo.jpg").convert("RGB")
113
+ orig_w, orig_h = pil_img.size # PIL gives (width, height)
 
114
 
115
+ # ── Detect people boxes returned in original image pixel coords ─────────────
116
  det_inputs = rtmdet_proc(images=pil_img, return_tensors="pt")
117
  with torch.no_grad():
118
+ det_out = rtmdet(pixel_values=det_inputs["pixel_values"],
119
+ original_size=(orig_h, orig_w)) # ← scale happens inside
120
+
121
+ boxes = det_out.boxes[0] # (N, 4) already in original image pixels
122
+ labels = det_out.labels[0] # (N,)
123
+ scores = det_out.scores[0] # (N,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
+ # ── Batch all person crops through the RTMW preprocessor ─────────────────────
126
+ person_boxes = [
127
+ (boxes[i], scores[i]) for i in range(len(labels))
128
+ if int(labels[i]) == 0 and float(scores[i]) > 0.3
129
+ ]
130
+
131
+ if person_boxes:
132
+ # PIL.Image.crop handles resize bookkeeping; processor handles normalize + batch
133
+ crops = [pil_img.crop(b.tolist()) for b, _ in person_boxes]
134
+ bboxes = torch.stack([b for b, _ in person_boxes]) # (P, 4)
135
+
136
+ inputs = rtmw_proc(images=crops, return_tensors="pt") # resize + normalize
137
+ with torch.no_grad():
138
+ out = rtmw(pixel_values=inputs["pixel_values"],
139
+ coordinate_mode="image", bbox=bboxes)
140
+
141
+ # out.keypoints: (P, 133, 2) — [x, y] in original image pixels
142
+ # out.scores: (P, 133) — confidence in [0, 1]
143
+ for i, (_, sc) in enumerate(person_boxes):
144
+ visible = (out.scores[i] > 0.3).sum()
145
+ print(f"Person {float(sc):.2f}: {visible} / 133 keypoints visible")
146
  ```
147
 
148
  ## Cocktail14 training datasets