akore
/

rtmw-x-256x192

@@ -92,63 +92,57 @@ out_root = model(**inputs, coordinate_mode="root_relative")
 ### End-to-end with RTMDet person detector
-This example uses [`akore/rtmdet-tiny`](https://huggingface.co/akore/rtmdet-tiny) to find
-people and then runs RTMW on each crop, returning keypoints back in original image coordinates.
 ```python
 from transformers import AutoModel, AutoImageProcessor
 from PIL import Image
-import torch, cv2, numpy as np
-# ── Load models ─────────────────────────────────────────────────────────────
-rtmdet = AutoModel.from_pretrained("akore/rtmdet-tiny", trust_remote_code=True).eval()
 rtmdet_proc = AutoImageProcessor.from_pretrained("akore/rtmdet-tiny")
-rtmw = AutoModel.from_pretrained("akore/rtmw-x-256x192", trust_remote_code=True).eval()
-# ── Prepare image ────────────────────────────────────────────────────────────
-img_bgr = cv2.imread("photo.jpg")
-orig_h, orig_w = img_bgr.shape[:2]
-pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
-# ── Person detection (RTMDet outputs in 640×640 space) ───────────────────────
 det_inputs = rtmdet_proc(images=pil_img, return_tensors="pt")
 with torch.no_grad():
-    det_out = rtmdet(pixel_values=det_inputs["pixel_values"])
-boxes  = det_out["boxes"][0].numpy()    # (N, 4)
-labels = det_out["labels"][0].numpy()   # (N,)
-scores = det_out["scores"][0].numpy()   # (N,)
-sx, sy = orig_w / 640.0, orig_h / 640.0
-# ── Per-person pose estimation ───────────────────────────────────────────────
-RTMW_W, RTMW_H = 192, 256
-mean = np.array([123.675, 116.28, 103.53], np.float32)
-std  = np.array([ 58.395,  57.12,  57.375], np.float32)
-for box, lbl, sc in zip(boxes, labels, scores):
-    if int(lbl) != 0 or float(sc) < 0.3:   # class 0 = person
-        continue
-    x1, y1, x2, y2 = box[0]*sx, box[1]*sy, box[2]*sx, box[3]*sy
-    xi1, yi1 = max(0, int(x1)), max(0, int(y1))
-    xi2, yi2 = min(orig_w, int(x2)), min(orig_h, int(y2))
-    # Crop → resize → normalize
-    crop  = img_bgr[yi1:yi2, xi1:xi2]
-    patch = cv2.resize(crop, (RTMW_W, RTMW_H), interpolation=cv2.INTER_LINEAR)
-    pv    = torch.from_numpy(
-        (cv2.cvtColor(patch, cv2.COLOR_BGR2RGB).astype(np.float32) - mean) / std
-    ).permute(2, 0, 1).unsqueeze(0)
-    # coordinate_mode="image" + bbox → keypoints in original image pixels
-    bbox = torch.tensor([[xi1, yi1, xi2, yi2]], dtype=torch.float32)
-    with torch.no_grad():
-        out = rtmw(pixel_values=pv, coordinate_mode="image", bbox=bbox)
-    kp    = out.keypoints[0]   # (133, 2)  — original image coords
-    score = out.scores[0]      # (133,)    — confidence in [0, 1]
-    print(f"Person {sc:.2f}: {(score > 0.3).sum()} / 133 keypoints visible")
 ```
 ## Cocktail14 training datasets

 ### End-to-end with RTMDet person detector
+Uses [`akore/rtmdet-tiny`](https://huggingface.co/akore/rtmdet-tiny) for detection and
+RTMW for pose estimation.  Both preprocessors handle all the resize / normalise bookkeeping
+— no manual mean/std or scaling arithmetic required.
 ```python
 from transformers import AutoModel, AutoImageProcessor
 from PIL import Image
+import torch
+# ── Load once ────────────────────────────────────────────────────────────────
+rtmdet      = AutoModel.from_pretrained("akore/rtmdet-tiny",          trust_remote_code=True).eval()
 rtmdet_proc = AutoImageProcessor.from_pretrained("akore/rtmdet-tiny")
+rtmw      = AutoModel.from_pretrained("akore/rtmw-x-256x192", trust_remote_code=True).eval()
+rtmw_proc = AutoImageProcessor.from_pretrained("akore/rtmw-x-256x192")
+# ── Load image ───────────────────────────────────────────────────────────────
+pil_img = Image.open("photo.jpg").convert("RGB")
+orig_w, orig_h = pil_img.size   # PIL gives (width, height)
+# ── Detect people — boxes returned in original image pixel coords ─────────────
 det_inputs = rtmdet_proc(images=pil_img, return_tensors="pt")
 with torch.no_grad():
+    det_out = rtmdet(pixel_values=det_inputs["pixel_values"],
+                     original_size=(orig_h, orig_w))   # ← scale happens inside
+boxes  = det_out.boxes[0]    # (N, 4)  already in original image pixels
+labels = det_out.labels[0]   # (N,)
+scores = det_out.scores[0]   # (N,)
+# ── Batch all person crops through the RTMW preprocessor ─────────────────────
+person_boxes = [
+    (boxes[i], scores[i]) for i in range(len(labels))
+    if int(labels[i]) == 0 and float(scores[i]) > 0.3
+]
+if person_boxes:
+    # PIL.Image.crop handles resize bookkeeping; processor handles normalize + batch
+    crops  = [pil_img.crop(b.tolist()) for b, _ in person_boxes]
+    bboxes = torch.stack([b for b, _ in person_boxes])            # (P, 4)
+    inputs = rtmw_proc(images=crops, return_tensors="pt")         # resize + normalize
+    with torch.no_grad():
+        out = rtmw(pixel_values=inputs["pixel_values"],
+                   coordinate_mode="image", bbox=bboxes)
+    # out.keypoints: (P, 133, 2)  — [x, y] in original image pixels
+    # out.scores:    (P, 133)     — confidence in [0, 1]
+    for i, (_, sc) in enumerate(person_boxes):
+        visible = (out.scores[i] > 0.3).sum()
+        print(f"Person {float(sc):.2f}: {visible} / 133 keypoints visible")
 ```
 ## Cocktail14 training datasets