Upload Train + exports
Browse files- .gitattributes +3 -0
- exports/mtg_4kp_s.onnx +3 -0
- exports/mtg_4kp_s.onnx.data +3 -0
- exports/mtg_4kp_s_fp16.onnx +3 -0
- exports/mtg_4kp_s_int8_conv_int4.onnx +3 -0
- exports/mtg_4kp_s_int8_conv_int4g32.onnx +3 -0
- exports/mtg_4kp_s_int8_conv_wo.onnx +3 -0
- exports/mtg_4kp_s_int8_conv_wo_fp16res.onnx +3 -0
- exports/mtg_4kp_s_int8_conv_wo_opt.onnx +3 -0
- exports/mtg_4kp_s_int8_dynamic.onnx +3 -0
- exports/mtg_4kp_s_nosim.onnx +3 -0
- exports/mtg_4kp_s_nosim.onnx.data +3 -0
- exports/mtg_4kp_s_r4b.onnx +3 -0
- exports/mtg_4kp_s_r4b.onnx.data +3 -0
- exports/mtg_4kp_s_r4b_fp16.onnx +3 -0
- run_20260519_012809/checkpoint.pth +3 -0
- run_20260519_012809/checkpoint0000.pth +3 -0
- run_20260519_012809/checkpoint0001.pth +3 -0
- run_20260519_012809/checkpoint_best_regular.pth +3 -0
- run_20260519_012809/effective_config.py +363 -0
- run_20260519_012809/eval/000.pth +3 -0
- run_20260519_012809/eval/latest.pth +3 -0
- run_20260519_012809/log.txt +2 -0
- run_20260519_012809/summary/events.out.tfevents.1779154095.palpatine.458155.0 +3 -0
- run_20260519_012809/summary/events.out.tfevents.1779154095.palpatine.458156.0 +3 -0
- run_20260519_012809/train.log +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
exports/mtg_4kp_s_nosim.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
exports/mtg_4kp_s_r4b.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
exports/mtg_4kp_s.onnx.data filter=lfs diff=lfs merge=lfs -text
|
exports/mtg_4kp_s.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c72f71bf017880ebbcde10f2ef630fa3e2fa3167124e7668434f6f726136e70
|
| 3 |
+
size 47111454
|
exports/mtg_4kp_s.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17cb72be4056e6d6a55210a448cc3bbf23610e416b355d99513a73ae73860c96
|
| 3 |
+
size 46792704
|
exports/mtg_4kp_s_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d4ed2defe0403ef700cb606982ee8e2c5e190c0fb315d4106c4f9ebb404f491b
|
| 3 |
+
size 23666438
|
exports/mtg_4kp_s_int8_conv_int4.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba99236110bbad78a1ab75091a573f4136d33581cd75485b440a25c7f516ebf0
|
| 3 |
+
size 10561260
|
exports/mtg_4kp_s_int8_conv_int4g32.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:232b8daa8d1fe47934541839c01984e6c40ea78dfecc992279fe0001c4d951cd
|
| 3 |
+
size 11262351
|
exports/mtg_4kp_s_int8_conv_wo.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39e6493a2519b30ae120f2e173319564af31fb4cac12d439ed8cf3b934ae742e
|
| 3 |
+
size 13068978
|
exports/mtg_4kp_s_int8_conv_wo_fp16res.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60608af82ba1f9192782fef79719ef6b7793b23b54ae6bdab7a5dc4b4d398d26
|
| 3 |
+
size 12458839
|
exports/mtg_4kp_s_int8_conv_wo_opt.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:240b38b6365dfb94ca61f4c13c51892f8c78e95de062a52d3e63dad5ece83ce4
|
| 3 |
+
size 13045930
|
exports/mtg_4kp_s_int8_dynamic.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a567c68b46b8f8f9f147827f4ce0301e4c4a59a6d783c82ab7a4b8d9ea5eeb7
|
| 3 |
+
size 28120147
|
exports/mtg_4kp_s_nosim.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3f2c47a664f550710588efa67ec607de3d8669902b5e08e34a68e4bdaa711ed
|
| 3 |
+
size 3061750
|
exports/mtg_4kp_s_nosim.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5f8e83484053d6cc2152f5a8f1d6218e0ba19e40cacf3cbea790ae6c451081c
|
| 3 |
+
size 46792704
|
exports/mtg_4kp_s_r4b.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d33788e992e4882e37b1d2307994942552413092fdebaf998728cdbda4b5a99
|
| 3 |
+
size 14850609
|
exports/mtg_4kp_s_r4b.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8977c772c498a74eeed131322efe13d9ce9bfde4a3bfa5242ba583615f2a3a22
|
| 3 |
+
size 14745600
|
exports/mtg_4kp_s_r4b_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e203415be3c526d486082f5a9b2de15bc03d6b443c5dc2a7f8bf5f3031a48471
|
| 3 |
+
size 7495617
|
run_20260519_012809/checkpoint.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a66ae6431459f3775eed0d847b8fa3f697d4d7ee5b3ffc2e72be993bc079b0d
|
| 3 |
+
size 59580875
|
run_20260519_012809/checkpoint0000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8910ba9beb071960bd6ac07e1cdf95324b03be4840a88e4769e868da281c4a0f
|
| 3 |
+
size 59590695
|
run_20260519_012809/checkpoint0001.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a939b95e08fdc272e44fcf15e02130d9ffb8d3520d463bfdd08025174a5bf268
|
| 3 |
+
size 59590695
|
run_20260519_012809/checkpoint_best_regular.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18b495cf63a29735287d2ac97299f87cb206c220b0fb0e28242feb9ff9a4b7a6
|
| 3 |
+
size 59612854
|
run_20260519_012809/effective_config.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""MTG card 4-keypoint training config for DETRPose-S.
|
| 2 |
+
|
| 3 |
+
This config is a LazyCall override of the upstream DETRPose-S (HGNetV2-B0) config.
|
| 4 |
+
It inherits the full upstream architecture and only overrides what differs for
|
| 5 |
+
single-class MTG card corner detection with 4 keypoints.
|
| 6 |
+
|
| 7 |
+
Placement: detrpose/configs/mtg_card_4kp.py
|
| 8 |
+
Load with:
|
| 9 |
+
python train.py --config_file configs/mtg_card_4kp.py --device cuda --amp
|
| 10 |
+
|
| 11 |
+
NOTE (T14): Three upstream files hardcode OKS sigma dispatch tables that crash
|
| 12 |
+
for num_body_points not in {3, 14, 17}:
|
| 13 |
+
- src/models/detrpose/matcher.py:29 (HungarianMatcher.__init__)
|
| 14 |
+
- src/misc/keypoint_loss.py:49 (OKSLoss.__init__)
|
| 15 |
+
- src/models/detrpose/dn_component.py:19 (get_sigmas)
|
| 16 |
+
Task 14 must add a num_keypoints==4 branch to each using:
|
| 17 |
+
oks_sigmas = [0.025, 0.025, 0.025, 0.025]
|
| 18 |
+
Until then, training will crash at model init with NotImplementedError/ValueError.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
# Base: import from the DETRPose-S include hierarchy (relative to this file's
|
| 23 |
+
# location, which is detrpose/configs/; the include files live at
|
| 24 |
+
# detrpose/configs/detrpose/include/).
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
from .detrpose.detrpose_hgnetv2_s import (
|
| 27 |
+
model,
|
| 28 |
+
criterion,
|
| 29 |
+
training_params,
|
| 30 |
+
postprocessor,
|
| 31 |
+
ema,
|
| 32 |
+
optimizer,
|
| 33 |
+
lr_scheduler,
|
| 34 |
+
)
|
| 35 |
+
from .detrpose.include.dataset import dataset_train, dataset_val, evaluator
|
| 36 |
+
|
| 37 |
+
from src.core import LazyCall as L
|
| 38 |
+
from src.data.coco_eval import CocoEvaluator
|
| 39 |
+
import src.data.transforms as T
|
| 40 |
+
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
# Spec §: Task-level constants (consumed by T14 patches and future reference)
|
| 43 |
+
# ---------------------------------------------------------------------------
|
| 44 |
+
|
| 45 |
+
# Number of card corners; drives model/criterion/postprocessor.
|
| 46 |
+
# NOTE(T14): oks sigma dispatch in matcher.py / keypoint_loss.py / dn_component.py
|
| 47 |
+
# must add a branch for this value using oks_sigmas below.
|
| 48 |
+
num_body_points = 4
|
| 49 |
+
|
| 50 |
+
# OKS sigmas — uniform for all 4 corners (no anatomical significance).
|
| 51 |
+
# Normalized to ~1/40 of image fraction following COCO convention (σ=0.025 ≈ 1/(2*20)).
|
| 52 |
+
# TODO(T14): wire these into the 3 sigma dispatch tables instead of hardcoding.
|
| 53 |
+
oks_sigmas = [0.025, 0.025, 0.025, 0.025]
|
| 54 |
+
|
| 55 |
+
# Single class: "mtg_card" (background is implicit, class 0 is the card)
|
| 56 |
+
num_classes = 1
|
| 57 |
+
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
# Spec §: Model overrides — num_body_points and num_classes in 3 places
|
| 60 |
+
# ---------------------------------------------------------------------------
|
| 61 |
+
|
| 62 |
+
# (1) Transformer: drives keypoint head output dimension
|
| 63 |
+
model.transformer.num_body_points = num_body_points
|
| 64 |
+
model.transformer.num_classes = num_classes
|
| 65 |
+
|
| 66 |
+
# (2) Criterion: drives loss computation over keypoints
|
| 67 |
+
criterion.num_classes = num_classes
|
| 68 |
+
criterion.num_body_points = num_body_points
|
| 69 |
+
|
| 70 |
+
# Loss weights — mapped from spec to upstream key names:
|
| 71 |
+
# spec "cls" → upstream "loss_vfl" (varifocal classification loss)
|
| 72 |
+
# spec "keypoints_l1" → upstream "loss_keypoints" (L1 keypoint regression)
|
| 73 |
+
# spec "keypoints_oks" → upstream "loss_oks" (OKS keypoint loss)
|
| 74 |
+
# Note: upstream DETRPose does NOT have separate bbox_l1 / bbox_giou loss keys;
|
| 75 |
+
# bounding boxes are recovered from keypoint predictions, not via a dedicated
|
| 76 |
+
# bbox branch. The spec's bbox_l1=5.0 / bbox_giou=2.0 weights have no upstream
|
| 77 |
+
# equivalent and are omitted here.
|
| 78 |
+
criterion.weight_dict = {
|
| 79 |
+
'loss_vfl': 2.0, # spec: cls=2.0
|
| 80 |
+
'loss_keypoints': 10.0, # spec: keypoints_l1=10.0
|
| 81 |
+
'loss_oks': 4.0, # spec: keypoints_oks=4.0
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
# Matcher costs — mirror weight_dict ratios so Hungarian assignment is consistent
|
| 85 |
+
criterion.matcher.cost_class = 2.0
|
| 86 |
+
criterion.matcher.cost_keypoints = 10.0
|
| 87 |
+
criterion.matcher.cost_oks = 4.0
|
| 88 |
+
criterion.matcher.num_body_points = num_body_points
|
| 89 |
+
|
| 90 |
+
# (3) PostProcessor: drives output decoding
|
| 91 |
+
postprocessor.num_body_points = num_body_points
|
| 92 |
+
|
| 93 |
+
# ---------------------------------------------------------------------------
|
| 94 |
+
# Round 4a — Model surgery for inference latency (2026-05-18)
|
| 95 |
+
# ---------------------------------------------------------------------------
|
| 96 |
+
# DETRPose-S was tuned for COCO multi-person pose (60 queries to find ≤ K
|
| 97 |
+
# people, 6 decoder layers to refine 17-keypoint anatomy). For MTG card
|
| 98 |
+
# corner detection we have 1 object per image with 4 deterministic corners,
|
| 99 |
+
# so the upstream defaults are wildly over-provisioned. Three config-only
|
| 100 |
+
# overrides cut a meaningful chunk of the decoder cost without invasive
|
| 101 |
+
# code changes:
|
| 102 |
+
#
|
| 103 |
+
# - num_queries 60 → 10: cross-attention scales linearly with queries
|
| 104 |
+
# and we only ever consume the top-1. 10 still gives the Hungarian
|
| 105 |
+
# matcher slack (≥ 4 ensures every keypoint slot has a query) without
|
| 106 |
+
# paying for the 50 unused ones.
|
| 107 |
+
# - num_decoder_layers 6 → 3: halves decoder compute. DETR keypoint
|
| 108 |
+
# refinement converges fast on a single-object task; 3 layers is
|
| 109 |
+
# plenty.
|
| 110 |
+
# - dec_n_points 4 → 2: each decoder query sampled 4 reference points
|
| 111 |
+
# per feature level via deformable attention, which is what the 10
|
| 112 |
+
# WebGPU `GridSample` ops implement. Cutting to 2 halves those
|
| 113 |
+
# dispatches — the WebGPU EP's biggest pain point on Mali / GCN.
|
| 114 |
+
#
|
| 115 |
+
# `postprocessor.num_select` must match `num_queries` (it does top-k over
|
| 116 |
+
# all available queries; if num_select > num_queries the index math
|
| 117 |
+
# silently wraps).
|
| 118 |
+
#
|
| 119 |
+
# Expected impact: -20 to -30 % inference on top of the FP16 model
|
| 120 |
+
# (Ampere 191 → ~140 ms, GCN-5 342 → ~250 ms, Apple Tahoe 218 → ~170 ms).
|
| 121 |
+
#
|
| 122 |
+
# Costs:
|
| 123 |
+
# - Existing checkpoints are NOT compatible — different head sizes,
|
| 124 |
+
# different decoder depth. Training restarts from scratch.
|
| 125 |
+
# - Smoke run before committing GPU: `python scripts/_make_small_dataset.py`
|
| 126 |
+
# + `python scripts/train.py --config detrpose/configs/mtg_card_4kp_smoke.py --single-gpu`
|
| 127 |
+
# verifies convergence in ~15 min before the 3-8 h full run.
|
| 128 |
+
#
|
| 129 |
+
# Round 4b (2026-05-19): R4a converged to AP=0.997 in a SINGLE epoch (run
|
| 130 |
+
# runs/run_20260518_210733/, checkpoint backed up as r4a_best_epoch0_AP997.pth).
|
| 131 |
+
# That's a saturated benchmark with the AP near 1.0 in epoch 0 — the model
|
| 132 |
+
# still has way too much capacity for a 4-corner-of-1-rigid-card task with
|
| 133 |
+
# synthetic data. R4b shrinks the transformer 3-4× further on top of R4a:
|
| 134 |
+
#
|
| 135 |
+
# - hidden_dim 256 → 128: quarters the attention compute (O(d²) per token).
|
| 136 |
+
# Touches BOTH encoder and transformer; both must be set or the model
|
| 137 |
+
# init fails on dim mismatch at the encoder→transformer boundary.
|
| 138 |
+
# - dim_feedforward 1024 → 512: halves FFN compute.
|
| 139 |
+
# - nhead 8 → 4: halves attention head projections.
|
| 140 |
+
# - num_decoder_layers 3 → 1: single decoder pass. With AP saturating in
|
| 141 |
+
# 1 epoch already, one decoder layer is plenty for this task.
|
| 142 |
+
# - num_queries 10 → 4: minimum that still gives Hungarian a slot per
|
| 143 |
+
# keypoint. We never use more than top-1 anyway.
|
| 144 |
+
#
|
| 145 |
+
# Param count goes 11.35 M → ~5-6 M (backbone HGNetv2-B0 dominates and
|
| 146 |
+
# can't shrink without a bigger surgery). FLOPS roughly halve again.
|
| 147 |
+
#
|
| 148 |
+
# Expected inference latency on FP16+WebGPU: Ampere 191 → ~90 ms,
|
| 149 |
+
# GCN-5 342 → ~200 ms. macOS Tahoe → ~120 ms.
|
| 150 |
+
model.transformer.hidden_dim = 128
|
| 151 |
+
model.transformer.dim_feedforward = 512
|
| 152 |
+
model.transformer.nhead = 4
|
| 153 |
+
model.transformer.num_decoder_layers = 1
|
| 154 |
+
model.transformer.num_queries = 4
|
| 155 |
+
model.transformer.dec_n_points = 2
|
| 156 |
+
|
| 157 |
+
model.encoder.hidden_dim = 128
|
| 158 |
+
model.encoder.dim_feedforward = 512
|
| 159 |
+
model.encoder.nhead = 4
|
| 160 |
+
|
| 161 |
+
postprocessor.num_select = 4
|
| 162 |
+
|
| 163 |
+
# ---------------------------------------------------------------------------
|
| 164 |
+
# Spec §: Training hyperparameters
|
| 165 |
+
# ---------------------------------------------------------------------------
|
| 166 |
+
|
| 167 |
+
# Upstream default is 100 epochs; spec says 150 — override.
|
| 168 |
+
training_params.epochs = 150
|
| 169 |
+
|
| 170 |
+
# Skip the full val eval that upstream fires before the first training
|
| 171 |
+
# epoch on --resume. It's 30+ min of compute on 307k imgs, and leaves
|
| 172 |
+
# evaluator state accumulated in RAM which feeds the gradual OOM growth we
|
| 173 |
+
# observed in epoch 1. The end-of-epoch eval (run every epoch anyway)
|
| 174 |
+
# provides the same signal.
|
| 175 |
+
training_params.skip_initial_eval = True
|
| 176 |
+
|
| 177 |
+
# Cap val eval at a deterministic subset of N images.
|
| 178 |
+
#
|
| 179 |
+
# Without this, every epoch's end-of-epoch eval runs over all 307k val imgs
|
| 180 |
+
# (9623 batches @ bs=32). pycocotools' CocoEvaluator.update() appends an
|
| 181 |
+
# [C, A, B] numpy array of per-image match dicts per batch — confirmed by
|
| 182 |
+
# scripts/_repro_eval_leak.py to grow at ~0.4 MB/batch (≈4 GB over the full
|
| 183 |
+
# val per rank). With 2-rank DDP on a 60 GB host, the val dataloader was
|
| 184 |
+
# OOM-killed at ~25-32 % of eval (runs/run_20260413_234839 iter 2360 and
|
| 185 |
+
# run_20260414_102022 iter 3010). Subsampling caps that accumulator without
|
| 186 |
+
# touching the source dataset on disk.
|
| 187 |
+
#
|
| 188 |
+
# N=10_000 keeps eval at ~1 min/epoch, accumulator ≤ 130 MB / rank, and gives
|
| 189 |
+
# a stable per-epoch AP signal. Run a full-set eval offline at the end of
|
| 190 |
+
# training for the headline benchmark. Set to 0/None to disable the cap.
|
| 191 |
+
training_params.max_eval_samples = 10000
|
| 192 |
+
|
| 193 |
+
# torch.compile is honored by our trainer.py patch but disabled here.
|
| 194 |
+
# Tried with mode="default", fullgraph=False — crashed in backward with:
|
| 195 |
+
# "RuntimeError: one of the variables needed for gradient computation has
|
| 196 |
+
# been modified by an inplace operation: [HalfTensor [32, 256, 80, 80]]"
|
| 197 |
+
# Root cause: inductor reordering + AMP + DDP gradient buckets clashes with
|
| 198 |
+
# the inplace activations in HGNetv2 backbone. Not worth the debugging rabbit
|
| 199 |
+
# hole; bigger speedups come from reducing epochs or dataset size.
|
| 200 |
+
training_params.compile_model = False
|
| 201 |
+
training_params.compile_mode = "default"
|
| 202 |
+
|
| 203 |
+
# Gradient clipping (spec: grad_clip=0.1; upstream default is also 0.1 —
|
| 204 |
+
# keeping explicit for clarity)
|
| 205 |
+
training_params.clip_max_norm = 0.1
|
| 206 |
+
|
| 207 |
+
# Batch size: bumped 16 → 64 (32/GPU) to saturate 2× RTX 3090 — at 16 the
|
| 208 |
+
# GPUs idled at ~50% with only 6–7 GB of 24 GB VRAM used (data-loading bound).
|
| 209 |
+
# Requires linear LR scaling below (4× = 0.0001 → 0.0004).
|
| 210 |
+
dataset_train.total_batch_size = 64
|
| 211 |
+
|
| 212 |
+
# Data-loading: 6 workers per rank (12 total). Reduced from 8 because we hit
|
| 213 |
+
# OOM-kill at iter 11000/11226 of epoch 1 on a 60 GB host — during epoch 1
|
| 214 |
+
# training, RAM grew from ~31 GB (epoch start) to OOM (~60+ GB) over 11k
|
| 215 |
+
# iters, a gradual leak of ~2-3 MB/iter we couldn't fully attribute in-run.
|
| 216 |
+
# Hypothesis: fragmentation + residual state from the on-resume val eval +
|
| 217 |
+
# pin_memory accumulation. Cutting workers 8→6 reclaims ~4-5 GB of headroom
|
| 218 |
+
# and doesn't affect throughput (data: 0.0002 — we're compute-bound, not
|
| 219 |
+
# loader-bound).
|
| 220 |
+
#
|
| 221 |
+
# persistent_workers=False, prefetch_factor=2 — same rationale as before,
|
| 222 |
+
# see #97432 (prefetch + pin_memory leak) and #62066 (persistent workers
|
| 223 |
+
# accumulating CoW pages).
|
| 224 |
+
dataset_train.num_workers = 6
|
| 225 |
+
dataset_train.persistent_workers = False
|
| 226 |
+
dataset_train.prefetch_factor = 2
|
| 227 |
+
|
| 228 |
+
# Val loader: keep the loader footprint small without increasing main-process
|
| 229 |
+
# pressure. The upstream default (bs=32, num_workers=4) OOM-killed a worker
|
| 230 |
+
# at ~32% of end-of-epoch val on 307k imgs.
|
| 231 |
+
#
|
| 232 |
+
# Two independent RAM pressures in val eval:
|
| 233 |
+
# (a) worker side: num_workers × (forked parent RSS + prefetch buffers).
|
| 234 |
+
# Halving num_workers to 1 cuts this dominant term ~4×.
|
| 235 |
+
# (b) main-process side: CocoEvaluator.update() calls `COCO.loadRes(coco_gt,
|
| 236 |
+
# results)` once per batch — creating a fresh pycocotools index per
|
| 237 |
+
# call. Reducing batch_size would DOUBLE the number of calls and the
|
| 238 |
+
# main-process allocator pressure, so we keep batch_size at 32.
|
| 239 |
+
#
|
| 240 |
+
# pin_memory=False because eval is a no_grad forward — there's no backward
|
| 241 |
+
# to benefit from pinned host buffers, and pinning ~1-2 GB of non-swappable
|
| 242 |
+
# RAM is pure overhead. PyTorch docs confirm pin_memory is page-locked and
|
| 243 |
+
# counts against the OOM killer's notion of used memory.
|
| 244 |
+
#
|
| 245 |
+
# Rationale cross-checked against:
|
| 246 |
+
# - pytorch/pytorch#8976 (SIGKILL workers = OOM)
|
| 247 |
+
# - pytorch/pytorch#13246 (num_workers × RSS replication)
|
| 248 |
+
# - Yuxin Wu "Demystify RAM Usage in Multiprocess DataLoader"
|
| 249 |
+
# - facebookresearch/detr#423, #602 (DETR-family val OOM)
|
| 250 |
+
dataset_val.num_workers = 1
|
| 251 |
+
dataset_val.pin_memory = False
|
| 252 |
+
|
| 253 |
+
# Image size 640×640 — upstream already uses (640,640); explicit for clarity.
|
| 254 |
+
# eval_spatial_size lives in include/detrpose_hgnetv2.py and is referenced by
|
| 255 |
+
# the encoder/transformer. We do NOT re-import and override it here because
|
| 256 |
+
# changing it would require re-instantiating encoder/transformer embed sizes.
|
| 257 |
+
# Training at 640×640 (the upstream default for -S) already satisfies spec.
|
| 258 |
+
|
| 259 |
+
# Optimizer: spec lr=0.0001 @ bs=16, lr_backbone=0.00001, weight_decay=0.0001.
|
| 260 |
+
# Batch size was bumped 16 → 64, so LR is scaled by √(64/16)=2× using the
|
| 261 |
+
# square-root rule instead of linear. Rationale: linear scaling (×4) is the
|
| 262 |
+
# optimum for ResNet-style conv nets, but DETR-family transformers with many
|
| 263 |
+
# aux heads are well known to go numerically unstable under aggressive LR
|
| 264 |
+
# scaling. We tried linear (lr=4e-4): training was converging (loss 192→15
|
| 265 |
+
# in 4k iters) then a single bad batch produced NaN in fp16 aux-head logits,
|
| 266 |
+
# the Hungarian matcher in scipy raised "matrix contains invalid numeric
|
| 267 |
+
# entries", and DDP timed out after 10 min waiting on the dead rank.
|
| 268 |
+
# Square-root scaling (lr=2e-4) gives back headroom against fp16 overflow
|
| 269 |
+
# while still benefitting from the larger batch — it's the standard choice
|
| 270 |
+
# for transformers per Goyal et al. 2017 §2.1 and AdamW large-batch studies.
|
| 271 |
+
# lr_head : 0.0001 → 0.0002
|
| 272 |
+
# lr_backbone : 0.00001 → 0.00002
|
| 273 |
+
optimizer.lr = 0.0002
|
| 274 |
+
optimizer.weight_decay = 0.0001
|
| 275 |
+
optimizer.params.cfg = [
|
| 276 |
+
{
|
| 277 |
+
'params': '^(?=.*backbone).*$',
|
| 278 |
+
'lr': 0.00002, # √(bs 16→64)=2× of spec lr_backbone=1e-5
|
| 279 |
+
},
|
| 280 |
+
]
|
| 281 |
+
|
| 282 |
+
# LR scheduler: keep upstream MultiStepLR with no decay during training window
|
| 283 |
+
# (milestones=[1000] effectively means no step during 150 epochs).
|
| 284 |
+
|
| 285 |
+
# ---------------------------------------------------------------------------
|
| 286 |
+
# Spec §: Stop-epoch policy for augmentation ops — scaled to 150 epochs
|
| 287 |
+
# Upstream -S uses [5, 53, 96]; we scale proportionally to 150 epochs:
|
| 288 |
+
# start_mosaic=5, stop_zoomout=round(53*150/100)=80, stop_mosaic=round(96*150/100)=144
|
| 289 |
+
# ---------------------------------------------------------------------------
|
| 290 |
+
dataset_train.dataset.transforms.policy = {
|
| 291 |
+
'name': 'stop_epoch',
|
| 292 |
+
'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
|
| 293 |
+
'epoch': [5, 80, 144], # scaled from [5, 53, 96] @ 100ep → 150ep
|
| 294 |
+
}
|
| 295 |
+
dataset_train.collate_fn.base_size_repeat = 20
|
| 296 |
+
dataset_train.collate_fn.stop_epoch = 144
|
| 297 |
+
|
| 298 |
+
# ---------------------------------------------------------------------------
|
| 299 |
+
# Spec §: Augmentation — add HSVJitter and set flip_pairs=[[1,2],[3,4]]
|
| 300 |
+
# (swap TL↔TR and BR↔BL on horizontal flip). HSVJitter is inserted as
|
| 301 |
+
# transforms3 (after load/mosaic transforms1-2, before geometric transforms).
|
| 302 |
+
#
|
| 303 |
+
# Upstream Compose receives transforms as **kwargs keyed transforms1..transforms7.
|
| 304 |
+
# The Compose.__init__ iterates dict values in insertion order (Python 3.7+),
|
| 305 |
+
# so we can rename slots to insert HSVJitter. New pipeline:
|
| 306 |
+
# transforms1 = Mosaic
|
| 307 |
+
# transforms2 = RandomZoomOut
|
| 308 |
+
# transforms3 = HSVJitter ← NEW: color jitter before geometric ops
|
| 309 |
+
# transforms4 = RandomHorizontalFlip(flip_pairs=[[0,1],[2,3]]) ← 0-indexed corner swap
|
| 310 |
+
# transforms5 = ColorJitter (kept; can coexist with HSVJitter)
|
| 311 |
+
# transforms6 = RandomResize
|
| 312 |
+
# transforms7 = ToTensor
|
| 313 |
+
# transforms8 = Normalize
|
| 314 |
+
#
|
| 315 |
+
# Note: upstream dataset.py uses a fixed set of numbered kwargs; we override
|
| 316 |
+
# the entire transforms object on dataset_train.dataset.transforms to inject
|
| 317 |
+
# the new pipeline cleanly.
|
| 318 |
+
# ---------------------------------------------------------------------------
|
| 319 |
+
from .detrpose.include.detrpose_hgnetv2 import eval_spatial_size
|
| 320 |
+
from src.data.container import Compose
|
| 321 |
+
|
| 322 |
+
_scales = [(640, 640)]
|
| 323 |
+
_max_size = 1333
|
| 324 |
+
|
| 325 |
+
dataset_train.dataset.transforms = L(Compose)(
|
| 326 |
+
policy={
|
| 327 |
+
'name': 'stop_epoch',
|
| 328 |
+
'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
|
| 329 |
+
'epoch': [5, 80, 144],
|
| 330 |
+
},
|
| 331 |
+
mosaic_prob=0.5,
|
| 332 |
+
transforms1=L(T.Mosaic)(output_size=320, probability=1.0),
|
| 333 |
+
transforms2=L(T.RandomZoomOut)(p=0.5),
|
| 334 |
+
transforms3=L(T.HSVJitter)(h=0.015, s=0.7, v=0.4), # spec: HSVJitter early
|
| 335 |
+
transforms4=L(T.RandomHorizontalFlip)(flip_pairs=[[0, 1], [2, 3]]), # 0-indexed pairs: TL↔TR (0↔1), BR↔BL (2↔3) — preserves corner semantics after h-flip
|
| 336 |
+
transforms5=L(T.ColorJitter)(), # upstream default; coexists with HSVJitter
|
| 337 |
+
transforms6=L(T.RandomResize)(sizes=_scales, max_size=_max_size),
|
| 338 |
+
transforms7=L(T.ToTensor)(),
|
| 339 |
+
transforms8=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1]),
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
# ---------------------------------------------------------------------------
|
| 343 |
+
# Spec §: Dataset paths — relative to cwd when training runs (train-pose-estimation-v2/)
|
| 344 |
+
# ---------------------------------------------------------------------------
|
| 345 |
+
dataset_train.dataset.img_folder = "coco_dataset/train"
|
| 346 |
+
dataset_train.dataset.ann_file = "coco_dataset/annotations/instances_train.json"
|
| 347 |
+
|
| 348 |
+
dataset_val.dataset.img_folder = "coco_dataset/val"
|
| 349 |
+
dataset_val.dataset.ann_file = "coco_dataset/annotations/instances_val.json"
|
| 350 |
+
|
| 351 |
+
# Evaluator: point to our val annotations.
|
| 352 |
+
# CocoEvaluator uses keypoints iou_type for pose eval.
|
| 353 |
+
evaluator.ann_file = "coco_dataset/annotations/instances_val.json"
|
| 354 |
+
evaluator.iou_types = ['keypoints']
|
| 355 |
+
evaluator.useCats = True
|
| 356 |
+
# Pass our 4-corner sigmas so pycocotools.COCOeval uses the right OKS scale
|
| 357 |
+
# instead of the 17-person defaults. Without this, the eval truncates the
|
| 358 |
+
# first 4 person sigmas (nose/l_eye/r_eye/l_ear) and all AP collapses to 0.
|
| 359 |
+
# The CocoEvaluator constructor was patched (mtg-fork) to accept this kwarg.
|
| 360 |
+
evaluator.kpt_oks_sigmas = oks_sigmas
|
| 361 |
+
|
| 362 |
+
# Output directory for checkpoints
|
| 363 |
+
training_params.output_dir = "output/mtg_card_4kp"
|
run_20260519_012809/eval/000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:755cca2d9cb0112e1682b5eb60f5007abf42892e58f3d4f73bdaee2529d8c734
|
| 3 |
+
size 574357
|
run_20260519_012809/eval/latest.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d9d28b086f8dd9cf17520fa5547cdce4218eb5fee0ac86f735c2e0c023f0fe1
|
| 3 |
+
size 573799
|
run_20260519_012809/log.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"train_lr": 1.999999999999807e-05, "train_loss": 12.543476746572056, "train_loss_keypoints": 1.2924200675378752, "train_loss_keypoints_dn_0": 0.9205444374136824, "train_loss_keypoints_dn_pre": 0.9917099125578445, "train_loss_keypoints_enc_0": 2.873148705925127, "train_loss_keypoints_pre": 1.3559586186052888, "train_loss_oks": 0.3853887344912788, "train_loss_oks_dn_0": 0.5664547389831461, "train_loss_oks_dn_pre": 0.5764505134504863, "train_loss_oks_enc_0": 1.0937668736151316, "train_loss_oks_pre": 0.38551030588972934, "train_loss_vfl": 0.17695714877120275, "train_loss_vfl_dn_0": 0.533061496633356, "train_loss_vfl_dn_pre": 0.546968495083348, "train_loss_vfl_enc_0": 0.66637879066898, "train_loss_vfl_pre": 0.17875792909365126, "test_coco_eval_keypoints": [0.9900901282433162, 0.9900930550152051, 0.9900930550152051, -1.0, 0.9900901282433162, 0.9998699999999999, 0.9999, 0.9999, -1.0, 0.9998699999999999], "test_mtg_kp_l2_norm_per_corner": [0.0007670049089938402, 0.0008375109755434096, 0.0008375166798941791, 0.0008211143431253731], "test_mtg_kp_l2_norm_mean": 0.0008157867268892005, "epoch": 0, "n_parameters": 3601989, "now_time": "2026-05-19 03:46:01.686364", "epoch_time": "2:17:35"}
|
| 2 |
+
{"train_lr": 1.999999999999807e-05, "train_loss": 2.994915947076494, "train_loss_keypoints": 0.1010790098453219, "train_loss_keypoints_dn_0": 0.284792542535896, "train_loss_keypoints_dn_pre": 0.28494799813946237, "train_loss_keypoints_enc_0": 0.46997490368083944, "train_loss_keypoints_pre": 0.10092079869171004, "train_loss_oks": 0.02156980774322143, "train_loss_oks_dn_0": 0.14225913356885422, "train_loss_oks_dn_pre": 0.1424708259666102, "train_loss_oks_enc_0": 0.26801609149232236, "train_loss_oks_pre": 0.021543673624277265, "train_loss_vfl": 0.05088163607264508, "train_loss_vfl_dn_0": 0.3026131311032981, "train_loss_vfl_dn_pre": 0.3101332919516496, "train_loss_vfl_enc_0": 0.4422853824269553, "train_loss_vfl_pre": 0.05142771953485578, "test_coco_eval_keypoints": [0.9990016902607651, 0.9999910971215015, 0.9999910971215015, -1.0, 0.9990016902607651, 0.99999, 1.0, 1.0, -1.0, 0.99999], "test_mtg_kp_l2_norm_per_corner": [0.0006501294556073844, 0.0006766291335225105, 0.0006809880142100155, 0.0006852635415270925], "test_mtg_kp_l2_norm_mean": 0.0006732525362167507, "epoch": 1, "n_parameters": 3601989, "now_time": "2026-05-19 06:03:47.075803", "epoch_time": "2:17:45"}
|
run_20260519_012809/summary/events.out.tfevents.1779154095.palpatine.458155.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0360420cd4a5c28fe09343bd2fef9a5cd22e37a625824ff30c8b953f2542426
|
| 3 |
+
size 5307086
|
run_20260519_012809/summary/events.out.tfevents.1779154095.palpatine.458156.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be9655c314b9859070066a62af066d51db7028b63ade3b74c0d36cc2fbedbff8
|
| 3 |
+
size 88
|
run_20260519_012809/train.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|