dhvazquez commited on
Commit
1ff644f
·
verified ·
1 Parent(s): f6ea357

Upload Train + exports

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ exports/mtg_4kp_s_nosim.onnx.data filter=lfs diff=lfs merge=lfs -text
37
+ exports/mtg_4kp_s_r4b.onnx.data filter=lfs diff=lfs merge=lfs -text
38
+ exports/mtg_4kp_s.onnx.data filter=lfs diff=lfs merge=lfs -text
exports/mtg_4kp_s.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c72f71bf017880ebbcde10f2ef630fa3e2fa3167124e7668434f6f726136e70
3
+ size 47111454
exports/mtg_4kp_s.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17cb72be4056e6d6a55210a448cc3bbf23610e416b355d99513a73ae73860c96
3
+ size 46792704
exports/mtg_4kp_s_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4ed2defe0403ef700cb606982ee8e2c5e190c0fb315d4106c4f9ebb404f491b
3
+ size 23666438
exports/mtg_4kp_s_int8_conv_int4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba99236110bbad78a1ab75091a573f4136d33581cd75485b440a25c7f516ebf0
3
+ size 10561260
exports/mtg_4kp_s_int8_conv_int4g32.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:232b8daa8d1fe47934541839c01984e6c40ea78dfecc992279fe0001c4d951cd
3
+ size 11262351
exports/mtg_4kp_s_int8_conv_wo.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39e6493a2519b30ae120f2e173319564af31fb4cac12d439ed8cf3b934ae742e
3
+ size 13068978
exports/mtg_4kp_s_int8_conv_wo_fp16res.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60608af82ba1f9192782fef79719ef6b7793b23b54ae6bdab7a5dc4b4d398d26
3
+ size 12458839
exports/mtg_4kp_s_int8_conv_wo_opt.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:240b38b6365dfb94ca61f4c13c51892f8c78e95de062a52d3e63dad5ece83ce4
3
+ size 13045930
exports/mtg_4kp_s_int8_dynamic.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a567c68b46b8f8f9f147827f4ce0301e4c4a59a6d783c82ab7a4b8d9ea5eeb7
3
+ size 28120147
exports/mtg_4kp_s_nosim.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3f2c47a664f550710588efa67ec607de3d8669902b5e08e34a68e4bdaa711ed
3
+ size 3061750
exports/mtg_4kp_s_nosim.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5f8e83484053d6cc2152f5a8f1d6218e0ba19e40cacf3cbea790ae6c451081c
3
+ size 46792704
exports/mtg_4kp_s_r4b.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d33788e992e4882e37b1d2307994942552413092fdebaf998728cdbda4b5a99
3
+ size 14850609
exports/mtg_4kp_s_r4b.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8977c772c498a74eeed131322efe13d9ce9bfde4a3bfa5242ba583615f2a3a22
3
+ size 14745600
exports/mtg_4kp_s_r4b_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e203415be3c526d486082f5a9b2de15bc03d6b443c5dc2a7f8bf5f3031a48471
3
+ size 7495617
run_20260519_012809/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a66ae6431459f3775eed0d847b8fa3f697d4d7ee5b3ffc2e72be993bc079b0d
3
+ size 59580875
run_20260519_012809/checkpoint0000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8910ba9beb071960bd6ac07e1cdf95324b03be4840a88e4769e868da281c4a0f
3
+ size 59590695
run_20260519_012809/checkpoint0001.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a939b95e08fdc272e44fcf15e02130d9ffb8d3520d463bfdd08025174a5bf268
3
+ size 59590695
run_20260519_012809/checkpoint_best_regular.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18b495cf63a29735287d2ac97299f87cb206c220b0fb0e28242feb9ff9a4b7a6
3
+ size 59612854
run_20260519_012809/effective_config.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MTG card 4-keypoint training config for DETRPose-S.
2
+
3
+ This config is a LazyCall override of the upstream DETRPose-S (HGNetV2-B0) config.
4
+ It inherits the full upstream architecture and only overrides what differs for
5
+ single-class MTG card corner detection with 4 keypoints.
6
+
7
+ Placement: detrpose/configs/mtg_card_4kp.py
8
+ Load with:
9
+ python train.py --config_file configs/mtg_card_4kp.py --device cuda --amp
10
+
11
+ NOTE (T14): Three upstream files hardcode OKS sigma dispatch tables that crash
12
+ for num_body_points not in {3, 14, 17}:
13
+ - src/models/detrpose/matcher.py:29 (HungarianMatcher.__init__)
14
+ - src/misc/keypoint_loss.py:49 (OKSLoss.__init__)
15
+ - src/models/detrpose/dn_component.py:19 (get_sigmas)
16
+ Task 14 must add a num_keypoints==4 branch to each using:
17
+ oks_sigmas = [0.025, 0.025, 0.025, 0.025]
18
+ Until then, training will crash at model init with NotImplementedError/ValueError.
19
+ """
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # Base: import from the DETRPose-S include hierarchy (relative to this file's
23
+ # location, which is detrpose/configs/; the include files live at
24
+ # detrpose/configs/detrpose/include/).
25
+ # ---------------------------------------------------------------------------
26
+ from .detrpose.detrpose_hgnetv2_s import (
27
+ model,
28
+ criterion,
29
+ training_params,
30
+ postprocessor,
31
+ ema,
32
+ optimizer,
33
+ lr_scheduler,
34
+ )
35
+ from .detrpose.include.dataset import dataset_train, dataset_val, evaluator
36
+
37
+ from src.core import LazyCall as L
38
+ from src.data.coco_eval import CocoEvaluator
39
+ import src.data.transforms as T
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Spec §: Task-level constants (consumed by T14 patches and future reference)
43
+ # ---------------------------------------------------------------------------
44
+
45
+ # Number of card corners; drives model/criterion/postprocessor.
46
+ # NOTE(T14): oks sigma dispatch in matcher.py / keypoint_loss.py / dn_component.py
47
+ # must add a branch for this value using oks_sigmas below.
48
+ num_body_points = 4
49
+
50
+ # OKS sigmas — uniform for all 4 corners (no anatomical significance).
51
+ # Normalized to ~1/40 of image fraction following COCO convention (σ=0.025 ≈ 1/(2*20)).
52
+ # TODO(T14): wire these into the 3 sigma dispatch tables instead of hardcoding.
53
+ oks_sigmas = [0.025, 0.025, 0.025, 0.025]
54
+
55
+ # Single class: "mtg_card" (background is implicit, class 0 is the card)
56
+ num_classes = 1
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # Spec §: Model overrides — num_body_points and num_classes in 3 places
60
+ # ---------------------------------------------------------------------------
61
+
62
+ # (1) Transformer: drives keypoint head output dimension
63
+ model.transformer.num_body_points = num_body_points
64
+ model.transformer.num_classes = num_classes
65
+
66
+ # (2) Criterion: drives loss computation over keypoints
67
+ criterion.num_classes = num_classes
68
+ criterion.num_body_points = num_body_points
69
+
70
+ # Loss weights — mapped from spec to upstream key names:
71
+ # spec "cls" → upstream "loss_vfl" (varifocal classification loss)
72
+ # spec "keypoints_l1" → upstream "loss_keypoints" (L1 keypoint regression)
73
+ # spec "keypoints_oks" → upstream "loss_oks" (OKS keypoint loss)
74
+ # Note: upstream DETRPose does NOT have separate bbox_l1 / bbox_giou loss keys;
75
+ # bounding boxes are recovered from keypoint predictions, not via a dedicated
76
+ # bbox branch. The spec's bbox_l1=5.0 / bbox_giou=2.0 weights have no upstream
77
+ # equivalent and are omitted here.
78
+ criterion.weight_dict = {
79
+ 'loss_vfl': 2.0, # spec: cls=2.0
80
+ 'loss_keypoints': 10.0, # spec: keypoints_l1=10.0
81
+ 'loss_oks': 4.0, # spec: keypoints_oks=4.0
82
+ }
83
+
84
+ # Matcher costs — mirror weight_dict ratios so Hungarian assignment is consistent
85
+ criterion.matcher.cost_class = 2.0
86
+ criterion.matcher.cost_keypoints = 10.0
87
+ criterion.matcher.cost_oks = 4.0
88
+ criterion.matcher.num_body_points = num_body_points
89
+
90
+ # (3) PostProcessor: drives output decoding
91
+ postprocessor.num_body_points = num_body_points
92
+
93
+ # ---------------------------------------------------------------------------
94
+ # Round 4a — Model surgery for inference latency (2026-05-18)
95
+ # ---------------------------------------------------------------------------
96
+ # DETRPose-S was tuned for COCO multi-person pose (60 queries to find ≤ K
97
+ # people, 6 decoder layers to refine 17-keypoint anatomy). For MTG card
98
+ # corner detection we have 1 object per image with 4 deterministic corners,
99
+ # so the upstream defaults are wildly over-provisioned. Three config-only
100
+ # overrides cut a meaningful chunk of the decoder cost without invasive
101
+ # code changes:
102
+ #
103
+ # - num_queries 60 → 10: cross-attention scales linearly with queries
104
+ # and we only ever consume the top-1. 10 still gives the Hungarian
105
+ # matcher slack (≥ 4 ensures every keypoint slot has a query) without
106
+ # paying for the 50 unused ones.
107
+ # - num_decoder_layers 6 → 3: halves decoder compute. DETR keypoint
108
+ # refinement converges fast on a single-object task; 3 layers is
109
+ # plenty.
110
+ # - dec_n_points 4 → 2: each decoder query sampled 4 reference points
111
+ # per feature level via deformable attention, which is what the 10
112
+ # WebGPU `GridSample` ops implement. Cutting to 2 halves those
113
+ # dispatches — the WebGPU EP's biggest pain point on Mali / GCN.
114
+ #
115
+ # `postprocessor.num_select` must match `num_queries` (it does top-k over
116
+ # all available queries; if num_select > num_queries the index math
117
+ # silently wraps).
118
+ #
119
+ # Expected impact: -20 to -30 % inference on top of the FP16 model
120
+ # (Ampere 191 → ~140 ms, GCN-5 342 → ~250 ms, Apple Tahoe 218 → ~170 ms).
121
+ #
122
+ # Costs:
123
+ # - Existing checkpoints are NOT compatible — different head sizes,
124
+ # different decoder depth. Training restarts from scratch.
125
+ # - Smoke run before committing GPU: `python scripts/_make_small_dataset.py`
126
+ # + `python scripts/train.py --config detrpose/configs/mtg_card_4kp_smoke.py --single-gpu`
127
+ # verifies convergence in ~15 min before the 3-8 h full run.
128
+ #
129
+ # Round 4b (2026-05-19): R4a converged to AP=0.997 in a SINGLE epoch (run
130
+ # runs/run_20260518_210733/, checkpoint backed up as r4a_best_epoch0_AP997.pth).
131
+ # That's a saturated benchmark with the AP near 1.0 in epoch 0 — the model
132
+ # still has way too much capacity for a 4-corner-of-1-rigid-card task with
133
+ # synthetic data. R4b shrinks the transformer 3-4× further on top of R4a:
134
+ #
135
+ # - hidden_dim 256 → 128: quarters the attention compute (O(d²) per token).
136
+ # Touches BOTH encoder and transformer; both must be set or the model
137
+ # init fails on dim mismatch at the encoder→transformer boundary.
138
+ # - dim_feedforward 1024 → 512: halves FFN compute.
139
+ # - nhead 8 → 4: halves attention head projections.
140
+ # - num_decoder_layers 3 → 1: single decoder pass. With AP saturating in
141
+ # 1 epoch already, one decoder layer is plenty for this task.
142
+ # - num_queries 10 → 4: minimum that still gives Hungarian a slot per
143
+ # keypoint. We never use more than top-1 anyway.
144
+ #
145
+ # Param count goes 11.35 M → ~5-6 M (backbone HGNetv2-B0 dominates and
146
+ # can't shrink without a bigger surgery). FLOPS roughly halve again.
147
+ #
148
+ # Expected inference latency on FP16+WebGPU: Ampere 191 → ~90 ms,
149
+ # GCN-5 342 → ~200 ms. macOS Tahoe → ~120 ms.
150
+ model.transformer.hidden_dim = 128
151
+ model.transformer.dim_feedforward = 512
152
+ model.transformer.nhead = 4
153
+ model.transformer.num_decoder_layers = 1
154
+ model.transformer.num_queries = 4
155
+ model.transformer.dec_n_points = 2
156
+
157
+ model.encoder.hidden_dim = 128
158
+ model.encoder.dim_feedforward = 512
159
+ model.encoder.nhead = 4
160
+
161
+ postprocessor.num_select = 4
162
+
163
+ # ---------------------------------------------------------------------------
164
+ # Spec §: Training hyperparameters
165
+ # ---------------------------------------------------------------------------
166
+
167
+ # Upstream default is 100 epochs; spec says 150 — override.
168
+ training_params.epochs = 150
169
+
170
+ # Skip the full val eval that upstream fires before the first training
171
+ # epoch on --resume. It's 30+ min of compute on 307k imgs, and leaves
172
+ # evaluator state accumulated in RAM which feeds the gradual OOM growth we
173
+ # observed in epoch 1. The end-of-epoch eval (run every epoch anyway)
174
+ # provides the same signal.
175
+ training_params.skip_initial_eval = True
176
+
177
+ # Cap val eval at a deterministic subset of N images.
178
+ #
179
+ # Without this, every epoch's end-of-epoch eval runs over all 307k val imgs
180
+ # (9623 batches @ bs=32). pycocotools' CocoEvaluator.update() appends an
181
+ # [C, A, B] numpy array of per-image match dicts per batch — confirmed by
182
+ # scripts/_repro_eval_leak.py to grow at ~0.4 MB/batch (≈4 GB over the full
183
+ # val per rank). With 2-rank DDP on a 60 GB host, the val dataloader was
184
+ # OOM-killed at ~25-32 % of eval (runs/run_20260413_234839 iter 2360 and
185
+ # run_20260414_102022 iter 3010). Subsampling caps that accumulator without
186
+ # touching the source dataset on disk.
187
+ #
188
+ # N=10_000 keeps eval at ~1 min/epoch, accumulator ≤ 130 MB / rank, and gives
189
+ # a stable per-epoch AP signal. Run a full-set eval offline at the end of
190
+ # training for the headline benchmark. Set to 0/None to disable the cap.
191
+ training_params.max_eval_samples = 10000
192
+
193
+ # torch.compile is honored by our trainer.py patch but disabled here.
194
+ # Tried with mode="default", fullgraph=False — crashed in backward with:
195
+ # "RuntimeError: one of the variables needed for gradient computation has
196
+ # been modified by an inplace operation: [HalfTensor [32, 256, 80, 80]]"
197
+ # Root cause: inductor reordering + AMP + DDP gradient buckets clashes with
198
+ # the inplace activations in HGNetv2 backbone. Not worth the debugging rabbit
199
+ # hole; bigger speedups come from reducing epochs or dataset size.
200
+ training_params.compile_model = False
201
+ training_params.compile_mode = "default"
202
+
203
+ # Gradient clipping (spec: grad_clip=0.1; upstream default is also 0.1 —
204
+ # keeping explicit for clarity)
205
+ training_params.clip_max_norm = 0.1
206
+
207
+ # Batch size: bumped 16 → 64 (32/GPU) to saturate 2× RTX 3090 — at 16 the
208
+ # GPUs idled at ~50% with only 6–7 GB of 24 GB VRAM used (data-loading bound).
209
+ # Requires linear LR scaling below (4× = 0.0001 → 0.0004).
210
+ dataset_train.total_batch_size = 64
211
+
212
+ # Data-loading: 6 workers per rank (12 total). Reduced from 8 because we hit
213
+ # OOM-kill at iter 11000/11226 of epoch 1 on a 60 GB host — during epoch 1
214
+ # training, RAM grew from ~31 GB (epoch start) to OOM (~60+ GB) over 11k
215
+ # iters, a gradual leak of ~2-3 MB/iter we couldn't fully attribute in-run.
216
+ # Hypothesis: fragmentation + residual state from the on-resume val eval +
217
+ # pin_memory accumulation. Cutting workers 8→6 reclaims ~4-5 GB of headroom
218
+ # and doesn't affect throughput (data: 0.0002 — we're compute-bound, not
219
+ # loader-bound).
220
+ #
221
+ # persistent_workers=False, prefetch_factor=2 — same rationale as before,
222
+ # see #97432 (prefetch + pin_memory leak) and #62066 (persistent workers
223
+ # accumulating CoW pages).
224
+ dataset_train.num_workers = 6
225
+ dataset_train.persistent_workers = False
226
+ dataset_train.prefetch_factor = 2
227
+
228
+ # Val loader: keep the loader footprint small without increasing main-process
229
+ # pressure. The upstream default (bs=32, num_workers=4) OOM-killed a worker
230
+ # at ~32% of end-of-epoch val on 307k imgs.
231
+ #
232
+ # Two independent RAM pressures in val eval:
233
+ # (a) worker side: num_workers × (forked parent RSS + prefetch buffers).
234
+ # Halving num_workers to 1 cuts this dominant term ~4×.
235
+ # (b) main-process side: CocoEvaluator.update() calls `COCO.loadRes(coco_gt,
236
+ # results)` once per batch — creating a fresh pycocotools index per
237
+ # call. Reducing batch_size would DOUBLE the number of calls and the
238
+ # main-process allocator pressure, so we keep batch_size at 32.
239
+ #
240
+ # pin_memory=False because eval is a no_grad forward — there's no backward
241
+ # to benefit from pinned host buffers, and pinning ~1-2 GB of non-swappable
242
+ # RAM is pure overhead. PyTorch docs confirm pin_memory is page-locked and
243
+ # counts against the OOM killer's notion of used memory.
244
+ #
245
+ # Rationale cross-checked against:
246
+ # - pytorch/pytorch#8976 (SIGKILL workers = OOM)
247
+ # - pytorch/pytorch#13246 (num_workers × RSS replication)
248
+ # - Yuxin Wu "Demystify RAM Usage in Multiprocess DataLoader"
249
+ # - facebookresearch/detr#423, #602 (DETR-family val OOM)
250
+ dataset_val.num_workers = 1
251
+ dataset_val.pin_memory = False
252
+
253
+ # Image size 640×640 — upstream already uses (640,640); explicit for clarity.
254
+ # eval_spatial_size lives in include/detrpose_hgnetv2.py and is referenced by
255
+ # the encoder/transformer. We do NOT re-import and override it here because
256
+ # changing it would require re-instantiating encoder/transformer embed sizes.
257
+ # Training at 640×640 (the upstream default for -S) already satisfies spec.
258
+
259
+ # Optimizer: spec lr=0.0001 @ bs=16, lr_backbone=0.00001, weight_decay=0.0001.
260
+ # Batch size was bumped 16 → 64, so LR is scaled by √(64/16)=2× using the
261
+ # square-root rule instead of linear. Rationale: linear scaling (×4) is the
262
+ # optimum for ResNet-style conv nets, but DETR-family transformers with many
263
+ # aux heads are well known to go numerically unstable under aggressive LR
264
+ # scaling. We tried linear (lr=4e-4): training was converging (loss 192→15
265
+ # in 4k iters) then a single bad batch produced NaN in fp16 aux-head logits,
266
+ # the Hungarian matcher in scipy raised "matrix contains invalid numeric
267
+ # entries", and DDP timed out after 10 min waiting on the dead rank.
268
+ # Square-root scaling (lr=2e-4) gives back headroom against fp16 overflow
269
+ # while still benefitting from the larger batch — it's the standard choice
270
+ # for transformers per Goyal et al. 2017 §2.1 and AdamW large-batch studies.
271
+ # lr_head : 0.0001 → 0.0002
272
+ # lr_backbone : 0.00001 → 0.00002
273
+ optimizer.lr = 0.0002
274
+ optimizer.weight_decay = 0.0001
275
+ optimizer.params.cfg = [
276
+ {
277
+ 'params': '^(?=.*backbone).*$',
278
+ 'lr': 0.00002, # √(bs 16→64)=2× of spec lr_backbone=1e-5
279
+ },
280
+ ]
281
+
282
+ # LR scheduler: keep upstream MultiStepLR with no decay during training window
283
+ # (milestones=[1000] effectively means no step during 150 epochs).
284
+
285
+ # ---------------------------------------------------------------------------
286
+ # Spec §: Stop-epoch policy for augmentation ops — scaled to 150 epochs
287
+ # Upstream -S uses [5, 53, 96]; we scale proportionally to 150 epochs:
288
+ # start_mosaic=5, stop_zoomout=round(53*150/100)=80, stop_mosaic=round(96*150/100)=144
289
+ # ---------------------------------------------------------------------------
290
+ dataset_train.dataset.transforms.policy = {
291
+ 'name': 'stop_epoch',
292
+ 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
293
+ 'epoch': [5, 80, 144], # scaled from [5, 53, 96] @ 100ep → 150ep
294
+ }
295
+ dataset_train.collate_fn.base_size_repeat = 20
296
+ dataset_train.collate_fn.stop_epoch = 144
297
+
298
+ # ---------------------------------------------------------------------------
299
+ # Spec §: Augmentation — add HSVJitter and set flip_pairs=[[1,2],[3,4]]
300
+ # (swap TL↔TR and BR↔BL on horizontal flip). HSVJitter is inserted as
301
+ # transforms3 (after load/mosaic transforms1-2, before geometric transforms).
302
+ #
303
+ # Upstream Compose receives transforms as **kwargs keyed transforms1..transforms7.
304
+ # The Compose.__init__ iterates dict values in insertion order (Python 3.7+),
305
+ # so we can rename slots to insert HSVJitter. New pipeline:
306
+ # transforms1 = Mosaic
307
+ # transforms2 = RandomZoomOut
308
+ # transforms3 = HSVJitter ← NEW: color jitter before geometric ops
309
+ # transforms4 = RandomHorizontalFlip(flip_pairs=[[0,1],[2,3]]) ← 0-indexed corner swap
310
+ # transforms5 = ColorJitter (kept; can coexist with HSVJitter)
311
+ # transforms6 = RandomResize
312
+ # transforms7 = ToTensor
313
+ # transforms8 = Normalize
314
+ #
315
+ # Note: upstream dataset.py uses a fixed set of numbered kwargs; we override
316
+ # the entire transforms object on dataset_train.dataset.transforms to inject
317
+ # the new pipeline cleanly.
318
+ # ---------------------------------------------------------------------------
319
+ from .detrpose.include.detrpose_hgnetv2 import eval_spatial_size
320
+ from src.data.container import Compose
321
+
322
+ _scales = [(640, 640)]
323
+ _max_size = 1333
324
+
325
+ dataset_train.dataset.transforms = L(Compose)(
326
+ policy={
327
+ 'name': 'stop_epoch',
328
+ 'ops': ['Mosaic', 'RandomCrop', 'RandomZoomOut'],
329
+ 'epoch': [5, 80, 144],
330
+ },
331
+ mosaic_prob=0.5,
332
+ transforms1=L(T.Mosaic)(output_size=320, probability=1.0),
333
+ transforms2=L(T.RandomZoomOut)(p=0.5),
334
+ transforms3=L(T.HSVJitter)(h=0.015, s=0.7, v=0.4), # spec: HSVJitter early
335
+ transforms4=L(T.RandomHorizontalFlip)(flip_pairs=[[0, 1], [2, 3]]), # 0-indexed pairs: TL↔TR (0↔1), BR↔BL (2↔3) — preserves corner semantics after h-flip
336
+ transforms5=L(T.ColorJitter)(), # upstream default; coexists with HSVJitter
337
+ transforms6=L(T.RandomResize)(sizes=_scales, max_size=_max_size),
338
+ transforms7=L(T.ToTensor)(),
339
+ transforms8=L(T.Normalize)(mean=[0, 0, 0], std=[1, 1, 1]),
340
+ )
341
+
342
+ # ---------------------------------------------------------------------------
343
+ # Spec §: Dataset paths — relative to cwd when training runs (train-pose-estimation-v2/)
344
+ # ---------------------------------------------------------------------------
345
+ dataset_train.dataset.img_folder = "coco_dataset/train"
346
+ dataset_train.dataset.ann_file = "coco_dataset/annotations/instances_train.json"
347
+
348
+ dataset_val.dataset.img_folder = "coco_dataset/val"
349
+ dataset_val.dataset.ann_file = "coco_dataset/annotations/instances_val.json"
350
+
351
+ # Evaluator: point to our val annotations.
352
+ # CocoEvaluator uses keypoints iou_type for pose eval.
353
+ evaluator.ann_file = "coco_dataset/annotations/instances_val.json"
354
+ evaluator.iou_types = ['keypoints']
355
+ evaluator.useCats = True
356
+ # Pass our 4-corner sigmas so pycocotools.COCOeval uses the right OKS scale
357
+ # instead of the 17-person defaults. Without this, the eval truncates the
358
+ # first 4 person sigmas (nose/l_eye/r_eye/l_ear) and all AP collapses to 0.
359
+ # The CocoEvaluator constructor was patched (mtg-fork) to accept this kwarg.
360
+ evaluator.kpt_oks_sigmas = oks_sigmas
361
+
362
+ # Output directory for checkpoints
363
+ training_params.output_dir = "output/mtg_card_4kp"
run_20260519_012809/eval/000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:755cca2d9cb0112e1682b5eb60f5007abf42892e58f3d4f73bdaee2529d8c734
3
+ size 574357
run_20260519_012809/eval/latest.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d9d28b086f8dd9cf17520fa5547cdce4218eb5fee0ac86f735c2e0c023f0fe1
3
+ size 573799
run_20260519_012809/log.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ {"train_lr": 1.999999999999807e-05, "train_loss": 12.543476746572056, "train_loss_keypoints": 1.2924200675378752, "train_loss_keypoints_dn_0": 0.9205444374136824, "train_loss_keypoints_dn_pre": 0.9917099125578445, "train_loss_keypoints_enc_0": 2.873148705925127, "train_loss_keypoints_pre": 1.3559586186052888, "train_loss_oks": 0.3853887344912788, "train_loss_oks_dn_0": 0.5664547389831461, "train_loss_oks_dn_pre": 0.5764505134504863, "train_loss_oks_enc_0": 1.0937668736151316, "train_loss_oks_pre": 0.38551030588972934, "train_loss_vfl": 0.17695714877120275, "train_loss_vfl_dn_0": 0.533061496633356, "train_loss_vfl_dn_pre": 0.546968495083348, "train_loss_vfl_enc_0": 0.66637879066898, "train_loss_vfl_pre": 0.17875792909365126, "test_coco_eval_keypoints": [0.9900901282433162, 0.9900930550152051, 0.9900930550152051, -1.0, 0.9900901282433162, 0.9998699999999999, 0.9999, 0.9999, -1.0, 0.9998699999999999], "test_mtg_kp_l2_norm_per_corner": [0.0007670049089938402, 0.0008375109755434096, 0.0008375166798941791, 0.0008211143431253731], "test_mtg_kp_l2_norm_mean": 0.0008157867268892005, "epoch": 0, "n_parameters": 3601989, "now_time": "2026-05-19 03:46:01.686364", "epoch_time": "2:17:35"}
2
+ {"train_lr": 1.999999999999807e-05, "train_loss": 2.994915947076494, "train_loss_keypoints": 0.1010790098453219, "train_loss_keypoints_dn_0": 0.284792542535896, "train_loss_keypoints_dn_pre": 0.28494799813946237, "train_loss_keypoints_enc_0": 0.46997490368083944, "train_loss_keypoints_pre": 0.10092079869171004, "train_loss_oks": 0.02156980774322143, "train_loss_oks_dn_0": 0.14225913356885422, "train_loss_oks_dn_pre": 0.1424708259666102, "train_loss_oks_enc_0": 0.26801609149232236, "train_loss_oks_pre": 0.021543673624277265, "train_loss_vfl": 0.05088163607264508, "train_loss_vfl_dn_0": 0.3026131311032981, "train_loss_vfl_dn_pre": 0.3101332919516496, "train_loss_vfl_enc_0": 0.4422853824269553, "train_loss_vfl_pre": 0.05142771953485578, "test_coco_eval_keypoints": [0.9990016902607651, 0.9999910971215015, 0.9999910971215015, -1.0, 0.9990016902607651, 0.99999, 1.0, 1.0, -1.0, 0.99999], "test_mtg_kp_l2_norm_per_corner": [0.0006501294556073844, 0.0006766291335225105, 0.0006809880142100155, 0.0006852635415270925], "test_mtg_kp_l2_norm_mean": 0.0006732525362167507, "epoch": 1, "n_parameters": 3601989, "now_time": "2026-05-19 06:03:47.075803", "epoch_time": "2:17:45"}
run_20260519_012809/summary/events.out.tfevents.1779154095.palpatine.458155.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0360420cd4a5c28fe09343bd2fef9a5cd22e37a625824ff30c8b953f2542426
3
+ size 5307086
run_20260519_012809/summary/events.out.tfevents.1779154095.palpatine.458156.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be9655c314b9859070066a62af066d51db7028b63ade3b74c0d36cc2fbedbff8
3
+ size 88
run_20260519_012809/train.log ADDED
The diff for this file is too large to render. See raw diff