File size: 52,931 Bytes
b048bc2
 
 
 
 
 
 
 
 
 
 
 
 
 
77edebf
c040324
0501689
77edebf
 
 
 
 
 
c040324
 
 
0e3b21f
b2f3ce6
f4924d6
77edebf
 
 
b2f3ce6
c040324
3112173
77edebf
 
 
 
f4924d6
3112173
b5ad973
58ce093
 
01d67e9
b5ad973
 
a662bfa
53de73a
3112173
a662bfa
53de73a
a662bfa
c4e21b3
01d67e9
f585077
58ce093
046548a
d2161b1
c040324
01d67e9
f4924d6
58ce093
 
 
 
 
 
2893b22
 
c4b5d70
58ce093
0501689
c040324
77edebf
 
b2f3ce6
 
 
 
 
 
 
 
 
c040324
97b9a4a
 
461547b
97b9a4a
 
 
 
 
c040324
 
 
 
 
 
628bc9e
 
 
 
 
 
 
 
97b9a4a
0c44305
c040324
 
97b9a4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b224eee
 
 
 
3112173
 
a662bfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6facf47
a662bfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6facf47
a662bfa
 
 
 
 
 
 
6facf47
a662bfa
 
 
 
 
 
 
 
6facf47
 
c87b253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58ce093
 
 
 
 
 
 
 
 
 
 
47c86cf
 
58ce093
 
 
 
47c86cf
 
 
 
c4b5d70
 
2893b22
c4b5d70
47c86cf
 
02751ff
 
 
 
 
2893b22
 
 
47c86cf
02751ff
 
 
47c86cf
 
 
 
58ce093
47c86cf
 
 
 
 
 
02751ff
47c86cf
 
 
58ce093
 
c4b5d70
2893b22
 
 
 
47c86cf
 
 
 
58ce093
 
c4b5d70
 
 
 
 
 
 
 
58ce093
 
a662bfa
 
 
 
 
 
 
 
 
 
 
 
 
c4b5d70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47c86cf
58ce093
47c86cf
 
daae24c
 
47c86cf
 
58ce093
47c86cf
 
 
58ce093
 
 
47c86cf
58ce093
47c86cf
58ce093
47c86cf
 
58ce093
a662bfa
 
daae24c
47c86cf
 
 
58ce093
 
daae24c
 
58ce093
 
 
 
 
 
 
 
 
 
a662bfa
 
daae24c
58ce093
 
 
 
 
47c86cf
c4b5d70
 
 
 
daae24c
58ce093
c4b5d70
 
58ce093
47c86cf
 
58ce093
 
 
 
 
47c86cf
58ce093
 
47c86cf
58ce093
a662bfa
 
47c86cf
a662bfa
47c86cf
 
daae24c
58ce093
 
c4b5d70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47c86cf
 
 
2893b22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77edebf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01d67e9
c1cb5e4
01d67e9
c1cb5e4
 
 
 
8eb8954
 
 
01d67e9
 
 
 
c1cb5e4
01d67e9
 
 
 
 
 
 
 
 
 
 
c4b5d70
 
 
c1cb5e4
4a9408a
c4b5d70
 
 
 
d2161b1
 
 
 
 
 
c4b5d70
d2161b1
c4b5d70
 
e611f15
d2161b1
e611f15
 
 
d2161b1
 
e611f15
d2161b1
e611f15
 
c4b5d70
d2161b1
c4b5d70
d2161b1
 
 
c4b5d70
d2161b1
e611f15
 
c4b5d70
c1cb5e4
 
 
c4b5d70
 
c1cb5e4
 
c4b5d70
 
 
 
5140b0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01d67e9
 
 
c1cb5e4
4a9408a
 
 
 
 
01d67e9
 
 
 
 
 
e611f15
 
 
01d67e9
 
 
 
 
 
 
 
f4924d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77edebf
c040324
 
 
 
 
461547b
01d67e9
c1cb5e4
4a9408a
 
 
 
 
01d67e9
 
 
 
 
 
461547b
a662bfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97b9a4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
046548a
 
 
 
a662bfa
 
046548a
 
53de73a
4e86f82
3112173
046548a
c4e21b3
046548a
 
 
53de73a
046548a
3112173
046548a
c4e21b3
f2f35be
f585077
 
 
 
 
 
 
 
 
046548a
6facf47
a662bfa
046548a
f585077
c040324
5fb3ebc
 
 
 
 
 
3112173
31854f7
 
 
 
 
 
 
 
 
 
 
c040324
 
 
 
 
461547b
 
 
c040324
 
 
 
 
952dbca
c040324
 
 
 
 
 
952dbca
8a21dae
952dbca
 
 
c040324
 
 
 
a58058c
 
c040324
 
c87b253
 
 
 
 
 
 
c040324
c87b253
 
 
b224eee
 
 
 
 
 
 
 
 
 
 
 
 
c040324
 
 
 
47c86cf
58ce093
 
 
 
 
47c86cf
 
58ce093
 
2893b22
 
 
 
47c86cf
 
 
 
 
58ce093
 
a662bfa
 
 
58ce093
a662bfa
58ce093
 
 
 
 
47c86cf
58ce093
 
47c86cf
58ce093
47c86cf
 
46e3a72
58ce093
47c86cf
 
 
 
 
 
 
58ce093
 
 
 
c4b5d70
 
 
 
58ce093
 
 
 
 
 
 
 
 
c4b5d70
 
 
 
 
 
 
 
2893b22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58ce093
47c86cf
58ce093
 
 
 
 
47c86cf
 
58ce093
daae24c
47c86cf
 
 
58ce093
daae24c
58ce093
 
c4b5d70
 
 
47c86cf
58ce093
 
 
 
daae24c
c4b5d70
 
 
 
 
 
 
 
 
58ce093
47c86cf
2893b22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a662bfa
47c86cf
c4b5d70
 
 
 
 
 
 
 
 
 
 
 
4e86f82
046548a
a662bfa
 
 
 
 
4e86f82
046548a
a662bfa
 
046548a
4e86f82
c87b253
 
 
 
 
daae24c
f4924d6
c87b253
47c86cf
 
 
 
 
 
58ce093
47c86cf
 
 
58ce093
 
c4b5d70
2893b22
 
 
 
47c86cf
 
 
 
c040324
77edebf
 
 
 
 
 
 
 
 
 
c1cb5e4
4a9408a
 
d2161b1
 
 
c4b5d70
c1cb5e4
c4b5d70
 
 
5140b0a
 
 
 
 
 
 
 
 
f4924d6
 
 
 
 
 
 
 
77edebf
 
 
c040324
77edebf
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
# Copyright 2026 Hugging Face
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""CADGenBench Leaderboard Space - Gradio UI + report-proxy mount.

Read path lives in :mod:`leaderboard`. Submit-tab validation lives in
:mod:`submit`. Both are wired into the Gradio Blocks below. The
Gradio app is mounted under a FastAPI parent so the custom
``/reports/{submission_id}.html`` route can re-serve dataset HTML
with ``Content-Type: text/html`` (HF Hub's ``/resolve/`` serves it
as ``text/plain`` by policy, which makes the browser show source
rather than render).
"""
from __future__ import annotations

import html
import logging
import mimetypes
import os
from functools import lru_cache
from pathlib import Path

import gradio as gr
import pandas as pd
import uvicorn
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, Response
from gradio_leaderboard import Leaderboard
from huggingface_hub import hf_hub_download, snapshot_download

from leaderboard import (
    ADMIN_COLUMNS,
    ADMIN_SELECT_COL,
    HF_DATA_GT_REPO,
    HF_DATA_REPO,
    HF_SUBMISSIONS_REPO,
    LEADERBOARD_COLS,
    LEADERBOARD_DATATYPES,
    LEADERBOARD_HIDE_COLUMNS,
    VALIDATED_LEADERBOARD_COLS,
    VALIDATED_LEADERBOARD_DATATYPES,
    LeaderboardDataError,
    _fmt_timestamp,
    _load_rows_from_hub,
    build_combined_csv,
    load_admin_table,
    load_leaderboard_split,
    render_public_url,
)
from gallery import render_gallery_page
from tasks import load_tasks_from_dir, render_tasks_page
from admin import (
    VALID_METHODS,
    delete_rows,
    demote_rows,
    is_admin,
    promote_rows,
    rescore_all,
    rescore_rows,
    stop_and_delete_rows,
)
from submit import handle_submit

logger = logging.getLogger(__name__)

# Surface module-level logger.info / logger.warning / logger.exception
# calls from leaderboard.py + submit.py in the Space's runtime logs.
# Otherwise they go nowhere and any refresh / worker pathology is
# silent. Format keeps timestamps + module + level + message.
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
)


# Canonical policy doc lives in the code repo so contributors reading
# the GitHub repo see it without needing to visit the Space. Linked
# from both the Detailed View tab's Validation Guidelines accordion and
# the About tab.
VALIDATION_DOC_URL = (
    "https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md"
)

ABOUT_MD = f"""## About

**CADGenBench** evaluates AI-driven CAD generation: how well a model can
turn a description of a mechanical part into a valid, geometrically
correct 3D model.

- **Reference baseline**: an iterative AI agent that writes build123d Python.
- **Submission flow**: upload a zip of per-fixture STEP files; the Space
  runs the eval and appends a row to the submissions dataset.
- **Datasets**: fixture inputs in
  [`{HF_DATA_REPO}`](https://huggingface.co/datasets/{HF_DATA_REPO});
  submissions and computed results in
  [`{HF_SUBMISSIONS_REPO}`](https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}).
- **Code**: [`huggingface/cadgenbench`](https://github.com/huggingface/cadgenbench).
- **Validation policy**: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL}).
- **Data**: CAD geometry from [Mecado](https://www.mecado.com).
"""

# Verbatim BibTeX entry locked in space-setup/bundles/1-2-space-ux.md
# (Locked decisions section). Shown in the Citation accordion as a
# copy-paste handle for anyone citing this benchmark; the About tab
# already links the source code via huggingface/cadgenbench so the
# Space URL is the right deep-link target for the citation.
CITATION_BIBTEX = r"""@misc{cadgenbench2026,
  author       = {Rabinovich, Michael and {Hugging Face}},
  title        = {{CADGenBench}: a benchmark for {AI}-driven {CAD} generation},
  year         = {2026},
  publisher    = {Hugging Face},
  howpublished = {\url{https://huggingface.co/spaces/HuggingAI4Engineering/cadgenbench-leaderboard}},
}"""

VALIDATION_GUIDELINES_MD = f"""Submissions appear on the **Unvalidated** table the moment evaluation completes. Maintainers promote rows to **Validated** after methodology review, accepting one of four evidence types (`code`, `traces`, `api`, `manual`).

Full policy: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL})."""

SUBMIT_STATUS_IDLE = (
    "_Log in, attach a zip, and click **Submit**. Progress and any "
    "errors appear here._"
)


def _data_error_banner_md(message: str | None) -> str:
    """Markdown for the top-of-tab data-unavailable banner.

    Empty string when there's no error (the banner is also hidden via
    ``visible=False`` in that case). When the live ``results.jsonl``
    can't be read, the banner is the loud, persistent signal that the
    tables below are empty *by design* (we never fall back to stale or
    bundled data) rather than because the leaderboard is genuinely
    empty.
    """
    if not message:
        return ""
    return (
        "> ⚠️ **Leaderboard data unavailable.** The live results could not "
        "be read from the Hub, so the tables below are empty. No stale or "
        "cached data is ever shown in its place.\n>\n"
        f"> Details: `{message}`"
    )


def _safe_load_split() -> tuple[pd.DataFrame, pd.DataFrame, str | None]:
    """Load both tiers, turning a Hub failure into empty frames + a message.

    The reader (:func:`load_leaderboard_split`) deliberately *raises*
    on any read failure (no silent fallback). The Space, however, must
    stay up and loudly surface the failure rather than crash, so this
    wrapper converts :class:`LeaderboardDataError` into empty,
    correctly-shaped DataFrames plus an error string the caller renders
    in the banner / a toast. Returns ``(validated, unvalidated, error)``
    with ``error`` ``None`` on success.
    """
    try:
        validated, unvalidated = load_leaderboard_split()
        return validated, unvalidated, None
    except LeaderboardDataError as e:
        logger.exception("Leaderboard data load failed")
        return (
            pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS),
            pd.DataFrame(columns=LEADERBOARD_COLS),
            str(e),
        )


def _safe_load_admin() -> tuple[pd.DataFrame, str | None]:
    """Admin-table counterpart to :func:`_safe_load_split`.

    Same no-crash contract: a Hub read failure yields an empty,
    correctly-shaped admin frame plus the error string instead of
    propagating the exception (which would take the whole Space down at
    boot, since the admin table loads at module-construction time).
    """
    try:
        return load_admin_table(), None
    except LeaderboardDataError as e:
        logger.exception("Admin table load failed")
        return pd.DataFrame(columns=ADMIN_COLUMNS), str(e)


def _refresh_leaderboard_with_toast():
    """Manual Refresh button handler: toast + fresh DataFrames + banner.

    Surfaces the outcome loudly either way: ``gr.Info`` on success,
    ``gr.Warning`` when the live read failed. The third output keeps
    the data-unavailable banner in sync (shown with the error,
    cleared on success).
    """
    validated, unvalidated, error = _safe_load_split()
    if error:
        gr.Warning(f"Leaderboard data unavailable: {error}")
    else:
        gr.Info("Leaderboard refreshed.")
    return (
        validated,
        unvalidated,
        gr.Markdown(value=_data_error_banner_md(error), visible=error is not None),
    )


def _auto_refresh_leaderboard():
    """Timer-tick handler: fresh DataFrames + banner, no success toast.

    Mirrors :func:`_refresh_leaderboard_with_toast` but stays silent on
    success (a toast every 10s would be noise). A read failure still
    fires a loud ``gr.Warning`` and updates the banner so a degraded
    Hub read can't quietly leave the tables blank.
    """
    validated, unvalidated, error = _safe_load_split()
    if error:
        gr.Warning(f"Leaderboard data unavailable: {error}")
    return (
        validated,
        unvalidated,
        gr.Markdown(value=_data_error_banner_md(error), visible=error is not None),
    )


def _enable_submit_when_logged_in(
    profile: gr.OAuthProfile | None,
) -> gr.Button:
    """Flip the Submit button's interactivity based on login state.

    Runs once per page load via ``blocks.load``. Gradio injects
    ``gr.OAuthProfile`` automatically (``None`` if the visitor isn't
    logged in via the LoginButton). The visible-disable mirrors the
    server-side gate in :func:`submit.handle_submit`; the handler
    still raises ``gr.Error`` defensively if it ever gets called
    without a profile.
    """
    return gr.Button(interactive=profile is not None)


def _selected_ids(table_df: pd.DataFrame | None) -> list[str]:
    """Submission ids of the rows whose ``select`` checkbox is ticked."""
    if (
        table_df is None
        or len(table_df) == 0
        or ADMIN_SELECT_COL not in table_df.columns
        or "submission_id" not in table_df.columns
    ):
        return []
    mask = table_df[ADMIN_SELECT_COL].apply(bool)
    return [str(s) for s in table_df.loc[mask, "submission_id"].tolist() if s]


def _admin_selection_status(table_df: pd.DataFrame | None) -> str:
    """Live count line under the admin table, updated as boxes are ticked."""
    n = len(_selected_ids(table_df))
    return f"**{n}** row(s) selected." if n else "_No rows selected._"


def _gate_admin_controls(
    profile: gr.OAuthProfile | None,
) -> tuple[
    gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox, gr.Button,
    gr.Button, gr.Checkbox, gr.Button, gr.Textbox, gr.Button, str,
]:
    """Enable the admin controls only for a logged-in user in the admin set.

    Runs on every page load and re-runs on LoginButton auth events, so
    the table value is also refreshed from the live Hub data instead of
    staying pinned to whatever rows existed when the Space process
    booted. Non-admins and logged-out visitors get the tab with the
    table read-only and every control disabled, mirroring the server-side
    re-check in each handler. The armed-by-confirmation buttons (delete,
    stop-and-delete, rescore-selected, rescore-all) always load disarmed:
    they only enable once their confirm box is ticked / phrase typed.
    """
    admin_df, error = _safe_load_admin()
    if error:
        gr.Warning(f"Admin table unavailable: {error}")
    admin = is_admin(profile)
    if profile is None:
        status = "Log in with an admin account to enable the controls below."
    elif admin:
        status = f"Signed in as `{profile.username}`. Admin controls enabled."
    else:
        status = (
            f"Signed in as `{profile.username}`, which is not in the admin "
            "set. Controls are disabled."
        )
    return (
        gr.Dataframe(value=admin_df, interactive=admin),
        gr.Radio(interactive=admin),
        gr.Button(interactive=admin),
        gr.Button(interactive=admin),
        gr.Checkbox(interactive=admin, value=False),
        gr.Button(interactive=False),
        gr.Button(interactive=False),
        gr.Checkbox(interactive=admin, value=False),
        gr.Button(interactive=False),
        gr.Textbox(interactive=admin, value=""),
        gr.Button(interactive=False),
        status,
    )


def _arm_delete(
    confirm: bool, profile: gr.OAuthProfile | None,
) -> tuple[gr.Button, gr.Button]:
    """Arm both destructive buttons once an admin ticks the confirm box.

    The plain delete and the stop-and-delete share the single confirm
    checkbox, so a deliberate tick is required before either fires.
    """
    armed = bool(confirm) and is_admin(profile)
    return gr.Button(interactive=armed), gr.Button(interactive=armed)


def _refresh_admin_table() -> pd.DataFrame:
    """Admin Refresh button handler: reload the admin table, toast on failure.

    Uses the no-crash :func:`_safe_load_admin` so a Hub read failure
    surfaces as a loud ``gr.Warning`` plus an empty table rather than an
    uncaught exception.
    """
    admin_df, error = _safe_load_admin()
    if error:
        gr.Warning(f"Admin table unavailable: {error}")
    return admin_df


def _reapply_selection(
    fresh: pd.DataFrame, selected: set[str],
) -> pd.DataFrame:
    """Re-tick the ``select`` column on rows the maintainer had selected.

    A freshly-loaded admin frame comes back all-unchecked; this carries
    the prior ticks forward by ``submission_id`` so a background refresh
    doesn't wipe an in-progress selection. Ids that vanished (e.g. a row
    deleted out from under the table) simply drop out.
    """
    if (
        selected
        and ADMIN_SELECT_COL in fresh.columns
        and "submission_id" in fresh.columns
    ):
        fresh[ADMIN_SELECT_COL] = (
            fresh["submission_id"].astype(str).isin(selected)
        )
    return fresh


def _auto_refresh_admin_table(current_df: pd.DataFrame | None) -> pd.DataFrame:
    """Timer-tick handler: reload the admin table, preserving ticked rows.

    The leaderboard tables auto-refresh every 10s but the admin table did
    not, so a pending row submitted after the tab loaded stayed invisible
    until a manual Refresh. This keeps it current on the same cadence.
    Unlike the leaderboard handler it stays silent (no per-tick toast)
    and, on a Hub read failure, returns the current frame unchanged so a
    transient blip never blanks the table or drops the user's selection.
    """
    admin_df, error = _safe_load_admin()
    if error:
        return current_df if current_df is not None else admin_df
    return _reapply_selection(admin_df, set(_selected_ids(current_df)))


def _admin_promote(
    table_df: pd.DataFrame | None,
    method: str | None,
    profile: gr.OAuthProfile | None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
    """Promote ticked rows, then refresh admin, leaderboard, and gallery.

    Re-checks :func:`admin.is_admin` server-side so a tampered client
    that re-enables the button still can't write.
    """
    if not is_admin(profile):
        raise gr.Error("You are not in the admin set.")
    ids = _selected_ids(table_df)
    if not ids:
        raise gr.Error("Tick at least one row first.")
    if not method:
        raise gr.Error("Pick a validation_method first.")
    try:
        promote_rows(ids, method)
    except (LookupError, ValueError) as e:
        raise gr.Error(str(e))
    gr.Info(f"Promoted {len(ids)} row(s) to validated ({method}).")
    validated, unvalidated, _ = _safe_load_split()
    admin_df, _ = _safe_load_admin()
    return admin_df, validated, unvalidated, _gallery_iframe_html()


def _admin_demote(
    table_df: pd.DataFrame | None,
    profile: gr.OAuthProfile | None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
    """Demote ticked rows, then refresh admin, leaderboard, and gallery."""
    if not is_admin(profile):
        raise gr.Error("You are not in the admin set.")
    ids = _selected_ids(table_df)
    if not ids:
        raise gr.Error("Tick at least one row first.")
    try:
        demote_rows(ids)
    except (LookupError, ValueError) as e:
        raise gr.Error(str(e))
    gr.Info(f"Demoted {len(ids)} row(s) to unvalidated.")
    validated, unvalidated, _ = _safe_load_split()
    admin_df, _ = _safe_load_admin()
    return admin_df, validated, unvalidated, _gallery_iframe_html()


def _admin_delete(
    table_df: pd.DataFrame | None,
    confirm: bool,
    profile: gr.OAuthProfile | None,
) -> tuple[
    pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
    gr.Button,
]:
    """Delete ticked rows, then refresh admin, leaderboard, gallery, and disarm.

    Resets the confirm checkbox and re-disables both destructive buttons
    on the way out so the next deletion needs a fresh, deliberate confirm.
    """
    if not is_admin(profile):
        raise gr.Error("You are not in the admin set.")
    if not confirm:
        raise gr.Error("Tick the confirmation box to enable delete.")
    ids = _selected_ids(table_df)
    if not ids:
        raise gr.Error("Tick at least one row first.")
    try:
        delete_rows(ids)
    except ValueError as e:
        raise gr.Error(str(e))
    gr.Info(f"Deleted {len(ids)} submission(s).")
    validated, unvalidated, _ = _safe_load_split()
    admin_df, _ = _safe_load_admin()
    return (
        admin_df,
        validated,
        unvalidated,
        _gallery_iframe_html(),
        gr.Checkbox(value=False),
        gr.Button(interactive=False),
        gr.Button(interactive=False),
    )


def _admin_stop_delete(
    table_df: pd.DataFrame | None,
    confirm: bool,
    profile: gr.OAuthProfile | None,
) -> tuple[
    pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
    gr.Button,
]:
    """Stop running eval job(s) for ticked rows, delete them, then disarm.

    Same gating + disarm contract as :func:`_admin_delete`; the only
    difference is it calls :func:`admin.stop_and_delete_rows`, which
    best-effort cancels the submissions' in-flight HF Jobs before
    deleting. Use this for pending rows whose GPU eval is still running.
    """
    if not is_admin(profile):
        raise gr.Error("You are not in the admin set.")
    if not confirm:
        raise gr.Error("Tick the confirmation box to enable delete.")
    ids = _selected_ids(table_df)
    if not ids:
        raise gr.Error("Tick at least one row first.")
    try:
        stop_and_delete_rows(ids)
    except ValueError as e:
        raise gr.Error(str(e))
    gr.Info(f"Stopped + deleted {len(ids)} submission(s).")
    validated, unvalidated, _ = _safe_load_split()
    admin_df, _ = _safe_load_admin()
    return (
        admin_df,
        validated,
        unvalidated,
        _gallery_iframe_html(),
        gr.Checkbox(value=False),
        gr.Button(interactive=False),
        gr.Button(interactive=False),
    )


# Exact phrase an admin must type to arm the board-wide rescore. A
# free-text match (not a checkbox) is the deliberate "are you sure"
# friction: it can't be tripped by a stray click and forces the admin
# to consciously type the words before the heavy, score-invalidating
# action arms.
RESCORE_ALL_PHRASE = "RESCORE ALL"


def _arm_rescore_selected(
    confirm: bool, profile: gr.OAuthProfile | None,
) -> gr.Button:
    """Arm the rescore-selected button once an admin ticks its confirm box."""
    return gr.Button(interactive=bool(confirm) and is_admin(profile))


def _arm_rescore_all(
    phrase: str | None, profile: gr.OAuthProfile | None,
) -> gr.Button:
    """Arm the rescore-all button only on an exact phrase match by an admin."""
    matched = (phrase or "").strip() == RESCORE_ALL_PHRASE
    return gr.Button(interactive=matched and is_admin(profile))


def _rescore_result_message(dispatched: int, skipped: list[str]) -> str:
    """Toast text summarising a rescore dispatch."""
    msg = (
        f"Rescoring {dispatched} submission(s): rows flipped to pending and "
        f"re-evaluating in the background. The leaderboard repopulates as "
        f"each finishes."
    )
    if skipped:
        msg += (
            f" Skipped {len(skipped)} row(s) with no stored zip (legacy seed "
            f"rows can't be rescored)."
        )
    return msg


def _admin_rescore_selected(
    table_df: pd.DataFrame | None,
    confirm: bool,
    profile: gr.OAuthProfile | None,
) -> tuple[
    pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
]:
    """Re-evaluate the ticked rows, refresh the views, then disarm.

    Same gating contract as the destructive handlers: server-side
    ``is_admin`` re-check, an explicit confirm tick, and a non-empty
    selection. Resets the confirm box + disarms the button on the way
    out so the next rescore needs a fresh, deliberate confirm.
    """
    if not is_admin(profile):
        raise gr.Error("You are not in the admin set.")
    if not confirm:
        raise gr.Error("Tick the confirmation box to enable rescore.")
    ids = _selected_ids(table_df)
    if not ids:
        raise gr.Error("Tick at least one row first.")
    try:
        dispatched, skipped = rescore_rows(ids)
    except (LookupError, ValueError) as e:
        raise gr.Error(str(e))
    gr.Info(_rescore_result_message(dispatched, skipped))
    validated, unvalidated, _ = _safe_load_split()
    admin_df, _ = _safe_load_admin()
    return (
        admin_df,
        validated,
        unvalidated,
        _gallery_iframe_html(),
        gr.Checkbox(value=False),
        gr.Button(interactive=False),
    )


def _admin_rescore_all(
    phrase: str | None,
    profile: gr.OAuthProfile | None,
) -> tuple[
    pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Textbox, gr.Button,
]:
    """Re-evaluate every rescoreable row, refresh the views, then disarm.

    The heavy, board-wide action: re-checks ``is_admin`` and the exact
    confirmation phrase server-side (so a tampered client that
    re-enables the button still can't fire), clears the phrase box, and
    disarms the button afterwards.
    """
    if not is_admin(profile):
        raise gr.Error("You are not in the admin set.")
    if (phrase or "").strip() != RESCORE_ALL_PHRASE:
        raise gr.Error(
            f"Type '{RESCORE_ALL_PHRASE}' exactly to confirm a full rescore."
        )
    try:
        dispatched, skipped = rescore_all()
    except ValueError as e:
        raise gr.Error(str(e))
    gr.Info(_rescore_result_message(dispatched, skipped))
    validated, unvalidated, _ = _safe_load_split()
    admin_df, _ = _safe_load_admin()
    return (
        admin_df,
        validated,
        unvalidated,
        _gallery_iframe_html(),
        gr.Textbox(value=""),
        gr.Button(interactive=False),
    )


@lru_cache(maxsize=128)
def _fetch_report_html(submission_id: str) -> bytes | None:
    """Pull ``reports/<id>.html`` off the submissions dataset.

    Cached in-process so repeat clicks on the same row don't hit
    the Hub. Returns ``None`` on any failure so the caller can
    serve a clean 404 rather than leaking a stack trace.
    """
    try:
        local_path = hf_hub_download(
            repo_id=HF_SUBMISSIONS_REPO,
            filename=f"reports/{submission_id}.html",
            repo_type="dataset",
        )
        return Path(local_path).read_bytes()
    except Exception as e:  # noqa: BLE001 - any Hub failure -> 404
        logger.warning(
            "Failed to fetch report for %s (%s: %s)",
            submission_id, type(e).__name__, e,
        )
        return None


def serve_report(submission_id: str) -> Response:
    """Proxy a per-submission HTML report through the Space.

    HF Hub serves dataset HTML under ``/resolve/`` with
    ``Content-Type: text/plain`` (security: dataset files can't host
    live HTML), so a direct dataset link shows source instead of
    rendering. This route lives on the Space (which can legitimately
    serve text/html) and re-streams the file's bytes with the right
    content-type.
    """
    content = _fetch_report_html(submission_id)
    if content is None:
        return HTMLResponse(
            content="<h1>Report not found</h1>",
            status_code=404,
        )
    return Response(content=content, media_type="text/html; charset=utf-8")


def _fetch_gt_render(fixture: str) -> bytes | None:
    """Pull a fixture's ground-truth GIF from the private GT dataset.

    Path inside the GT repo is ``<fixture>/renders/rotating.webp``. GT
    renders are a property of the data revision, not of any submission,
    so they're served straight from the GT repo rather than duplicated
    per submission. Not memoized for the same reason as :func:`_fetch_render` (GT
    renders can be added/updated on a data revision bump);
    ``hf_hub_download`` handles the per-revision disk cache. Needs the
    Space ``HF_TOKEN``'s read scope on the private repo.
    """
    try:
        local_path = hf_hub_download(
            repo_id=HF_DATA_GT_REPO,
            filename=f"{fixture}/renders/rotating.webp",
            repo_type="dataset",
        )
        return Path(local_path).read_bytes()
    except Exception as e:  # noqa: BLE001 - any Hub failure -> 404
        logger.warning(
            "Failed to fetch GT render for %s (%s: %s)",
            fixture, type(e).__name__, e,
        )
        return None


# Long-lived immutable caching: a (submission, fixture) render never
# changes (fixed camera + lighting; re-renders would be a new artifact),
# so the browser/CDN can keep it forever. This is what makes fixture
# swaps and repeat visits free: only the ~33 on-screen turntables are
# fetched on first paint, and everything after that is a cache hit.
RENDER_CACHE_CONTROL = "public, max-age=31536000, immutable"


def _render_proxy_url(submission_id: str, fixture: str) -> str | None:
    """Resolver for a submission's plain turntable: a public render-bucket URL.

    The eval job uploads ``renders/<id>/<fixture>/rotating.webp`` to the public
    bucket, so the browser fetches it straight from object storage (anonymous,
    no Space proxy hop). The gallery only calls this for ``valid`` fixtures; a
    missing upload 404s and degrades to the dashed cell via ``<img onerror>``.
    """
    return render_public_url(submission_id, fixture, "rotating.webp")


def _render_diff_proxy_url(submission_id: str, fixture: str) -> str | None:
    """Resolver for an editing fixture's edit-diff turntable (public bucket URL).

    Used by the gallery grid for editing fixtures (see
    ``gallery.build_gallery_payload``). A miss (non-editing fixture, or an edit
    that never rendered a diff) 404s and degrades to the dashed cell, no
    fallback to the plain turntable.
    """
    return render_public_url(submission_id, fixture, "edit_diff.webp")


def _gt_proxy_url(fixture: str) -> str | None:
    """Resolver returning the cached proxy URL for a fixture's GT WebP.

    GT renders stay in the **private** GT dataset, so they cannot be public
    bucket URLs; they are still re-streamed through the Space proxy (which
    holds the read token).
    """
    return f"/gt-render/{fixture}.webp"


def serve_gt_render(fixture: str) -> Response:
    """Stream a fixture's ground-truth render WebP with long-lived caching."""
    webp = _fetch_gt_render(fixture)
    if webp is None:
        return Response(status_code=404)
    return Response(
        content=webp,
        media_type="image/webp",
        headers={"Cache-Control": RENDER_CACHE_CONTROL},
    )


def _fetch_gt_file(fixture: str, relpath: str) -> bytes | None:
    """Pull an arbitrary GT asset (``<fixture>/<relpath>``) from the GT dataset.

    Serves the hosted report's ground-truth column: the per-view PNGs
    (``renders/<view>.png``) and the ``ground_truth.pdf``. The GT dataset is
    **private**, so these are proxied through the Space (which holds the read
    token) rather than linked directly. ``hf_hub_download`` does the
    per-revision disk cache. Returns ``None`` on any failure (the report hides
    the broken tile via the browser's normal missing-image handling).
    """
    try:
        local_path = hf_hub_download(
            repo_id=HF_DATA_GT_REPO,
            filename=f"{fixture}/{relpath}",
            repo_type="dataset",
        )
        return Path(local_path).read_bytes()
    except Exception as e:  # noqa: BLE001 - any Hub failure -> 404
        logger.warning(
            "Failed to fetch GT file %s/%s (%s: %s)",
            fixture, relpath, type(e).__name__, e,
        )
        return None


def serve_gt_file(fixture: str, relpath: str) -> Response:
    """Stream a GT asset (view PNG / PDF) with long-lived immutable caching.

    Path-traversal-guarded (``..`` rejected). The hosted report references
    ``/gt/<fixture>/<relpath>`` and the browser fetches it lazily; the bytes
    are a property of the data revision (not any submission), so the same
    immutable ``Cache-Control`` as the render/input proxies applies.
    """
    if ".." in fixture or ".." in relpath:
        return Response(status_code=404)
    data = _fetch_gt_file(fixture, relpath)
    if data is None:
        return Response(status_code=404)
    media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream"
    return Response(
        content=data,
        media_type=media_type,
        headers={"Cache-Control": RENDER_CACHE_CONTROL},
    )


def _gallery_iframe_html() -> str:
    """Build the gallery as a self-contained ``srcdoc`` iframe.

    Reads the live rows and renders the page (turntables referenced as
    cached ``/render`` / ``/gt-render`` proxy URLs, lazy-loaded by the
    browser), then inlines the whole document into an iframe ``srcdoc``
    so it gets its own style context (no Gradio CSS collision). A Hub
    read failure degrades to an empty gallery rather than crashing the
    tab.
    """
    try:
        rows = _load_rows_from_hub()
    except LeaderboardDataError:
        logger.exception("Gallery row load failed; rendering empty gallery")
        rows = []
    doc = render_gallery_page(
        rows, _render_proxy_url, _gt_proxy_url, _render_diff_proxy_url,
    )
    escaped = html.escape(doc, quote=True)
    return (
        f'<iframe srcdoc="{escaped}" '
        'style="width:100%; height:90vh; border:0; display:block;" '
        'title="CADGenBench gallery"></iframe>'
    )


def _fetch_task_input(fixture: str, relpath: str) -> bytes | None:
    """Pull a fixture input asset (``<fixture>/<relpath>``) from the inputs repo.

    Serves the Task-browser tab's drawings / starting-shape renders.
    The inputs dataset is private, so these are proxied through the
    Space (which holds the read token) rather than linked directly —
    mirroring :func:`_fetch_render`. Not memoized for the same reason:
    inputs can be added/updated on a data revision bump, and
    ``hf_hub_download`` already does per-revision disk caching. Returns
    ``None`` on any failure (the page hides the broken tile).
    """
    try:
        local_path = hf_hub_download(
            repo_id=HF_DATA_REPO,
            filename=f"{fixture}/{relpath}",
            repo_type="dataset",
        )
        return Path(local_path).read_bytes()
    except Exception as e:  # noqa: BLE001 - any Hub failure -> 404
        logger.warning(
            "Failed to fetch task input %s/%s (%s: %s)",
            fixture, relpath, type(e).__name__, e,
        )
        return None


def _task_input_url(fixture: str, relpath: str) -> str:
    """Resolver returning the Space proxy URL for a task input asset.

    Returns the route string without fetching bytes (the browser
    lazy-fetches only the on-screen task's images). An absolute path
    resolves against the Space origin even inside the iframe ``srcdoc``.
    """
    return f"/task-input/{fixture}/{relpath}"


def serve_task_input(fixture: str, relpath: str) -> Response:
    """Stream a fixture input asset with long-lived immutable caching.

    Path-traversal-guarded (``..`` rejected). The task browser
    references ``/task-input/<fixture>/<relpath>`` and the browser
    fetches it lazily; re-streams the dataset bytes (the Space holds the
    read token) with the same immutable ``Cache-Control`` as the render
    proxies so the CDN/browser cache them hard.
    """
    if ".." in fixture or ".." in relpath:
        return Response(status_code=404)
    data = _fetch_task_input(fixture, relpath)
    if data is None:
        return Response(status_code=404)
    media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream"
    return Response(
        content=data,
        media_type=media_type,
        headers={"Cache-Control": RENDER_CACHE_CONTROL},
    )


def _tasks_iframe_html() -> str:
    """Build the Task browser as a self-contained ``srcdoc`` iframe.

    Snapshots just the ``<fixture>/description.yaml`` files from the
    inputs dataset (lightweight: the drawings/renders themselves load
    lazily via the ``/task-input`` proxy), shapes them into task cards,
    and inlines the page into an iframe so it keeps its own style
    context (no Gradio CSS collision). A Hub read failure degrades to an
    empty browser rather than crashing the tab.
    """
    try:
        local = snapshot_download(
            repo_id=HF_DATA_REPO,
            repo_type="dataset",
            allow_patterns=["*/description.yaml"],
        )
        tasks = load_tasks_from_dir(Path(local))
    except Exception:  # noqa: BLE001 - degrade to empty browser, never crash
        logger.exception("Task load failed; rendering empty task browser")
        tasks = []
    doc = render_tasks_page(tasks, _task_input_url)
    escaped = html.escape(doc, quote=True)
    return (
        f'<iframe srcdoc="{escaped}" '
        'style="width:100%; height:90vh; border:0; display:block;" '
        'title="CADGenBench tasks"></iframe>'
    )


with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as blocks:
    gr.Markdown(
        "# CADGenBench Leaderboard\n"
        "_Benchmarking AI-driven CAD generation._"
    )

    with gr.Tab("Leaderboard"):
        # Visual-first leaderboard. The bespoke surface (sticky GT row,
        # fixture picker, turntable grid, compare modal) is a
        # self-contained HTML doc inlined into an iframe `srcdoc` so it
        # keeps its own style context. Thumbnails are lazy-loaded from
        # the cached `/render` / `/gt-render` proxy routes (requires the
        # Space to be public). Built at boot, rebuilt on page load, and
        # refreshed after admin actions.
        gallery_html = gr.HTML(value=_gallery_iframe_html())
        gallery_refresh_btn = gr.Button("Refresh gallery", size="sm")
        gallery_refresh_btn.click(
            fn=_gallery_iframe_html, outputs=gallery_html,
        )

    with gr.Tab("Detailed View"):
        # Load both tiers once at boot. `_safe_load_split` keeps a Hub
        # read failure from crashing the Space: on failure the frames
        # come up empty and `initial_error` carries the message the
        # banner renders.
        initial_validated, initial_unvalidated, initial_error = _safe_load_split()

        # Loud, persistent banner shown only when the live results
        # can't be read from the Hub (e.g. an under-scoped Space
        # HF_TOKEN). Kept in sync by the refresh / Timer handlers. The
        # leaderboard never falls back to stale/bundled data, so this
        # banner is the signal that empty tables are a read failure,
        # not a genuinely empty leaderboard.
        data_error_banner = gr.Markdown(
            value=_data_error_banner_md(initial_error),
            visible=initial_error is not None,
        )

        # Collapsed accordions above the tables. Validation guidelines
        # gives the short two-tier story + link to the full policy
        # doc; Citation carries the verbatim BibTeX entry. Both start
        # closed so the leaderboard itself stays above the fold.
        with gr.Accordion("Validation guidelines", open=False):
            gr.Markdown(VALIDATION_GUIDELINES_MD)
        with gr.Accordion("Citation", open=False):
            # language=None -> plain monospaced render (gr.Code doesn't
            # ship a BibTeX highlighter); show_line_numbers off because
            # the entry is meant to be copy-pasted, not annotated.
            gr.Code(
                value=CITATION_BIBTEX,
                language=None,
                show_line_numbers=False,
            )

        # Two stacked tables, split by `validation_status`. Validated
        # on top so the curated results are above the fold; unvalidated
        # below carries every other row (auto-published, awaiting
        # methodology review). See decisions/validation-policy.md.
        # Initial values come from the boot-time `_safe_load_split`
        # above (empty + banner on a Hub read failure).
        validated_view = Leaderboard(
            value=initial_validated,
            datatype=VALIDATED_LEADERBOARD_DATATYPES,
            search_columns=["submission_name", "submitter_name"],
            hide_columns=LEADERBOARD_HIDE_COLUMNS,
            label="Validated Leaderboard",
            interactive=False,
        )
        unvalidated_view = Leaderboard(
            value=initial_unvalidated,
            datatype=LEADERBOARD_DATATYPES,
            search_columns=["submission_name", "submitter_name"],
            hide_columns=LEADERBOARD_HIDE_COLUMNS,
            label="Unvalidated Leaderboard",
            interactive=False,
        )
        with gr.Row():
            refresh_btn = gr.Button("Refresh", size="sm")
            # One file, both tables, `validation_status` discriminator
            # column. Fresh CSV is generated on every click so the
            # download reflects the latest data, not a stale snapshot
            # captured at boot.
            download_btn = gr.DownloadButton(
                label="Download CSV", size="sm",
            )
        refresh_btn.click(
            fn=_refresh_leaderboard_with_toast,
            outputs=[validated_view, unvalidated_view, data_error_banner],
        )
        download_btn.click(fn=build_combined_csv, outputs=download_btn)

        # No inline row-click detail panel: the submission_name cell is a
        # deep-link that opens the self-contained per-submission report in
        # a new tab (see `_submission_name_md` in leaderboard.py). Now that
        # the Space is public, HF's edge serves `/reports/<id>.html` to
        # browser users, so we link to it directly instead of inlining the
        # (tens-to-hundreds-of-MB) report through the Gradio event payload.

    with gr.Tab("Tasks"):
        # Read-only task browser: mirrors the per-submission report's
        # summary-table -> detail-card navigation (j/k, Esc) but shows
        # only the prompt + input (drawing / starting shape), no scores
        # or ground truth. Self-contained HTML inlined into an iframe
        # `srcdoc` like the gallery; input images lazy-load from the
        # `/task-input` proxy. Built at boot, rebuilt on page load.
        tasks_html = gr.HTML(value=_tasks_iframe_html())
        tasks_refresh_btn = gr.Button("Refresh tasks", size="sm")
        tasks_refresh_btn.click(fn=_tasks_iframe_html, outputs=tasks_html)

    with gr.Tab("Submit"):
        gr.Markdown(
            f"""
**Submission format.** A single zip with:

- one folder per sample in `{HF_DATA_REPO}`; include `output.step` for
  samples where your system produced a candidate. Missing `output.step`
  scores zero for that sample;
- a top-level `meta.json`:

```json
{{
  "submitter_name": "your name or team",
  "submission_name": "MyAgent v2.3 (or whatever describes your system)",
  "agent_url": "https://github.com/...   (optional)",
  "notes": "free text, optional, max 500 chars, single line, plain text",
  "agree_to_publish": true
}}
```

**Submission name.** Free text describing the system being benchmarked,
however you choose to describe it. The benchmark is system-agnostic: your
submission may use no LLM, one, or many. If you want to disclose your
stack, put it here or in `notes`.

**Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars
and stripped to a single line. Shown in the per-submission detail view,
not in the main leaderboard table.

**Consent.** `"agree_to_publish": true` in `meta.json` is your consent
to publish the resulting row on the public leaderboard.
"""
        )
        # OAuth gate. The user must log in via the HF button before
        # the Submit button becomes interactive; the row gets the
        # canonical `hf_username` from `gr.OAuthProfile.username`
        # (not a free-text claim in meta.json). README front-matter
        # already carries `hf_oauth: true` so HF's OAuth integration
        # is wired up at the Space level.
        login_btn = gr.LoginButton()
        zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
        # Starts disabled; the `blocks.load` handler below flips it
        # to interactive when an OAuthProfile is present.
        submit_btn = gr.Button("Submit", variant="primary", interactive=False)
        # Persistent status panel. handle_submit is a generator that
        # streams stage updates (validating -> uploading/queuing ->
        # queued) and any rejection reason here, so the outcome
        # survives instead of vanishing with a transient toast. The
        # handler also reads `gr.OAuthProfile` implicitly via its
        # parameter type annotation (Gradio's dependency-injection
        # convention).
        submit_status = gr.Markdown(value=SUBMIT_STATUS_IDLE)
        submit_btn.click(
            fn=handle_submit,
            inputs=[zip_in],
            outputs=[submit_status],
        )

    with gr.Tab("About"):
        gr.Markdown(ABOUT_MD)

    with gr.Tab("Admin"):
        # Maintainer-only controls. The tab is visible to everyone (a
        # hint the path exists); the table + buttons are gated to OAuth
        # users in the CADGENBENCH_ADMINS set via the `blocks.load`
        # handler below + a server-side re-check in every handler. See
        # decisions/validation-policy.md.
        gr.Markdown(
            "## Admin\n"
            "Tick rows in the **select** column, then promote them into the "
            "**Validated** tier (recording an evidence type), demote them back "
            "to **Unvalidated**, delete them, or rescore them against the "
            "current ground truth. Actions apply to every ticked row at once. "
            "Limited to maintainers in the admin set; everyone else sees the "
            "tab with the controls disabled."
        )
        admin_login_btn = gr.LoginButton()
        admin_status = gr.Markdown(
            "Log in with an admin account to enable the controls below."
        )
        # Only the leading `select` column is editable; the rest is
        # read-only context. Click-to-tick drives every action below.
        # `_safe_load_admin` keeps a Hub read failure from crashing the
        # Space at boot (the admin table loads at construction time).
        initial_admin_table, _ = _safe_load_admin()
        admin_table = gr.Dataframe(
            value=initial_admin_table,
            datatype=[
                "bool", "str", "str", "str", "str", "str", "str", "number",
                "str",
            ],
            static_columns=list(range(1, len(ADMIN_COLUMNS))),
            interactive=False,
            label="Submissions (tick select to choose rows)",
            wrap=True,
        )
        admin_selection_md = gr.Markdown("_No rows selected._")
        admin_method_radio = gr.Radio(
            choices=list(VALID_METHODS),
            value="manual",
            label="validation_method (applied to all rows on promote)",
            interactive=False,
        )
        with gr.Row():
            promote_btn = gr.Button(
                "Mark validated", variant="primary", interactive=False,
            )
            demote_btn = gr.Button("Mark unvalidated", interactive=False)
        with gr.Accordion("Danger zone: delete", open=False):
            gr.Markdown(
                "Permanently deletes the ticked rows **and** their uploaded "
                "zip + report files from the submissions dataset. This cannot "
                "be undone (only a manual revert of the dataset commit).\n\n"
                "**Stop & delete** additionally cancels any still-running "
                "evaluation job(s) for the ticked rows before deleting — use "
                "it for pending submissions whose GPU eval is in flight."
            )
            delete_confirm = gr.Checkbox(
                label=(
                    "I understand this permanently deletes the selected "
                    "submissions and their files."
                ),
                value=False,
                interactive=False,
            )
            with gr.Row():
                delete_btn = gr.Button(
                    "Delete selected", variant="stop", interactive=False,
                )
                stop_delete_btn = gr.Button(
                    "Stop & delete selected", variant="stop",
                    interactive=False,
                )
        with gr.Accordion("Danger zone: rescore", open=False):
            gr.Markdown(
                "Re-evaluates submissions against the **current** "
                "ground truth + data: each row flips back to pending, the "
                "gallery renders and the per-submission report HTML are "
                "regenerated, and the score is recomputed. Use after a "
                "ground-truth swap or a metric change that invalidates the "
                "existing scores.\n\n"
                "Rescoring is **re-runnable**: if a row's eval fails, mark it "
                "and rescore again (or rescore all) — each run is "
                "independent and converges.\n\n"
                "- **Rescore selected** re-evaluates the ticked rows.\n"
                f"- **Rescore all** re-evaluates every submission that has a "
                f"stored zip and isn't already pending — type "
                f"`{RESCORE_ALL_PHRASE}` to arm it."
            )
            rescore_confirm = gr.Checkbox(
                label=(
                    "I understand this flips the selected rows to pending and "
                    "recomputes their scores."
                ),
                value=False,
                interactive=False,
            )
            rescore_selected_btn = gr.Button(
                "Rescore selected", variant="stop", interactive=False,
            )
            rescore_all_phrase = gr.Textbox(
                label=(
                    f"Type '{RESCORE_ALL_PHRASE}' to arm the board-wide "
                    f"rescore"
                ),
                placeholder=RESCORE_ALL_PHRASE,
                interactive=False,
            )
            rescore_all_btn = gr.Button(
                "Rescore ALL submissions", variant="stop", interactive=False,
            )
        admin_refresh_btn = gr.Button("Refresh", size="sm")

        admin_table.change(
            fn=_admin_selection_status,
            inputs=admin_table,
            outputs=admin_selection_md,
        )
        promote_btn.click(
            fn=_admin_promote,
            inputs=[admin_table, admin_method_radio],
            outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
        )
        demote_btn.click(
            fn=_admin_demote,
            inputs=[admin_table],
            outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
        )
        delete_confirm.change(
            fn=_arm_delete,
            inputs=[delete_confirm],
            outputs=[delete_btn, stop_delete_btn],
        )
        delete_btn.click(
            fn=_admin_delete,
            inputs=[admin_table, delete_confirm],
            outputs=[
                admin_table, validated_view, unvalidated_view, gallery_html,
                delete_confirm, delete_btn, stop_delete_btn,
            ],
        )
        stop_delete_btn.click(
            fn=_admin_stop_delete,
            inputs=[admin_table, delete_confirm],
            outputs=[
                admin_table, validated_view, unvalidated_view, gallery_html,
                delete_confirm, delete_btn, stop_delete_btn,
            ],
        )
        rescore_confirm.change(
            fn=_arm_rescore_selected,
            inputs=[rescore_confirm],
            outputs=[rescore_selected_btn],
        )
        rescore_selected_btn.click(
            fn=_admin_rescore_selected,
            inputs=[admin_table, rescore_confirm],
            outputs=[
                admin_table, validated_view, unvalidated_view, gallery_html,
                rescore_confirm, rescore_selected_btn,
            ],
        )
        rescore_all_phrase.change(
            fn=_arm_rescore_all,
            inputs=[rescore_all_phrase],
            outputs=[rescore_all_btn],
        )
        rescore_all_btn.click(
            fn=_admin_rescore_all,
            inputs=[rescore_all_phrase],
            outputs=[
                admin_table, validated_view, unvalidated_view, gallery_html,
                rescore_all_phrase, rescore_all_btn,
            ],
        )
        admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table)

        # Keep the admin table on the same 10s cadence as the leaderboard
        # so a row that lands (or a pending row that completes) after the
        # tab loaded shows up without a manual Refresh. Selection is
        # preserved across ticks so an in-progress set of checkboxes
        # survives the reload.
        admin_auto_refresh_timer = gr.Timer(10)
        admin_auto_refresh_timer.tick(
            fn=_auto_refresh_admin_table,
            inputs=admin_table,
            outputs=admin_table,
        )

    # gradio_leaderboard.Leaderboard handles its own update path
    # cleanly; bind a Timer to push fresh dataframes every 10 seconds.
    # Single tick runs `_auto_refresh_leaderboard` once and pushes the
    # two halves into the validated / unvalidated widgets plus the
    # data-unavailable banner. The handler swallows a Hub read failure
    # into empty frames + a loud warning toast so a degraded read never
    # crashes the tick loop or silently blanks the tables.
    auto_refresh_timer = gr.Timer(10)
    auto_refresh_timer.tick(
        fn=_auto_refresh_leaderboard,
        outputs=[validated_view, unvalidated_view, data_error_banner],
    )

    # On page load, read the visitor's OAuth profile (None if not
    # logged in) and flip the Submit button's interactivity. Runs once
    # per page load; LoginButton clicks also re-trigger this through
    # Gradio's auth-event plumbing.
    blocks.load(fn=_enable_submit_when_logged_in, outputs=submit_btn)
    blocks.load(fn=_gallery_iframe_html, outputs=gallery_html)
    blocks.load(fn=_tasks_iframe_html, outputs=tasks_html)

    # Same per-load OAuth read, gating the Admin tab's controls on
    # membership in the CADGENBENCH_ADMINS set. Logged-out / non-admin
    # visitors get the tab with everything disabled.
    blocks.load(
        fn=_gate_admin_controls,
        outputs=[
            admin_table,
            admin_method_radio,
            promote_btn,
            demote_btn,
            delete_confirm,
            delete_btn,
            stop_delete_btn,
            rescore_confirm,
            rescore_selected_btn,
            rescore_all_phrase,
            rescore_all_btn,
            admin_status,
        ],
    )


# Mount Gradio under a FastAPI parent so the custom proxy route
# above lives at the same origin as the UI. Direct routes on `app`
# get checked before the Gradio sub-app, so `/reports/<sid>.html`
# never gets shadowed.
app = FastAPI()
app.add_api_route(
    "/reports/{submission_id}.html",
    serve_report,
    methods=["GET"],
)
# Cached render proxies the gallery's lazy-loaded turntables point at.
# Registered before the Gradio mount so they're not shadowed by the
# catch-all sub-app.
# Candidate renders are served directly from the public render bucket (URLs
# come from the gallery resolvers), so only the private GT render still needs a
# token-holding Space proxy route.
app.add_api_route(
    "/gt-render/{fixture}.webp",
    serve_gt_render,
    methods=["GET"],
)
# Ground-truth assets the hosted report links lazily (per-view PNGs + PDF).
# GT is private, so this token-holding proxy streams them; the `:path`
# converter lets `relpath` carry a slash (e.g. renders/iso.png). Registered
# before the Gradio mount so it isn't shadowed by the catch-all sub-app.
app.add_api_route(
    "/gt/{fixture}/{relpath:path}",
    serve_gt_file,
    methods=["GET"],
)
# Task-browser input assets (drawings + starting-shape renders). The
# `:path` converter lets `relpath` carry a slash (e.g. renders/iso.png).
# Registered before the Gradio mount so it's not shadowed.
app.add_api_route(
    "/task-input/{fixture}/{relpath:path}",
    serve_task_input,
    methods=["GET"],
)
app = gr.mount_gradio_app(app, blocks, path="/")


if __name__ == "__main__":
    host = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0")
    port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
    uvicorn.run(app, host=host, port=port)