File size: 52,931 Bytes
b048bc2 77edebf c040324 0501689 77edebf c040324 0e3b21f b2f3ce6 f4924d6 77edebf b2f3ce6 c040324 3112173 77edebf f4924d6 3112173 b5ad973 58ce093 01d67e9 b5ad973 a662bfa 53de73a 3112173 a662bfa 53de73a a662bfa c4e21b3 01d67e9 f585077 58ce093 046548a d2161b1 c040324 01d67e9 f4924d6 58ce093 2893b22 c4b5d70 58ce093 0501689 c040324 77edebf b2f3ce6 c040324 97b9a4a 461547b 97b9a4a c040324 628bc9e 97b9a4a 0c44305 c040324 97b9a4a b224eee 3112173 a662bfa 6facf47 a662bfa 6facf47 a662bfa 6facf47 a662bfa 6facf47 c87b253 58ce093 47c86cf 58ce093 47c86cf c4b5d70 2893b22 c4b5d70 47c86cf 02751ff 2893b22 47c86cf 02751ff 47c86cf 58ce093 47c86cf 02751ff 47c86cf 58ce093 c4b5d70 2893b22 47c86cf 58ce093 c4b5d70 58ce093 a662bfa c4b5d70 47c86cf 58ce093 47c86cf daae24c 47c86cf 58ce093 47c86cf 58ce093 47c86cf 58ce093 47c86cf 58ce093 47c86cf 58ce093 a662bfa daae24c 47c86cf 58ce093 daae24c 58ce093 a662bfa daae24c 58ce093 47c86cf c4b5d70 daae24c 58ce093 c4b5d70 58ce093 47c86cf 58ce093 47c86cf 58ce093 47c86cf 58ce093 a662bfa 47c86cf a662bfa 47c86cf daae24c 58ce093 c4b5d70 47c86cf 2893b22 77edebf 01d67e9 c1cb5e4 01d67e9 c1cb5e4 8eb8954 01d67e9 c1cb5e4 01d67e9 c4b5d70 c1cb5e4 4a9408a c4b5d70 d2161b1 c4b5d70 d2161b1 c4b5d70 e611f15 d2161b1 e611f15 d2161b1 e611f15 d2161b1 e611f15 c4b5d70 d2161b1 c4b5d70 d2161b1 c4b5d70 d2161b1 e611f15 c4b5d70 c1cb5e4 c4b5d70 c1cb5e4 c4b5d70 5140b0a 01d67e9 c1cb5e4 4a9408a 01d67e9 e611f15 01d67e9 f4924d6 77edebf c040324 461547b 01d67e9 c1cb5e4 4a9408a 01d67e9 461547b a662bfa 97b9a4a 046548a a662bfa 046548a 53de73a 4e86f82 3112173 046548a c4e21b3 046548a 53de73a 046548a 3112173 046548a c4e21b3 f2f35be f585077 046548a 6facf47 a662bfa 046548a f585077 c040324 5fb3ebc 3112173 31854f7 c040324 461547b c040324 952dbca c040324 952dbca 8a21dae 952dbca c040324 a58058c c040324 c87b253 c040324 c87b253 b224eee c040324 47c86cf 58ce093 47c86cf 58ce093 2893b22 47c86cf 58ce093 a662bfa 58ce093 a662bfa 58ce093 47c86cf 58ce093 47c86cf 58ce093 47c86cf 46e3a72 58ce093 47c86cf 58ce093 c4b5d70 58ce093 c4b5d70 2893b22 58ce093 47c86cf 58ce093 47c86cf 58ce093 daae24c 47c86cf 58ce093 daae24c 58ce093 c4b5d70 47c86cf 58ce093 daae24c c4b5d70 58ce093 47c86cf 2893b22 a662bfa 47c86cf c4b5d70 4e86f82 046548a a662bfa 4e86f82 046548a a662bfa 046548a 4e86f82 c87b253 daae24c f4924d6 c87b253 47c86cf 58ce093 47c86cf 58ce093 c4b5d70 2893b22 47c86cf c040324 77edebf c1cb5e4 4a9408a d2161b1 c4b5d70 c1cb5e4 c4b5d70 5140b0a f4924d6 77edebf c040324 77edebf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 | # Copyright 2026 Hugging Face
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CADGenBench Leaderboard Space - Gradio UI + report-proxy mount.
Read path lives in :mod:`leaderboard`. Submit-tab validation lives in
:mod:`submit`. Both are wired into the Gradio Blocks below. The
Gradio app is mounted under a FastAPI parent so the custom
``/reports/{submission_id}.html`` route can re-serve dataset HTML
with ``Content-Type: text/html`` (HF Hub's ``/resolve/`` serves it
as ``text/plain`` by policy, which makes the browser show source
rather than render).
"""
from __future__ import annotations
import html
import logging
import mimetypes
import os
from functools import lru_cache
from pathlib import Path
import gradio as gr
import pandas as pd
import uvicorn
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, Response
from gradio_leaderboard import Leaderboard
from huggingface_hub import hf_hub_download, snapshot_download
from leaderboard import (
ADMIN_COLUMNS,
ADMIN_SELECT_COL,
HF_DATA_GT_REPO,
HF_DATA_REPO,
HF_SUBMISSIONS_REPO,
LEADERBOARD_COLS,
LEADERBOARD_DATATYPES,
LEADERBOARD_HIDE_COLUMNS,
VALIDATED_LEADERBOARD_COLS,
VALIDATED_LEADERBOARD_DATATYPES,
LeaderboardDataError,
_fmt_timestamp,
_load_rows_from_hub,
build_combined_csv,
load_admin_table,
load_leaderboard_split,
render_public_url,
)
from gallery import render_gallery_page
from tasks import load_tasks_from_dir, render_tasks_page
from admin import (
VALID_METHODS,
delete_rows,
demote_rows,
is_admin,
promote_rows,
rescore_all,
rescore_rows,
stop_and_delete_rows,
)
from submit import handle_submit
logger = logging.getLogger(__name__)
# Surface module-level logger.info / logger.warning / logger.exception
# calls from leaderboard.py + submit.py in the Space's runtime logs.
# Otherwise they go nowhere and any refresh / worker pathology is
# silent. Format keeps timestamps + module + level + message.
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
)
# Canonical policy doc lives in the code repo so contributors reading
# the GitHub repo see it without needing to visit the Space. Linked
# from both the Detailed View tab's Validation Guidelines accordion and
# the About tab.
VALIDATION_DOC_URL = (
"https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md"
)
ABOUT_MD = f"""## About
**CADGenBench** evaluates AI-driven CAD generation: how well a model can
turn a description of a mechanical part into a valid, geometrically
correct 3D model.
- **Reference baseline**: an iterative AI agent that writes build123d Python.
- **Submission flow**: upload a zip of per-fixture STEP files; the Space
runs the eval and appends a row to the submissions dataset.
- **Datasets**: fixture inputs in
[`{HF_DATA_REPO}`](https://huggingface.co/datasets/{HF_DATA_REPO});
submissions and computed results in
[`{HF_SUBMISSIONS_REPO}`](https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}).
- **Code**: [`huggingface/cadgenbench`](https://github.com/huggingface/cadgenbench).
- **Validation policy**: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL}).
- **Data**: CAD geometry from [Mecado](https://www.mecado.com).
"""
# Verbatim BibTeX entry locked in space-setup/bundles/1-2-space-ux.md
# (Locked decisions section). Shown in the Citation accordion as a
# copy-paste handle for anyone citing this benchmark; the About tab
# already links the source code via huggingface/cadgenbench so the
# Space URL is the right deep-link target for the citation.
CITATION_BIBTEX = r"""@misc{cadgenbench2026,
author = {Rabinovich, Michael and {Hugging Face}},
title = {{CADGenBench}: a benchmark for {AI}-driven {CAD} generation},
year = {2026},
publisher = {Hugging Face},
howpublished = {\url{https://huggingface.co/spaces/HuggingAI4Engineering/cadgenbench-leaderboard}},
}"""
VALIDATION_GUIDELINES_MD = f"""Submissions appear on the **Unvalidated** table the moment evaluation completes. Maintainers promote rows to **Validated** after methodology review, accepting one of four evidence types (`code`, `traces`, `api`, `manual`).
Full policy: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL})."""
SUBMIT_STATUS_IDLE = (
"_Log in, attach a zip, and click **Submit**. Progress and any "
"errors appear here._"
)
def _data_error_banner_md(message: str | None) -> str:
"""Markdown for the top-of-tab data-unavailable banner.
Empty string when there's no error (the banner is also hidden via
``visible=False`` in that case). When the live ``results.jsonl``
can't be read, the banner is the loud, persistent signal that the
tables below are empty *by design* (we never fall back to stale or
bundled data) rather than because the leaderboard is genuinely
empty.
"""
if not message:
return ""
return (
"> ⚠️ **Leaderboard data unavailable.** The live results could not "
"be read from the Hub, so the tables below are empty. No stale or "
"cached data is ever shown in its place.\n>\n"
f"> Details: `{message}`"
)
def _safe_load_split() -> tuple[pd.DataFrame, pd.DataFrame, str | None]:
"""Load both tiers, turning a Hub failure into empty frames + a message.
The reader (:func:`load_leaderboard_split`) deliberately *raises*
on any read failure (no silent fallback). The Space, however, must
stay up and loudly surface the failure rather than crash, so this
wrapper converts :class:`LeaderboardDataError` into empty,
correctly-shaped DataFrames plus an error string the caller renders
in the banner / a toast. Returns ``(validated, unvalidated, error)``
with ``error`` ``None`` on success.
"""
try:
validated, unvalidated = load_leaderboard_split()
return validated, unvalidated, None
except LeaderboardDataError as e:
logger.exception("Leaderboard data load failed")
return (
pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS),
pd.DataFrame(columns=LEADERBOARD_COLS),
str(e),
)
def _safe_load_admin() -> tuple[pd.DataFrame, str | None]:
"""Admin-table counterpart to :func:`_safe_load_split`.
Same no-crash contract: a Hub read failure yields an empty,
correctly-shaped admin frame plus the error string instead of
propagating the exception (which would take the whole Space down at
boot, since the admin table loads at module-construction time).
"""
try:
return load_admin_table(), None
except LeaderboardDataError as e:
logger.exception("Admin table load failed")
return pd.DataFrame(columns=ADMIN_COLUMNS), str(e)
def _refresh_leaderboard_with_toast():
"""Manual Refresh button handler: toast + fresh DataFrames + banner.
Surfaces the outcome loudly either way: ``gr.Info`` on success,
``gr.Warning`` when the live read failed. The third output keeps
the data-unavailable banner in sync (shown with the error,
cleared on success).
"""
validated, unvalidated, error = _safe_load_split()
if error:
gr.Warning(f"Leaderboard data unavailable: {error}")
else:
gr.Info("Leaderboard refreshed.")
return (
validated,
unvalidated,
gr.Markdown(value=_data_error_banner_md(error), visible=error is not None),
)
def _auto_refresh_leaderboard():
"""Timer-tick handler: fresh DataFrames + banner, no success toast.
Mirrors :func:`_refresh_leaderboard_with_toast` but stays silent on
success (a toast every 10s would be noise). A read failure still
fires a loud ``gr.Warning`` and updates the banner so a degraded
Hub read can't quietly leave the tables blank.
"""
validated, unvalidated, error = _safe_load_split()
if error:
gr.Warning(f"Leaderboard data unavailable: {error}")
return (
validated,
unvalidated,
gr.Markdown(value=_data_error_banner_md(error), visible=error is not None),
)
def _enable_submit_when_logged_in(
profile: gr.OAuthProfile | None,
) -> gr.Button:
"""Flip the Submit button's interactivity based on login state.
Runs once per page load via ``blocks.load``. Gradio injects
``gr.OAuthProfile`` automatically (``None`` if the visitor isn't
logged in via the LoginButton). The visible-disable mirrors the
server-side gate in :func:`submit.handle_submit`; the handler
still raises ``gr.Error`` defensively if it ever gets called
without a profile.
"""
return gr.Button(interactive=profile is not None)
def _selected_ids(table_df: pd.DataFrame | None) -> list[str]:
"""Submission ids of the rows whose ``select`` checkbox is ticked."""
if (
table_df is None
or len(table_df) == 0
or ADMIN_SELECT_COL not in table_df.columns
or "submission_id" not in table_df.columns
):
return []
mask = table_df[ADMIN_SELECT_COL].apply(bool)
return [str(s) for s in table_df.loc[mask, "submission_id"].tolist() if s]
def _admin_selection_status(table_df: pd.DataFrame | None) -> str:
"""Live count line under the admin table, updated as boxes are ticked."""
n = len(_selected_ids(table_df))
return f"**{n}** row(s) selected." if n else "_No rows selected._"
def _gate_admin_controls(
profile: gr.OAuthProfile | None,
) -> tuple[
gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox, gr.Button,
gr.Button, gr.Checkbox, gr.Button, gr.Textbox, gr.Button, str,
]:
"""Enable the admin controls only for a logged-in user in the admin set.
Runs on every page load and re-runs on LoginButton auth events, so
the table value is also refreshed from the live Hub data instead of
staying pinned to whatever rows existed when the Space process
booted. Non-admins and logged-out visitors get the tab with the
table read-only and every control disabled, mirroring the server-side
re-check in each handler. The armed-by-confirmation buttons (delete,
stop-and-delete, rescore-selected, rescore-all) always load disarmed:
they only enable once their confirm box is ticked / phrase typed.
"""
admin_df, error = _safe_load_admin()
if error:
gr.Warning(f"Admin table unavailable: {error}")
admin = is_admin(profile)
if profile is None:
status = "Log in with an admin account to enable the controls below."
elif admin:
status = f"Signed in as `{profile.username}`. Admin controls enabled."
else:
status = (
f"Signed in as `{profile.username}`, which is not in the admin "
"set. Controls are disabled."
)
return (
gr.Dataframe(value=admin_df, interactive=admin),
gr.Radio(interactive=admin),
gr.Button(interactive=admin),
gr.Button(interactive=admin),
gr.Checkbox(interactive=admin, value=False),
gr.Button(interactive=False),
gr.Button(interactive=False),
gr.Checkbox(interactive=admin, value=False),
gr.Button(interactive=False),
gr.Textbox(interactive=admin, value=""),
gr.Button(interactive=False),
status,
)
def _arm_delete(
confirm: bool, profile: gr.OAuthProfile | None,
) -> tuple[gr.Button, gr.Button]:
"""Arm both destructive buttons once an admin ticks the confirm box.
The plain delete and the stop-and-delete share the single confirm
checkbox, so a deliberate tick is required before either fires.
"""
armed = bool(confirm) and is_admin(profile)
return gr.Button(interactive=armed), gr.Button(interactive=armed)
def _refresh_admin_table() -> pd.DataFrame:
"""Admin Refresh button handler: reload the admin table, toast on failure.
Uses the no-crash :func:`_safe_load_admin` so a Hub read failure
surfaces as a loud ``gr.Warning`` plus an empty table rather than an
uncaught exception.
"""
admin_df, error = _safe_load_admin()
if error:
gr.Warning(f"Admin table unavailable: {error}")
return admin_df
def _reapply_selection(
fresh: pd.DataFrame, selected: set[str],
) -> pd.DataFrame:
"""Re-tick the ``select`` column on rows the maintainer had selected.
A freshly-loaded admin frame comes back all-unchecked; this carries
the prior ticks forward by ``submission_id`` so a background refresh
doesn't wipe an in-progress selection. Ids that vanished (e.g. a row
deleted out from under the table) simply drop out.
"""
if (
selected
and ADMIN_SELECT_COL in fresh.columns
and "submission_id" in fresh.columns
):
fresh[ADMIN_SELECT_COL] = (
fresh["submission_id"].astype(str).isin(selected)
)
return fresh
def _auto_refresh_admin_table(current_df: pd.DataFrame | None) -> pd.DataFrame:
"""Timer-tick handler: reload the admin table, preserving ticked rows.
The leaderboard tables auto-refresh every 10s but the admin table did
not, so a pending row submitted after the tab loaded stayed invisible
until a manual Refresh. This keeps it current on the same cadence.
Unlike the leaderboard handler it stays silent (no per-tick toast)
and, on a Hub read failure, returns the current frame unchanged so a
transient blip never blanks the table or drops the user's selection.
"""
admin_df, error = _safe_load_admin()
if error:
return current_df if current_df is not None else admin_df
return _reapply_selection(admin_df, set(_selected_ids(current_df)))
def _admin_promote(
table_df: pd.DataFrame | None,
method: str | None,
profile: gr.OAuthProfile | None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
"""Promote ticked rows, then refresh admin, leaderboard, and gallery.
Re-checks :func:`admin.is_admin` server-side so a tampered client
that re-enables the button still can't write.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
if not method:
raise gr.Error("Pick a validation_method first.")
try:
promote_rows(ids, method)
except (LookupError, ValueError) as e:
raise gr.Error(str(e))
gr.Info(f"Promoted {len(ids)} row(s) to validated ({method}).")
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return admin_df, validated, unvalidated, _gallery_iframe_html()
def _admin_demote(
table_df: pd.DataFrame | None,
profile: gr.OAuthProfile | None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
"""Demote ticked rows, then refresh admin, leaderboard, and gallery."""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
try:
demote_rows(ids)
except (LookupError, ValueError) as e:
raise gr.Error(str(e))
gr.Info(f"Demoted {len(ids)} row(s) to unvalidated.")
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return admin_df, validated, unvalidated, _gallery_iframe_html()
def _admin_delete(
table_df: pd.DataFrame | None,
confirm: bool,
profile: gr.OAuthProfile | None,
) -> tuple[
pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
gr.Button,
]:
"""Delete ticked rows, then refresh admin, leaderboard, gallery, and disarm.
Resets the confirm checkbox and re-disables both destructive buttons
on the way out so the next deletion needs a fresh, deliberate confirm.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
if not confirm:
raise gr.Error("Tick the confirmation box to enable delete.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
try:
delete_rows(ids)
except ValueError as e:
raise gr.Error(str(e))
gr.Info(f"Deleted {len(ids)} submission(s).")
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return (
admin_df,
validated,
unvalidated,
_gallery_iframe_html(),
gr.Checkbox(value=False),
gr.Button(interactive=False),
gr.Button(interactive=False),
)
def _admin_stop_delete(
table_df: pd.DataFrame | None,
confirm: bool,
profile: gr.OAuthProfile | None,
) -> tuple[
pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
gr.Button,
]:
"""Stop running eval job(s) for ticked rows, delete them, then disarm.
Same gating + disarm contract as :func:`_admin_delete`; the only
difference is it calls :func:`admin.stop_and_delete_rows`, which
best-effort cancels the submissions' in-flight HF Jobs before
deleting. Use this for pending rows whose GPU eval is still running.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
if not confirm:
raise gr.Error("Tick the confirmation box to enable delete.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
try:
stop_and_delete_rows(ids)
except ValueError as e:
raise gr.Error(str(e))
gr.Info(f"Stopped + deleted {len(ids)} submission(s).")
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return (
admin_df,
validated,
unvalidated,
_gallery_iframe_html(),
gr.Checkbox(value=False),
gr.Button(interactive=False),
gr.Button(interactive=False),
)
# Exact phrase an admin must type to arm the board-wide rescore. A
# free-text match (not a checkbox) is the deliberate "are you sure"
# friction: it can't be tripped by a stray click and forces the admin
# to consciously type the words before the heavy, score-invalidating
# action arms.
RESCORE_ALL_PHRASE = "RESCORE ALL"
def _arm_rescore_selected(
confirm: bool, profile: gr.OAuthProfile | None,
) -> gr.Button:
"""Arm the rescore-selected button once an admin ticks its confirm box."""
return gr.Button(interactive=bool(confirm) and is_admin(profile))
def _arm_rescore_all(
phrase: str | None, profile: gr.OAuthProfile | None,
) -> gr.Button:
"""Arm the rescore-all button only on an exact phrase match by an admin."""
matched = (phrase or "").strip() == RESCORE_ALL_PHRASE
return gr.Button(interactive=matched and is_admin(profile))
def _rescore_result_message(dispatched: int, skipped: list[str]) -> str:
"""Toast text summarising a rescore dispatch."""
msg = (
f"Rescoring {dispatched} submission(s): rows flipped to pending and "
f"re-evaluating in the background. The leaderboard repopulates as "
f"each finishes."
)
if skipped:
msg += (
f" Skipped {len(skipped)} row(s) with no stored zip (legacy seed "
f"rows can't be rescored)."
)
return msg
def _admin_rescore_selected(
table_df: pd.DataFrame | None,
confirm: bool,
profile: gr.OAuthProfile | None,
) -> tuple[
pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
]:
"""Re-evaluate the ticked rows, refresh the views, then disarm.
Same gating contract as the destructive handlers: server-side
``is_admin`` re-check, an explicit confirm tick, and a non-empty
selection. Resets the confirm box + disarms the button on the way
out so the next rescore needs a fresh, deliberate confirm.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
if not confirm:
raise gr.Error("Tick the confirmation box to enable rescore.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
try:
dispatched, skipped = rescore_rows(ids)
except (LookupError, ValueError) as e:
raise gr.Error(str(e))
gr.Info(_rescore_result_message(dispatched, skipped))
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return (
admin_df,
validated,
unvalidated,
_gallery_iframe_html(),
gr.Checkbox(value=False),
gr.Button(interactive=False),
)
def _admin_rescore_all(
phrase: str | None,
profile: gr.OAuthProfile | None,
) -> tuple[
pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Textbox, gr.Button,
]:
"""Re-evaluate every rescoreable row, refresh the views, then disarm.
The heavy, board-wide action: re-checks ``is_admin`` and the exact
confirmation phrase server-side (so a tampered client that
re-enables the button still can't fire), clears the phrase box, and
disarms the button afterwards.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
if (phrase or "").strip() != RESCORE_ALL_PHRASE:
raise gr.Error(
f"Type '{RESCORE_ALL_PHRASE}' exactly to confirm a full rescore."
)
try:
dispatched, skipped = rescore_all()
except ValueError as e:
raise gr.Error(str(e))
gr.Info(_rescore_result_message(dispatched, skipped))
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return (
admin_df,
validated,
unvalidated,
_gallery_iframe_html(),
gr.Textbox(value=""),
gr.Button(interactive=False),
)
@lru_cache(maxsize=128)
def _fetch_report_html(submission_id: str) -> bytes | None:
"""Pull ``reports/<id>.html`` off the submissions dataset.
Cached in-process so repeat clicks on the same row don't hit
the Hub. Returns ``None`` on any failure so the caller can
serve a clean 404 rather than leaking a stack trace.
"""
try:
local_path = hf_hub_download(
repo_id=HF_SUBMISSIONS_REPO,
filename=f"reports/{submission_id}.html",
repo_type="dataset",
)
return Path(local_path).read_bytes()
except Exception as e: # noqa: BLE001 - any Hub failure -> 404
logger.warning(
"Failed to fetch report for %s (%s: %s)",
submission_id, type(e).__name__, e,
)
return None
def serve_report(submission_id: str) -> Response:
"""Proxy a per-submission HTML report through the Space.
HF Hub serves dataset HTML under ``/resolve/`` with
``Content-Type: text/plain`` (security: dataset files can't host
live HTML), so a direct dataset link shows source instead of
rendering. This route lives on the Space (which can legitimately
serve text/html) and re-streams the file's bytes with the right
content-type.
"""
content = _fetch_report_html(submission_id)
if content is None:
return HTMLResponse(
content="<h1>Report not found</h1>",
status_code=404,
)
return Response(content=content, media_type="text/html; charset=utf-8")
def _fetch_gt_render(fixture: str) -> bytes | None:
"""Pull a fixture's ground-truth GIF from the private GT dataset.
Path inside the GT repo is ``<fixture>/renders/rotating.webp``. GT
renders are a property of the data revision, not of any submission,
so they're served straight from the GT repo rather than duplicated
per submission. Not memoized for the same reason as :func:`_fetch_render` (GT
renders can be added/updated on a data revision bump);
``hf_hub_download`` handles the per-revision disk cache. Needs the
Space ``HF_TOKEN``'s read scope on the private repo.
"""
try:
local_path = hf_hub_download(
repo_id=HF_DATA_GT_REPO,
filename=f"{fixture}/renders/rotating.webp",
repo_type="dataset",
)
return Path(local_path).read_bytes()
except Exception as e: # noqa: BLE001 - any Hub failure -> 404
logger.warning(
"Failed to fetch GT render for %s (%s: %s)",
fixture, type(e).__name__, e,
)
return None
# Long-lived immutable caching: a (submission, fixture) render never
# changes (fixed camera + lighting; re-renders would be a new artifact),
# so the browser/CDN can keep it forever. This is what makes fixture
# swaps and repeat visits free: only the ~33 on-screen turntables are
# fetched on first paint, and everything after that is a cache hit.
RENDER_CACHE_CONTROL = "public, max-age=31536000, immutable"
def _render_proxy_url(submission_id: str, fixture: str) -> str | None:
"""Resolver for a submission's plain turntable: a public render-bucket URL.
The eval job uploads ``renders/<id>/<fixture>/rotating.webp`` to the public
bucket, so the browser fetches it straight from object storage (anonymous,
no Space proxy hop). The gallery only calls this for ``valid`` fixtures; a
missing upload 404s and degrades to the dashed cell via ``<img onerror>``.
"""
return render_public_url(submission_id, fixture, "rotating.webp")
def _render_diff_proxy_url(submission_id: str, fixture: str) -> str | None:
"""Resolver for an editing fixture's edit-diff turntable (public bucket URL).
Used by the gallery grid for editing fixtures (see
``gallery.build_gallery_payload``). A miss (non-editing fixture, or an edit
that never rendered a diff) 404s and degrades to the dashed cell, no
fallback to the plain turntable.
"""
return render_public_url(submission_id, fixture, "edit_diff.webp")
def _gt_proxy_url(fixture: str) -> str | None:
"""Resolver returning the cached proxy URL for a fixture's GT WebP.
GT renders stay in the **private** GT dataset, so they cannot be public
bucket URLs; they are still re-streamed through the Space proxy (which
holds the read token).
"""
return f"/gt-render/{fixture}.webp"
def serve_gt_render(fixture: str) -> Response:
"""Stream a fixture's ground-truth render WebP with long-lived caching."""
webp = _fetch_gt_render(fixture)
if webp is None:
return Response(status_code=404)
return Response(
content=webp,
media_type="image/webp",
headers={"Cache-Control": RENDER_CACHE_CONTROL},
)
def _fetch_gt_file(fixture: str, relpath: str) -> bytes | None:
"""Pull an arbitrary GT asset (``<fixture>/<relpath>``) from the GT dataset.
Serves the hosted report's ground-truth column: the per-view PNGs
(``renders/<view>.png``) and the ``ground_truth.pdf``. The GT dataset is
**private**, so these are proxied through the Space (which holds the read
token) rather than linked directly. ``hf_hub_download`` does the
per-revision disk cache. Returns ``None`` on any failure (the report hides
the broken tile via the browser's normal missing-image handling).
"""
try:
local_path = hf_hub_download(
repo_id=HF_DATA_GT_REPO,
filename=f"{fixture}/{relpath}",
repo_type="dataset",
)
return Path(local_path).read_bytes()
except Exception as e: # noqa: BLE001 - any Hub failure -> 404
logger.warning(
"Failed to fetch GT file %s/%s (%s: %s)",
fixture, relpath, type(e).__name__, e,
)
return None
def serve_gt_file(fixture: str, relpath: str) -> Response:
"""Stream a GT asset (view PNG / PDF) with long-lived immutable caching.
Path-traversal-guarded (``..`` rejected). The hosted report references
``/gt/<fixture>/<relpath>`` and the browser fetches it lazily; the bytes
are a property of the data revision (not any submission), so the same
immutable ``Cache-Control`` as the render/input proxies applies.
"""
if ".." in fixture or ".." in relpath:
return Response(status_code=404)
data = _fetch_gt_file(fixture, relpath)
if data is None:
return Response(status_code=404)
media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream"
return Response(
content=data,
media_type=media_type,
headers={"Cache-Control": RENDER_CACHE_CONTROL},
)
def _gallery_iframe_html() -> str:
"""Build the gallery as a self-contained ``srcdoc`` iframe.
Reads the live rows and renders the page (turntables referenced as
cached ``/render`` / ``/gt-render`` proxy URLs, lazy-loaded by the
browser), then inlines the whole document into an iframe ``srcdoc``
so it gets its own style context (no Gradio CSS collision). A Hub
read failure degrades to an empty gallery rather than crashing the
tab.
"""
try:
rows = _load_rows_from_hub()
except LeaderboardDataError:
logger.exception("Gallery row load failed; rendering empty gallery")
rows = []
doc = render_gallery_page(
rows, _render_proxy_url, _gt_proxy_url, _render_diff_proxy_url,
)
escaped = html.escape(doc, quote=True)
return (
f'<iframe srcdoc="{escaped}" '
'style="width:100%; height:90vh; border:0; display:block;" '
'title="CADGenBench gallery"></iframe>'
)
def _fetch_task_input(fixture: str, relpath: str) -> bytes | None:
"""Pull a fixture input asset (``<fixture>/<relpath>``) from the inputs repo.
Serves the Task-browser tab's drawings / starting-shape renders.
The inputs dataset is private, so these are proxied through the
Space (which holds the read token) rather than linked directly —
mirroring :func:`_fetch_render`. Not memoized for the same reason:
inputs can be added/updated on a data revision bump, and
``hf_hub_download`` already does per-revision disk caching. Returns
``None`` on any failure (the page hides the broken tile).
"""
try:
local_path = hf_hub_download(
repo_id=HF_DATA_REPO,
filename=f"{fixture}/{relpath}",
repo_type="dataset",
)
return Path(local_path).read_bytes()
except Exception as e: # noqa: BLE001 - any Hub failure -> 404
logger.warning(
"Failed to fetch task input %s/%s (%s: %s)",
fixture, relpath, type(e).__name__, e,
)
return None
def _task_input_url(fixture: str, relpath: str) -> str:
"""Resolver returning the Space proxy URL for a task input asset.
Returns the route string without fetching bytes (the browser
lazy-fetches only the on-screen task's images). An absolute path
resolves against the Space origin even inside the iframe ``srcdoc``.
"""
return f"/task-input/{fixture}/{relpath}"
def serve_task_input(fixture: str, relpath: str) -> Response:
"""Stream a fixture input asset with long-lived immutable caching.
Path-traversal-guarded (``..`` rejected). The task browser
references ``/task-input/<fixture>/<relpath>`` and the browser
fetches it lazily; re-streams the dataset bytes (the Space holds the
read token) with the same immutable ``Cache-Control`` as the render
proxies so the CDN/browser cache them hard.
"""
if ".." in fixture or ".." in relpath:
return Response(status_code=404)
data = _fetch_task_input(fixture, relpath)
if data is None:
return Response(status_code=404)
media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream"
return Response(
content=data,
media_type=media_type,
headers={"Cache-Control": RENDER_CACHE_CONTROL},
)
def _tasks_iframe_html() -> str:
"""Build the Task browser as a self-contained ``srcdoc`` iframe.
Snapshots just the ``<fixture>/description.yaml`` files from the
inputs dataset (lightweight: the drawings/renders themselves load
lazily via the ``/task-input`` proxy), shapes them into task cards,
and inlines the page into an iframe so it keeps its own style
context (no Gradio CSS collision). A Hub read failure degrades to an
empty browser rather than crashing the tab.
"""
try:
local = snapshot_download(
repo_id=HF_DATA_REPO,
repo_type="dataset",
allow_patterns=["*/description.yaml"],
)
tasks = load_tasks_from_dir(Path(local))
except Exception: # noqa: BLE001 - degrade to empty browser, never crash
logger.exception("Task load failed; rendering empty task browser")
tasks = []
doc = render_tasks_page(tasks, _task_input_url)
escaped = html.escape(doc, quote=True)
return (
f'<iframe srcdoc="{escaped}" '
'style="width:100%; height:90vh; border:0; display:block;" '
'title="CADGenBench tasks"></iframe>'
)
with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as blocks:
gr.Markdown(
"# CADGenBench Leaderboard\n"
"_Benchmarking AI-driven CAD generation._"
)
with gr.Tab("Leaderboard"):
# Visual-first leaderboard. The bespoke surface (sticky GT row,
# fixture picker, turntable grid, compare modal) is a
# self-contained HTML doc inlined into an iframe `srcdoc` so it
# keeps its own style context. Thumbnails are lazy-loaded from
# the cached `/render` / `/gt-render` proxy routes (requires the
# Space to be public). Built at boot, rebuilt on page load, and
# refreshed after admin actions.
gallery_html = gr.HTML(value=_gallery_iframe_html())
gallery_refresh_btn = gr.Button("Refresh gallery", size="sm")
gallery_refresh_btn.click(
fn=_gallery_iframe_html, outputs=gallery_html,
)
with gr.Tab("Detailed View"):
# Load both tiers once at boot. `_safe_load_split` keeps a Hub
# read failure from crashing the Space: on failure the frames
# come up empty and `initial_error` carries the message the
# banner renders.
initial_validated, initial_unvalidated, initial_error = _safe_load_split()
# Loud, persistent banner shown only when the live results
# can't be read from the Hub (e.g. an under-scoped Space
# HF_TOKEN). Kept in sync by the refresh / Timer handlers. The
# leaderboard never falls back to stale/bundled data, so this
# banner is the signal that empty tables are a read failure,
# not a genuinely empty leaderboard.
data_error_banner = gr.Markdown(
value=_data_error_banner_md(initial_error),
visible=initial_error is not None,
)
# Collapsed accordions above the tables. Validation guidelines
# gives the short two-tier story + link to the full policy
# doc; Citation carries the verbatim BibTeX entry. Both start
# closed so the leaderboard itself stays above the fold.
with gr.Accordion("Validation guidelines", open=False):
gr.Markdown(VALIDATION_GUIDELINES_MD)
with gr.Accordion("Citation", open=False):
# language=None -> plain monospaced render (gr.Code doesn't
# ship a BibTeX highlighter); show_line_numbers off because
# the entry is meant to be copy-pasted, not annotated.
gr.Code(
value=CITATION_BIBTEX,
language=None,
show_line_numbers=False,
)
# Two stacked tables, split by `validation_status`. Validated
# on top so the curated results are above the fold; unvalidated
# below carries every other row (auto-published, awaiting
# methodology review). See decisions/validation-policy.md.
# Initial values come from the boot-time `_safe_load_split`
# above (empty + banner on a Hub read failure).
validated_view = Leaderboard(
value=initial_validated,
datatype=VALIDATED_LEADERBOARD_DATATYPES,
search_columns=["submission_name", "submitter_name"],
hide_columns=LEADERBOARD_HIDE_COLUMNS,
label="Validated Leaderboard",
interactive=False,
)
unvalidated_view = Leaderboard(
value=initial_unvalidated,
datatype=LEADERBOARD_DATATYPES,
search_columns=["submission_name", "submitter_name"],
hide_columns=LEADERBOARD_HIDE_COLUMNS,
label="Unvalidated Leaderboard",
interactive=False,
)
with gr.Row():
refresh_btn = gr.Button("Refresh", size="sm")
# One file, both tables, `validation_status` discriminator
# column. Fresh CSV is generated on every click so the
# download reflects the latest data, not a stale snapshot
# captured at boot.
download_btn = gr.DownloadButton(
label="Download CSV", size="sm",
)
refresh_btn.click(
fn=_refresh_leaderboard_with_toast,
outputs=[validated_view, unvalidated_view, data_error_banner],
)
download_btn.click(fn=build_combined_csv, outputs=download_btn)
# No inline row-click detail panel: the submission_name cell is a
# deep-link that opens the self-contained per-submission report in
# a new tab (see `_submission_name_md` in leaderboard.py). Now that
# the Space is public, HF's edge serves `/reports/<id>.html` to
# browser users, so we link to it directly instead of inlining the
# (tens-to-hundreds-of-MB) report through the Gradio event payload.
with gr.Tab("Tasks"):
# Read-only task browser: mirrors the per-submission report's
# summary-table -> detail-card navigation (j/k, Esc) but shows
# only the prompt + input (drawing / starting shape), no scores
# or ground truth. Self-contained HTML inlined into an iframe
# `srcdoc` like the gallery; input images lazy-load from the
# `/task-input` proxy. Built at boot, rebuilt on page load.
tasks_html = gr.HTML(value=_tasks_iframe_html())
tasks_refresh_btn = gr.Button("Refresh tasks", size="sm")
tasks_refresh_btn.click(fn=_tasks_iframe_html, outputs=tasks_html)
with gr.Tab("Submit"):
gr.Markdown(
f"""
**Submission format.** A single zip with:
- one folder per sample in `{HF_DATA_REPO}`; include `output.step` for
samples where your system produced a candidate. Missing `output.step`
scores zero for that sample;
- a top-level `meta.json`:
```json
{{
"submitter_name": "your name or team",
"submission_name": "MyAgent v2.3 (or whatever describes your system)",
"agent_url": "https://github.com/... (optional)",
"notes": "free text, optional, max 500 chars, single line, plain text",
"agree_to_publish": true
}}
```
**Submission name.** Free text describing the system being benchmarked,
however you choose to describe it. The benchmark is system-agnostic: your
submission may use no LLM, one, or many. If you want to disclose your
stack, put it here or in `notes`.
**Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars
and stripped to a single line. Shown in the per-submission detail view,
not in the main leaderboard table.
**Consent.** `"agree_to_publish": true` in `meta.json` is your consent
to publish the resulting row on the public leaderboard.
"""
)
# OAuth gate. The user must log in via the HF button before
# the Submit button becomes interactive; the row gets the
# canonical `hf_username` from `gr.OAuthProfile.username`
# (not a free-text claim in meta.json). README front-matter
# already carries `hf_oauth: true` so HF's OAuth integration
# is wired up at the Space level.
login_btn = gr.LoginButton()
zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
# Starts disabled; the `blocks.load` handler below flips it
# to interactive when an OAuthProfile is present.
submit_btn = gr.Button("Submit", variant="primary", interactive=False)
# Persistent status panel. handle_submit is a generator that
# streams stage updates (validating -> uploading/queuing ->
# queued) and any rejection reason here, so the outcome
# survives instead of vanishing with a transient toast. The
# handler also reads `gr.OAuthProfile` implicitly via its
# parameter type annotation (Gradio's dependency-injection
# convention).
submit_status = gr.Markdown(value=SUBMIT_STATUS_IDLE)
submit_btn.click(
fn=handle_submit,
inputs=[zip_in],
outputs=[submit_status],
)
with gr.Tab("About"):
gr.Markdown(ABOUT_MD)
with gr.Tab("Admin"):
# Maintainer-only controls. The tab is visible to everyone (a
# hint the path exists); the table + buttons are gated to OAuth
# users in the CADGENBENCH_ADMINS set via the `blocks.load`
# handler below + a server-side re-check in every handler. See
# decisions/validation-policy.md.
gr.Markdown(
"## Admin\n"
"Tick rows in the **select** column, then promote them into the "
"**Validated** tier (recording an evidence type), demote them back "
"to **Unvalidated**, delete them, or rescore them against the "
"current ground truth. Actions apply to every ticked row at once. "
"Limited to maintainers in the admin set; everyone else sees the "
"tab with the controls disabled."
)
admin_login_btn = gr.LoginButton()
admin_status = gr.Markdown(
"Log in with an admin account to enable the controls below."
)
# Only the leading `select` column is editable; the rest is
# read-only context. Click-to-tick drives every action below.
# `_safe_load_admin` keeps a Hub read failure from crashing the
# Space at boot (the admin table loads at construction time).
initial_admin_table, _ = _safe_load_admin()
admin_table = gr.Dataframe(
value=initial_admin_table,
datatype=[
"bool", "str", "str", "str", "str", "str", "str", "number",
"str",
],
static_columns=list(range(1, len(ADMIN_COLUMNS))),
interactive=False,
label="Submissions (tick select to choose rows)",
wrap=True,
)
admin_selection_md = gr.Markdown("_No rows selected._")
admin_method_radio = gr.Radio(
choices=list(VALID_METHODS),
value="manual",
label="validation_method (applied to all rows on promote)",
interactive=False,
)
with gr.Row():
promote_btn = gr.Button(
"Mark validated", variant="primary", interactive=False,
)
demote_btn = gr.Button("Mark unvalidated", interactive=False)
with gr.Accordion("Danger zone: delete", open=False):
gr.Markdown(
"Permanently deletes the ticked rows **and** their uploaded "
"zip + report files from the submissions dataset. This cannot "
"be undone (only a manual revert of the dataset commit).\n\n"
"**Stop & delete** additionally cancels any still-running "
"evaluation job(s) for the ticked rows before deleting — use "
"it for pending submissions whose GPU eval is in flight."
)
delete_confirm = gr.Checkbox(
label=(
"I understand this permanently deletes the selected "
"submissions and their files."
),
value=False,
interactive=False,
)
with gr.Row():
delete_btn = gr.Button(
"Delete selected", variant="stop", interactive=False,
)
stop_delete_btn = gr.Button(
"Stop & delete selected", variant="stop",
interactive=False,
)
with gr.Accordion("Danger zone: rescore", open=False):
gr.Markdown(
"Re-evaluates submissions against the **current** "
"ground truth + data: each row flips back to pending, the "
"gallery renders and the per-submission report HTML are "
"regenerated, and the score is recomputed. Use after a "
"ground-truth swap or a metric change that invalidates the "
"existing scores.\n\n"
"Rescoring is **re-runnable**: if a row's eval fails, mark it "
"and rescore again (or rescore all) — each run is "
"independent and converges.\n\n"
"- **Rescore selected** re-evaluates the ticked rows.\n"
f"- **Rescore all** re-evaluates every submission that has a "
f"stored zip and isn't already pending — type "
f"`{RESCORE_ALL_PHRASE}` to arm it."
)
rescore_confirm = gr.Checkbox(
label=(
"I understand this flips the selected rows to pending and "
"recomputes their scores."
),
value=False,
interactive=False,
)
rescore_selected_btn = gr.Button(
"Rescore selected", variant="stop", interactive=False,
)
rescore_all_phrase = gr.Textbox(
label=(
f"Type '{RESCORE_ALL_PHRASE}' to arm the board-wide "
f"rescore"
),
placeholder=RESCORE_ALL_PHRASE,
interactive=False,
)
rescore_all_btn = gr.Button(
"Rescore ALL submissions", variant="stop", interactive=False,
)
admin_refresh_btn = gr.Button("Refresh", size="sm")
admin_table.change(
fn=_admin_selection_status,
inputs=admin_table,
outputs=admin_selection_md,
)
promote_btn.click(
fn=_admin_promote,
inputs=[admin_table, admin_method_radio],
outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
)
demote_btn.click(
fn=_admin_demote,
inputs=[admin_table],
outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
)
delete_confirm.change(
fn=_arm_delete,
inputs=[delete_confirm],
outputs=[delete_btn, stop_delete_btn],
)
delete_btn.click(
fn=_admin_delete,
inputs=[admin_table, delete_confirm],
outputs=[
admin_table, validated_view, unvalidated_view, gallery_html,
delete_confirm, delete_btn, stop_delete_btn,
],
)
stop_delete_btn.click(
fn=_admin_stop_delete,
inputs=[admin_table, delete_confirm],
outputs=[
admin_table, validated_view, unvalidated_view, gallery_html,
delete_confirm, delete_btn, stop_delete_btn,
],
)
rescore_confirm.change(
fn=_arm_rescore_selected,
inputs=[rescore_confirm],
outputs=[rescore_selected_btn],
)
rescore_selected_btn.click(
fn=_admin_rescore_selected,
inputs=[admin_table, rescore_confirm],
outputs=[
admin_table, validated_view, unvalidated_view, gallery_html,
rescore_confirm, rescore_selected_btn,
],
)
rescore_all_phrase.change(
fn=_arm_rescore_all,
inputs=[rescore_all_phrase],
outputs=[rescore_all_btn],
)
rescore_all_btn.click(
fn=_admin_rescore_all,
inputs=[rescore_all_phrase],
outputs=[
admin_table, validated_view, unvalidated_view, gallery_html,
rescore_all_phrase, rescore_all_btn,
],
)
admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table)
# Keep the admin table on the same 10s cadence as the leaderboard
# so a row that lands (or a pending row that completes) after the
# tab loaded shows up without a manual Refresh. Selection is
# preserved across ticks so an in-progress set of checkboxes
# survives the reload.
admin_auto_refresh_timer = gr.Timer(10)
admin_auto_refresh_timer.tick(
fn=_auto_refresh_admin_table,
inputs=admin_table,
outputs=admin_table,
)
# gradio_leaderboard.Leaderboard handles its own update path
# cleanly; bind a Timer to push fresh dataframes every 10 seconds.
# Single tick runs `_auto_refresh_leaderboard` once and pushes the
# two halves into the validated / unvalidated widgets plus the
# data-unavailable banner. The handler swallows a Hub read failure
# into empty frames + a loud warning toast so a degraded read never
# crashes the tick loop or silently blanks the tables.
auto_refresh_timer = gr.Timer(10)
auto_refresh_timer.tick(
fn=_auto_refresh_leaderboard,
outputs=[validated_view, unvalidated_view, data_error_banner],
)
# On page load, read the visitor's OAuth profile (None if not
# logged in) and flip the Submit button's interactivity. Runs once
# per page load; LoginButton clicks also re-trigger this through
# Gradio's auth-event plumbing.
blocks.load(fn=_enable_submit_when_logged_in, outputs=submit_btn)
blocks.load(fn=_gallery_iframe_html, outputs=gallery_html)
blocks.load(fn=_tasks_iframe_html, outputs=tasks_html)
# Same per-load OAuth read, gating the Admin tab's controls on
# membership in the CADGENBENCH_ADMINS set. Logged-out / non-admin
# visitors get the tab with everything disabled.
blocks.load(
fn=_gate_admin_controls,
outputs=[
admin_table,
admin_method_radio,
promote_btn,
demote_btn,
delete_confirm,
delete_btn,
stop_delete_btn,
rescore_confirm,
rescore_selected_btn,
rescore_all_phrase,
rescore_all_btn,
admin_status,
],
)
# Mount Gradio under a FastAPI parent so the custom proxy route
# above lives at the same origin as the UI. Direct routes on `app`
# get checked before the Gradio sub-app, so `/reports/<sid>.html`
# never gets shadowed.
app = FastAPI()
app.add_api_route(
"/reports/{submission_id}.html",
serve_report,
methods=["GET"],
)
# Cached render proxies the gallery's lazy-loaded turntables point at.
# Registered before the Gradio mount so they're not shadowed by the
# catch-all sub-app.
# Candidate renders are served directly from the public render bucket (URLs
# come from the gallery resolvers), so only the private GT render still needs a
# token-holding Space proxy route.
app.add_api_route(
"/gt-render/{fixture}.webp",
serve_gt_render,
methods=["GET"],
)
# Ground-truth assets the hosted report links lazily (per-view PNGs + PDF).
# GT is private, so this token-holding proxy streams them; the `:path`
# converter lets `relpath` carry a slash (e.g. renders/iso.png). Registered
# before the Gradio mount so it isn't shadowed by the catch-all sub-app.
app.add_api_route(
"/gt/{fixture}/{relpath:path}",
serve_gt_file,
methods=["GET"],
)
# Task-browser input assets (drawings + starting-shape renders). The
# `:path` converter lets `relpath` carry a slash (e.g. renders/iso.png).
# Registered before the Gradio mount so it's not shadowed.
app.add_api_route(
"/task-input/{fixture}/{relpath:path}",
serve_task_input,
methods=["GET"],
)
app = gr.mount_gradio_app(app, blocks, path="/")
if __name__ == "__main__":
host = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0")
port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
uvicorn.run(app, host=host, port=port)
|