Spaces:
Running
Clean up unused code, files, and assets
Browse filesRemoved unused Python code:
- svg_to_data_uri() from leaderboard_transformer.py
- create_svg_html() from ui_components.py
- format_llm_base_with_html() from ui_components.py
Consolidated duplicate code:
- Renamed get_company_from_model_name to get_company_from_model in leaderboard_transformer.py
- Removed duplicate COMPANY_LOGO_MAP and get_company_from_model from ui_components.py
- ui_components.py and visualizations.py now import from leaderboard_transformer.py
Removed unused files:
- submission.py (submission functionality not exposed in app)
- submission_utils.py (only used by submission.py)
- generate_mock_jsonl.py (development utility)
- tests/integration/test_submission.py (tests for removed submission code)
Removed unused config:
- CONTACT_DATASET and SUBMISSION_DATASET from config.py
Removed unused CSS:
- Submission page CSS section from content.py
Removed 35 unused asset files:
- Various legend SVGs (api-*, c-*, os-*, custom-*, standard-*, equivalent-*)
- Category icons (code-execution, data-analysis, etc.)
- Star SVGs (three-point, four-point, five-point)
- Old logo files (openhands-logo.svg, logo-alibaba.svg, logo-huggingface.svg)
- Unused ellipse colors and other SVGs
- assets/api-custom.svg +0 -3
- assets/api-equivalent.svg +0 -3
- assets/api-legend.svg +0 -3
- assets/api-standard.svg +0 -3
- assets/c-custom.svg +0 -3
- assets/c-equivalent.svg +0 -3
- assets/c-legend.svg +0 -3
- assets/c-standard.svg +0 -3
- assets/code-execution.svg +0 -265
- assets/custom-legend.svg +0 -3
- assets/data-analysis.svg +0 -265
- assets/ellipse-coral.svg +0 -3
- assets/ellipse-white.svg +0 -3
- assets/end-to-end-discovery.svg +0 -265
- assets/equivalent-legend.svg +0 -3
- assets/five-point-star.svg +0 -3
- assets/four-point-star.svg +0 -3
- assets/just-icon.svg +0 -3
- assets/literature-understanding.svg +0 -265
- assets/logo-alibaba.svg +0 -1
- assets/logo-huggingface.svg +0 -1
- assets/openhands-logo.svg +0 -1
- assets/openhands_mark_color_forwhite.png +0 -0
- assets/os-custom.svg +0 -3
- assets/os-equivalent.svg +0 -3
- assets/os-legend.svg +0 -3
- assets/os-ow-custom.svg +0 -3
- assets/os-ow-equivalent.svg +0 -3
- assets/os-ow-legend.svg +0 -3
- assets/os-ow-standard.svg +0 -3
- assets/os-standard.svg +0 -3
- assets/overall.svg +0 -261
- assets/standard-legend.svg +0 -3
- assets/three-point-star.svg +0 -3
- assets/up-arrow.svg +0 -3
- config.py +0 -6
- content.py +0 -122
- generate_mock_jsonl.py +0 -161
- leaderboard_transformer.py +2 -12
- submission.py +0 -474
- submission_utils.py +0 -88
- tests/integration/test_submission.py +0 -108
- ui_components.py +2 -90
- visualizations.py +3 -2
|
Binary file (17.6 kB)
|
|
|
|
@@ -5,16 +5,10 @@ CONFIG_NAME = os.getenv("HF_CONFIG", "1.0.0-dev1") # This corresponds to 'config
|
|
| 5 |
IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
|
| 6 |
|
| 7 |
# OpenHands Index datasets
|
| 8 |
-
CONTACT_DATASET = f"OpenHands/openhands-index-contact-info"
|
| 9 |
-
|
| 10 |
if IS_INTERNAL:
|
| 11 |
-
# datasets backing the internal leaderboard
|
| 12 |
-
SUBMISSION_DATASET = f"OpenHands/openhands-index-internal-submissions"
|
| 13 |
RESULTS_DATASET = f"OpenHands/openhands-index-internal-results"
|
| 14 |
LEADERBOARD_PATH = f"OpenHands/openhands-index-internal-leaderboard"
|
| 15 |
else:
|
| 16 |
-
# datasets backing the public leaderboard
|
| 17 |
-
SUBMISSION_DATASET = f"OpenHands/openhands-index-submissions"
|
| 18 |
RESULTS_DATASET = f"OpenHands/openhands-index-results"
|
| 19 |
LEADERBOARD_PATH = f"OpenHands/openhands-index"
|
| 20 |
|
|
|
|
| 5 |
IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
|
| 6 |
|
| 7 |
# OpenHands Index datasets
|
|
|
|
|
|
|
| 8 |
if IS_INTERNAL:
|
|
|
|
|
|
|
| 9 |
RESULTS_DATASET = f"OpenHands/openhands-index-internal-results"
|
| 10 |
LEADERBOARD_PATH = f"OpenHands/openhands-index-internal-leaderboard"
|
| 11 |
else:
|
|
|
|
|
|
|
| 12 |
RESULTS_DATASET = f"OpenHands/openhands-index-results"
|
| 13 |
LEADERBOARD_PATH = f"OpenHands/openhands-index"
|
| 14 |
|
|
@@ -631,129 +631,7 @@ span.wrap[tabindex="0"][role="button"][data-editable="false"] {
|
|
| 631 |
margin: 20px 0;
|
| 632 |
max-width: 800px;
|
| 633 |
}
|
| 634 |
-
/*------ Submission Page CSS ------*/
|
| 635 |
-
#submission-modal .modal-container,
|
| 636 |
-
#success-modal .modal-container {
|
| 637 |
-
height: auto;
|
| 638 |
-
max-width: 600px;
|
| 639 |
-
}
|
| 640 |
-
|
| 641 |
-
#submission-modal-content,
|
| 642 |
-
#success-modal .submission-modal-content {
|
| 643 |
-
padding: 20px;
|
| 644 |
-
background-color: inherit;
|
| 645 |
-
border-radius: 8px;
|
| 646 |
-
text-align: center;
|
| 647 |
-
}
|
| 648 |
-
|
| 649 |
-
#submission-modal-content p,
|
| 650 |
-
#success-modal .submission-modal-content p {
|
| 651 |
-
font-size: 16px;
|
| 652 |
-
}
|
| 653 |
|
| 654 |
-
#legal-modal-content {
|
| 655 |
-
padding: 30px;
|
| 656 |
-
background-color: inherit;
|
| 657 |
-
border-radius: 8px;
|
| 658 |
-
text-align: left;
|
| 659 |
-
font-size: 14px;
|
| 660 |
-
}
|
| 661 |
-
|
| 662 |
-
#legal-modal-content h2 {
|
| 663 |
-
text-align: center;
|
| 664 |
-
}
|
| 665 |
-
#legal-modal-content button {
|
| 666 |
-
width: fit-content;
|
| 667 |
-
}
|
| 668 |
-
.spinner-container {
|
| 669 |
-
display: flex;
|
| 670 |
-
flex-direction: column;
|
| 671 |
-
align-items: center;
|
| 672 |
-
justify-content: center;
|
| 673 |
-
padding: 30px;
|
| 674 |
-
}
|
| 675 |
-
|
| 676 |
-
.spinner {
|
| 677 |
-
width: 50px;
|
| 678 |
-
height: 50px;
|
| 679 |
-
border: 5px solid #dee2e6;
|
| 680 |
-
border-top: 5px solid #007bff;
|
| 681 |
-
border-radius: 50%;
|
| 682 |
-
animation: spin 1s linear infinite;
|
| 683 |
-
margin-bottom: 20px;
|
| 684 |
-
}
|
| 685 |
-
|
| 686 |
-
@keyframes spin {
|
| 687 |
-
0% { transform: rotate(0deg); }
|
| 688 |
-
100% { transform: rotate(360deg); }
|
| 689 |
-
}
|
| 690 |
-
|
| 691 |
-
#submission-page-container {
|
| 692 |
-
max-width: 800px;
|
| 693 |
-
margin: 0 auto;
|
| 694 |
-
}
|
| 695 |
-
|
| 696 |
-
#submission-file-label {
|
| 697 |
-
padding: 10px;
|
| 698 |
-
}
|
| 699 |
-
|
| 700 |
-
#submission-button {
|
| 701 |
-
max-width: fit-content;
|
| 702 |
-
font-size: 14px;
|
| 703 |
-
}
|
| 704 |
-
|
| 705 |
-
.custom-form-group {
|
| 706 |
-
border: 1px solid #000 !important;
|
| 707 |
-
border-radius: 4px !important;
|
| 708 |
-
padding: 24px !important;
|
| 709 |
-
overflow: visible !important;
|
| 710 |
-
}
|
| 711 |
-
|
| 712 |
-
#openness-label-html,
|
| 713 |
-
#agent-tooling-label-html,
|
| 714 |
-
#agent-info-label-html,
|
| 715 |
-
#submitter-info-label-html,
|
| 716 |
-
#username-label-html,
|
| 717 |
-
#email-label-html,
|
| 718 |
-
#role-label-html {
|
| 719 |
-
padding-left: 12px;
|
| 720 |
-
}
|
| 721 |
-
|
| 722 |
-
.form-label {
|
| 723 |
-
margin: 4px 0px 0px 6px;
|
| 724 |
-
}
|
| 725 |
-
|
| 726 |
-
.form-label-fieldset {
|
| 727 |
-
padding-top: 10px !important;
|
| 728 |
-
}
|
| 729 |
-
|
| 730 |
-
#agent-tooling-label-html {
|
| 731 |
-
padding-top: 6px;
|
| 732 |
-
}
|
| 733 |
-
|
| 734 |
-
.custom-form-group,
|
| 735 |
-
.styler {
|
| 736 |
-
background: none;
|
| 737 |
-
}
|
| 738 |
-
|
| 739 |
-
#feedback-button {
|
| 740 |
-
display: inline-block;
|
| 741 |
-
background-color: var(--color-primary-link);
|
| 742 |
-
color: white;
|
| 743 |
-
border: none;
|
| 744 |
-
border-radius: 4px;
|
| 745 |
-
padding: 15px 20px;
|
| 746 |
-
font-size: 16px;
|
| 747 |
-
cursor: pointer;
|
| 748 |
-
transition: all 0.3s ease;
|
| 749 |
-
text-decoration: none;
|
| 750 |
-
}
|
| 751 |
-
|
| 752 |
-
#feedback-button:hover {
|
| 753 |
-
background-color: var(--color-button-hover);
|
| 754 |
-
transform: translateY(-2px);
|
| 755 |
-
box-shadow: 0 6px 12px rgba(0,0,0,0.3);
|
| 756 |
-
}
|
| 757 |
.dark #main-header h2 {
|
| 758 |
color: var(--color-primary-accent);
|
| 759 |
}
|
|
|
|
| 631 |
margin: 20px 0;
|
| 632 |
max-width: 800px;
|
| 633 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
.dark #main-header h2 {
|
| 636 |
color: var(--color-primary-accent);
|
| 637 |
}
|
|
@@ -1,161 +0,0 @@
|
|
| 1 |
-
"""Generate mock results data in JSONL format for OpenHands Index."""
|
| 2 |
-
import json
|
| 3 |
-
import os
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from datetime import datetime
|
| 6 |
-
|
| 7 |
-
# Define the 5 benchmarks
|
| 8 |
-
BENCHMARKS = {
|
| 9 |
-
"swe-bench": {
|
| 10 |
-
"tags": ["swe-bench"],
|
| 11 |
-
"metric": "resolve_rate",
|
| 12 |
-
"metric_display": "Resolve Rate (%)"
|
| 13 |
-
},
|
| 14 |
-
"swe-bench-multimodal": {
|
| 15 |
-
"tags": ["swe-bench-multimodal"],
|
| 16 |
-
"metric": "resolve_rate",
|
| 17 |
-
"metric_display": "Resolve Rate (%)"
|
| 18 |
-
},
|
| 19 |
-
"swt-bench": {
|
| 20 |
-
"tags": ["swt-bench"],
|
| 21 |
-
"metric": "success_rate",
|
| 22 |
-
"metric_display": "Success Rate (%)"
|
| 23 |
-
},
|
| 24 |
-
"commit0": {
|
| 25 |
-
"tags": ["commit0"],
|
| 26 |
-
"metric": "test_pass_rate",
|
| 27 |
-
"metric_display": "Test Pass Rate (%)"
|
| 28 |
-
},
|
| 29 |
-
"gaia": {
|
| 30 |
-
"tags": ["gaia"],
|
| 31 |
-
"metric": "accuracy",
|
| 32 |
-
"metric_display": "Accuracy (%)"
|
| 33 |
-
}
|
| 34 |
-
}
|
| 35 |
-
|
| 36 |
-
# Mock agents with realistic scores
|
| 37 |
-
MOCK_AGENTS = [
|
| 38 |
-
{
|
| 39 |
-
"agent_name": "1.0.2",
|
| 40 |
-
"llm_base": "claude-3-5-sonnet-20241022",
|
| 41 |
-
"openness": "closed",
|
| 42 |
-
"scores": {
|
| 43 |
-
"swe-bench": 48.3,
|
| 44 |
-
"swe-bench-multimodal": 42.1,
|
| 45 |
-
"swt-bench": 65.4,
|
| 46 |
-
"commit0": 71.2,
|
| 47 |
-
"gaia": 58.7
|
| 48 |
-
}
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"agent_name": "1.0.1",
|
| 52 |
-
"llm_base": "gpt-4o-2024-11-20",
|
| 53 |
-
"openness": "closed",
|
| 54 |
-
"scores": {
|
| 55 |
-
"swe-bench": 45.1,
|
| 56 |
-
"swe-bench-multimodal": 39.5,
|
| 57 |
-
"swt-bench": 62.3,
|
| 58 |
-
"commit0": 68.9,
|
| 59 |
-
"gaia": 55.2
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"agent_name": "1.0.0",
|
| 64 |
-
"llm_base": "gpt-4-turbo-2024-04-09",
|
| 65 |
-
"openness": "closed",
|
| 66 |
-
"scores": {
|
| 67 |
-
"swe-bench": 38.7,
|
| 68 |
-
"swe-bench-multimodal": 34.2,
|
| 69 |
-
"swt-bench": 54.1,
|
| 70 |
-
"commit0": 61.5,
|
| 71 |
-
"gaia": 48.3
|
| 72 |
-
}
|
| 73 |
-
},
|
| 74 |
-
{
|
| 75 |
-
"agent_name": "0.9.5",
|
| 76 |
-
"llm_base": "gpt-4o-mini-2024-07-18",
|
| 77 |
-
"openness": "closed",
|
| 78 |
-
"scores": {
|
| 79 |
-
"swe-bench": 32.5,
|
| 80 |
-
"swe-bench-multimodal": 28.9,
|
| 81 |
-
"swt-bench": 47.8,
|
| 82 |
-
"commit0": 55.3,
|
| 83 |
-
"gaia": 42.1
|
| 84 |
-
}
|
| 85 |
-
},
|
| 86 |
-
{
|
| 87 |
-
"agent_name": "0.9.0",
|
| 88 |
-
"llm_base": "claude-3-opus-20240229",
|
| 89 |
-
"openness": "closed",
|
| 90 |
-
"scores": {
|
| 91 |
-
"swe-bench": 29.8,
|
| 92 |
-
"swe-bench-multimodal": 25.7,
|
| 93 |
-
"swt-bench": 44.2,
|
| 94 |
-
"commit0": 52.1,
|
| 95 |
-
"gaia": 39.4
|
| 96 |
-
}
|
| 97 |
-
},
|
| 98 |
-
]
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def generate_mock_data():
|
| 102 |
-
"""Generate mock JSONL files for all benchmarks."""
|
| 103 |
-
output_dir = Path("mock_results/1.0.0-dev1")
|
| 104 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
| 105 |
-
|
| 106 |
-
# Create agenteval.json config
|
| 107 |
-
config = {
|
| 108 |
-
"suite_config": {
|
| 109 |
-
"name": "openhands-index",
|
| 110 |
-
"version": "1.0.0-dev1",
|
| 111 |
-
"splits": []
|
| 112 |
-
}
|
| 113 |
-
}
|
| 114 |
-
|
| 115 |
-
# Generate data for each benchmark
|
| 116 |
-
for benchmark_name, benchmark_info in BENCHMARKS.items():
|
| 117 |
-
print(f"Generating mock data for {benchmark_name}...")
|
| 118 |
-
|
| 119 |
-
# Add to config
|
| 120 |
-
config["suite_config"]["splits"].append({
|
| 121 |
-
"name": benchmark_name,
|
| 122 |
-
"tasks": [{
|
| 123 |
-
"name": benchmark_name,
|
| 124 |
-
"tags": benchmark_info["tags"]
|
| 125 |
-
}]
|
| 126 |
-
})
|
| 127 |
-
|
| 128 |
-
# Generate JSONL file
|
| 129 |
-
jsonl_path = output_dir / f"{benchmark_name}.jsonl"
|
| 130 |
-
with open(jsonl_path, 'w') as f:
|
| 131 |
-
for agent in MOCK_AGENTS:
|
| 132 |
-
record = {
|
| 133 |
-
"agent_name": agent["agent_name"],
|
| 134 |
-
"llm_base": agent["llm_base"],
|
| 135 |
-
"openness": agent["openness"],
|
| 136 |
-
"score": agent["scores"][benchmark_name],
|
| 137 |
-
"metric": benchmark_info["metric"],
|
| 138 |
-
"submission_time": datetime.now().isoformat(),
|
| 139 |
-
"tags": benchmark_info["tags"],
|
| 140 |
-
# Additional metadata
|
| 141 |
-
"cost_per_instance": round(0.1 + agent["scores"][benchmark_name] * 0.01, 4),
|
| 142 |
-
"average_runtime": round(300 + agent["scores"][benchmark_name] * 5, 1),
|
| 143 |
-
}
|
| 144 |
-
f.write(json.dumps(record) + '\n')
|
| 145 |
-
|
| 146 |
-
print(f" Created {jsonl_path}")
|
| 147 |
-
|
| 148 |
-
# Write config file
|
| 149 |
-
config_path = output_dir / "agenteval.json"
|
| 150 |
-
with open(config_path, 'w') as f:
|
| 151 |
-
json.dump(config, f, indent=2)
|
| 152 |
-
print(f"\nCreated config: {config_path}")
|
| 153 |
-
|
| 154 |
-
print("\n✓ Mock data generation complete!")
|
| 155 |
-
print(f" Location: {output_dir}")
|
| 156 |
-
print(f" Benchmarks: {', '.join(BENCHMARKS.keys())}")
|
| 157 |
-
print(f" Agents: {len(MOCK_AGENTS)}")
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
if __name__ == "__main__":
|
| 161 |
-
generate_mock_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -128,7 +128,7 @@ def add_branding_to_figure(fig: go.Figure) -> go.Figure:
|
|
| 128 |
return fig
|
| 129 |
|
| 130 |
|
| 131 |
-
def
|
| 132 |
"""
|
| 133 |
Gets the company info (logo path and name) from a model name.
|
| 134 |
Returns default unknown logo if no match found.
|
|
@@ -696,7 +696,7 @@ def _plot_scatter_plotly(
|
|
| 696 |
|
| 697 |
for _, row in data_plot.iterrows():
|
| 698 |
model_name = row.get('Language Model', '')
|
| 699 |
-
company_info =
|
| 700 |
logo_path = company_info['path']
|
| 701 |
|
| 702 |
# Read the SVG file and encode as base64 data URI
|
|
@@ -994,16 +994,6 @@ def get_pareto_df(data, cost_col=None, score_col=None):
|
|
| 994 |
return pd.DataFrame(pareto_points)
|
| 995 |
|
| 996 |
|
| 997 |
-
def svg_to_data_uri(path: str) -> str:
|
| 998 |
-
"""Reads an SVG file and encodes it as a Data URI for Plotly."""
|
| 999 |
-
try:
|
| 1000 |
-
with open(path, "rb") as f:
|
| 1001 |
-
encoded_string = base64.b64encode(f.read()).decode()
|
| 1002 |
-
return f"data:image/svg+xml;base64,{encoded_string}"
|
| 1003 |
-
except FileNotFoundError:
|
| 1004 |
-
logger.warning(f"SVG file not found at: {path}")
|
| 1005 |
-
return None
|
| 1006 |
-
|
| 1007 |
def clean_llm_base_list(model_list):
|
| 1008 |
"""
|
| 1009 |
Cleans a list of model strings by keeping only the text after the last '/'.
|
|
|
|
| 128 |
return fig
|
| 129 |
|
| 130 |
|
| 131 |
+
def get_company_from_model(model_name: str) -> dict:
|
| 132 |
"""
|
| 133 |
Gets the company info (logo path and name) from a model name.
|
| 134 |
Returns default unknown logo if no match found.
|
|
|
|
| 696 |
|
| 697 |
for _, row in data_plot.iterrows():
|
| 698 |
model_name = row.get('Language Model', '')
|
| 699 |
+
company_info = get_company_from_model(model_name)
|
| 700 |
logo_path = company_info['path']
|
| 701 |
|
| 702 |
# Read the SVG file and encode as base64 data URI
|
|
|
|
| 994 |
return pd.DataFrame(pareto_points)
|
| 995 |
|
| 996 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 997 |
def clean_llm_base_list(model_list):
|
| 998 |
"""
|
| 999 |
Cleans a list of model strings by keeping only the text after the last '/'.
|
|
@@ -1,474 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import typing
|
| 3 |
-
|
| 4 |
-
import matplotlib
|
| 5 |
-
matplotlib.use('Agg')
|
| 6 |
-
|
| 7 |
-
import os
|
| 8 |
-
import shutil
|
| 9 |
-
import tarfile
|
| 10 |
-
from datetime import datetime, timedelta, timezone
|
| 11 |
-
from email.utils import parseaddr
|
| 12 |
-
|
| 13 |
-
import gradio as gr
|
| 14 |
-
import requests
|
| 15 |
-
from huggingface_hub import HfApi
|
| 16 |
-
|
| 17 |
-
import aliases
|
| 18 |
-
from submission_utils import (
|
| 19 |
-
SUBMISSION_METADATA_FILENAME,
|
| 20 |
-
SubmissionMetadata,
|
| 21 |
-
sanitize_path_component,
|
| 22 |
-
_validate_path_component
|
| 23 |
-
)
|
| 24 |
-
from config import (
|
| 25 |
-
CONFIG_NAME,
|
| 26 |
-
CONTACT_DATASET,
|
| 27 |
-
EXTRACTED_DATA_DIR,
|
| 28 |
-
RESULTS_DATASET,
|
| 29 |
-
SUBMISSION_DATASET,
|
| 30 |
-
)
|
| 31 |
-
from content import (
|
| 32 |
-
CITATION_BUTTON_LABEL,
|
| 33 |
-
CITATION_BUTTON_TEXT,
|
| 34 |
-
LEGAL_DISCLAIMER_TEXT,
|
| 35 |
-
SUBMISSION_CONFIRMATION,
|
| 36 |
-
format_error,
|
| 37 |
-
format_log,
|
| 38 |
-
format_warning,
|
| 39 |
-
)
|
| 40 |
-
from ui_components import build_openness_tooltip_content, build_tooling_tooltip_content
|
| 41 |
-
|
| 42 |
-
# Simple stubs for dataset functionality (not using HF datasets)
|
| 43 |
-
class DatasetDict(dict):
|
| 44 |
-
"""Simple stub for datasets.DatasetDict."""
|
| 45 |
-
pass
|
| 46 |
-
|
| 47 |
-
class Dataset:
|
| 48 |
-
"""Simple stub for datasets.Dataset."""
|
| 49 |
-
@staticmethod
|
| 50 |
-
def from_list(data):
|
| 51 |
-
return Dataset()
|
| 52 |
-
|
| 53 |
-
def load_dataset(*args, **kwargs):
|
| 54 |
-
"""Simple stub for datasets.load_dataset."""
|
| 55 |
-
return DatasetDict()
|
| 56 |
-
|
| 57 |
-
class EmptyDatasetError(Exception):
|
| 58 |
-
"""Simple stub for datasets exception."""
|
| 59 |
-
pass
|
| 60 |
-
|
| 61 |
-
class DataFilesNotFoundError(Exception):
|
| 62 |
-
"""Simple stub for datasets exception."""
|
| 63 |
-
pass
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
logger = logging.getLogger(__name__)
|
| 67 |
-
logger.setLevel(logging.DEBUG)
|
| 68 |
-
|
| 69 |
-
api = HfApi()
|
| 70 |
-
MAX_UPLOAD_BYTES = 5e9
|
| 71 |
-
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
|
| 72 |
-
|
| 73 |
-
# --- Submission Logic (largely unchanged from original, ensure LeaderboardSubmission and other deps are fine) ---
|
| 74 |
-
def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to avoid conflict if LV has one
|
| 75 |
-
try:
|
| 76 |
-
return load_dataset(*args, **kwargs)
|
| 77 |
-
except EmptyDatasetError:
|
| 78 |
-
return DatasetDict()
|
| 79 |
-
except ValueError: # Handles cases where dataset is empty or ill-formed
|
| 80 |
-
return DatasetDict()
|
| 81 |
-
except DataFilesNotFoundError:
|
| 82 |
-
return DatasetDict()
|
| 83 |
-
|
| 84 |
-
def upload_submission(
|
| 85 |
-
folder_path: str,
|
| 86 |
-
split: str,
|
| 87 |
-
submission_name: str,
|
| 88 |
-
hf_username: str,
|
| 89 |
-
) -> str:
|
| 90 |
-
total = 0
|
| 91 |
-
for root, _, files in os.walk(folder_path):
|
| 92 |
-
for f_ul in files:
|
| 93 |
-
total += os.path.getsize(os.path.join(root, f_ul))
|
| 94 |
-
if total > MAX_UPLOAD_BYTES:
|
| 95 |
-
raise ValueError(
|
| 96 |
-
f"Upload too large: exceeds {MAX_UPLOAD_BYTES // 1000000} MB limit."
|
| 97 |
-
)
|
| 98 |
-
|
| 99 |
-
# This is a copy of agenteval.upload.upload_folder_to_hf so we can use other api params.
|
| 100 |
-
# TODO in agenteval: When you mildly wrap another library call, always pass *args, **kwargs.
|
| 101 |
-
_validate_path_component(CONFIG_NAME, "config_name")
|
| 102 |
-
_validate_path_component(split, "split")
|
| 103 |
-
_validate_path_component(submission_name, "submission_name")
|
| 104 |
-
dataset_url = f"hf://datasets/{SUBMISSION_DATASET}/{CONFIG_NAME}/{split}/{submission_name}"
|
| 105 |
-
logger.info(f"Uploading dataset {dataset_url}")
|
| 106 |
-
api.upload_folder(
|
| 107 |
-
folder_path=folder_path,
|
| 108 |
-
path_in_repo=f"{CONFIG_NAME}/{split}/{submission_name}",
|
| 109 |
-
repo_id=SUBMISSION_DATASET,
|
| 110 |
-
repo_type="dataset",
|
| 111 |
-
# Reminder: This may be going into a public dataset.
|
| 112 |
-
# Don't put private information in commit message such as email.
|
| 113 |
-
commit_message=f'Submission from hf user "{hf_username}" to "{dataset_url}"',
|
| 114 |
-
)
|
| 115 |
-
return dataset_url
|
| 116 |
-
|
| 117 |
-
def show_loading_spinner():
|
| 118 |
-
return gr.update(visible=True)
|
| 119 |
-
|
| 120 |
-
def add_new_eval(
|
| 121 |
-
val_or_test: str,
|
| 122 |
-
agent_name: str | None,
|
| 123 |
-
agent_description: str,
|
| 124 |
-
agent_url: str,
|
| 125 |
-
openness: str | None,
|
| 126 |
-
degree_of_control: str | None,
|
| 127 |
-
path_to_file: typing.IO | None,
|
| 128 |
-
username: str,
|
| 129 |
-
role: str,
|
| 130 |
-
email: str,
|
| 131 |
-
email_opt_in: bool,
|
| 132 |
-
profile: gr.OAuthProfile,
|
| 133 |
-
):
|
| 134 |
-
if not agent_name:
|
| 135 |
-
return (
|
| 136 |
-
format_warning("Please provide an agent name."), # error_message
|
| 137 |
-
gr.update(visible=True), # error_modal
|
| 138 |
-
gr.update(visible=False), # success_modal
|
| 139 |
-
gr.update(visible=False) # loading_modal
|
| 140 |
-
)
|
| 141 |
-
|
| 142 |
-
if path_to_file is None:
|
| 143 |
-
return (
|
| 144 |
-
format_warning("Please attach a .tar.gz file."), # error_message
|
| 145 |
-
gr.update(visible=True), # error_modal
|
| 146 |
-
gr.update(visible=False), # success_modal
|
| 147 |
-
gr.update(visible=False) # loading_modal
|
| 148 |
-
)
|
| 149 |
-
|
| 150 |
-
logger.info(f"agent {agent_name}: Checking submission")
|
| 151 |
-
|
| 152 |
-
# Load current eval_results for submission checks
|
| 153 |
-
# This is a bit redundant if display part reloads it, but submission needs its own consistent view
|
| 154 |
-
current_eval_results_for_submission = try_load_dataset_submission(
|
| 155 |
-
RESULTS_DATASET,
|
| 156 |
-
CONFIG_NAME,
|
| 157 |
-
download_mode="force_redownload", # Or a less aggressive mode
|
| 158 |
-
verification_mode=VerificationMode.NO_CHECKS,
|
| 159 |
-
)
|
| 160 |
-
|
| 161 |
-
submission_time = datetime.now(timezone.utc)
|
| 162 |
-
if not username or username.strip() == "":
|
| 163 |
-
username = profile.username # Default to HF username
|
| 164 |
-
|
| 165 |
-
logger.debug(f"agent {agent_name}: User account age check {profile.username}")
|
| 166 |
-
try:
|
| 167 |
-
# Account age check disabled for launch.
|
| 168 |
-
# https://github.com/allenai/astabench-issues/issues/419
|
| 169 |
-
# if _is_hf_acct_too_new(submission_time, profile.username):
|
| 170 |
-
# return (
|
| 171 |
-
# format_error("This account is not authorized to submit here (account too new)."), # error_message
|
| 172 |
-
# gr.update(visible=True), # error_modal
|
| 173 |
-
# gr.update(visible=False), # success_modal
|
| 174 |
-
# gr.update(visible=False) # loading_modal
|
| 175 |
-
# )
|
| 176 |
-
pass
|
| 177 |
-
except Exception as e:
|
| 178 |
-
logger.warning(f"Error checking user account age: {e}")
|
| 179 |
-
return (
|
| 180 |
-
format_error("Could not verify account age. Please try again later."), # error_message
|
| 181 |
-
gr.update(visible=True), # error_modal
|
| 182 |
-
gr.update(visible=False), # success_modal
|
| 183 |
-
gr.update(visible=False) # loading_modal
|
| 184 |
-
)
|
| 185 |
-
|
| 186 |
-
logger.debug(f"agent {agent_name}: Submission frequency check {profile.username}")
|
| 187 |
-
contact_infos = try_load_dataset_submission(
|
| 188 |
-
CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
|
| 189 |
-
verification_mode=VerificationMode.NO_CHECKS
|
| 190 |
-
)
|
| 191 |
-
if _is_last_submission_too_recent(
|
| 192 |
-
contact_rows=contact_infos.get(val_or_test, []),
|
| 193 |
-
username=profile.username,
|
| 194 |
-
submission_time=submission_time,
|
| 195 |
-
):
|
| 196 |
-
logger.info(f"agent {agent_name}: Denied submission because user {username} submitted recently")
|
| 197 |
-
return (
|
| 198 |
-
format_error("You already submitted once in the last 24h for this split; please try again later."), # error_message
|
| 199 |
-
gr.update(visible=True), # error_modal
|
| 200 |
-
gr.update(visible=False), # success_modal
|
| 201 |
-
gr.update(visible=False) # loading_modal
|
| 202 |
-
)
|
| 203 |
-
|
| 204 |
-
logger.debug(f"agent {agent_name}: Email validation {email}")
|
| 205 |
-
_, parsed_mail = parseaddr(email)
|
| 206 |
-
if "@" not in parsed_mail:
|
| 207 |
-
return (
|
| 208 |
-
format_warning("Please provide a valid email address."), # error_message
|
| 209 |
-
gr.update(visible=True), # error_modal
|
| 210 |
-
gr.update(visible=False), # success_modal
|
| 211 |
-
gr.update(visible=False) # loading_modal
|
| 212 |
-
)
|
| 213 |
-
|
| 214 |
-
logger.debug(f"agent {agent_name}: Duplicate submission check")
|
| 215 |
-
if val_or_test in current_eval_results_for_submission and len(current_eval_results_for_submission[val_or_test]) > 0:
|
| 216 |
-
existing_submissions = current_eval_results_for_submission[val_or_test].to_dict().get("submission", [])
|
| 217 |
-
for sub_item in existing_submissions:
|
| 218 |
-
if (sub_item.get("agent_name", "").lower() == agent_name.lower() and
|
| 219 |
-
sub_item.get("username", "").lower() == username.lower()):
|
| 220 |
-
return (
|
| 221 |
-
format_warning("This agent name by this user has already been submitted to this split."), # error_message
|
| 222 |
-
gr.update(visible=True), # error_modal
|
| 223 |
-
gr.update(visible=False), # success_modal
|
| 224 |
-
gr.update(visible=False) # loading_modal
|
| 225 |
-
)
|
| 226 |
-
|
| 227 |
-
safe_username = sanitize_path_component(username)
|
| 228 |
-
safe_agent_name = sanitize_path_component(agent_name)
|
| 229 |
-
extracted_dir = os.path.join(EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}")
|
| 230 |
-
|
| 231 |
-
logger.debug(f"agent {agent_name}: File extraction to {extracted_dir}")
|
| 232 |
-
try:
|
| 233 |
-
if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir)
|
| 234 |
-
os.makedirs(extracted_dir, exist_ok=True)
|
| 235 |
-
with tarfile.open(path_to_file.name, "r:gz") as tar:
|
| 236 |
-
members_extracted = 0
|
| 237 |
-
for member in tar.getmembers():
|
| 238 |
-
if not member.isreg(): continue
|
| 239 |
-
fname = os.path.basename(member.name)
|
| 240 |
-
if not fname or fname.startswith("."): continue
|
| 241 |
-
fobj = tar.extractfile(member)
|
| 242 |
-
if not fobj: continue
|
| 243 |
-
with open(os.path.join(extracted_dir, fname), "wb") as out:
|
| 244 |
-
out.write(fobj.read())
|
| 245 |
-
members_extracted +=1
|
| 246 |
-
if members_extracted == 0:
|
| 247 |
-
return (
|
| 248 |
-
format_error("Submission tarball is empty or contains no valid files."), # error_message
|
| 249 |
-
gr.update(visible=True), # error_modal
|
| 250 |
-
gr.update(visible=False), # success_modal
|
| 251 |
-
gr.update(visible=False) # loading_modal
|
| 252 |
-
)
|
| 253 |
-
except Exception as e:
|
| 254 |
-
return (
|
| 255 |
-
format_error(f"Error extracting file: {e}. Ensure it's a valid .tar.gz."), # error_message
|
| 256 |
-
gr.update(visible=True), # error_modal
|
| 257 |
-
gr.update(visible=False), # success_modal
|
| 258 |
-
gr.update(visible=False) # loading_modal
|
| 259 |
-
)
|
| 260 |
-
|
| 261 |
-
submission_name = f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d_%H-%M-%S')}"
|
| 262 |
-
|
| 263 |
-
logger.debug(f"agent {agent_name}: Generate submission.json")
|
| 264 |
-
subm_meta = SubmissionMetadata(
|
| 265 |
-
agent_name=agent_name,
|
| 266 |
-
agent_description=agent_description,
|
| 267 |
-
agent_url=agent_url,
|
| 268 |
-
openness=openness,
|
| 269 |
-
tool_usage=degree_of_control,
|
| 270 |
-
username=username,
|
| 271 |
-
submit_time=submission_time,
|
| 272 |
-
)
|
| 273 |
-
with open(os.path.join(extracted_dir, SUBMISSION_METADATA_FILENAME), "w", encoding="utf-8") as fp:
|
| 274 |
-
fp.write(subm_meta.model_dump_json(indent=2))
|
| 275 |
-
|
| 276 |
-
logger.info(f"agent {agent_name}: Upload raw (unscored) submission files")
|
| 277 |
-
try:
|
| 278 |
-
dataset_url = upload_submission(extracted_dir, val_or_test, submission_name, profile.username)
|
| 279 |
-
except ValueError as e:
|
| 280 |
-
return (
|
| 281 |
-
format_error(str(e)), # error_message
|
| 282 |
-
gr.update(visible=True), # error_modal
|
| 283 |
-
gr.update(visible=False), # success_modal
|
| 284 |
-
gr.update(visible=False) # loading_modal
|
| 285 |
-
)
|
| 286 |
-
except Exception as e:
|
| 287 |
-
return (
|
| 288 |
-
format_error(f"Failed to upload raw submission: {e}"), # error_message
|
| 289 |
-
gr.update(visible=True), # error_modal
|
| 290 |
-
gr.update(visible=False), # success_modal
|
| 291 |
-
gr.update(visible=False) # loading_modal
|
| 292 |
-
)
|
| 293 |
-
|
| 294 |
-
logger.info(f"agent {agent_name}: Save contact information")
|
| 295 |
-
contact_info = subm_meta.model_dump()
|
| 296 |
-
contact_info["username_auth"] = profile.username
|
| 297 |
-
contact_info["email"] = email
|
| 298 |
-
contact_info["email_opt_in"] = email_opt_in
|
| 299 |
-
contact_info["role"] = role
|
| 300 |
-
contact_info["dataset_url"] = dataset_url
|
| 301 |
-
|
| 302 |
-
logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
|
| 303 |
-
if val_or_test in contact_infos:
|
| 304 |
-
contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info)
|
| 305 |
-
else:
|
| 306 |
-
contact_infos[val_or_test] = Dataset.from_list([contact_info])
|
| 307 |
-
|
| 308 |
-
try:
|
| 309 |
-
contact_infos.push_to_hub(
|
| 310 |
-
repo_id=CONTACT_DATASET,
|
| 311 |
-
config_name=CONFIG_NAME,
|
| 312 |
-
commit_message=f'Submission from hf user "{profile.username}" to "{dataset_url}"',
|
| 313 |
-
)
|
| 314 |
-
except Exception as e:
|
| 315 |
-
return (
|
| 316 |
-
format_error(f"Submission recorded, but contact info failed to save: {e}"), # error_message
|
| 317 |
-
gr.update(visible=True), # error_modal
|
| 318 |
-
gr.update(visible=False), # success_modal
|
| 319 |
-
gr.update(visible=False) # loading_modal
|
| 320 |
-
)
|
| 321 |
-
|
| 322 |
-
logger.info(f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split.")
|
| 323 |
-
return (
|
| 324 |
-
"", # message
|
| 325 |
-
gr.update(visible=False), # error_modal
|
| 326 |
-
gr.update(visible=True), # success_modal
|
| 327 |
-
gr.update(visible=False) # loading_modal
|
| 328 |
-
)
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
def _is_hf_acct_too_new(submission_time: datetime, username: str):
|
| 332 |
-
user_data_resp = requests.get(f"https://huggingface.co/api/users/{username}/overview")
|
| 333 |
-
user_data_resp.raise_for_status()
|
| 334 |
-
creation_date_str = user_data_resp.json()["createdAt"]
|
| 335 |
-
created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
|
| 336 |
-
return submission_time - created_at < timedelta(days=60)
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
def _is_last_submission_too_recent(contact_rows, username, submission_time):
|
| 340 |
-
user_submission_dates = sorted(
|
| 341 |
-
row["submit_time"] for row in contact_rows if row["username_auth"] == username
|
| 342 |
-
)
|
| 343 |
-
return user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1))
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
openness_label_html = f"""<div>
|
| 347 |
-
<b>Agent Openness</b>
|
| 348 |
-
{build_openness_tooltip_content()}
|
| 349 |
-
</div>"""
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
agent_tooling_label_html = f"""<div>
|
| 353 |
-
<b>Agent Tooling</b>
|
| 354 |
-
{build_tooling_tooltip_content()}
|
| 355 |
-
</div>"""
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
heading_html = """
|
| 359 |
-
<h2>🚀 Submit an agent for evaluation</h2>
|
| 360 |
-
<p>Submit your agent to OpenHands Index for evaluation on real-world scientific tasks. Once submitted, your run will be reviewed by our team. If there are any issues, we’ll reach out within 5–7 business days. We’re working toward full automation, but in the meantime, human review helps ensure quality and trust.</p>
|
| 361 |
-
<h3>How to run an evaluation</h3>
|
| 362 |
-
<p>Please follow the steps in our <a href="https://github.com/allenai/asta-bench?tab=readme-ov-file#usage" target="_blank">README</a>. You’ll upload your run file at the end of this form.</p>
|
| 363 |
-
"""
|
| 364 |
-
|
| 365 |
-
# --- Submission Accordion ---
|
| 366 |
-
def build_page():
|
| 367 |
-
with gr.Column(elem_id="submission-page-container"):
|
| 368 |
-
gr.HTML(heading_html)
|
| 369 |
-
gr.LoginButton()
|
| 370 |
-
with gr.Group(elem_classes="custom-form-group"):
|
| 371 |
-
gr.HTML(value="""<h2>Submitter Information</h2>""", elem_id="submitter-info-label-html")
|
| 372 |
-
gr.HTML(value="""<h3>Username</h3>""", elem_classes="form-label")
|
| 373 |
-
username_tb = gr.Textbox(label="This will show on the leaderboard. By default, we’ll use your Hugging Face username; but you can enter your organization name instead (e.g., university, company, or lab).")
|
| 374 |
-
gr.HTML(value="""<h3>Role</h3>""", elem_classes="form-label")
|
| 375 |
-
role = gr.Dropdown(label="Please select the role that most closely matches your current position. Helps us improve OpenHands Index for different user types. Not displayed on the leaderboard.",
|
| 376 |
-
interactive=True,
|
| 377 |
-
choices=[
|
| 378 |
-
"Undergraduate Student",
|
| 379 |
-
"Masters Student",
|
| 380 |
-
"PhD Student",
|
| 381 |
-
"Postdoctoral Researcher",
|
| 382 |
-
"Academic Faculty (e.g., Professor, Lecturer)",
|
| 383 |
-
"Industry Researcher (e.g., Research Scientist, Applied Scientist)",
|
| 384 |
-
"Engineer or Developer (e.g., Software or ML Engineer)",
|
| 385 |
-
"Data Scientist or Analyst",
|
| 386 |
-
"Product or Program Manager",
|
| 387 |
-
"Startup Founder or Independent Researcher",
|
| 388 |
-
"Other"
|
| 389 |
-
])
|
| 390 |
-
gr.HTML(value="""<h3>Contact email</h3>""", elem_classes="form-label")
|
| 391 |
-
mail_tb = gr.Textbox(label="We'll only use your email to communicate about your submission.")
|
| 392 |
-
mail_opt_in = gr.Checkbox(label="I’m open to being contacted by email for user research studies or feedback opportunities.")
|
| 393 |
-
with gr.Group(elem_classes="custom-form-group"):
|
| 394 |
-
gr.HTML(value="""<h2>Agent Information</h2>""", elem_id="agent-info-label-html")
|
| 395 |
-
gr.HTML(value="""<h3>Split</h3>""", elem_classes="form-label")
|
| 396 |
-
level_of_test_radio = gr.Radio(choices=[
|
| 397 |
-
("Test set", "test"),
|
| 398 |
-
("Validation set", "validation"),
|
| 399 |
-
], elem_classes="form-label-fieldset", value="validation", label="The Test Set is used for final leaderboard rankings. The Validation Set is for development and iteration. Choose based on your evaluation goal.")
|
| 400 |
-
gr.HTML(value="""<h3>Agent name</h3>""", elem_classes="form-label")
|
| 401 |
-
agent_name_tb = gr.Textbox(label="This is how your agent will appear on the leaderboard. Use a clear, descriptive name (e.g., Asta Scholar QA, Perplexity Deep Research). Omit model names (e.g. GPT-4, Mistral) as they’ll be shown automatically based on your logs.")
|
| 402 |
-
gr.HTML(value="""<h3>Agent description</h3>""", elem_classes="form-label")
|
| 403 |
-
agent_desc_tb = gr.Textbox(label="Briefly describe your agent’s approach, core strategies, or what makes it distinct. This description may appear on the leaderboard.")
|
| 404 |
-
gr.HTML(value="""<h3>URL</h3>""", elem_classes="form-label")
|
| 405 |
-
agent_url_tb = gr.Textbox(label="Link to more information about your agent (e.g. GitHub repo, blog post, or website). This optional link may be shown on the leaderboard to let others explore your agent in more depth.")
|
| 406 |
-
gr.HTML(value=openness_label_html, elem_classes="form-label")
|
| 407 |
-
openness_radio = gr.Radio([aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS, aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS, aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE, aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY], elem_classes="form-label-fieldset", value=None, label="This affects how your submission is categorized on the leaderboard. Choose based on the availability of your code, model weights, or APIs.")
|
| 408 |
-
gr.HTML(value=agent_tooling_label_html, elem_classes="form-label")
|
| 409 |
-
degree_of_control_radio = gr.Radio([aliases.CANONICAL_TOOL_USAGE_STANDARD, aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE, aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM], elem_classes="form-label-fieldset",value=None, label="Choose based on the tools and the execution environment your agent used during evaluation.")
|
| 410 |
-
gr.HTML(value="""<h3>Submission file</h3>""", elem_classes="form-label")
|
| 411 |
-
gr.HTML("<div id='submission-file-label'>Upload your run as a .tar.gz archive prepared using the steps in the <a href='https://github.com/allenai/asta-bench?tab=readme-ov-file#submitting-to-the-leaderboard' target='_blank'>README</a> (“Submitting to the Leaderboard”).</div>")
|
| 412 |
-
file_upload_comp = gr.File(
|
| 413 |
-
show_label=False,
|
| 414 |
-
file_types=[".gz", ".tar.gz"],
|
| 415 |
-
)
|
| 416 |
-
submit_eval_button = gr.Button("Submit Evaluation", elem_id="submission-button")
|
| 417 |
-
# Modals for loading spinner, success and error messages
|
| 418 |
-
with Modal(visible=False, elem_id="submission-modal") as loading_modal:
|
| 419 |
-
with gr.Column(elem_id="submission-modal-content"):
|
| 420 |
-
gr.HTML('<div class="spinner-container"><div class="spinner"></div><p>Processing your submission...</p></div>')
|
| 421 |
-
|
| 422 |
-
with Modal(visible=False, elem_id="submission-modal") as error_modal:
|
| 423 |
-
with gr.Column(elem_id="submission-modal-content"):
|
| 424 |
-
gr.Markdown("## ⚠️ Error")
|
| 425 |
-
error_message = gr.Markdown()
|
| 426 |
-
|
| 427 |
-
with Modal(visible=False, elem_id="success-modal") as success_modal:
|
| 428 |
-
with gr.Column(elem_id="submission-modal-content"):
|
| 429 |
-
gr.Markdown(SUBMISSION_CONFIRMATION)
|
| 430 |
-
with Modal(visible=False, elem_id="submission-modal") as disclaimer_modal:
|
| 431 |
-
with gr.Column(elem_id="legal-modal-content"):
|
| 432 |
-
gr.HTML(LEGAL_DISCLAIMER_TEXT)
|
| 433 |
-
with gr.Row():
|
| 434 |
-
agree_button = gr.Button("I agree to the terms and conditions above", variant="primary")
|
| 435 |
-
|
| 436 |
-
def accept_and_load():
|
| 437 |
-
return [
|
| 438 |
-
gr.update(visible=False), # Hide disclaimer_modal
|
| 439 |
-
gr.update(visible=True) # Show loading_modal
|
| 440 |
-
]
|
| 441 |
-
|
| 442 |
-
def show_disclaimer():
|
| 443 |
-
return gr.update(visible=True)
|
| 444 |
-
|
| 445 |
-
submit_eval_button.click(
|
| 446 |
-
fn=show_disclaimer,
|
| 447 |
-
inputs=None,
|
| 448 |
-
outputs=[disclaimer_modal]
|
| 449 |
-
)
|
| 450 |
-
|
| 451 |
-
agree_button.click(
|
| 452 |
-
fn=accept_and_load,
|
| 453 |
-
inputs=None,
|
| 454 |
-
outputs=[disclaimer_modal, loading_modal],
|
| 455 |
-
).then(
|
| 456 |
-
fn=add_new_eval,
|
| 457 |
-
inputs=[
|
| 458 |
-
level_of_test_radio,
|
| 459 |
-
agent_name_tb,
|
| 460 |
-
agent_desc_tb,
|
| 461 |
-
agent_url_tb,
|
| 462 |
-
openness_radio,
|
| 463 |
-
degree_of_control_radio,
|
| 464 |
-
file_upload_comp,
|
| 465 |
-
username_tb,
|
| 466 |
-
role,
|
| 467 |
-
mail_tb,
|
| 468 |
-
mail_opt_in
|
| 469 |
-
],
|
| 470 |
-
outputs=[error_message, error_modal, success_modal, loading_modal],
|
| 471 |
-
)
|
| 472 |
-
# hiding this for now till we have the real paper data
|
| 473 |
-
# with gr.Accordion("📙 Citation", open=False):
|
| 474 |
-
# gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,88 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Utilities for submission handling, replacing agent-eval dependencies.
|
| 3 |
-
"""
|
| 4 |
-
import re
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
from typing import Optional
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
# Constants
|
| 10 |
-
SUBMISSION_METADATA_FILENAME = "metadata.json"
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
# Simple SubmissionMetadata class
|
| 14 |
-
class SubmissionMetadata:
|
| 15 |
-
"""Simple metadata for submissions."""
|
| 16 |
-
def __init__(self, **kwargs):
|
| 17 |
-
self.agent_name = kwargs.get("agent_name", "")
|
| 18 |
-
self.llm_base = kwargs.get("llm_base", "")
|
| 19 |
-
self.openness = kwargs.get("openness", "")
|
| 20 |
-
self.tool_usage = kwargs.get("tool_usage", "")
|
| 21 |
-
self.submitter_name = kwargs.get("submitter_name", "")
|
| 22 |
-
self.submitter_email = kwargs.get("submitter_email", "")
|
| 23 |
-
|
| 24 |
-
def to_dict(self):
|
| 25 |
-
return {
|
| 26 |
-
"agent_name": self.agent_name,
|
| 27 |
-
"llm_base": self.llm_base,
|
| 28 |
-
"openness": self.openness,
|
| 29 |
-
"tool_usage": self.tool_usage,
|
| 30 |
-
"submitter_name": self.submitter_name,
|
| 31 |
-
"submitter_email": self.submitter_email,
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
# Path validation functions
|
| 36 |
-
def _validate_path_component(component: str, allow_underscores: bool = True) -> None:
|
| 37 |
-
"""
|
| 38 |
-
Validate a single path component.
|
| 39 |
-
|
| 40 |
-
Args:
|
| 41 |
-
component: The path component to validate
|
| 42 |
-
allow_underscores: Whether to allow underscores in the component
|
| 43 |
-
|
| 44 |
-
Raises:
|
| 45 |
-
ValueError: If the component is invalid
|
| 46 |
-
"""
|
| 47 |
-
if not component:
|
| 48 |
-
raise ValueError("Path component cannot be empty")
|
| 49 |
-
|
| 50 |
-
if component in (".", ".."):
|
| 51 |
-
raise ValueError(f"Path component cannot be '{component}'")
|
| 52 |
-
|
| 53 |
-
# Check for invalid characters
|
| 54 |
-
pattern = r'^[a-zA-Z0-9_\-\.]+$' if allow_underscores else r'^[a-zA-Z0-9\-\.]+$'
|
| 55 |
-
if not re.match(pattern, component):
|
| 56 |
-
raise ValueError(
|
| 57 |
-
f"Path component '{component}' contains invalid characters. "
|
| 58 |
-
f"Only alphanumeric, hyphens, dots{', and underscores' if allow_underscores else ''} are allowed."
|
| 59 |
-
)
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
def sanitize_path_component(component: str, replacement: str = "_") -> str:
|
| 63 |
-
"""
|
| 64 |
-
Sanitize a path component by replacing invalid characters.
|
| 65 |
-
|
| 66 |
-
Args:
|
| 67 |
-
component: The path component to sanitize
|
| 68 |
-
replacement: The character to use for replacing invalid characters
|
| 69 |
-
|
| 70 |
-
Returns:
|
| 71 |
-
Sanitized path component
|
| 72 |
-
"""
|
| 73 |
-
if not component:
|
| 74 |
-
return "unnamed"
|
| 75 |
-
|
| 76 |
-
# Replace any non-alphanumeric, non-hyphen, non-dot, non-underscore with replacement
|
| 77 |
-
sanitized = re.sub(r'[^a-zA-Z0-9_\-\.]', replacement, component)
|
| 78 |
-
|
| 79 |
-
# Remove leading/trailing dots or hyphens
|
| 80 |
-
sanitized = sanitized.strip('.-')
|
| 81 |
-
|
| 82 |
-
# Collapse multiple replacements into one
|
| 83 |
-
sanitized = re.sub(f'{re.escape(replacement)}+', replacement, sanitized)
|
| 84 |
-
|
| 85 |
-
if not sanitized:
|
| 86 |
-
return "unnamed"
|
| 87 |
-
|
| 88 |
-
return sanitized
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,108 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import os
|
| 3 |
-
from datetime import datetime
|
| 4 |
-
|
| 5 |
-
import gradio
|
| 6 |
-
import pytest
|
| 7 |
-
from huggingface_hub import HfApi, hf_hub_download
|
| 8 |
-
|
| 9 |
-
from aliases import CANONICAL_TOOL_USAGE_STANDARD, CANONICAL_OPENNESS_CLOSED_UI_ONLY
|
| 10 |
-
from config import IS_INTERNAL, CONFIG_NAME, CONTACT_DATASET, SUBMISSION_DATASET
|
| 11 |
-
from submission import add_new_eval
|
| 12 |
-
from submission_utils import SubmissionMetadata
|
| 13 |
-
|
| 14 |
-
_hf = HfApi()
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
class TestSubmission:
|
| 18 |
-
@pytest.fixture(autouse=True)
|
| 19 |
-
def setup(self):
|
| 20 |
-
# These need to be set before imports are evaluated so all we can do here
|
| 21 |
-
# is check that they have been set correctly.
|
| 22 |
-
assert IS_INTERNAL == True
|
| 23 |
-
assert CONFIG_NAME == "continuous-integration"
|
| 24 |
-
|
| 25 |
-
def test_add_new_eval(self, mocker):
|
| 26 |
-
# Bypass some checks so that the test can cover later parts of the code.
|
| 27 |
-
mocker.patch("submission._is_hf_acct_too_new", return_value=False)
|
| 28 |
-
mocker.patch("submission._is_last_submission_too_recent", return_value=False)
|
| 29 |
-
|
| 30 |
-
# We use this to find records corresponding to this test.
|
| 31 |
-
agent_description = f"CI run at {datetime.now().isoformat()}"
|
| 32 |
-
print(f"Using unique agent description: {agent_description}")
|
| 33 |
-
|
| 34 |
-
print("Submitting test submission...")
|
| 35 |
-
with open(os.path.join(os.path.dirname(__file__), "test-submission.tar.gz"), "rb") as f:
|
| 36 |
-
result = add_new_eval(
|
| 37 |
-
val_or_test="test",
|
| 38 |
-
agent_name="TestSubmissionIntegration",
|
| 39 |
-
agent_description=agent_description,
|
| 40 |
-
agent_url="https://github.com/allenai/asta-bench-leaderboard/blob/main/tests/integration/test_submission.py",
|
| 41 |
-
openness=CANONICAL_OPENNESS_CLOSED_UI_ONLY,
|
| 42 |
-
degree_of_control=CANONICAL_TOOL_USAGE_STANDARD,
|
| 43 |
-
path_to_file=f,
|
| 44 |
-
username="test_user",
|
| 45 |
-
role="Other",
|
| 46 |
-
email="jasond+asta_testing@allenai.org",
|
| 47 |
-
email_opt_in=True,
|
| 48 |
-
profile=gradio.OAuthProfile({
|
| 49 |
-
"name": "Test User",
|
| 50 |
-
"preferred_username": "test_user",
|
| 51 |
-
"profile": "test_user_profile",
|
| 52 |
-
"picture": "https://placecats.com/150/150",
|
| 53 |
-
}),
|
| 54 |
-
)
|
| 55 |
-
|
| 56 |
-
message, error_modal, success_modal, loading_modal = result
|
| 57 |
-
assert message == "" # Success
|
| 58 |
-
assert error_modal == {'__type__': 'update', 'visible': False}
|
| 59 |
-
assert success_modal == {'__type__': 'update', 'visible': True}
|
| 60 |
-
assert loading_modal == {'__type__': 'update', 'visible': False}
|
| 61 |
-
|
| 62 |
-
print("Looking up contact record...")
|
| 63 |
-
contacts = load_dataset(path=CONTACT_DATASET,
|
| 64 |
-
name=CONFIG_NAME,
|
| 65 |
-
download_mode="force_redownload",
|
| 66 |
-
verification_mode=VerificationMode.NO_CHECKS)
|
| 67 |
-
# There should have been a new entry due to this test with our unique description.
|
| 68 |
-
found_contact = next(row for row in contacts['test'] if row['agent_description'] == agent_description)
|
| 69 |
-
assert found_contact
|
| 70 |
-
|
| 71 |
-
# This contains an attribute that should lead us to files in the submissions dataset.
|
| 72 |
-
dataset_url = found_contact['dataset_url']
|
| 73 |
-
print(f"Found dataset URL: {dataset_url}")
|
| 74 |
-
assert dataset_url.startswith(
|
| 75 |
-
"hf://datasets/allenai/asta-bench-internal-submissions/continuous-integration/test/")
|
| 76 |
-
|
| 77 |
-
print("Checking submission dataset...")
|
| 78 |
-
# Commit message itself should link this and the contact record together unambiguously.
|
| 79 |
-
recent_commits = _hf.list_repo_commits(repo_type="dataset", repo_id=SUBMISSION_DATASET)
|
| 80 |
-
assert any(dataset_url in c.title for c in recent_commits)
|
| 81 |
-
|
| 82 |
-
print("Checking that files are present...")
|
| 83 |
-
rel_path = dataset_url[len("hf://datasets/allenai/asta-bench-internal-submissions/"):]
|
| 84 |
-
ds_info = _hf.dataset_info(SUBMISSION_DATASET)
|
| 85 |
-
# These are the files in our test-submission.tar.gz
|
| 86 |
-
assert any(f"{rel_path}/eval_config.json" == f.rfilename for f in ds_info.siblings)
|
| 87 |
-
assert any(f"{rel_path}/task_sqa_solver_openscilm.eval" == f.rfilename for f in ds_info.siblings)
|
| 88 |
-
# This is the generated metadata put into the dataset itself.
|
| 89 |
-
assert any(f"{rel_path}/submission.json" == f.rfilename for f in ds_info.siblings)
|
| 90 |
-
|
| 91 |
-
print("Checking contact record against submission.json...")
|
| 92 |
-
# Checks on contact record which is stored in a private dataset.
|
| 93 |
-
local_path = hf_hub_download(repo_type="dataset",
|
| 94 |
-
repo_id=SUBMISSION_DATASET,
|
| 95 |
-
filename=f"{rel_path}/submission.json")
|
| 96 |
-
with open(local_path) as f:
|
| 97 |
-
contact_from_json = json.load(f)
|
| 98 |
-
# Assert that all keys and values in submission.json are present in the contact record
|
| 99 |
-
for key, value_from_json in contact_from_json.items():
|
| 100 |
-
value_from_dataset = found_contact[key]
|
| 101 |
-
if isinstance(value_from_dataset, datetime):
|
| 102 |
-
value_from_dataset = found_contact[key].isoformat().replace('+00:00', 'Z')
|
| 103 |
-
assert value_from_dataset == value_from_json
|
| 104 |
-
# submission.json should not contain sensitive PII, specifically, email.
|
| 105 |
-
assert 'email' in found_contact
|
| 106 |
-
assert 'email' not in contact_from_json
|
| 107 |
-
# submission.json is defined by a specific data model.
|
| 108 |
-
SubmissionMetadata.model_validate(contact_from_json)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -20,6 +20,8 @@ from leaderboard_transformer import (
|
|
| 20 |
format_runtime_column,
|
| 21 |
get_pareto_df,
|
| 22 |
clean_llm_base_list,
|
|
|
|
|
|
|
| 23 |
)
|
| 24 |
from config import (
|
| 25 |
CONFIG_NAME,
|
|
@@ -39,56 +41,6 @@ from content import (
|
|
| 39 |
api = HfApi()
|
| 40 |
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
|
| 41 |
|
| 42 |
-
# Company logo mapping - maps model name patterns to company logo files
|
| 43 |
-
COMPANY_LOGO_MAP = {
|
| 44 |
-
"anthropic": {"path": "assets/logo-anthropic.svg", "name": "Anthropic"},
|
| 45 |
-
"claude": {"path": "assets/logo-anthropic.svg", "name": "Anthropic"},
|
| 46 |
-
"openai": {"path": "assets/logo-openai.svg", "name": "OpenAI"},
|
| 47 |
-
"gpt": {"path": "assets/logo-openai.svg", "name": "OpenAI"},
|
| 48 |
-
"o1": {"path": "assets/logo-openai.svg", "name": "OpenAI"},
|
| 49 |
-
"o3": {"path": "assets/logo-openai.svg", "name": "OpenAI"},
|
| 50 |
-
"google": {"path": "assets/logo-google.svg", "name": "Google"},
|
| 51 |
-
"gemini": {"path": "assets/logo-google.svg", "name": "Google"},
|
| 52 |
-
"gemma": {"path": "assets/logo-google.svg", "name": "Google"},
|
| 53 |
-
"meta": {"path": "assets/logo-meta.svg", "name": "Meta"},
|
| 54 |
-
"llama": {"path": "assets/logo-meta.svg", "name": "Meta"},
|
| 55 |
-
"mistral": {"path": "assets/logo-mistral.svg", "name": "Mistral"},
|
| 56 |
-
"mixtral": {"path": "assets/logo-mistral.svg", "name": "Mistral"},
|
| 57 |
-
"codestral": {"path": "assets/logo-mistral.svg", "name": "Mistral"},
|
| 58 |
-
"deepseek": {"path": "assets/logo-deepseek.svg", "name": "DeepSeek"},
|
| 59 |
-
"xai": {"path": "assets/logo-xai.svg", "name": "xAI"},
|
| 60 |
-
"grok": {"path": "assets/logo-xai.svg", "name": "xAI"},
|
| 61 |
-
"cohere": {"path": "assets/logo-cohere.svg", "name": "Cohere"},
|
| 62 |
-
"command": {"path": "assets/logo-cohere.svg", "name": "Cohere"},
|
| 63 |
-
"qwen": {"path": "assets/logo-qwen.svg", "name": "Qwen"},
|
| 64 |
-
"alibaba": {"path": "assets/logo-qwen.svg", "name": "Qwen"},
|
| 65 |
-
"kimi": {"path": "assets/logo-moonshot.svg", "name": "Moonshot"},
|
| 66 |
-
"moonshot": {"path": "assets/logo-moonshot.svg", "name": "Moonshot"},
|
| 67 |
-
"minimax": {"path": "assets/logo-minimax.svg", "name": "MiniMax"},
|
| 68 |
-
}
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
def get_company_from_model(model_name: str) -> dict:
|
| 72 |
-
"""
|
| 73 |
-
Gets the company info (logo path and name) from a model name.
|
| 74 |
-
Returns default unknown logo if no match found.
|
| 75 |
-
"""
|
| 76 |
-
if not model_name:
|
| 77 |
-
return {"path": "assets/logo-unknown.svg", "name": "Unknown"}
|
| 78 |
-
|
| 79 |
-
# Handle list of models - use the first one
|
| 80 |
-
if isinstance(model_name, list):
|
| 81 |
-
model_name = model_name[0] if model_name else ""
|
| 82 |
-
|
| 83 |
-
model_lower = str(model_name).lower()
|
| 84 |
-
|
| 85 |
-
# Check each pattern
|
| 86 |
-
for pattern, company_info in COMPANY_LOGO_MAP.items():
|
| 87 |
-
if pattern in model_lower:
|
| 88 |
-
return company_info
|
| 89 |
-
|
| 90 |
-
return {"path": "assets/logo-unknown.svg", "name": "Unknown"}
|
| 91 |
-
|
| 92 |
|
| 93 |
def get_company_logo_html(model_name: str) -> str:
|
| 94 |
"""
|
|
@@ -135,26 +87,6 @@ def get_svg_as_data_uri(path: str) -> str:
|
|
| 135 |
print(f"Warning: SVG file not found at {path}")
|
| 136 |
return ""
|
| 137 |
|
| 138 |
-
def create_svg_html(value, svg_map):
|
| 139 |
-
"""
|
| 140 |
-
Generates the absolute simplest HTML for an icon, without any extra text.
|
| 141 |
-
This version is compatible with gr.DataFrame.
|
| 142 |
-
"""
|
| 143 |
-
if pd.isna(value) or value not in svg_map:
|
| 144 |
-
return ""
|
| 145 |
-
|
| 146 |
-
path_info = svg_map[value]
|
| 147 |
-
# Handle both old string format and new object format
|
| 148 |
-
if isinstance(path_info, dict):
|
| 149 |
-
path = path_info["path"]
|
| 150 |
-
else:
|
| 151 |
-
path = path_info
|
| 152 |
-
|
| 153 |
-
src = get_svg_as_data_uri(path)
|
| 154 |
-
# Generate the HTML for the single icon, with NO text.
|
| 155 |
-
if src:
|
| 156 |
-
return f'<img src="{src}" style="width: 16px; height: 16px; vertical-align: middle;" alt="{value}" title="{value}">'
|
| 157 |
-
return ""
|
| 158 |
|
| 159 |
def build_openness_tooltip_content() -> str:
|
| 160 |
"""
|
|
@@ -1355,23 +1287,3 @@ def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: boo
|
|
| 1355 |
|
| 1356 |
# Return the entire navigation bar as one single Gradio HTML component
|
| 1357 |
return gr.HTML(full_html)
|
| 1358 |
-
|
| 1359 |
-
def format_llm_base_with_html(value):
|
| 1360 |
-
"""
|
| 1361 |
-
Formats the 'Models Used' cell value.
|
| 1362 |
-
If the value is a list with more than 1 element, it returns an
|
| 1363 |
-
HTML <span> with the full list in a hover-over tooltip.
|
| 1364 |
-
If it's a single-element list, it returns just that element.
|
| 1365 |
-
Otherwise, it returns the original value.
|
| 1366 |
-
"""
|
| 1367 |
-
if isinstance(value, list):
|
| 1368 |
-
if len(value) > 1:
|
| 1369 |
-
# Join the list items with a newline character for a clean tooltip
|
| 1370 |
-
tooltip_text = "\n".join(map(str, value))
|
| 1371 |
-
# Return an HTML span with the title attribute for the tooltip
|
| 1372 |
-
return f'<span class="tooltip-icon cell-tooltip-icon" style="cursor: help;" data-tooltip="{tooltip_text}">{value[0]} (+ {len(value) - 1}) ⓘ</span>'
|
| 1373 |
-
if len(value) == 1:
|
| 1374 |
-
# If only one item, just return that item
|
| 1375 |
-
return value[0]
|
| 1376 |
-
# Return the value as-is if it's not a list or is an empty list
|
| 1377 |
-
return value
|
|
|
|
| 20 |
format_runtime_column,
|
| 21 |
get_pareto_df,
|
| 22 |
clean_llm_base_list,
|
| 23 |
+
get_company_from_model,
|
| 24 |
+
COMPANY_LOGO_MAP,
|
| 25 |
)
|
| 26 |
from config import (
|
| 27 |
CONFIG_NAME,
|
|
|
|
| 41 |
api = HfApi()
|
| 42 |
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
def get_company_logo_html(model_name: str) -> str:
|
| 46 |
"""
|
|
|
|
| 87 |
print(f"Warning: SVG file not found at {path}")
|
| 88 |
return ""
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
def build_openness_tooltip_content() -> str:
|
| 92 |
"""
|
|
|
|
| 1287 |
|
| 1288 |
# Return the entire navigation bar as one single Gradio HTML component
|
| 1289 |
return gr.HTML(full_html)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -9,8 +9,9 @@ import os
|
|
| 9 |
import base64
|
| 10 |
import aliases
|
| 11 |
|
| 12 |
-
# Import company logo mapping from
|
| 13 |
-
from
|
|
|
|
| 14 |
|
| 15 |
# Standard layout configuration matching existing charts
|
| 16 |
# Colors aligned with OpenHands brand
|
|
|
|
| 9 |
import base64
|
| 10 |
import aliases
|
| 11 |
|
| 12 |
+
# Import company logo mapping from leaderboard_transformer
|
| 13 |
+
from leaderboard_transformer import get_company_from_model
|
| 14 |
+
from ui_components import get_svg_as_data_uri
|
| 15 |
|
| 16 |
# Standard layout configuration matching existing charts
|
| 17 |
# Colors aligned with OpenHands brand
|