justinlangsethgenesis commited on
Commit
42b6362
·
1 Parent(s): 767cd4a

Add temporary DABstep leaderboard mirror

Browse files
README.md CHANGED
@@ -1,14 +1,29 @@
1
  ---
2
- title: DABstep Temp
3
- emoji: 👁
4
- colorFrom: yellow
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 6.1.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-4.0
11
- short_description: Temporary Leaderboard for DABstep benchmark
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: DABstep Leaderboard — Temporary Mirror
3
+ emoji: 🕺
4
+ colorFrom: purple
5
+ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 6.1.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-4.0
11
+ short_description: Temporary mirror of DABstep benchmark leaderboard
12
  ---
13
 
14
+ # 🕺 DABstep Leaderboard Temporary Mirror
15
+
16
+ This is a **temporary mirror** of the [DABstep Benchmark Leaderboard](https://huggingface.co/spaces/adyen/DABstep) while the official Adyen leaderboard is experiencing issues.
17
+
18
+ ## ⚠️ Important Notes
19
+
20
+ - Submissions here **will NOT sync** to the official Adyen leaderboard
21
+ - Once the official leaderboard is restored, please re-submit there
22
+ - For questions: [support@genesiscomputing.ai](mailto:support@genesiscomputing.ai)
23
+ - Hosted by [Genesis Computing](https://genesiscomputing.ai)
24
+
25
+ ## About DABstep
26
+
27
+ DABstep (Data Agent Benchmark for Multi-Step Reasoning) evaluates AI agents on complex data analysis tasks requiring multi-step reasoning over structured datasets.
28
+
29
+ See the [HF Discussion](https://huggingface.co/spaces/adyen/DABstep/discussions/17) about the official leaderboard status.
app.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DABstep Leaderboard - Genesis Edition
3
+ A self-contained leaderboard for the DABstep benchmark.
4
+
5
+ Original source: https://huggingface.co/spaces/adyen/DABstep
6
+ """
7
+ import os
8
+
9
+ import gradio as gr
10
+
11
+ from dabstep_benchmark.content import (
12
+ CITATION_BUTTON_LABEL,
13
+ CITATION_BUTTON_TEXT,
14
+ INTRODUCTION_TEXT,
15
+ SUBMISSION_TEXT,
16
+ TEMPORARY_NOTICE,
17
+ TITLE,
18
+ VALIDATION_GUIDELINES,
19
+ )
20
+ from dabstep_benchmark.leaderboard import (
21
+ generate_leaderboard_df,
22
+ process_submission,
23
+ refresh,
24
+ )
25
+
26
+
27
+ def download_leaderboard(lb_type: str) -> str:
28
+ """Download the leaderboard as CSV."""
29
+ validated_lb, unvalidated_lb = generate_leaderboard_df()
30
+
31
+ if lb_type == "validated":
32
+ df_to_download = validated_lb
33
+ else:
34
+ df_to_download = unvalidated_lb
35
+
36
+ os.makedirs("data", exist_ok=True)
37
+ path = f"data/{lb_type}_leaderboard.csv"
38
+
39
+ if os.path.exists(path):
40
+ os.remove(path)
41
+
42
+ df_to_download.to_csv(path, index=False)
43
+ return path
44
+
45
+
46
+ # Custom CSS for better styling
47
+ CUSTOM_CSS = """
48
+ .markdown-text {
49
+ font-size: 16px;
50
+ }
51
+ .gradio-container {
52
+ max-width: 1200px !important;
53
+ }
54
+ #citation-button {
55
+ font-family: monospace;
56
+ font-size: 12px;
57
+ }
58
+ .notice-box {
59
+ background: transparent !important;
60
+ border: none !important;
61
+ padding: 0 !important;
62
+ margin: 8px 0 16px 0 !important;
63
+ }
64
+ .notice-box p {
65
+ margin: 0 !important;
66
+ padding: 12px 16px !important;
67
+ background: #2a2a2a !important;
68
+ border: 1px solid #444 !important;
69
+ border-radius: 6px !important;
70
+ color: #ccc !important;
71
+ font-size: 14px !important;
72
+ line-height: 1.5 !important;
73
+ }
74
+ .notice-box a {
75
+ color: #6cb6ff !important;
76
+ }
77
+ """
78
+
79
+ if __name__ == "__main__":
80
+ # Ensure data directories exist
81
+ os.makedirs("data/task_scores", exist_ok=True)
82
+ os.makedirs("data/submissions", exist_ok=True)
83
+
84
+ # Load data once at startup (cached for subsequent calls)
85
+ validated_lb, unvalidated_lb = generate_leaderboard_df()
86
+
87
+ # Build the Gradio app
88
+ demo = gr.Blocks(title="DABstep Leaderboard - Temporary Mirror")
89
+
90
+ with demo:
91
+ gr.HTML(f"<style>{CUSTOM_CSS}</style>")
92
+ gr.Markdown(TITLE)
93
+ gr.Markdown(TEMPORARY_NOTICE, elem_classes="notice-box")
94
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
95
+
96
+ with gr.Tab("📊 Leaderboard"):
97
+ with gr.Tab("Validated"):
98
+ verified_table = gr.Dataframe(
99
+ value=validated_lb,
100
+ datatype=["markdown", "str", "str", "str", "markdown", "str", "str"],
101
+ interactive=False,
102
+ column_widths=["20%"],
103
+ wrap=True,
104
+ )
105
+ verified_download = gr.DownloadButton(
106
+ label="📥 Download Leaderboard CSV",
107
+ elem_id="download-verified-lb",
108
+ )
109
+
110
+ with gr.Tab("Unvalidated"):
111
+ unverified_table = gr.Dataframe(
112
+ value=unvalidated_lb,
113
+ datatype=["markdown", "str", "str", "str", "markdown", "str", "str"],
114
+ interactive=False,
115
+ column_widths=["20%"],
116
+ wrap=True,
117
+ )
118
+ unverified_download = gr.DownloadButton(
119
+ label="📥 Download Full Leaderboard CSV",
120
+ elem_id="download-unverified-lb",
121
+ )
122
+
123
+ # Refresh button
124
+ refresh_button = gr.Button("🔄 Refresh Leaderboard", variant="secondary")
125
+
126
+ def do_refresh():
127
+ """Clear cache and reload leaderboard data."""
128
+ return refresh(only_leaderboard=True)
129
+
130
+ refresh_button.click(
131
+ fn=do_refresh,
132
+ inputs=None,
133
+ outputs=[verified_table, unverified_table],
134
+ )
135
+
136
+ # Download handlers
137
+ verified_download.click(
138
+ download_leaderboard,
139
+ inputs=[gr.Textbox(value="validated", visible=False)],
140
+ outputs=[verified_download]
141
+ )
142
+ unverified_download.click(
143
+ download_leaderboard,
144
+ inputs=[gr.Textbox(value="unvalidated", visible=False)],
145
+ outputs=[unverified_download]
146
+ )
147
+
148
+
149
+ gr.Markdown(VALIDATION_GUIDELINES, elem_classes="markdown-text")
150
+
151
+ with gr.Tab("📤 Submit"):
152
+ gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
153
+
154
+ with gr.Row():
155
+ with gr.Column():
156
+ split = gr.Radio(["all"], value="all", label="Split", visible=False)
157
+ agent_name_textbox = gr.Textbox(
158
+ label="Agent Name",
159
+ placeholder="e.g., MyDataAgent-v1"
160
+ )
161
+ model_family_textbox = gr.Textbox(
162
+ label="Model Family",
163
+ placeholder="e.g., GPT-4, Claude, Llama"
164
+ )
165
+ repo_url_textbox = gr.Textbox(
166
+ label="Repository URL (optional)",
167
+ placeholder="https://github.com/..."
168
+ )
169
+
170
+ with gr.Column():
171
+ organisation = gr.Textbox(
172
+ label="Organization",
173
+ placeholder="e.g., Genesis Computing"
174
+ )
175
+ mail = gr.Textbox(
176
+ label="Contact Email",
177
+ placeholder="your@email.com"
178
+ )
179
+ file_output = gr.File(
180
+ label="Upload Submission (.jsonl)",
181
+ file_types=[".jsonl", ".json"]
182
+ )
183
+
184
+ with gr.Row():
185
+ submit_button = gr.Button("🚀 Submit Answers", variant="primary")
186
+
187
+ submission_result = gr.Markdown()
188
+
189
+ submit_button.click(
190
+ process_submission,
191
+ inputs=[
192
+ split,
193
+ agent_name_textbox,
194
+ model_family_textbox,
195
+ repo_url_textbox,
196
+ file_output,
197
+ organisation,
198
+ mail
199
+ ],
200
+ outputs=submission_result,
201
+ )
202
+
203
+ with gr.Tab("📚 Citation"):
204
+ with gr.Accordion("📙 How to Cite", open=True):
205
+ citation_button = gr.Textbox(
206
+ value=CITATION_BUTTON_TEXT,
207
+ label=CITATION_BUTTON_LABEL,
208
+ lines=len(CITATION_BUTTON_TEXT.split("\n")),
209
+ elem_id="citation-button",
210
+ )
211
+
212
+ # Launch the app
213
+ demo.launch(
214
+ server_name="0.0.0.0",
215
+ server_port=7860,
216
+ share=False,
217
+ debug=True
218
+ )
219
+
dabstep_benchmark/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # DABstep Benchmark Package
2
+
dabstep_benchmark/content.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DABstep Benchmark Content
3
+ Text content for the leaderboard UI.
4
+ """
5
+
6
+ TITLE = """# 🕺 DABstep Leaderboard — Temporary Mirror"""
7
+
8
+ TEMPORARY_NOTICE = """
9
+ ⚠️ **Temporary mirror** — The [official Adyen leaderboard](https://huggingface.co/spaces/adyen/DABstep) is currently down. Submissions are accepted here for testing, but will likely need to be re-submitted to Adyen once it's back online. Hosted by [Genesis Computing](https://genesiscomputing.ai) • [support@genesiscomputing.ai](mailto:support@genesiscomputing.ai)
10
+ """
11
+
12
+ INTRODUCTION_TEXT = """
13
+ The [Data Agent Benchmark for Multi-step Reasoning (DABstep)](https://huggingface.co/blog/dabstep) measures and pushes the state-of-the-art in Data Analysis by LLMs.
14
+
15
+ The benchmark is composed of ~450 data analysis questions centered around documents that agents must understand and cross-reference to answer correctly.
16
+
17
+ ### Resources
18
+ - 📊 [Original Dataset](https://huggingface.co/datasets/adyen/DABstep)
19
+ - 📄 [Adyen Technical Report](https://www.adyen.com/knowledge-hub/data-agent-benchmark-for-multi-step-reasoning-dabstep)
20
+ - 📝 [Hugging Face Blog Post](https://huggingface.co/blog/dabstep)
21
+ - 🔗 [Colab Notebook Baseline](https://colab.research.google.com/drive/1pXi5ffBFNJQ5nn1111SnIfjfKCOlunxu)
22
+ - 💬 [HF Discussion: Leaderboard is down](https://huggingface.co/spaces/adyen/DABstep/discussions/17)
23
+ - 🌐 [Genesis Computing](https://genesiscomputing.ai) — Mirror host
24
+ """
25
+
26
+ SUBMISSION_TEXT = """
27
+ ## Submission Format
28
+
29
+ Submit a JSON Lines (.jsonl) file with the following format:
30
+
31
+ ```json
32
+ {"task_id": "1", "agent_answer": "Your answer", "reasoning_trace": "Optional: how your model reached this answer"}
33
+ {"task_id": "2", "agent_answer": "Another answer", "reasoning_trace": "Optional trace"}
34
+ ```
35
+
36
+ **Required fields:**
37
+ - `task_id`: The task identifier (string)
38
+ - `agent_answer`: Your agent's answer (string)
39
+
40
+ **Optional fields:**
41
+ - `reasoning_trace`: The reasoning steps (string)
42
+
43
+ Scores are expressed as the percentage of correct answers. Evaluation uses quasi-exact match
44
+ between your answer and the ground truth (with normalization for numbers, lists, etc.).
45
+ """
46
+
47
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
48
+
49
+ CITATION_BUTTON_TEXT = r"""@misc{DABstep,
50
+ title={DABstep: Data Agent Benchmark for Multi-step Reasoning},
51
+ author={Alex Egg and Martin Iglesias Goyanes and Friso Kingma and Andreu Mora and Leandro von Werra and Thomas Wolf},
52
+ year={2025},
53
+ eprint={2506.23719},
54
+ archivePrefix={arXiv},
55
+ primaryClass={cs.LG},
56
+ url={https://arxiv.org/abs/2506.23719}
57
+ }"""
58
+
59
+ VALIDATION_GUIDELINES = """
60
+ ## About This Leaderboard
61
+
62
+ This is an independent instance of the DABstep leaderboard. Submissions are scored automatically
63
+ against the ground truth answers using the official DABstep scorer.
64
+
65
+ **Scoring:**
66
+ - Easy Level: Tasks 1-72 (basic data analysis)
67
+ - Hard Level: Tasks 73-450 (complex multi-step reasoning)
68
+
69
+ **Note:** This leaderboard stores submissions locally. For official benchmark results,
70
+ please submit to the [official DABstep leaderboard](https://huggingface.co/spaces/adyen/DABstep).
71
+ """
72
+
dabstep_benchmark/evaluation/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # DABstep Evaluation Package
2
+
dabstep_benchmark/evaluation/scorer.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DABstep Benchmark Scorer
3
+ Original source: https://huggingface.co/spaces/adyen/DABstep/blob/main/dabstep_benchmark/evaluation/scorer.py
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import math
8
+ import re
9
+ from difflib import SequenceMatcher
10
+
11
+
12
+ def is_numeric_with_commas(value: str) -> bool:
13
+ """
14
+ True for strings that are either:
15
+ - numbers using comma thousands-separators (at least one comma),
16
+ with optional dot-decimal, e.g. "1,000" or "12,345.67"
17
+ OR
18
+ - pure decimals (no separators) with a decimal point or comma,
19
+ e.g. "0.99" or "0,99"
20
+ Plain ints without commas (e.g. "64") are rejected.
21
+ """
22
+ v = value.strip()
23
+ pattern = r'''
24
+ ^\$? # optional dollar sign
25
+ (?: # two alternate groups:
26
+ \d{1,3}(?:,\d{3})+(?:\.\d+)? # 1) at least one comma-group + optional .decimal
27
+ | \d+[.,]\d+ # 2) or plain decimal with . or ,
28
+ )
29
+ $ # end of string
30
+ '''
31
+ return bool(re.match(pattern, v, re.VERBOSE))
32
+
33
+
34
+ def question_scorer(input1: str, input2: str) -> bool:
35
+ """Score a single question answer against the ground truth."""
36
+ # Remove leading/trailing whitespace and convert to lowercase
37
+ input1 = input1.strip().lower()
38
+ input2 = input2.strip().lower()
39
+
40
+ # Check if inputs are numeric with commas
41
+ if is_numeric_with_commas(input1) or is_numeric_with_commas(input2):
42
+ num1 = extract_numeric(input1)
43
+ num2 = extract_numeric(input2)
44
+ return compare_numeric(num1, num2) if num1 is not None and num2 is not None else False
45
+
46
+ # Check for list match
47
+ if ';' in input1 or ';' in input2 or ',' in input1 or ',' in input2:
48
+ return compare_lists(input1, input2)
49
+
50
+ # Extract numeric values if present
51
+ num1 = extract_numeric(input1)
52
+ num2 = extract_numeric(input2)
53
+
54
+ # If both inputs have numeric values, compare them
55
+ if num1 is not None and num2 is not None:
56
+ return compare_numeric(num1, num2)
57
+
58
+ # Check for string match or subset
59
+ return compare_strings(input1, input2)
60
+
61
+
62
+ def extract_numeric(value: str) -> float | None:
63
+ """Extract numeric value from a string."""
64
+ # Remove commas and currency symbols from the value string
65
+ value = value.replace(',', '').replace('$', '')
66
+
67
+ # Extract the first occurrence of a numeric value
68
+ match = re.search(r'(\d*\.\d+|\d+\.?\d*)%?', value)
69
+ if match:
70
+ num_str = match.group(1)
71
+ try:
72
+ return float(num_str)
73
+ except ValueError:
74
+ return None
75
+ return None
76
+
77
+
78
+ def compare_numeric(num1: float, num2: float) -> bool:
79
+ """Compare two numeric values with tolerance."""
80
+ # Check for exact equality first
81
+ if num1 == num2:
82
+ return True
83
+
84
+ # For percentages and small numbers, use a more lenient comparison
85
+ if num1 < 1 and num2 < 1:
86
+ return math.isclose(num1, num2, rel_tol=1e-4, abs_tol=1e-4)
87
+
88
+ # For larger numbers, use the original comparison method
89
+ dec_places1 = len(str(num1).split('.')[-1]) if '.' in str(num1) else 0
90
+ dec_places2 = len(str(num2).split('.')[-1]) if '.' in str(num2) else 0
91
+ round_to = min(dec_places1, dec_places2)
92
+ rounded1 = round(num1, round_to)
93
+ rounded2 = round(num2, round_to)
94
+
95
+ if rounded1 == rounded2:
96
+ return True
97
+
98
+ return math.isclose(num1, num2, rel_tol=1e-4, abs_tol=1e-4)
99
+
100
+
101
+ def compare_strings(str1: str, str2: str) -> bool:
102
+ """Compare two strings for similarity."""
103
+ # Remove all whitespace and punctuation
104
+ clean1 = re.sub(r'[^\w]', '', str1)
105
+ clean2 = re.sub(r'[^\w]', '', str2)
106
+
107
+ if clean1 == clean2:
108
+ return True
109
+
110
+ words1 = re.findall(r'\b\w+\b', str1.lower())
111
+ words2 = re.findall(r'\b\w+\b', str2.lower())
112
+
113
+ # Only do subset comparison if neither list is empty
114
+ if (len(words1) == 1 or len(words2) == 1) and words1 and words2:
115
+ return set(words1).issubset(set(words2)) or set(words2).issubset(set(words1))
116
+
117
+ # Use similarity score
118
+ similarity = SequenceMatcher(None, str1, str2).ratio()
119
+ return similarity > 0.95
120
+
121
+
122
+ def compare_lists(list1: str, list2: str) -> bool:
123
+ """Compare two list-formatted strings."""
124
+ # Normalize list representations by removing brackets
125
+ list1 = re.sub(r'^\[|\]$', '', list1.strip())
126
+ list2 = re.sub(r'^\[|\]$', '', list2.strip())
127
+
128
+ # Split the lists and remove whitespace
129
+ items1 = [item.strip() for item in re.split(r'[,;]', list1) if item.strip()]
130
+ items2 = [item.strip() for item in re.split(r'[,;]', list2) if item.strip()]
131
+
132
+ # Sort the items to handle different order
133
+ items1.sort()
134
+ items2.sort()
135
+
136
+ # Check if the lists are identical
137
+ if items1 == items2:
138
+ return True
139
+
140
+ # If lists are not identical, compare each item
141
+ if len(items1) != len(items2):
142
+ return False
143
+
144
+ for item1, item2 in zip(items1, items2):
145
+ if not question_scorer(item1, item2):
146
+ return False
147
+
148
+ return True
149
+
dabstep_benchmark/leaderboard.py ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DABstep Leaderboard Logic
3
+ Handles submission processing, scoring, and leaderboard generation.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import datetime
8
+ import json
9
+ import os
10
+ import re
11
+ from email.utils import parseaddr
12
+ from pathlib import Path
13
+
14
+ import gradio as gr
15
+ import pandas as pd
16
+
17
+ from dabstep_benchmark.utils import (
18
+ evaluate,
19
+ format_error,
20
+ format_log,
21
+ format_warning,
22
+ is_valid_https_url,
23
+ )
24
+
25
+ # Paths
26
+ DATA_DIR = Path("data")
27
+ SUBMISSIONS_DIR = DATA_DIR / "submissions"
28
+ TASK_SCORES_DIR = DATA_DIR / "task_scores"
29
+ METADATA_FILE = DATA_DIR / "metadata.jsonl" # Small file with just submission metadata
30
+ SCORES_SUMMARY_FILE = DATA_DIR / "scores_summary.jsonl" # Pre-aggregated scores (128 KB)
31
+ GROUND_TRUTH_FILE = Path("ground_truth.jsonl")
32
+
33
+ # In-memory cache
34
+ GROUND_TRUTH_DF = None
35
+ SUBMISSIONS_DF = None
36
+ TASK_SCORES_DF = None
37
+ METADATA_DF = None
38
+ SCORES_SUMMARY_DF = None
39
+ LEADERBOARD_CACHE = None # Cached (validated_df, unvalidated_df)
40
+
41
+
42
+ def ensure_directories():
43
+ """Ensure data directories exist."""
44
+ SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
45
+ TASK_SCORES_DIR.mkdir(parents=True, exist_ok=True)
46
+
47
+
48
+ def load_ground_truth() -> pd.DataFrame:
49
+ """Load ground truth answers from JSONL file or HF Secret.
50
+
51
+ For HuggingFace Spaces deployment:
52
+ 1. Set the GROUND_TRUTH_DATA secret in Space Settings
53
+ 2. The secret should contain the full JSONL content (one JSON object per line)
54
+ 3. Delete the ground_truth.jsonl file from the repo to hide answers
55
+
56
+ For local development:
57
+ - Just use the ground_truth.jsonl file
58
+ """
59
+ global GROUND_TRUTH_DF
60
+
61
+ if GROUND_TRUTH_DF is not None:
62
+ return GROUND_TRUTH_DF
63
+
64
+ records = []
65
+
66
+ # Try loading from HF Secret first (for production)
67
+ ground_truth_data = os.environ.get("GROUND_TRUTH_DATA")
68
+ if ground_truth_data:
69
+ print("Loading ground truth from HF Secret...")
70
+ for line in ground_truth_data.strip().split("\n"):
71
+ if line.strip():
72
+ record = json.loads(line)
73
+ task_id = str(record["task_id"])
74
+ level = "easy" if int(task_id) <= 72 else "hard"
75
+ records.append({
76
+ "task_id": task_id,
77
+ "answer": str(record["agent_answer"]),
78
+ "level": level
79
+ })
80
+ else:
81
+ # Fall back to file for local development
82
+ if not GROUND_TRUTH_FILE.exists():
83
+ raise FileNotFoundError(
84
+ f"Ground truth not found. Either set GROUND_TRUTH_DATA env var "
85
+ f"or provide {GROUND_TRUTH_FILE}"
86
+ )
87
+
88
+ print("Loading ground truth from file...")
89
+ with open(GROUND_TRUTH_FILE) as f:
90
+ for line in f:
91
+ record = json.loads(line)
92
+ task_id = str(record["task_id"])
93
+ level = "easy" if int(task_id) <= 72 else "hard"
94
+ records.append({
95
+ "task_id": task_id,
96
+ "answer": str(record["agent_answer"]),
97
+ "level": level
98
+ })
99
+
100
+ GROUND_TRUTH_DF = pd.DataFrame(records)
101
+ print(f"Loaded {len(GROUND_TRUTH_DF)} ground truth answers")
102
+ return GROUND_TRUTH_DF
103
+
104
+
105
+ def load_submissions() -> pd.DataFrame:
106
+ """Load all submissions from the submissions directory."""
107
+ global SUBMISSIONS_DF
108
+
109
+ submissions = []
110
+ for file_path in SUBMISSIONS_DIR.glob("*.jsonl"):
111
+ try:
112
+ df = pd.read_json(file_path, lines=True, dtype=str)
113
+ submissions.append(df)
114
+ except Exception as e:
115
+ print(f"Error loading {file_path}: {e}")
116
+
117
+ if submissions:
118
+ SUBMISSIONS_DF = pd.concat(submissions, ignore_index=True)
119
+ else:
120
+ SUBMISSIONS_DF = pd.DataFrame(columns=[
121
+ "submission_id", "task_id", "agent_answer", "agent_name",
122
+ "model_family", "organisation", "repo_url", "date"
123
+ ])
124
+
125
+ return SUBMISSIONS_DF
126
+
127
+
128
+ def load_task_scores() -> pd.DataFrame:
129
+ """Load all task scores from the task_scores directory."""
130
+ global TASK_SCORES_DF
131
+
132
+ scores = []
133
+ for file_path in TASK_SCORES_DIR.glob("*.jsonl"):
134
+ try:
135
+ with open(file_path) as f:
136
+ for line in f:
137
+ scores.append(json.loads(line))
138
+ except Exception as e:
139
+ print(f"Error loading {file_path}: {e}")
140
+
141
+ if scores:
142
+ TASK_SCORES_DF = pd.DataFrame(scores)
143
+ else:
144
+ TASK_SCORES_DF = pd.DataFrame(columns=[
145
+ "submission_id", "task_id", "score", "level", "agent_answer"
146
+ ])
147
+
148
+ return TASK_SCORES_DF
149
+
150
+
151
+ def load_metadata() -> pd.DataFrame:
152
+ """Load submission metadata from the small metadata file."""
153
+ global METADATA_DF
154
+
155
+ # Return cached data if available
156
+ if METADATA_DF is not None:
157
+ return METADATA_DF
158
+
159
+ if not METADATA_FILE.exists():
160
+ print(f"No metadata file found at {METADATA_FILE}")
161
+ METADATA_DF = pd.DataFrame(columns=[
162
+ "submission_id", "agent_name", "model_family", "organisation",
163
+ "repo_url", "date", "validated"
164
+ ])
165
+ return METADATA_DF
166
+
167
+ try:
168
+ METADATA_DF = pd.read_json(METADATA_FILE, lines=True, dtype=str)
169
+ # Convert validated to boolean
170
+ if "validated" in METADATA_DF.columns:
171
+ METADATA_DF["validated"] = METADATA_DF["validated"].apply(
172
+ lambda x: str(x).lower() == "true" if pd.notna(x) else False
173
+ )
174
+ print(f"Loaded metadata for {len(METADATA_DF)} submissions")
175
+ except Exception as e:
176
+ print(f"Error loading metadata: {e}")
177
+ METADATA_DF = pd.DataFrame(columns=[
178
+ "submission_id", "agent_name", "model_family", "organisation",
179
+ "repo_url", "date", "validated"
180
+ ])
181
+
182
+ return METADATA_DF
183
+
184
+
185
+ def save_metadata(submission_id: str, agent_name: str, organisation: str,
186
+ model_family: str, repo_url: str, date: str, validated: bool = False):
187
+ """Append a new submission's metadata to the metadata file."""
188
+ metadata = {
189
+ "submission_id": submission_id,
190
+ "agent_name": agent_name,
191
+ "organisation": organisation,
192
+ "model_family": model_family,
193
+ "repo_url": repo_url,
194
+ "date": date,
195
+ "validated": validated
196
+ }
197
+
198
+ with open(METADATA_FILE, "a") as f:
199
+ f.write(json.dumps(metadata) + "\n")
200
+ print(f"Saved metadata for {submission_id}")
201
+
202
+
203
+ def load_scores_summary() -> pd.DataFrame:
204
+ """Load pre-aggregated scores summary (128 KB instead of 514 MB)."""
205
+ global SCORES_SUMMARY_DF
206
+
207
+ # Return cached data if available
208
+ if SCORES_SUMMARY_DF is not None:
209
+ return SCORES_SUMMARY_DF
210
+
211
+ if not SCORES_SUMMARY_FILE.exists():
212
+ print(f"No scores summary file at {SCORES_SUMMARY_FILE}")
213
+ SCORES_SUMMARY_DF = pd.DataFrame(columns=[
214
+ "submission_id", "easy_accuracy", "hard_accuracy"
215
+ ])
216
+ return SCORES_SUMMARY_DF
217
+
218
+ try:
219
+ SCORES_SUMMARY_DF = pd.read_json(SCORES_SUMMARY_FILE, lines=True)
220
+ print(f"Loaded scores summary for {len(SCORES_SUMMARY_DF)} submissions")
221
+ except Exception as e:
222
+ print(f"Error loading scores summary: {e}")
223
+ SCORES_SUMMARY_DF = pd.DataFrame(columns=[
224
+ "submission_id", "easy_accuracy", "hard_accuracy"
225
+ ])
226
+
227
+ return SCORES_SUMMARY_DF
228
+
229
+
230
+ def save_scores_summary(submission_id: str, easy_accuracy: float, hard_accuracy: float):
231
+ """Append a new submission's aggregated scores to the summary file."""
232
+ entry = {
233
+ "submission_id": submission_id,
234
+ "easy_accuracy": round(easy_accuracy, 2),
235
+ "hard_accuracy": round(hard_accuracy, 2)
236
+ }
237
+ with open(SCORES_SUMMARY_FILE, "a") as f:
238
+ f.write(json.dumps(entry) + "\n")
239
+ print(f"Saved scores summary for {submission_id}")
240
+
241
+
242
+ def refresh(only_leaderboard: bool = False) -> tuple[pd.DataFrame, pd.DataFrame]:
243
+ """Refresh data and regenerate leaderboard."""
244
+ global GROUND_TRUTH_DF, SUBMISSIONS_DF, TASK_SCORES_DF, METADATA_DF, SCORES_SUMMARY_DF, LEADERBOARD_CACHE
245
+
246
+ ensure_directories()
247
+
248
+ if not only_leaderboard:
249
+ GROUND_TRUTH_DF = None
250
+ load_ground_truth()
251
+
252
+ SUBMISSIONS_DF = None
253
+ TASK_SCORES_DF = None
254
+ METADATA_DF = None
255
+ SCORES_SUMMARY_DF = None
256
+ LEADERBOARD_CACHE = None
257
+
258
+ return generate_leaderboard_df()
259
+
260
+
261
+ def validate_submission(submission_df: pd.DataFrame) -> str | None:
262
+ """Validate a submission DataFrame."""
263
+ mandatory_columns = ["agent_answer", "task_id"]
264
+ expected_columns = [*mandatory_columns, "reasoning_trace"]
265
+
266
+ # Check for missing mandatory columns
267
+ missing_columns = [col for col in mandatory_columns if col not in submission_df.columns]
268
+ if missing_columns:
269
+ return format_error(f"Missing mandatory columns: {', '.join(missing_columns)}")
270
+
271
+ # Check for unexpected columns
272
+ unexpected_columns = [col for col in submission_df.columns if col not in expected_columns]
273
+ if unexpected_columns:
274
+ return format_error(f"Unexpected columns: {', '.join(unexpected_columns)}")
275
+
276
+ # Check for NaN values in any column
277
+ if submission_df.isnull().values.any():
278
+ return format_error("Submission contains NaN values. Please ensure no missing data.")
279
+
280
+ # Check if all columns are of string type
281
+ non_string_columns = [col for col in submission_df.columns if submission_df[col].dtype != 'object']
282
+ if non_string_columns:
283
+ return format_error(f"Columns with non-string data type: {', '.join(non_string_columns)}")
284
+
285
+ return None
286
+
287
+
288
+ def process_submission(
289
+ split: str,
290
+ agent_name: str,
291
+ model_family: str,
292
+ repo_url: str,
293
+ path_to_file: str,
294
+ organisation: str,
295
+ mail: str,
296
+ ) -> str:
297
+ """Process a new submission."""
298
+ # Validate inputs
299
+ if agent_name == "":
300
+ return format_warning("Please provide an agent name")
301
+ if organisation == "":
302
+ return format_warning("Please provide an organisation")
303
+ if mail == "":
304
+ return format_warning("Please provide an email")
305
+ if model_family == "":
306
+ return format_warning("Please provide a model family")
307
+
308
+ allowed_pattern = re.compile(r'^[a-zA-Z0-9 _.-]+$')
309
+ if not allowed_pattern.match(agent_name):
310
+ return format_warning(
311
+ f"Agent name can only contain alphanumeric characters, spaces, dashes (-), and underscores (_)"
312
+ )
313
+ if not allowed_pattern.match(organisation):
314
+ return format_warning(
315
+ f"Organisation can only contain alphanumeric characters, spaces, dashes (-), and underscores (_)"
316
+ )
317
+
318
+ # Basic email validation
319
+ _, parsed_mail = parseaddr(mail)
320
+ if "@" not in parsed_mail:
321
+ return format_warning("Please provide a valid email address.")
322
+
323
+ if repo_url != "" and not is_valid_https_url(repo_url):
324
+ return format_warning("If you provide a URL it must be a valid one. You can also leave it empty")
325
+
326
+ # Validate submission file
327
+ if path_to_file is None:
328
+ return format_warning("Please attach a file.")
329
+
330
+ submission_path = path_to_file.name
331
+ try:
332
+ submission_df = pd.read_json(submission_path, lines=True, dtype=str)
333
+ validation_error = validate_submission(submission_df)
334
+ if validation_error:
335
+ return validation_error
336
+ except Exception as exc:
337
+ return format_error(f"Submission file is incorrectly formatted. Please fix it and resubmit your file. {exc!s}")
338
+
339
+ submission_id = f"{organisation}-{agent_name}"
340
+ print(f"Processing submission_id={submission_id}...")
341
+ gr.Info(f"Processing submission of {agent_name}...")
342
+
343
+ # Reload data
344
+ ensure_directories()
345
+ load_ground_truth()
346
+ submissions_df = load_submissions()
347
+
348
+ # Check if already submitted
349
+ if len(submissions_df) > 0 and submission_id in submissions_df['submission_id'].values:
350
+ return format_warning(f"This {submission_id} pair has already been submitted.")
351
+
352
+ # Add metadata to submission
353
+ submission_df["submission_id"] = submission_id
354
+ submission_df["agent_name"] = agent_name
355
+ submission_df["model_family"] = model_family
356
+ submission_df["organisation"] = organisation
357
+ submission_df["repo_url"] = repo_url
358
+ submission_df["date"] = datetime.date.today().strftime("%d-%m-%Y")
359
+
360
+ if "reasoning_trace" not in submission_df.columns:
361
+ submission_df["reasoning_trace"] = ""
362
+
363
+ # Evaluate submission
364
+ try:
365
+ task_scores = evaluate(
366
+ agent_answers=submission_df,
367
+ tasks_with_gt=GROUND_TRUTH_DF,
368
+ submission_id=submission_id
369
+ )
370
+ except KeyError as exc:
371
+ return format_error(str(exc))
372
+
373
+ # Save submission
374
+ filename_id = f"v1__{organisation}-{agent_name}__{datetime.datetime.today().strftime('%d-%m-%Y')}"
375
+ submission_file = SUBMISSIONS_DIR / f"{filename_id}.jsonl"
376
+ submission_df.to_json(submission_file, orient="records", lines=True)
377
+ print(f"Saved submission to {submission_file}")
378
+
379
+ # Save scores
380
+ scores_file = TASK_SCORES_DIR / f"{filename_id}.jsonl"
381
+ with open(scores_file, "w") as f:
382
+ for score in task_scores:
383
+ f.write(json.dumps(score) + "\n")
384
+ print(f"Saved task scores to {scores_file}")
385
+
386
+ # Calculate aggregated scores for the summary
387
+ easy_scores = [s["score"] for s in task_scores if s["level"] == "easy"]
388
+ hard_scores = [s["score"] for s in task_scores if s["level"] == "hard"]
389
+ easy_accuracy = (sum(easy_scores) / len(easy_scores) * 100) if easy_scores else 0
390
+ hard_accuracy = (sum(hard_scores) / len(hard_scores) * 100) if hard_scores else 0
391
+
392
+ # Save scores summary (for fast leaderboard rendering)
393
+ save_scores_summary(
394
+ submission_id=submission_id,
395
+ easy_accuracy=easy_accuracy,
396
+ hard_accuracy=hard_accuracy
397
+ )
398
+
399
+ # Save metadata (small file for fast leaderboard rendering)
400
+ date_str = datetime.datetime.today().strftime('%d-%m-%Y')
401
+ save_metadata(
402
+ submission_id=submission_id,
403
+ agent_name=agent_name,
404
+ organisation=organisation,
405
+ model_family=model_family,
406
+ repo_url=repo_url,
407
+ date=date_str,
408
+ validated=False
409
+ )
410
+
411
+ return format_log(f"""
412
+ Agent {agent_name} submitted by {organisation} successfully!
413
+ Please refresh the leaderboard to see your score.
414
+ """)
415
+
416
+
417
+ def generate_leaderboard_df() -> tuple[pd.DataFrame, pd.DataFrame]:
418
+ """Generate the leaderboard DataFrames.
419
+
420
+ Uses pre-aggregated files for fast loading:
421
+ - metadata.jsonl (284 KB) - submission metadata
422
+ - scores_summary.jsonl (128 KB) - pre-aggregated scores
423
+
424
+ Total: ~400 KB instead of 3.1 GB
425
+ """
426
+ global LEADERBOARD_CACHE
427
+
428
+ # Return cached result if available
429
+ if LEADERBOARD_CACHE is not None:
430
+ return LEADERBOARD_CACHE
431
+
432
+ # Load pre-aggregated scores (128 KB instead of 514 MB)
433
+ scores_summary_df = load_scores_summary()
434
+
435
+ # Load metadata (284 KB instead of 2.6 GB)
436
+ metadata_df = load_metadata()
437
+
438
+ if len(metadata_df) == 0 or len(scores_summary_df) == 0:
439
+ empty_df = pd.DataFrame(columns=[
440
+ "Agent", "Easy Level Accuracy (%)", "Hard Level Accuracy (%)",
441
+ "Organization", "Repo URL", "Model Family", "Date"
442
+ ])
443
+ return empty_df, empty_df
444
+
445
+ # Join metadata with pre-aggregated scores
446
+ leaderboard_df = pd.merge(metadata_df, scores_summary_df, on="submission_id", how="inner")
447
+
448
+ # Rename columns (scores_summary already has percentage values)
449
+ col_map = {
450
+ "agent_name": "Agent",
451
+ "easy_accuracy": "Easy Level Accuracy (%)",
452
+ "hard_accuracy": "Hard Level Accuracy (%)",
453
+ "organisation": "Organization",
454
+ "repo_url": "Repo URL",
455
+ "model_family": "Model Family",
456
+ "date": "Date",
457
+ }
458
+
459
+ leaderboard_df.rename(columns=col_map, inplace=True)
460
+
461
+ # Format columns (keep 'validated' for later splitting)
462
+ available_cols = [col for col in col_map.values() if col in leaderboard_df.columns]
463
+ keep_cols = available_cols.copy()
464
+ if "validated" in leaderboard_df.columns:
465
+ keep_cols.append("validated")
466
+ leaderboard_df = leaderboard_df[keep_cols]
467
+
468
+ # Make repo URL clickable
469
+ if "Repo URL" in leaderboard_df.columns:
470
+ leaderboard_df["Repo URL"] = leaderboard_df["Repo URL"].apply(
471
+ lambda x: f"[Link]({x})" if x != "" else ""
472
+ )
473
+
474
+ # Make agent name bold
475
+ if "Agent" in leaderboard_df.columns:
476
+ leaderboard_df["Agent"] = leaderboard_df["Agent"].apply(lambda x: f"**{x}**")
477
+
478
+ # Sort by best score
479
+ sort_cols = []
480
+ if "Hard Level Accuracy (%)" in leaderboard_df.columns:
481
+ sort_cols.append("Hard Level Accuracy (%)")
482
+ if "Easy Level Accuracy (%)" in leaderboard_df.columns:
483
+ sort_cols.append("Easy Level Accuracy (%)")
484
+
485
+ if sort_cols:
486
+ leaderboard_df.sort_values(by=sort_cols, ascending=[False] * len(sort_cols), inplace=True)
487
+
488
+ # Split into validated and unvalidated based on the 'validated' field
489
+ display_cols = [c for c in leaderboard_df.columns if c != "validated"]
490
+
491
+ if "validated" in leaderboard_df.columns:
492
+ # Convert validated field to boolean (handles string "true"/"false" and bool)
493
+ leaderboard_df["_is_validated"] = leaderboard_df["validated"].apply(
494
+ lambda x: str(x).lower() in ("true", "1", "yes") if pd.notna(x) else False
495
+ )
496
+ validated_lb = leaderboard_df[leaderboard_df["_is_validated"]][display_cols].copy()
497
+ unvalidated_lb = leaderboard_df[~leaderboard_df["_is_validated"]][display_cols].copy()
498
+ else:
499
+ # No validated field - all go to unvalidated
500
+ validated_lb = pd.DataFrame(columns=display_cols)
501
+ unvalidated_lb = leaderboard_df[display_cols].copy()
502
+
503
+ # Cache the result
504
+ LEADERBOARD_CACHE = (validated_lb, unvalidated_lb)
505
+ return validated_lb, unvalidated_lb
506
+
dabstep_benchmark/utils.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DABstep Benchmark Utilities
3
+ Adapted from: https://huggingface.co/spaces/adyen/DABstep/blob/main/dabstep_benchmark/utils.py
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import re
8
+
9
+ import pandas as pd
10
+
11
+ from dabstep_benchmark.evaluation.scorer import question_scorer
12
+
13
+
14
+ def format_error(msg: str) -> str:
15
+ """Format an error message in red."""
16
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
17
+
18
+
19
+ def format_warning(msg: str) -> str:
20
+ """Format a warning message in orange."""
21
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
22
+
23
+
24
+ def format_log(msg: str) -> str:
25
+ """Format a log message in green."""
26
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
27
+
28
+
29
+ def model_hyperlink(link: str, model_name: str) -> str:
30
+ """Create a hyperlink for a model."""
31
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
32
+
33
+
34
+ def is_valid_https_url(url: str) -> bool:
35
+ """Validate that a URL is a valid HTTPS URL."""
36
+ pattern = re.compile(
37
+ r'^https://' # URL must start with 'https://'
38
+ r'(?!10(?:\.\d{1,3}){3})' # Exclude private IP 10.x.x.x
39
+ r'(?!127(?:\.\d{1,3}){3})' # Exclude loopback IP 127.x.x.x
40
+ r'(?!169\.254(?:\.\d{1,3}){2})' # Exclude link-local IP 169.254.x.x
41
+ r'(?!192\.168(?:\.\d{1,3}){2})' # Exclude private IP 192.168.x.x
42
+ r'(?!172\.(?:1[6-9]|2[0-9]|3[0-1])(?:\.\d{1,3}){2})' # Exclude private IP 172.16.x.x - 172.31.x.x
43
+ r'(?:(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,})' # Match domain name
44
+ r'(?::\d{2,5})?' # Optional port
45
+ r'(?:/[^\s]*)?$', # Optional path
46
+ re.IGNORECASE
47
+ )
48
+ return re.match(pattern, url) is not None
49
+
50
+
51
+ def evaluate(
52
+ agent_answers: pd.DataFrame,
53
+ tasks_with_gt: pd.DataFrame,
54
+ submission_id: str = ""
55
+ ) -> list[dict]:
56
+ """
57
+ Evaluate agent answers against ground truth.
58
+
59
+ Args:
60
+ agent_answers: DataFrame with columns 'task_id' and 'agent_answer'
61
+ tasks_with_gt: DataFrame with columns 'task_id', 'answer', and 'level'
62
+ submission_id: Identifier for the submission
63
+
64
+ Returns:
65
+ List of score dictionaries for each task
66
+ """
67
+ task_scores = []
68
+
69
+ for _, row in tasks_with_gt.iterrows():
70
+ correct_answer = row["answer"]
71
+ level = str(row["level"])
72
+ task_id = str(row["task_id"])
73
+
74
+ if task_id not in agent_answers["task_id"].values:
75
+ raise KeyError(f"Task ID: {task_id} not found. Are you sure you submitted the correct file?")
76
+
77
+ agent_answer = agent_answers.loc[agent_answers.task_id == task_id, "agent_answer"].values[0]
78
+ score = question_scorer(agent_answer, correct_answer)
79
+
80
+ task_scores.append({
81
+ "submission_id": submission_id,
82
+ "task_id": task_id,
83
+ "score": score,
84
+ "level": level,
85
+ "agent_answer": agent_answer,
86
+ })
87
+
88
+ return task_scores
89
+
data/metadata.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/scores_summary.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ pandas>=2.0.0
3
+ numpy>=1.24.0
4
+