Ray0202
commited on
Commit
·
949c705
1
Parent(s):
9abc796
update 02.17.2026
Browse files- app.py +16 -3
- src/about.py +34 -2
app.py
CHANGED
|
@@ -173,17 +173,30 @@ with demo:
|
|
| 173 |
# Temporarily disabled for performance debugging.
|
| 174 |
with gr.TabItem("📤 Submit Results", elem_id="tab-submit", id=2):
|
| 175 |
gr.Markdown(
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
elem_classes="markdown-text",
|
| 178 |
)
|
| 179 |
gr.Markdown(EXAMPLE_RECORD_MD, elem_classes="markdown-text")
|
| 180 |
-
submission_file = gr.File(
|
|
|
|
|
|
|
|
|
|
| 181 |
submit_button = gr.Button("Submit for Review")
|
| 182 |
submission_status = gr.Markdown()
|
| 183 |
-
|
| 184 |
|
| 185 |
with gr.TabItem("📝 About", elem_id="tab-about", id=3):
|
| 186 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
| 187 |
|
| 188 |
# Citation section hidden for now.
|
| 189 |
# with gr.Row():
|
|
|
|
| 173 |
# Temporarily disabled for performance debugging.
|
| 174 |
with gr.TabItem("📤 Submit Results", elem_id="tab-submit", id=2):
|
| 175 |
gr.Markdown(
|
| 176 |
+
(
|
| 177 |
+
"Upload submission files for manual review.\n\n"
|
| 178 |
+
"Required files:\n"
|
| 179 |
+
"1. `results_on_dev_dataset.json`: task-level metrics in leaderboard format.\n"
|
| 180 |
+
"2. `results_on_test_dataset.json`: per-example test outputs with at least "
|
| 181 |
+
"`id`, `tier`, `source_dataset`, `label`, and `output` "
|
| 182 |
+
"(required when the sample contains forecasting).\n\n"
|
| 183 |
+
"Please also include model architecture code and LLM/system details for verification."
|
| 184 |
+
),
|
| 185 |
elem_classes="markdown-text",
|
| 186 |
)
|
| 187 |
gr.Markdown(EXAMPLE_RECORD_MD, elem_classes="markdown-text")
|
| 188 |
+
submission_file = gr.File(
|
| 189 |
+
label="Submission package (.zip or .rar)",
|
| 190 |
+
file_types=[".zip", ".rar"],
|
| 191 |
+
)
|
| 192 |
submit_button = gr.Button("Submit for Review")
|
| 193 |
submission_status = gr.Markdown()
|
| 194 |
+
submit_button.click(save_submission, [submission_file], submission_status)
|
| 195 |
|
| 196 |
with gr.TabItem("📝 About", elem_id="tab-about", id=3):
|
| 197 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 198 |
+
gr.Markdown(f"## Citation\n{CITATION_BUTTON_LABEL}", elem_classes="markdown-text")
|
| 199 |
+
gr.Markdown(f"```bibtex\n{CITATION_BUTTON_TEXT.strip()}\n```", elem_classes="markdown-text")
|
| 200 |
|
| 201 |
# Citation section hidden for now.
|
| 202 |
# with gr.Row():
|
src/about.py
CHANGED
|
@@ -7,6 +7,8 @@ executed here, and no LLM APIs are called.
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
LLM_BENCHMARKS_TEXT = """
|
|
|
|
|
|
|
| 10 |
## What this leaderboard shows
|
| 11 |
|
| 12 |
- One row per evaluated agent configuration
|
|
@@ -32,8 +34,29 @@ from dataset-level results using question/series counts. Missing values are igno
|
|
| 32 |
|
| 33 |
## Submission workflow
|
| 34 |
|
| 35 |
-
Uploads are stored locally for manual review.
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
## Data access
|
| 39 |
|
|
@@ -49,4 +72,13 @@ EVALUATION_QUEUE_TEXT = ""
|
|
| 49 |
|
| 50 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 51 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
"""
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
LLM_BENCHMARKS_TEXT = """
|
| 10 |
+
The paper describing this benchmark is *TemporalBench: A Benchmark for Evaluating LLM-Based Agents on Contextual and Event-Informed Time Series Tasks* (https://arxiv.org/abs/2602.13272). We also maintain a public leaderboard and welcome submissions from state-of-the-art models: https://huggingface.co/spaces/Melady/TemporalBench_Leaderboard
|
| 11 |
+
|
| 12 |
## What this leaderboard shows
|
| 13 |
|
| 14 |
- One row per evaluated agent configuration
|
|
|
|
| 34 |
|
| 35 |
## Submission workflow
|
| 36 |
|
| 37 |
+
Uploads are stored locally for manual review.
|
| 38 |
+
|
| 39 |
+
For a valid submission, please provide **two files**:
|
| 40 |
+
|
| 41 |
+
1. `results_on_dev_dataset.json`
|
| 42 |
+
- This follows the leaderboard metrics format.
|
| 43 |
+
- It should include task-level metrics only (e.g., T1-T4 and forecasting metrics).
|
| 44 |
+
|
| 45 |
+
2. `results_on_test_dataset.json`
|
| 46 |
+
- This should include per-example outputs on the test split.
|
| 47 |
+
- For each example, include at least:
|
| 48 |
+
- `id`
|
| 49 |
+
- `tier`
|
| 50 |
+
- `source_dataset`
|
| 51 |
+
- `label`
|
| 52 |
+
- `output` (required when the example contains a forecasting task)
|
| 53 |
+
|
| 54 |
+
We also strongly encourage including model and system metadata, such as:
|
| 55 |
+
- model architecture code
|
| 56 |
+
- LLM(s) used
|
| 57 |
+
- key implementation details needed for result verification
|
| 58 |
+
|
| 59 |
+
Approved submissions should then be merged into the main results file to appear on the leaderboard.
|
| 60 |
|
| 61 |
## Data access
|
| 62 |
|
|
|
|
| 72 |
|
| 73 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 74 |
CITATION_BUTTON_TEXT = r"""
|
| 75 |
+
@misc{weng2026temporalbenchbenchmarkevaluatingllmbased,
|
| 76 |
+
title={TemporalBench: A Benchmark for Evaluating LLM-Based Agents on Contextual and Event-Informed Time Series Tasks},
|
| 77 |
+
author={Muyan Weng and Defu Cao and Wei Yang and Yashaswi Sharma and Yan Liu},
|
| 78 |
+
year={2026},
|
| 79 |
+
eprint={2602.13272},
|
| 80 |
+
archivePrefix={arXiv},
|
| 81 |
+
primaryClass={cs.AI},
|
| 82 |
+
url={https://arxiv.org/abs/2602.13272},
|
| 83 |
+
}
|
| 84 |
"""
|