Ray0202 commited on
Commit
949c705
·
1 Parent(s): 9abc796

update 02.17.2026

Browse files
Files changed (2) hide show
  1. app.py +16 -3
  2. src/about.py +34 -2
app.py CHANGED
@@ -173,17 +173,30 @@ with demo:
173
  # Temporarily disabled for performance debugging.
174
  with gr.TabItem("📤 Submit Results", elem_id="tab-submit", id=2):
175
  gr.Markdown(
176
- "Upload a results file for manual review. Approved results will be merged into the main dataset.",
 
 
 
 
 
 
 
 
177
  elem_classes="markdown-text",
178
  )
179
  gr.Markdown(EXAMPLE_RECORD_MD, elem_classes="markdown-text")
180
- submission_file = gr.File(label="Results file (.json or .csv)", file_types=[".json", ".csv"])
 
 
 
181
  submit_button = gr.Button("Submit for Review")
182
  submission_status = gr.Markdown()
183
- submission_status.value = "Submission is temporarily disabled for performance debugging."
184
 
185
  with gr.TabItem("📝 About", elem_id="tab-about", id=3):
186
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
187
 
188
  # Citation section hidden for now.
189
  # with gr.Row():
 
173
  # Temporarily disabled for performance debugging.
174
  with gr.TabItem("📤 Submit Results", elem_id="tab-submit", id=2):
175
  gr.Markdown(
176
+ (
177
+ "Upload submission files for manual review.\n\n"
178
+ "Required files:\n"
179
+ "1. `results_on_dev_dataset.json`: task-level metrics in leaderboard format.\n"
180
+ "2. `results_on_test_dataset.json`: per-example test outputs with at least "
181
+ "`id`, `tier`, `source_dataset`, `label`, and `output` "
182
+ "(required when the sample contains forecasting).\n\n"
183
+ "Please also include model architecture code and LLM/system details for verification."
184
+ ),
185
  elem_classes="markdown-text",
186
  )
187
  gr.Markdown(EXAMPLE_RECORD_MD, elem_classes="markdown-text")
188
+ submission_file = gr.File(
189
+ label="Submission package (.zip or .rar)",
190
+ file_types=[".zip", ".rar"],
191
+ )
192
  submit_button = gr.Button("Submit for Review")
193
  submission_status = gr.Markdown()
194
+ submit_button.click(save_submission, [submission_file], submission_status)
195
 
196
  with gr.TabItem("📝 About", elem_id="tab-about", id=3):
197
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
198
+ gr.Markdown(f"## Citation\n{CITATION_BUTTON_LABEL}", elem_classes="markdown-text")
199
+ gr.Markdown(f"```bibtex\n{CITATION_BUTTON_TEXT.strip()}\n```", elem_classes="markdown-text")
200
 
201
  # Citation section hidden for now.
202
  # with gr.Row():
src/about.py CHANGED
@@ -7,6 +7,8 @@ executed here, and no LLM APIs are called.
7
  """
8
 
9
  LLM_BENCHMARKS_TEXT = """
 
 
10
  ## What this leaderboard shows
11
 
12
  - One row per evaluated agent configuration
@@ -32,8 +34,29 @@ from dataset-level results using question/series counts. Missing values are igno
32
 
33
  ## Submission workflow
34
 
35
- Uploads are stored locally for manual review. Approved results should be merged into
36
- the main results file to appear on the leaderboard.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  ## Data access
39
 
@@ -49,4 +72,13 @@ EVALUATION_QUEUE_TEXT = ""
49
 
50
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
51
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
 
52
  """
 
7
  """
8
 
9
  LLM_BENCHMARKS_TEXT = """
10
+ The paper describing this benchmark is *TemporalBench: A Benchmark for Evaluating LLM-Based Agents on Contextual and Event-Informed Time Series Tasks* (https://arxiv.org/abs/2602.13272). We also maintain a public leaderboard and welcome submissions from state-of-the-art models: https://huggingface.co/spaces/Melady/TemporalBench_Leaderboard
11
+
12
  ## What this leaderboard shows
13
 
14
  - One row per evaluated agent configuration
 
34
 
35
  ## Submission workflow
36
 
37
+ Uploads are stored locally for manual review.
38
+
39
+ For a valid submission, please provide **two files**:
40
+
41
+ 1. `results_on_dev_dataset.json`
42
+ - This follows the leaderboard metrics format.
43
+ - It should include task-level metrics only (e.g., T1-T4 and forecasting metrics).
44
+
45
+ 2. `results_on_test_dataset.json`
46
+ - This should include per-example outputs on the test split.
47
+ - For each example, include at least:
48
+ - `id`
49
+ - `tier`
50
+ - `source_dataset`
51
+ - `label`
52
+ - `output` (required when the example contains a forecasting task)
53
+
54
+ We also strongly encourage including model and system metadata, such as:
55
+ - model architecture code
56
+ - LLM(s) used
57
+ - key implementation details needed for result verification
58
+
59
+ Approved submissions should then be merged into the main results file to appear on the leaderboard.
60
 
61
  ## Data access
62
 
 
72
 
73
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
74
  CITATION_BUTTON_TEXT = r"""
75
+ @misc{weng2026temporalbenchbenchmarkevaluatingllmbased,
76
+ title={TemporalBench: A Benchmark for Evaluating LLM-Based Agents on Contextual and Event-Informed Time Series Tasks},
77
+ author={Muyan Weng and Defu Cao and Wei Yang and Yashaswi Sharma and Yan Liu},
78
+ year={2026},
79
+ eprint={2602.13272},
80
+ archivePrefix={arXiv},
81
+ primaryClass={cs.AI},
82
+ url={https://arxiv.org/abs/2602.13272},
83
+ }
84
  """