Ray0202 commited on
Commit
4527eaf
·
1 Parent(s): 9d8edae

Update space

Browse files
Files changed (4) hide show
  1. README.md +1 -0
  2. app.py +47 -6
  3. src/about.py +5 -0
  4. src/envs.py +3 -0
README.md CHANGED
@@ -22,6 +22,7 @@ It does not execute agents, call LLM APIs, or accept API keys.
22
 
23
  - Set the local results file path via `TEMPORALBENCH_RESULTS_PATH`.
24
  Default is `data/results.json`.
 
25
  - Update descriptive text in `src/about.py`.
26
 
27
  ## Results File Format
 
22
 
23
  - Set the local results file path via `TEMPORALBENCH_RESULTS_PATH`.
24
  Default is `data/results.json`.
25
+ - Submissions are stored in `data/submissions/` for manual review (override with `TEMPORALBENCH_SUBMISSIONS_PATH`).
26
  - Update descriptive text in `src/about.py`.
27
 
28
  ## Results File Format
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  from typing import Optional
2
 
3
  import gradio as gr
@@ -12,8 +15,13 @@ from src.about import (
12
  TITLE,
13
  )
14
  from src.display.css_html_js import custom_css
15
- from src.envs import RESULTS_PATH
16
- from src.leaderboard.load_results import ResultsValidationError, build_dataframe, load_records
 
 
 
 
 
17
  from src.leaderboard.schema import SCHEMA
18
 
19
 
@@ -100,6 +108,32 @@ def compare_entries(entry_a: str, entry_b: str) -> pd.DataFrame:
100
  return pd.DataFrame.from_records(rows)
101
 
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  demo = gr.Blocks(css=custom_css)
104
  with demo:
105
  gr.HTML(TITLE)
@@ -129,7 +163,17 @@ with demo:
129
  entry_a.change(compare_entries, [entry_a, entry_b], compare_table)
130
  entry_b.change(compare_entries, [entry_a, entry_b], compare_table)
131
 
132
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
 
 
 
 
 
 
 
 
 
 
133
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
134
 
135
  with gr.Row():
@@ -142,7 +186,4 @@ with demo:
142
  show_copy_button=True,
143
  )
144
 
145
- scheduler = BackgroundScheduler()
146
- scheduler.add_job(restart_space, "interval", seconds=1800)
147
- scheduler.start()
148
  demo.queue(default_concurrency_limit=40).launch()
 
1
+ from datetime import datetime, timezone
2
+ import json
3
+ import os
4
  from typing import Optional
5
 
6
  import gradio as gr
 
15
  TITLE,
16
  )
17
  from src.display.css_html_js import custom_css
18
+ from src.envs import RESULTS_PATH, SUBMISSIONS_PATH
19
+ from src.leaderboard.load_results import (
20
+ ResultsValidationError,
21
+ build_dataframe,
22
+ load_records,
23
+ validate_records,
24
+ )
25
  from src.leaderboard.schema import SCHEMA
26
 
27
 
 
108
  return pd.DataFrame.from_records(rows)
109
 
110
 
111
+ def save_submission(uploaded_file) -> str:
112
+ if uploaded_file is None:
113
+ return "Please upload a results file."
114
+
115
+ file_path = uploaded_file.name if hasattr(uploaded_file, "name") else str(uploaded_file)
116
+
117
+ try:
118
+ records = load_records(file_path)
119
+ validate_records(records)
120
+ except ResultsValidationError as exc:
121
+ return f"**Validation error:** {exc}"
122
+
123
+ os.makedirs(SUBMISSIONS_PATH, exist_ok=True)
124
+ timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
125
+ out_path = os.path.join(SUBMISSIONS_PATH, f"submission_{timestamp}.json")
126
+ payload = {
127
+ "submitted_at": timestamp,
128
+ "source_filename": os.path.basename(file_path),
129
+ "records": records,
130
+ }
131
+ with open(out_path, "w") as fp:
132
+ json.dump(payload, fp, indent=2)
133
+
134
+ return f"Submission received for review. Saved to `{out_path}`."
135
+
136
+
137
  demo = gr.Blocks(css=custom_css)
138
  with demo:
139
  gr.HTML(TITLE)
 
163
  entry_a.change(compare_entries, [entry_a, entry_b], compare_table)
164
  entry_b.change(compare_entries, [entry_a, entry_b], compare_table)
165
 
166
+ with gr.TabItem("📤 Submit Results", elem_id="llm-benchmark-tab-table", id=2):
167
+ gr.Markdown(
168
+ "Upload a results file for manual review. Approved results will be merged into the main dataset.",
169
+ elem_classes="markdown-text",
170
+ )
171
+ submission_file = gr.File(label="Results file (.json or .csv)", file_types=[".json", ".csv"])
172
+ submit_button = gr.Button("Submit for Review")
173
+ submission_status = gr.Markdown()
174
+ submit_button.click(save_submission, [submission_file], submission_status)
175
+
176
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
177
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
178
 
179
  with gr.Row():
 
186
  show_copy_button=True,
187
  )
188
 
 
 
 
189
  demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -20,6 +20,11 @@ Results are loaded from a local JSON or CSV file. Each record must include:
20
  - Identity fields: `model_name`, `agent_name`, `agent_type`, `base_model`
21
  - Required metrics: `T1_acc`, `T2_acc`, `T3_acc`, `T4_acc`
22
  - Optional metrics: `T2_MAE`, `T4_sMAPE`, and any additional numeric columns
 
 
 
 
 
23
  """
24
 
25
  EVALUATION_QUEUE_TEXT = ""
 
20
  - Identity fields: `model_name`, `agent_name`, `agent_type`, `base_model`
21
  - Required metrics: `T1_acc`, `T2_acc`, `T3_acc`, `T4_acc`
22
  - Optional metrics: `T2_MAE`, `T4_sMAPE`, and any additional numeric columns
23
+
24
+ ## Submission workflow
25
+
26
+ Uploads are stored locally for manual review. Approved results should be merged into
27
+ the main results file to appear on the leaderboard.
28
  """
29
 
30
  EVALUATION_QUEUE_TEXT = ""
src/envs.py CHANGED
@@ -2,3 +2,6 @@ import os
2
 
3
  # Local results file (JSON or CSV). Override with TEMPORALBENCH_RESULTS_PATH.
4
  RESULTS_PATH = os.environ.get("TEMPORALBENCH_RESULTS_PATH", "data/results.json")
 
 
 
 
2
 
3
  # Local results file (JSON or CSV). Override with TEMPORALBENCH_RESULTS_PATH.
4
  RESULTS_PATH = os.environ.get("TEMPORALBENCH_RESULTS_PATH", "data/results.json")
5
+
6
+ # Local submissions folder for manual review.
7
+ SUBMISSIONS_PATH = os.environ.get("TEMPORALBENCH_SUBMISSIONS_PATH", "data/submissions")