Ray0202 commited on
Commit ·
4527eaf
1
Parent(s): 9d8edae
Update space
Browse files- README.md +1 -0
- app.py +47 -6
- src/about.py +5 -0
- src/envs.py +3 -0
README.md
CHANGED
|
@@ -22,6 +22,7 @@ It does not execute agents, call LLM APIs, or accept API keys.
|
|
| 22 |
|
| 23 |
- Set the local results file path via `TEMPORALBENCH_RESULTS_PATH`.
|
| 24 |
Default is `data/results.json`.
|
|
|
|
| 25 |
- Update descriptive text in `src/about.py`.
|
| 26 |
|
| 27 |
## Results File Format
|
|
|
|
| 22 |
|
| 23 |
- Set the local results file path via `TEMPORALBENCH_RESULTS_PATH`.
|
| 24 |
Default is `data/results.json`.
|
| 25 |
+
- Submissions are stored in `data/submissions/` for manual review (override with `TEMPORALBENCH_SUBMISSIONS_PATH`).
|
| 26 |
- Update descriptive text in `src/about.py`.
|
| 27 |
|
| 28 |
## Results File Format
|
app.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from typing import Optional
|
| 2 |
|
| 3 |
import gradio as gr
|
|
@@ -12,8 +15,13 @@ from src.about import (
|
|
| 12 |
TITLE,
|
| 13 |
)
|
| 14 |
from src.display.css_html_js import custom_css
|
| 15 |
-
from src.envs import RESULTS_PATH
|
| 16 |
-
from src.leaderboard.load_results import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
from src.leaderboard.schema import SCHEMA
|
| 18 |
|
| 19 |
|
|
@@ -100,6 +108,32 @@ def compare_entries(entry_a: str, entry_b: str) -> pd.DataFrame:
|
|
| 100 |
return pd.DataFrame.from_records(rows)
|
| 101 |
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
demo = gr.Blocks(css=custom_css)
|
| 104 |
with demo:
|
| 105 |
gr.HTML(TITLE)
|
|
@@ -129,7 +163,17 @@ with demo:
|
|
| 129 |
entry_a.change(compare_entries, [entry_a, entry_b], compare_table)
|
| 130 |
entry_b.change(compare_entries, [entry_a, entry_b], compare_table)
|
| 131 |
|
| 132 |
-
with gr.TabItem("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 134 |
|
| 135 |
with gr.Row():
|
|
@@ -142,7 +186,4 @@ with demo:
|
|
| 142 |
show_copy_button=True,
|
| 143 |
)
|
| 144 |
|
| 145 |
-
scheduler = BackgroundScheduler()
|
| 146 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 147 |
-
scheduler.start()
|
| 148 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 1 |
+
from datetime import datetime, timezone
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
from typing import Optional
|
| 5 |
|
| 6 |
import gradio as gr
|
|
|
|
| 15 |
TITLE,
|
| 16 |
)
|
| 17 |
from src.display.css_html_js import custom_css
|
| 18 |
+
from src.envs import RESULTS_PATH, SUBMISSIONS_PATH
|
| 19 |
+
from src.leaderboard.load_results import (
|
| 20 |
+
ResultsValidationError,
|
| 21 |
+
build_dataframe,
|
| 22 |
+
load_records,
|
| 23 |
+
validate_records,
|
| 24 |
+
)
|
| 25 |
from src.leaderboard.schema import SCHEMA
|
| 26 |
|
| 27 |
|
|
|
|
| 108 |
return pd.DataFrame.from_records(rows)
|
| 109 |
|
| 110 |
|
| 111 |
+
def save_submission(uploaded_file) -> str:
|
| 112 |
+
if uploaded_file is None:
|
| 113 |
+
return "Please upload a results file."
|
| 114 |
+
|
| 115 |
+
file_path = uploaded_file.name if hasattr(uploaded_file, "name") else str(uploaded_file)
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
records = load_records(file_path)
|
| 119 |
+
validate_records(records)
|
| 120 |
+
except ResultsValidationError as exc:
|
| 121 |
+
return f"**Validation error:** {exc}"
|
| 122 |
+
|
| 123 |
+
os.makedirs(SUBMISSIONS_PATH, exist_ok=True)
|
| 124 |
+
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
| 125 |
+
out_path = os.path.join(SUBMISSIONS_PATH, f"submission_{timestamp}.json")
|
| 126 |
+
payload = {
|
| 127 |
+
"submitted_at": timestamp,
|
| 128 |
+
"source_filename": os.path.basename(file_path),
|
| 129 |
+
"records": records,
|
| 130 |
+
}
|
| 131 |
+
with open(out_path, "w") as fp:
|
| 132 |
+
json.dump(payload, fp, indent=2)
|
| 133 |
+
|
| 134 |
+
return f"Submission received for review. Saved to `{out_path}`."
|
| 135 |
+
|
| 136 |
+
|
| 137 |
demo = gr.Blocks(css=custom_css)
|
| 138 |
with demo:
|
| 139 |
gr.HTML(TITLE)
|
|
|
|
| 163 |
entry_a.change(compare_entries, [entry_a, entry_b], compare_table)
|
| 164 |
entry_b.change(compare_entries, [entry_a, entry_b], compare_table)
|
| 165 |
|
| 166 |
+
with gr.TabItem("📤 Submit Results", elem_id="llm-benchmark-tab-table", id=2):
|
| 167 |
+
gr.Markdown(
|
| 168 |
+
"Upload a results file for manual review. Approved results will be merged into the main dataset.",
|
| 169 |
+
elem_classes="markdown-text",
|
| 170 |
+
)
|
| 171 |
+
submission_file = gr.File(label="Results file (.json or .csv)", file_types=[".json", ".csv"])
|
| 172 |
+
submit_button = gr.Button("Submit for Review")
|
| 173 |
+
submission_status = gr.Markdown()
|
| 174 |
+
submit_button.click(save_submission, [submission_file], submission_status)
|
| 175 |
+
|
| 176 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
| 177 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 178 |
|
| 179 |
with gr.Row():
|
|
|
|
| 186 |
show_copy_button=True,
|
| 187 |
)
|
| 188 |
|
|
|
|
|
|
|
|
|
|
| 189 |
demo.queue(default_concurrency_limit=40).launch()
|
src/about.py
CHANGED
|
@@ -20,6 +20,11 @@ Results are loaded from a local JSON or CSV file. Each record must include:
|
|
| 20 |
- Identity fields: `model_name`, `agent_name`, `agent_type`, `base_model`
|
| 21 |
- Required metrics: `T1_acc`, `T2_acc`, `T3_acc`, `T4_acc`
|
| 22 |
- Optional metrics: `T2_MAE`, `T4_sMAPE`, and any additional numeric columns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
"""
|
| 24 |
|
| 25 |
EVALUATION_QUEUE_TEXT = ""
|
|
|
|
| 20 |
- Identity fields: `model_name`, `agent_name`, `agent_type`, `base_model`
|
| 21 |
- Required metrics: `T1_acc`, `T2_acc`, `T3_acc`, `T4_acc`
|
| 22 |
- Optional metrics: `T2_MAE`, `T4_sMAPE`, and any additional numeric columns
|
| 23 |
+
|
| 24 |
+
## Submission workflow
|
| 25 |
+
|
| 26 |
+
Uploads are stored locally for manual review. Approved results should be merged into
|
| 27 |
+
the main results file to appear on the leaderboard.
|
| 28 |
"""
|
| 29 |
|
| 30 |
EVALUATION_QUEUE_TEXT = ""
|
src/envs.py
CHANGED
|
@@ -2,3 +2,6 @@ import os
|
|
| 2 |
|
| 3 |
# Local results file (JSON or CSV). Override with TEMPORALBENCH_RESULTS_PATH.
|
| 4 |
RESULTS_PATH = os.environ.get("TEMPORALBENCH_RESULTS_PATH", "data/results.json")
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
# Local results file (JSON or CSV). Override with TEMPORALBENCH_RESULTS_PATH.
|
| 4 |
RESULTS_PATH = os.environ.get("TEMPORALBENCH_RESULTS_PATH", "data/results.json")
|
| 5 |
+
|
| 6 |
+
# Local submissions folder for manual review.
|
| 7 |
+
SUBMISSIONS_PATH = os.environ.get("TEMPORALBENCH_SUBMISSIONS_PATH", "data/submissions")
|