Spaces:

Melady
/

TemporalBench_Leaderboard

Running

App Files Files Community

Ray0202 commited on Jan 14

Commit

4527eaf

1 Parent(s): 9d8edae

Update space

Browse files

Files changed (4) hide show

README.md +1 -0
app.py +47 -6
src/about.py +5 -0
src/envs.py +3 -0

README.md CHANGED Viewed

@@ -22,6 +22,7 @@ It does not execute agents, call LLM APIs, or accept API keys.
 - Set the local results file path via `TEMPORALBENCH_RESULTS_PATH`.
   Default is `data/results.json`.
 - Update descriptive text in `src/about.py`.
 ## Results File Format

 - Set the local results file path via `TEMPORALBENCH_RESULTS_PATH`.
   Default is `data/results.json`.
+- Submissions are stored in `data/submissions/` for manual review (override with `TEMPORALBENCH_SUBMISSIONS_PATH`).
 - Update descriptive text in `src/about.py`.
 ## Results File Format

app.py CHANGED Viewed

@@ -1,3 +1,6 @@
 from typing import Optional
 import gradio as gr
@@ -12,8 +15,13 @@ from src.about import (
     TITLE,
 )
 from src.display.css_html_js import custom_css
-from src.envs import RESULTS_PATH
-from src.leaderboard.load_results import ResultsValidationError, build_dataframe, load_records
 from src.leaderboard.schema import SCHEMA
@@ -100,6 +108,32 @@ def compare_entries(entry_a: str, entry_b: str) -> pd.DataFrame:
     return pd.DataFrame.from_records(rows)
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
@@ -129,7 +163,17 @@ with demo:
             entry_a.change(compare_entries, [entry_a, entry_b], compare_table)
             entry_b.change(compare_entries, [entry_a, entry_b], compare_table)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():
@@ -142,7 +186,4 @@ with demo:
                 show_copy_button=True,
             )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

+from datetime import datetime, timezone
+import json
+import os
 from typing import Optional
 import gradio as gr
     TITLE,
 )
 from src.display.css_html_js import custom_css
+from src.envs import RESULTS_PATH, SUBMISSIONS_PATH
+from src.leaderboard.load_results import (
+    ResultsValidationError,
+    build_dataframe,
+    load_records,
+    validate_records,
+)
 from src.leaderboard.schema import SCHEMA
     return pd.DataFrame.from_records(rows)
+def save_submission(uploaded_file) -> str:
+    if uploaded_file is None:
+        return "Please upload a results file."
+    file_path = uploaded_file.name if hasattr(uploaded_file, "name") else str(uploaded_file)
+    try:
+        records = load_records(file_path)
+        validate_records(records)
+    except ResultsValidationError as exc:
+        return f"**Validation error:** {exc}"
+    os.makedirs(SUBMISSIONS_PATH, exist_ok=True)
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    out_path = os.path.join(SUBMISSIONS_PATH, f"submission_{timestamp}.json")
+    payload = {
+        "submitted_at": timestamp,
+        "source_filename": os.path.basename(file_path),
+        "records": records,
+    }
+    with open(out_path, "w") as fp:
+        json.dump(payload, fp, indent=2)
+    return f"Submission received for review. Saved to `{out_path}`."
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
             entry_a.change(compare_entries, [entry_a, entry_b], compare_table)
             entry_b.change(compare_entries, [entry_a, entry_b], compare_table)
+        with gr.TabItem("📤 Submit Results", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(
+                "Upload a results file for manual review. Approved results will be merged into the main dataset.",
+                elem_classes="markdown-text",
+            )
+            submission_file = gr.File(label="Results file (.json or .csv)", file_types=[".json", ".csv"])
+            submit_button = gr.Button("Submit for Review")
+            submission_status = gr.Markdown()
+            submit_button.click(save_submission, [submission_file], submission_status)
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():
                 show_copy_button=True,
             )
 demo.queue(default_concurrency_limit=40).launch()

src/about.py CHANGED Viewed

@@ -20,6 +20,11 @@ Results are loaded from a local JSON or CSV file. Each record must include:
 - Identity fields: `model_name`, `agent_name`, `agent_type`, `base_model`
 - Required metrics: `T1_acc`, `T2_acc`, `T3_acc`, `T4_acc`
 - Optional metrics: `T2_MAE`, `T4_sMAPE`, and any additional numeric columns
 """
 EVALUATION_QUEUE_TEXT = ""

 - Identity fields: `model_name`, `agent_name`, `agent_type`, `base_model`
 - Required metrics: `T1_acc`, `T2_acc`, `T3_acc`, `T4_acc`
 - Optional metrics: `T2_MAE`, `T4_sMAPE`, and any additional numeric columns
+## Submission workflow
+Uploads are stored locally for manual review. Approved results should be merged into
+the main results file to appear on the leaderboard.
 """
 EVALUATION_QUEUE_TEXT = ""

src/envs.py CHANGED Viewed

@@ -2,3 +2,6 @@ import os
 # Local results file (JSON or CSV). Override with TEMPORALBENCH_RESULTS_PATH.
 RESULTS_PATH = os.environ.get("TEMPORALBENCH_RESULTS_PATH", "data/results.json")

 # Local results file (JSON or CSV). Override with TEMPORALBENCH_RESULTS_PATH.
 RESULTS_PATH = os.environ.get("TEMPORALBENCH_RESULTS_PATH", "data/results.json")
+# Local submissions folder for manual review.
+SUBMISSIONS_PATH = os.environ.get("TEMPORALBENCH_SUBMISSIONS_PATH", "data/submissions")