Spaces:

DeepSynthesisTeam
/

deepsynth-leaderboard

Sleeping

App Files Files Community

debjitpaul commited on Apr 19

Commit

369761f

1 Parent(s): a999c3e

Updated app.py

Browse files

Files changed (1) hide show

app.py +7 -6

app.py CHANGED Viewed

@@ -24,7 +24,8 @@ import pandas as pd
 RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
 TITLE = "🐙 DeepSynth Leaderboard"
 SUBTITLE = (
-    "Benchmark for Deep Information Synthesis — 120 expert-curated tasks "
     "across 7 domains and 67 countries. ICLR 2026."
 )
 REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench"
@@ -73,8 +74,8 @@ def build_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
             "Overall F1": scores.get("overall", {}).get("f1"),
             "LLM Judge": scores.get("overall", {}).get("llm_judge"),
         }
-        for domain in DOMAINS:
-            row[f"{domain.title()}"] = scores.get("per_domain", {}).get(domain, {}).get("f1")
         row["Avg Cost ($)"] = efficiency.get("avg_cost_usd")
         row["Avg Latency (s)"] = efficiency.get("avg_latency_s")
@@ -202,13 +203,13 @@ a run trace for spot-check verification.
                 gr.Markdown("""
 ```bibtex
 @inproceedings{deepsynth2026,
-  title     = {DeepSynth: A Benchmark for Deep Information Synthesis},
-  author    = {DeepSynth Team},
   booktitle = {International Conference on Learning Representations (ICLR)},
   year      = {2026}
 }
 ```
-                """)
     return app

 RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
 TITLE = "🐙 DeepSynth Leaderboard"
 SUBTITLE = (
+    "Large language model (LLM)-based agents are increasingly used to solve complex tasks involving tool use, such as web browsing, code execution, and data analysis. However, current evaluation benchmarks do not adequately assess their ability to solve real-world tasks that require synthesizing information from multiple sources and inferring insights beyond simple fact retrieval."
+    "We introduce DEEPSYNTH, a novel benchmark of 120 tasks across 7 domains and 67 countries, designed to evaluate agents on realistic, time-consuming problems that combine information gathering, synthesis, and structured reasoning. "
     "across 7 domains and 67 countries. ICLR 2026."
 )
 REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench"
             "Overall F1": scores.get("overall", {}).get("f1"),
             "LLM Judge": scores.get("overall", {}).get("llm_judge"),
         }
+        #for domain in DOMAINS:
+        #    row[f"{domain.title()}"] = scores.get("per_domain", {}).get(domain, {}).get("f1")
         row["Avg Cost ($)"] = efficiency.get("avg_cost_usd")
         row["Avg Latency (s)"] = efficiency.get("avg_latency_s")
                 gr.Markdown("""
 ```bibtex
 @inproceedings{deepsynth2026,
+  title     = {A Benchmark for Deep Information Synthesis},
+  author    = {{Paul, Debjit and Murphy, Daniel and Gritta, Milan and Cardenas, Ronald and Prokhorov, Victor and Bolliger, Lena Sophia and Toker, Aysim and Miles, Roy and Oncescu, Andreea-Maria and Sivakumar, Jasivan Alex and Borchert, Philipp and Elezi, Ismail and Zhang, Meiru and Lee, Ka Yiu and Zhang, Guchun and Wang, Jun and Lampouras, Gerasimos}}},
   booktitle = {International Conference on Learning Representations (ICLR)},
   year      = {2026}
 }
 ```
+""")
     return app