debjitpaul commited on
Commit
369761f
ยท
1 Parent(s): a999c3e

Updated app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -6
app.py CHANGED
@@ -24,7 +24,8 @@ import pandas as pd
24
  RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
25
  TITLE = "๐Ÿ™ DeepSynth Leaderboard"
26
  SUBTITLE = (
27
- "Benchmark for Deep Information Synthesis โ€” 120 expert-curated tasks "
 
28
  "across 7 domains and 67 countries. ICLR 2026."
29
  )
30
  REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench"
@@ -73,8 +74,8 @@ def build_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
73
  "Overall F1": scores.get("overall", {}).get("f1"),
74
  "LLM Judge": scores.get("overall", {}).get("llm_judge"),
75
  }
76
- for domain in DOMAINS:
77
- row[f"{domain.title()}"] = scores.get("per_domain", {}).get(domain, {}).get("f1")
78
 
79
  row["Avg Cost ($)"] = efficiency.get("avg_cost_usd")
80
  row["Avg Latency (s)"] = efficiency.get("avg_latency_s")
@@ -202,13 +203,13 @@ a run trace for spot-check verification.
202
  gr.Markdown("""
203
  ```bibtex
204
  @inproceedings{deepsynth2026,
205
- title = {DeepSynth: A Benchmark for Deep Information Synthesis},
206
- author = {DeepSynth Team},
207
  booktitle = {International Conference on Learning Representations (ICLR)},
208
  year = {2026}
209
  }
210
  ```
211
- """)
212
 
213
  return app
214
 
 
24
  RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
25
  TITLE = "๐Ÿ™ DeepSynth Leaderboard"
26
  SUBTITLE = (
27
+ "Large language model (LLM)-based agents are increasingly used to solve complex tasks involving tool use, such as web browsing, code execution, and data analysis. However, current evaluation benchmarks do not adequately assess their ability to solve real-world tasks that require synthesizing information from multiple sources and inferring insights beyond simple fact retrieval."
28
+ "We introduce DEEPSYNTH, a novel benchmark of 120 tasks across 7 domains and 67 countries, designed to evaluate agents on realistic, time-consuming problems that combine information gathering, synthesis, and structured reasoning. "
29
  "across 7 domains and 67 countries. ICLR 2026."
30
  )
31
  REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench"
 
74
  "Overall F1": scores.get("overall", {}).get("f1"),
75
  "LLM Judge": scores.get("overall", {}).get("llm_judge"),
76
  }
77
+ #for domain in DOMAINS:
78
+ # row[f"{domain.title()}"] = scores.get("per_domain", {}).get(domain, {}).get("f1")
79
 
80
  row["Avg Cost ($)"] = efficiency.get("avg_cost_usd")
81
  row["Avg Latency (s)"] = efficiency.get("avg_latency_s")
 
203
  gr.Markdown("""
204
  ```bibtex
205
  @inproceedings{deepsynth2026,
206
+ title = {A Benchmark for Deep Information Synthesis},
207
+ author = {{Paul, Debjit and Murphy, Daniel and Gritta, Milan and Cardenas, Ronald and Prokhorov, Victor and Bolliger, Lena Sophia and Toker, Aysim and Miles, Roy and Oncescu, Andreea-Maria and Sivakumar, Jasivan Alex and Borchert, Philipp and Elezi, Ismail and Zhang, Meiru and Lee, Ka Yiu and Zhang, Guchun and Wang, Jun and Lampouras, Gerasimos}}},
208
  booktitle = {International Conference on Learning Representations (ICLR)},
209
  year = {2026}
210
  }
211
  ```
212
+ """)
213
 
214
  return app
215