debjitpaul commited on
Commit ยท
369761f
1
Parent(s): a999c3e
Updated app.py
Browse files
app.py
CHANGED
|
@@ -24,7 +24,8 @@ import pandas as pd
|
|
| 24 |
RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
|
| 25 |
TITLE = "๐ DeepSynth Leaderboard"
|
| 26 |
SUBTITLE = (
|
| 27 |
-
"
|
|
|
|
| 28 |
"across 7 domains and 67 countries. ICLR 2026."
|
| 29 |
)
|
| 30 |
REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench"
|
|
@@ -73,8 +74,8 @@ def build_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
|
|
| 73 |
"Overall F1": scores.get("overall", {}).get("f1"),
|
| 74 |
"LLM Judge": scores.get("overall", {}).get("llm_judge"),
|
| 75 |
}
|
| 76 |
-
for domain in DOMAINS:
|
| 77 |
-
|
| 78 |
|
| 79 |
row["Avg Cost ($)"] = efficiency.get("avg_cost_usd")
|
| 80 |
row["Avg Latency (s)"] = efficiency.get("avg_latency_s")
|
|
@@ -202,13 +203,13 @@ a run trace for spot-check verification.
|
|
| 202 |
gr.Markdown("""
|
| 203 |
```bibtex
|
| 204 |
@inproceedings{deepsynth2026,
|
| 205 |
-
title = {
|
| 206 |
-
author = {
|
| 207 |
booktitle = {International Conference on Learning Representations (ICLR)},
|
| 208 |
year = {2026}
|
| 209 |
}
|
| 210 |
```
|
| 211 |
-
|
| 212 |
|
| 213 |
return app
|
| 214 |
|
|
|
|
| 24 |
RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
|
| 25 |
TITLE = "๐ DeepSynth Leaderboard"
|
| 26 |
SUBTITLE = (
|
| 27 |
+
"Large language model (LLM)-based agents are increasingly used to solve complex tasks involving tool use, such as web browsing, code execution, and data analysis. However, current evaluation benchmarks do not adequately assess their ability to solve real-world tasks that require synthesizing information from multiple sources and inferring insights beyond simple fact retrieval."
|
| 28 |
+
"We introduce DEEPSYNTH, a novel benchmark of 120 tasks across 7 domains and 67 countries, designed to evaluate agents on realistic, time-consuming problems that combine information gathering, synthesis, and structured reasoning. "
|
| 29 |
"across 7 domains and 67 countries. ICLR 2026."
|
| 30 |
)
|
| 31 |
REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench"
|
|
|
|
| 74 |
"Overall F1": scores.get("overall", {}).get("f1"),
|
| 75 |
"LLM Judge": scores.get("overall", {}).get("llm_judge"),
|
| 76 |
}
|
| 77 |
+
#for domain in DOMAINS:
|
| 78 |
+
# row[f"{domain.title()}"] = scores.get("per_domain", {}).get(domain, {}).get("f1")
|
| 79 |
|
| 80 |
row["Avg Cost ($)"] = efficiency.get("avg_cost_usd")
|
| 81 |
row["Avg Latency (s)"] = efficiency.get("avg_latency_s")
|
|
|
|
| 203 |
gr.Markdown("""
|
| 204 |
```bibtex
|
| 205 |
@inproceedings{deepsynth2026,
|
| 206 |
+
title = {A Benchmark for Deep Information Synthesis},
|
| 207 |
+
author = {{Paul, Debjit and Murphy, Daniel and Gritta, Milan and Cardenas, Ronald and Prokhorov, Victor and Bolliger, Lena Sophia and Toker, Aysim and Miles, Roy and Oncescu, Andreea-Maria and Sivakumar, Jasivan Alex and Borchert, Philipp and Elezi, Ismail and Zhang, Meiru and Lee, Ka Yiu and Zhang, Guchun and Wang, Jun and Lampouras, Gerasimos}}},
|
| 208 |
booktitle = {International Conference on Learning Representations (ICLR)},
|
| 209 |
year = {2026}
|
| 210 |
}
|
| 211 |
```
|
| 212 |
+
""")
|
| 213 |
|
| 214 |
return app
|
| 215 |
|