Spaces:
Build error
Build error
博闻 commited on
Commit ·
13f06e2
1
Parent(s): 6a12be7
add emoji
Browse files- app.py +2 -1
- chinatravel/ui/content.py +6 -6
app.py
CHANGED
|
@@ -14,8 +14,9 @@ with gr.Blocks(title="ChinaTravel Benchmark Evaluation") as demo:
|
|
| 14 |
gr.Markdown(content.INTRO_MARKDOWN)
|
| 15 |
gr.Markdown(content.SUBMISSION_GUIDE)
|
| 16 |
|
| 17 |
-
gr.Markdown("### Leaderboard")
|
| 18 |
gr.Markdown("Methods marked with \* leverage Oracle DSL or an Oracle Verifier.")
|
|
|
|
| 19 |
if SPLITS_LIST:
|
| 20 |
with gr.Tabs():
|
| 21 |
for split in SPLITS_LIST:
|
|
|
|
| 14 |
gr.Markdown(content.INTRO_MARKDOWN)
|
| 15 |
gr.Markdown(content.SUBMISSION_GUIDE)
|
| 16 |
|
| 17 |
+
gr.Markdown("### 🏆 Leaderboard")
|
| 18 |
gr.Markdown("Methods marked with \* leverage Oracle DSL or an Oracle Verifier.")
|
| 19 |
+
gr.Markdown("✨ Methods marked with * leverage Oracle DSL or an Oracle Verifier.")
|
| 20 |
if SPLITS_LIST:
|
| 21 |
with gr.Tabs():
|
| 22 |
for split in SPLITS_LIST:
|
chinatravel/ui/content.py
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
TITLE_HTML = """
|
| 2 |
-
<h1 style=\"text-align:center; margin-bottom: 0.25rem;\">ChinaTravel Benchmark Evaluation</h1>
|
| 3 |
"""
|
| 4 |
|
| 5 |
INTRO_MARKDOWN = """
|
| 6 |
-
ChinaTravel is an open-ended travel planning benchmark with compositional constraint validation for language agents. (See our [paper](https://arxiv.org/abs/2412.13682) for more details.)
|
| 7 |
"""
|
| 8 |
|
| 9 |
SUBMISSION_GUIDE = """
|
| 10 |
-
**How to submit**
|
| 11 |
- Pick a split. The split determines which query UIDs are expected.
|
| 12 |
- Upload a `.zip` that contains JSON files named by query UIDs.
|
| 13 |
- Each JSON must follow the target schema: see [chinatravel/evaluation/output_schema.json](chinatravel/evaluation/output_schema.json).
|
| 14 |
- You can dry-run locally via `python eval_exp.py --splits <split> --method <your_method>` to mirror the hosted evaluation.
|
| 15 |
|
| 16 |
-
**Output**
|
| 17 |
- We compute DR (schema pass rate), EPR_micro/EPR_macro (commonsense), LPR_micro/LPR_macro/C-LPR (logic), and FPR (all-pass rate).
|
| 18 |
- A detailed JSON report is produced for download after evaluation.
|
| 19 |
|
| 20 |
-
**Contact**
|
| 21 |
- If you are interested in showing your results on our leaderboard or have any questions, please contact [Jie-Jing Shao](shaojj@lamda.nju.edu.cn), [Bo-Wen Zhang](221900200@smail.nju.edu.cn), [Xiao-Wen Yang](yangxw@lamda.nju.edu.cn)
|
| 22 |
"""
|
| 23 |
|
| 24 |
-
CONTACT = "Contact: zbw@smail.nju.edu.cn, shaojj@lamda.nju.edu.cn"
|
|
|
|
| 1 |
TITLE_HTML = """
|
| 2 |
+
<h1 style=\"text-align:center; margin-bottom: 0.25rem;\">🧭 ChinaTravel Benchmark Evaluation</h1>
|
| 3 |
"""
|
| 4 |
|
| 5 |
INTRO_MARKDOWN = """
|
| 6 |
+
✈️ ChinaTravel is an open-ended travel planning benchmark with compositional constraint validation for language agents. (See our [paper](https://arxiv.org/abs/2412.13682) for more details.)
|
| 7 |
"""
|
| 8 |
|
| 9 |
SUBMISSION_GUIDE = """
|
| 10 |
+
📥 **How to submit**
|
| 11 |
- Pick a split. The split determines which query UIDs are expected.
|
| 12 |
- Upload a `.zip` that contains JSON files named by query UIDs.
|
| 13 |
- Each JSON must follow the target schema: see [chinatravel/evaluation/output_schema.json](chinatravel/evaluation/output_schema.json).
|
| 14 |
- You can dry-run locally via `python eval_exp.py --splits <split> --method <your_method>` to mirror the hosted evaluation.
|
| 15 |
|
| 16 |
+
📊 **Output**
|
| 17 |
- We compute DR (schema pass rate), EPR_micro/EPR_macro (commonsense), LPR_micro/LPR_macro/C-LPR (logic), and FPR (all-pass rate).
|
| 18 |
- A detailed JSON report is produced for download after evaluation.
|
| 19 |
|
| 20 |
+
📨 **Contact**
|
| 21 |
- If you are interested in showing your results on our leaderboard or have any questions, please contact [Jie-Jing Shao](shaojj@lamda.nju.edu.cn), [Bo-Wen Zhang](221900200@smail.nju.edu.cn), [Xiao-Wen Yang](yangxw@lamda.nju.edu.cn)
|
| 22 |
"""
|
| 23 |
|
| 24 |
+
CONTACT = "Contact: ✉️ zbw@smail.nju.edu.cn, ✉️ shaojj@lamda.nju.edu.cn"
|