Spaces:
Build error
Build error
博闻 commited on
Commit ·
3a4034e
1
Parent(s): 992e8c1
add relevant info
Browse files- app.py +3 -2
- chinatravel/ui/content.py +19 -6
- easy_submission_example.zip +3 -0
- leaderboard_data/easy.csv +13 -13
- leaderboard_data/human.csv +11 -11
- leaderboard_data/human1000.csv +4 -4
- leaderboard_data/lb_all/easy.csv +0 -23
- leaderboard_data/lb_all/human.csv +0 -21
- leaderboard_data/lb_all/human1000.csv +0 -7
app.py
CHANGED
|
@@ -13,13 +13,14 @@ with gr.Blocks(title="ChinaTravel Benchmark Evaluation") as demo:
|
|
| 13 |
gr.HTML(content.TITLE_HTML)
|
| 14 |
gr.Markdown(content.INTRO_MARKDOWN)
|
| 15 |
gr.Markdown(content.SUBMISSION_GUIDE)
|
|
|
|
| 16 |
|
| 17 |
-
gr.Markdown("### 🏆 Leaderboard")
|
| 18 |
-
gr.Markdown("- Methods marked with * leverage Oracle DSL or an Oracle Verifier.")
|
| 19 |
if SPLITS_LIST:
|
| 20 |
with gr.Tabs():
|
| 21 |
for split in SPLITS_LIST:
|
| 22 |
with gr.Tab(split):
|
|
|
|
|
|
|
| 23 |
gr.Dataframe(
|
| 24 |
value=leaderboard_frames.get(split),
|
| 25 |
interactive=False,
|
|
|
|
| 13 |
gr.HTML(content.TITLE_HTML)
|
| 14 |
gr.Markdown(content.INTRO_MARKDOWN)
|
| 15 |
gr.Markdown(content.SUBMISSION_GUIDE)
|
| 16 |
+
gr.Markdown(content.LEADERBOARD_INTRO)
|
| 17 |
|
|
|
|
|
|
|
| 18 |
if SPLITS_LIST:
|
| 19 |
with gr.Tabs():
|
| 20 |
for split in SPLITS_LIST:
|
| 21 |
with gr.Tab(split):
|
| 22 |
+
hf_url = f"https://huggingface.co/datasets/LAMDA-NeSy/ChinaTravel/viewer/default/{split}"
|
| 23 |
+
gr.Markdown(f"📂 [Hugging Face Dataset Viewer]({hf_url})")
|
| 24 |
gr.Dataframe(
|
| 25 |
value=leaderboard_frames.get(split),
|
| 26 |
interactive=False,
|
chinatravel/ui/content.py
CHANGED
|
@@ -1,24 +1,37 @@
|
|
| 1 |
TITLE_HTML = """
|
| 2 |
-
<h1 style=\"text-align:center; margin-bottom: 0.25rem;\">🧭 ChinaTravel Benchmark
|
| 3 |
"""
|
| 4 |
|
| 5 |
INTRO_MARKDOWN = """
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
SUBMISSION_GUIDE = """
|
| 10 |
📥 **How to submit**
|
| 11 |
- Pick a split. The split determines which query UIDs are expected.
|
| 12 |
- Upload a `.zip` that contains JSON files named by query UIDs.
|
| 13 |
-
- Each JSON must follow the target schema: see [chinatravel/evaluation/output_schema.json](chinatravel/evaluation/output_schema.json).
|
|
|
|
| 14 |
- You can dry-run locally via `python eval_exp.py --splits <split> --method <your_method>` to mirror the hosted evaluation.
|
| 15 |
|
| 16 |
📊 **Output**
|
| 17 |
-
- We compute DR (schema pass rate), EPR_micro/EPR_macro (commonsense), LPR_micro/LPR_macro/C-LPR (logic), and FPR (all-pass rate).
|
| 18 |
- A detailed JSON report is produced for download after evaluation.
|
| 19 |
|
| 20 |
📨 **Contact**
|
| 21 |
-
- If you are interested in showing your results on our leaderboard or have any questions, please contact
|
| 22 |
"""
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
TITLE_HTML = """
|
| 2 |
+
<h1 style=\"text-align:center; margin-bottom: 0.25rem;\">🧭 ChinaTravel Benchmark Leaderboard</h1>
|
| 3 |
"""
|
| 4 |
|
| 5 |
INTRO_MARKDOWN = """
|
| 6 |
+
<div style=\"max-width: 450px; margin: 0 auto; padding: 16px 18px; border: 1px solid #e5e7eb; border-radius: 12px; background: linear-gradient(90deg, #f8fafc 0%, #eef2ff 100%); box-shadow: 0 6px 18px rgba(0,0,0,0.04);\">
|
| 7 |
+
<div style=\"display: flex; justify-content: center; flex-wrap: wrap; gap: 8px;\">
|
| 8 |
+
<a href="https://arxiv.org/abs/2412.13682" target="_blank" style="text-decoration:none; padding: 6px 12px; border-radius: 999px; border: 1px solid #cbd5e1; background: #ffffffcc; font-weight: 600; color: #0f172a;">📑 Paper</a>
|
| 9 |
+
<a href="https://github.com/LAMDASZ-ML/ChinaTravel" target="_blank" style="text-decoration:none; padding: 6px 12px; border-radius: 999px; border: 1px solid #cbd5e1; background: #ffffffcc; font-weight: 600; color: #0f172a;">🛠️ GitHub</a>
|
| 10 |
+
<a href="https://huggingface.co/datasets/LAMDA-NeSy/ChinaTravel" target="_blank" style="text-decoration:none; padding: 6px 12px; border-radius: 999px; border: 1px solid #cbd5e1; background: #ffffffcc; font-weight: 600; color: #0f172a;">📂 Dataset</a>
|
| 11 |
+
<a href="https://www.lamda.nju.edu.cn/shaojj/chinatravel/" target="_blank" style="text-decoration:none; padding: 6px 12px; border-radius: 999px; border: 1px solid #cbd5e1; background: #ffffffcc; font-weight: 600; color: #0f172a;">🌆 Project</a>
|
| 12 |
+
</div>
|
| 13 |
+
</div>
|
| 14 |
+
|
| 15 |
+
ChinaTravel is an open-ended travel planning benchmark with compositional constraint validation for language agents.
|
| 16 |
"""
|
| 17 |
|
| 18 |
SUBMISSION_GUIDE = """
|
| 19 |
📥 **How to submit**
|
| 20 |
- Pick a split. The split determines which query UIDs are expected.
|
| 21 |
- Upload a `.zip` that contains JSON files named by query UIDs.
|
| 22 |
+
- Each JSON must follow the target schema: see [chinatravel/evaluation/output_schema.json](https://huggingface.co/spaces/LAMDA-NeSy/ChinaTravel/blob/main/chinatravel/evaluation/output_schema.json).
|
| 23 |
+
- Example archive: [easy_submission_example.zip](https://huggingface.co/spaces/LAMDA-NeSy/ChinaTravel/blob/main//easy_submission_example.zip)
|
| 24 |
- You can dry-run locally via `python eval_exp.py --splits <split> --method <your_method>` to mirror the hosted evaluation.
|
| 25 |
|
| 26 |
📊 **Output**
|
| 27 |
+
- We compute $DR$ (schema pass rate), $EPR_micro/EPR_macro$ (commonsense), $LPR_micro/LPR_macro/C-LPR$ (logic), and $FPR$ (all-pass rate)$.
|
| 28 |
- A detailed JSON report is produced for download after evaluation.
|
| 29 |
|
| 30 |
📨 **Contact**
|
| 31 |
+
- If you are interested in showing your results on our leaderboard or have any questions, please contact shaojj@lamda.nju.edu.cn, zbw@smail.nju.edu.cn, yangxw@lamda.nju.edu.cn
|
| 32 |
"""
|
| 33 |
|
| 34 |
+
LEADERBOARD_INTRO = """
|
| 35 |
+
🏆 **Leaderboard**
|
| 36 |
+
- Methods marked with * leverage Oracle DSL or an Oracle Verifier.
|
| 37 |
+
"""
|
easy_submission_example.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:df16f032bb130907eb07bed0a1d19cfd35233b294215b12dd08881b471873b24
|
| 3 |
+
size 2844
|
leaderboard_data/easy.csv
CHANGED
|
@@ -1,23 +1,23 @@
|
|
| 1 |
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
-
NJU,Act,DeepSeek,70.4,49.9 / 0,64.6 / 30.6,0,0
|
| 3 |
-
NJU,Act,GPT,97.5,70.8 / 0,86.8 / 68.6,0,0
|
| 4 |
-
NJU,ReAct (zero-shot),DeepSeek,43.3,40.8 / 0,41.9 / 19.6,0,0
|
| 5 |
-
NJU,ReAct (zero-shot),GPT,95.4,48.2 / 0,71.3 / 33.0,0,0
|
| 6 |
-
NJU,ReAct (one-shot),DeepSeek,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
|
| 7 |
-
NJU,ReAct (one-shot),GPT,94.2,68.1 / 0,89.4 / 70.6,0,0
|
| 8 |
-
NJU,NeSy Planning,DeepSeek,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
|
| 9 |
-
NJU,NeSy Planning,GPT,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
|
| 10 |
NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
|
| 11 |
NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
|
| 12 |
NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
|
| 13 |
-
NJU,TTG (oracle),DeepSeek,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
|
| 14 |
-
NJU,LLM-Modulo*,DeepSeek,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
|
| 15 |
-
NJU,LLM-Modulo*,GPT,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
|
| 16 |
NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
|
| 17 |
NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
|
| 18 |
NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
|
| 19 |
-
NJU,NeSy Planning*,DeepSeek,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
|
| 20 |
-
NJU,NeSy Planning*,GPT,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
|
| 21 |
NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
|
| 22 |
NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
|
| 23 |
NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3
|
|
|
|
| 1 |
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
+
NJU,Act,DeepSeek-V3,70.4,49.9 / 0,64.6 / 30.6,0,0
|
| 3 |
+
NJU,Act,GPT-4o,97.5,70.8 / 0,86.8 / 68.6,0,0
|
| 4 |
+
NJU,ReAct (zero-shot),DeepSeek-V3,43.3,40.8 / 0,41.9 / 19.6,0,0
|
| 5 |
+
NJU,ReAct (zero-shot),GPT-4o,95.4,48.2 / 0,71.3 / 33.0,0,0
|
| 6 |
+
NJU,ReAct (one-shot),DeepSeek-V3,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
|
| 7 |
+
NJU,ReAct (one-shot),GPT-4o,94.2,68.1 / 0,89.4 / 70.6,0,0
|
| 8 |
+
NJU,NeSy Planning,DeepSeek-V3,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
|
| 9 |
+
NJU,NeSy Planning,GPT-4o,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
|
| 10 |
NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
|
| 11 |
NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
|
| 12 |
NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
|
| 13 |
+
NJU,TTG (oracle),DeepSeek-V3,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
|
| 14 |
+
NJU,LLM-Modulo*,DeepSeek-V3,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
|
| 15 |
+
NJU,LLM-Modulo*,GPT-4o,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
|
| 16 |
NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
|
| 17 |
NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
|
| 18 |
NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
|
| 19 |
+
NJU,NeSy Planning*,DeepSeek-V3,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
|
| 20 |
+
NJU,NeSy Planning*,GPT-4o,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
|
| 21 |
NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
|
| 22 |
NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
|
| 23 |
NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3
|
leaderboard_data/human.csv
CHANGED
|
@@ -1,21 +1,21 @@
|
|
| 1 |
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
-
NJU,ReAct (zero-shot),DeepSeek,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
|
| 3 |
-
NJU,ReAct (zero-shot),GPT,96.1,50.5 / 0,72.4 / 32.5,0,0
|
| 4 |
-
NJU,ReAct (one-shot),DeepSeek,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
|
| 5 |
-
NJU,ReAct (one-shot),GPT,69.5,46.3 / 0,63.6 / 46.8,0,0
|
| 6 |
-
NJU,NeSy Planning,DeepSeek,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
|
| 7 |
-
NJU,NeSy Planning,GPT,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
|
| 8 |
NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
|
| 9 |
NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
|
| 10 |
NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
|
| 11 |
-
NJU,TTG (oracle),DeepSeek,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
|
| 12 |
-
NJU,LLM-Modulo*,DeepSeek,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
|
| 13 |
-
NJU,LLM-Modulo*,GPT,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
|
| 14 |
NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
|
| 15 |
NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
|
| 16 |
NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
|
| 17 |
-
NJU,NeSy Planning*,DeepSeek,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
|
| 18 |
-
NJU,NeSy Planning*,GPT,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
|
| 19 |
NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
|
| 20 |
NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
|
| 21 |
NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4
|
|
|
|
| 1 |
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
+
NJU,ReAct (zero-shot),DeepSeek-V3,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
|
| 3 |
+
NJU,ReAct (zero-shot),GPT-4o,96.1,50.5 / 0,72.4 / 32.5,0,0
|
| 4 |
+
NJU,ReAct (one-shot),DeepSeek-V3,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
|
| 5 |
+
NJU,ReAct (one-shot),GPT-4o,69.5,46.3 / 0,63.6 / 46.8,0,0
|
| 6 |
+
NJU,NeSy Planning,DeepSeek-V3,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
|
| 7 |
+
NJU,NeSy Planning,GPT-4o,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
|
| 8 |
NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
|
| 9 |
NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
|
| 10 |
NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
|
| 11 |
+
NJU,TTG (oracle),DeepSeek-V3,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
|
| 12 |
+
NJU,LLM-Modulo*,DeepSeek-V3,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
|
| 13 |
+
NJU,LLM-Modulo*,GPT-4o,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
|
| 14 |
NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
|
| 15 |
NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
|
| 16 |
NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
|
| 17 |
+
NJU,NeSy Planning*,DeepSeek-V3,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
|
| 18 |
+
NJU,NeSy Planning*,GPT-4o,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
|
| 19 |
NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
|
| 20 |
NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
|
| 21 |
NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4
|
leaderboard_data/human1000.csv
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
-
NJU,NeSy Planning,DeepSeek,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
|
| 3 |
-
NJU,NeSy Planning,GPT,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
|
| 4 |
NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
|
| 5 |
-
NJU,NeSy Planning*,DeepSeek,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
|
| 6 |
-
NJU,NeSy Planning*,GPT,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
|
| 7 |
NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8
|
|
|
|
| 1 |
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
+
NJU,NeSy Planning,DeepSeek-V3,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
|
| 3 |
+
NJU,NeSy Planning,GPT-4o,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
|
| 4 |
NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
|
| 5 |
+
NJU,NeSy Planning*,DeepSeek-V3,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
|
| 6 |
+
NJU,NeSy Planning*,GPT-4o,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
|
| 7 |
NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8
|
leaderboard_data/lb_all/easy.csv
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
-
NJU,Act,DeepSeek,70.4,49.9 / 0,64.6 / 30.6,0,0
|
| 3 |
-
NJU,Act,GPT,97.5,70.8 / 0,86.8 / 68.6,0,0
|
| 4 |
-
NJU,ReAct (zero-shot),DeepSeek,43.3,40.8 / 0,41.9 / 19.6,0,0
|
| 5 |
-
NJU,ReAct (zero-shot),GPT,95.4,48.2 / 0,71.3 / 33.0,0,0
|
| 6 |
-
NJU,ReAct (one-shot),DeepSeek,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
|
| 7 |
-
NJU,ReAct (one-shot),GPT,94.2,68.1 / 0,89.4 / 70.6,0,0
|
| 8 |
-
NJU,NeSy Planning,DeepSeek,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
|
| 9 |
-
NJU,NeSy Planning,GPT,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
|
| 10 |
-
NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
|
| 11 |
-
NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
|
| 12 |
-
NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
|
| 13 |
-
NJU,TTG (oracle),DeepSeek,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
|
| 14 |
-
NJU,LLM-Modulo*,DeepSeek,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
|
| 15 |
-
NJU,LLM-Modulo*,GPT,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
|
| 16 |
-
NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
|
| 17 |
-
NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
|
| 18 |
-
NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
|
| 19 |
-
NJU,NeSy Planning*,DeepSeek,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
|
| 20 |
-
NJU,NeSy Planning*,GPT,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
|
| 21 |
-
NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
|
| 22 |
-
NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
|
| 23 |
-
NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/lb_all/human.csv
DELETED
|
@@ -1,21 +0,0 @@
|
|
| 1 |
-
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
-
NJU,ReAct (zero-shot),DeepSeek,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
|
| 3 |
-
NJU,ReAct (zero-shot),GPT,96.1,50.5 / 0,72.4 / 32.5,0,0
|
| 4 |
-
NJU,ReAct (one-shot),DeepSeek,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
|
| 5 |
-
NJU,ReAct (one-shot),GPT,69.5,46.3 / 0,63.6 / 46.8,0,0
|
| 6 |
-
NJU,NeSy Planning,DeepSeek,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
|
| 7 |
-
NJU,NeSy Planning,GPT,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
|
| 8 |
-
NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
|
| 9 |
-
NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
|
| 10 |
-
NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
|
| 11 |
-
NJU,TTG (oracle),DeepSeek,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
|
| 12 |
-
NJU,LLM-Modulo*,DeepSeek,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
|
| 13 |
-
NJU,LLM-Modulo*,GPT,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
|
| 14 |
-
NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
|
| 15 |
-
NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
|
| 16 |
-
NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
|
| 17 |
-
NJU,NeSy Planning*,DeepSeek,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
|
| 18 |
-
NJU,NeSy Planning*,GPT,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
|
| 19 |
-
NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
|
| 20 |
-
NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
|
| 21 |
-
NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/lb_all/human1000.csv
DELETED
|
@@ -1,7 +0,0 @@
|
|
| 1 |
-
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
-
NJU,NeSy Planning,DeepSeek,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
|
| 3 |
-
NJU,NeSy Planning,GPT,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
|
| 4 |
-
NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
|
| 5 |
-
NJU,NeSy Planning*,DeepSeek,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
|
| 6 |
-
NJU,NeSy Planning*,GPT,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
|
| 7 |
-
NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|