博闻 commited on
Commit
3a4034e
·
1 Parent(s): 992e8c1

add relevant info

Browse files
app.py CHANGED
@@ -13,13 +13,14 @@ with gr.Blocks(title="ChinaTravel Benchmark Evaluation") as demo:
13
  gr.HTML(content.TITLE_HTML)
14
  gr.Markdown(content.INTRO_MARKDOWN)
15
  gr.Markdown(content.SUBMISSION_GUIDE)
 
16
 
17
- gr.Markdown("### 🏆 Leaderboard")
18
- gr.Markdown("- Methods marked with * leverage Oracle DSL or an Oracle Verifier.")
19
  if SPLITS_LIST:
20
  with gr.Tabs():
21
  for split in SPLITS_LIST:
22
  with gr.Tab(split):
 
 
23
  gr.Dataframe(
24
  value=leaderboard_frames.get(split),
25
  interactive=False,
 
13
  gr.HTML(content.TITLE_HTML)
14
  gr.Markdown(content.INTRO_MARKDOWN)
15
  gr.Markdown(content.SUBMISSION_GUIDE)
16
+ gr.Markdown(content.LEADERBOARD_INTRO)
17
 
 
 
18
  if SPLITS_LIST:
19
  with gr.Tabs():
20
  for split in SPLITS_LIST:
21
  with gr.Tab(split):
22
+ hf_url = f"https://huggingface.co/datasets/LAMDA-NeSy/ChinaTravel/viewer/default/{split}"
23
+ gr.Markdown(f"📂 [Hugging Face Dataset Viewer]({hf_url})")
24
  gr.Dataframe(
25
  value=leaderboard_frames.get(split),
26
  interactive=False,
chinatravel/ui/content.py CHANGED
@@ -1,24 +1,37 @@
1
  TITLE_HTML = """
2
- <h1 style=\"text-align:center; margin-bottom: 0.25rem;\">🧭 ChinaTravel Benchmark Evaluation</h1>
3
  """
4
 
5
  INTRO_MARKDOWN = """
6
- ChinaTravel is an open-ended travel planning benchmark with compositional constraint validation for language agents. (See our [paper](https://arxiv.org/abs/2412.13682) for more details.)
 
 
 
 
 
 
 
 
 
7
  """
8
 
9
  SUBMISSION_GUIDE = """
10
  📥 **How to submit**
11
  - Pick a split. The split determines which query UIDs are expected.
12
  - Upload a `.zip` that contains JSON files named by query UIDs.
13
- - Each JSON must follow the target schema: see [chinatravel/evaluation/output_schema.json](chinatravel/evaluation/output_schema.json).
 
14
  - You can dry-run locally via `python eval_exp.py --splits <split> --method <your_method>` to mirror the hosted evaluation.
15
 
16
  📊 **Output**
17
- - We compute DR (schema pass rate), EPR_micro/EPR_macro (commonsense), LPR_micro/LPR_macro/C-LPR (logic), and FPR (all-pass rate).
18
  - A detailed JSON report is produced for download after evaluation.
19
 
20
  📨 **Contact**
21
- - If you are interested in showing your results on our leaderboard or have any questions, please contact [Jie-Jing Shao](shaojj@lamda.nju.edu.cn), [Bo-Wen Zhang](221900200@smail.nju.edu.cn), [Xiao-Wen Yang](yangxw@lamda.nju.edu.cn)
22
  """
23
 
24
- CONTACT = "Contact: ✉️ zbw@smail.nju.edu.cn, ✉️ shaojj@lamda.nju.edu.cn"
 
 
 
 
1
  TITLE_HTML = """
2
+ <h1 style=\"text-align:center; margin-bottom: 0.25rem;\">🧭 ChinaTravel Benchmark Leaderboard</h1>
3
  """
4
 
5
  INTRO_MARKDOWN = """
6
+ <div style=\"max-width: 450px; margin: 0 auto; padding: 16px 18px; border: 1px solid #e5e7eb; border-radius: 12px; background: linear-gradient(90deg, #f8fafc 0%, #eef2ff 100%); box-shadow: 0 6px 18px rgba(0,0,0,0.04);\">
7
+ <div style=\"display: flex; justify-content: center; flex-wrap: wrap; gap: 8px;\">
8
+ <a href="https://arxiv.org/abs/2412.13682" target="_blank" style="text-decoration:none; padding: 6px 12px; border-radius: 999px; border: 1px solid #cbd5e1; background: #ffffffcc; font-weight: 600; color: #0f172a;">📑 Paper</a>
9
+ <a href="https://github.com/LAMDASZ-ML/ChinaTravel" target="_blank" style="text-decoration:none; padding: 6px 12px; border-radius: 999px; border: 1px solid #cbd5e1; background: #ffffffcc; font-weight: 600; color: #0f172a;">🛠️ GitHub</a>
10
+ <a href="https://huggingface.co/datasets/LAMDA-NeSy/ChinaTravel" target="_blank" style="text-decoration:none; padding: 6px 12px; border-radius: 999px; border: 1px solid #cbd5e1; background: #ffffffcc; font-weight: 600; color: #0f172a;">📂 Dataset</a>
11
+ <a href="https://www.lamda.nju.edu.cn/shaojj/chinatravel/" target="_blank" style="text-decoration:none; padding: 6px 12px; border-radius: 999px; border: 1px solid #cbd5e1; background: #ffffffcc; font-weight: 600; color: #0f172a;">🌆 Project</a>
12
+ </div>
13
+ </div>
14
+
15
+ ChinaTravel is an open-ended travel planning benchmark with compositional constraint validation for language agents.
16
  """
17
 
18
  SUBMISSION_GUIDE = """
19
  📥 **How to submit**
20
  - Pick a split. The split determines which query UIDs are expected.
21
  - Upload a `.zip` that contains JSON files named by query UIDs.
22
+ - Each JSON must follow the target schema: see [chinatravel/evaluation/output_schema.json](https://huggingface.co/spaces/LAMDA-NeSy/ChinaTravel/blob/main/chinatravel/evaluation/output_schema.json).
23
+ - Example archive: [easy_submission_example.zip](https://huggingface.co/spaces/LAMDA-NeSy/ChinaTravel/blob/main//easy_submission_example.zip)
24
  - You can dry-run locally via `python eval_exp.py --splits <split> --method <your_method>` to mirror the hosted evaluation.
25
 
26
  📊 **Output**
27
+ - We compute $DR$ (schema pass rate), $EPR_micro/EPR_macro$ (commonsense), $LPR_micro/LPR_macro/C-LPR$ (logic), and $FPR$ (all-pass rate)$.
28
  - A detailed JSON report is produced for download after evaluation.
29
 
30
  📨 **Contact**
31
+ - If you are interested in showing your results on our leaderboard or have any questions, please contact shaojj@lamda.nju.edu.cn, zbw@smail.nju.edu.cn, yangxw@lamda.nju.edu.cn
32
  """
33
 
34
+ LEADERBOARD_INTRO = """
35
+ 🏆 **Leaderboard**
36
+ - Methods marked with * leverage Oracle DSL or an Oracle Verifier.
37
+ """
easy_submission_example.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df16f032bb130907eb07bed0a1d19cfd35233b294215b12dd08881b471873b24
3
+ size 2844
leaderboard_data/easy.csv CHANGED
@@ -1,23 +1,23 @@
1
  Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
- NJU,Act,DeepSeek,70.4,49.9 / 0,64.6 / 30.6,0,0
3
- NJU,Act,GPT,97.5,70.8 / 0,86.8 / 68.6,0,0
4
- NJU,ReAct (zero-shot),DeepSeek,43.3,40.8 / 0,41.9 / 19.6,0,0
5
- NJU,ReAct (zero-shot),GPT,95.4,48.2 / 0,71.3 / 33.0,0,0
6
- NJU,ReAct (one-shot),DeepSeek,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
7
- NJU,ReAct (one-shot),GPT,94.2,68.1 / 0,89.4 / 70.6,0,0
8
- NJU,NeSy Planning,DeepSeek,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
9
- NJU,NeSy Planning,GPT,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
10
  NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
11
  NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
12
  NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
13
- NJU,TTG (oracle),DeepSeek,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
14
- NJU,LLM-Modulo*,DeepSeek,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
15
- NJU,LLM-Modulo*,GPT,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
16
  NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
17
  NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
18
  NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
19
- NJU,NeSy Planning*,DeepSeek,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
20
- NJU,NeSy Planning*,GPT,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
21
  NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
22
  NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
23
  NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3
 
1
  Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
+ NJU,Act,DeepSeek-V3,70.4,49.9 / 0,64.6 / 30.6,0,0
3
+ NJU,Act,GPT-4o,97.5,70.8 / 0,86.8 / 68.6,0,0
4
+ NJU,ReAct (zero-shot),DeepSeek-V3,43.3,40.8 / 0,41.9 / 19.6,0,0
5
+ NJU,ReAct (zero-shot),GPT-4o,95.4,48.2 / 0,71.3 / 33.0,0,0
6
+ NJU,ReAct (one-shot),DeepSeek-V3,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
7
+ NJU,ReAct (one-shot),GPT-4o,94.2,68.1 / 0,89.4 / 70.6,0,0
8
+ NJU,NeSy Planning,DeepSeek-V3,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
9
+ NJU,NeSy Planning,GPT-4o,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
10
  NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
11
  NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
12
  NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
13
+ NJU,TTG (oracle),DeepSeek-V3,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
14
+ NJU,LLM-Modulo*,DeepSeek-V3,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
15
+ NJU,LLM-Modulo*,GPT-4o,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
16
  NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
17
  NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
18
  NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
19
+ NJU,NeSy Planning*,DeepSeek-V3,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
20
+ NJU,NeSy Planning*,GPT-4o,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
21
  NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
22
  NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
23
  NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3
leaderboard_data/human.csv CHANGED
@@ -1,21 +1,21 @@
1
  Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
- NJU,ReAct (zero-shot),DeepSeek,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
3
- NJU,ReAct (zero-shot),GPT,96.1,50.5 / 0,72.4 / 32.5,0,0
4
- NJU,ReAct (one-shot),DeepSeek,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
5
- NJU,ReAct (one-shot),GPT,69.5,46.3 / 0,63.6 / 46.8,0,0
6
- NJU,NeSy Planning,DeepSeek,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
7
- NJU,NeSy Planning,GPT,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
8
  NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
9
  NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
10
  NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
11
- NJU,TTG (oracle),DeepSeek,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
12
- NJU,LLM-Modulo*,DeepSeek,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
13
- NJU,LLM-Modulo*,GPT,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
14
  NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
15
  NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
16
  NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
17
- NJU,NeSy Planning*,DeepSeek,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
18
- NJU,NeSy Planning*,GPT,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
19
  NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
20
  NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
21
  NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4
 
1
  Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
+ NJU,ReAct (zero-shot),DeepSeek-V3,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
3
+ NJU,ReAct (zero-shot),GPT-4o,96.1,50.5 / 0,72.4 / 32.5,0,0
4
+ NJU,ReAct (one-shot),DeepSeek-V3,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
5
+ NJU,ReAct (one-shot),GPT-4o,69.5,46.3 / 0,63.6 / 46.8,0,0
6
+ NJU,NeSy Planning,DeepSeek-V3,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
7
+ NJU,NeSy Planning,GPT-4o,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
8
  NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
9
  NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
10
  NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
11
+ NJU,TTG (oracle),DeepSeek-V3,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
12
+ NJU,LLM-Modulo*,DeepSeek-V3,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
13
+ NJU,LLM-Modulo*,GPT-4o,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
14
  NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
15
  NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
16
  NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
17
+ NJU,NeSy Planning*,DeepSeek-V3,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
18
+ NJU,NeSy Planning*,GPT-4o,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
19
  NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
20
  NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
21
  NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4
leaderboard_data/human1000.csv CHANGED
@@ -1,7 +1,7 @@
1
  Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
- NJU,NeSy Planning,DeepSeek,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
3
- NJU,NeSy Planning,GPT,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
4
  NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
5
- NJU,NeSy Planning*,DeepSeek,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
6
- NJU,NeSy Planning*,GPT,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
7
  NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8
 
1
  Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
+ NJU,NeSy Planning,DeepSeek-V3,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
3
+ NJU,NeSy Planning,GPT-4o,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
4
  NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
5
+ NJU,NeSy Planning*,DeepSeek-V3,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
6
+ NJU,NeSy Planning*,GPT-4o,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
7
  NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8
leaderboard_data/lb_all/easy.csv DELETED
@@ -1,23 +0,0 @@
1
- Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
- NJU,Act,DeepSeek,70.4,49.9 / 0,64.6 / 30.6,0,0
3
- NJU,Act,GPT,97.5,70.8 / 0,86.8 / 68.6,0,0
4
- NJU,ReAct (zero-shot),DeepSeek,43.3,40.8 / 0,41.9 / 19.6,0,0
5
- NJU,ReAct (zero-shot),GPT,95.4,48.2 / 0,71.3 / 33.0,0,0
6
- NJU,ReAct (one-shot),DeepSeek,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
7
- NJU,ReAct (one-shot),GPT,94.2,68.1 / 0,89.4 / 70.6,0,0
8
- NJU,NeSy Planning,DeepSeek,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
9
- NJU,NeSy Planning,GPT,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
10
- NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
11
- NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
12
- NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
13
- NJU,TTG (oracle),DeepSeek,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
14
- NJU,LLM-Modulo*,DeepSeek,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
15
- NJU,LLM-Modulo*,GPT,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
16
- NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
17
- NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
18
- NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
19
- NJU,NeSy Planning*,DeepSeek,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
20
- NJU,NeSy Planning*,GPT,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
21
- NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
22
- NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
23
- NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/lb_all/human.csv DELETED
@@ -1,21 +0,0 @@
1
- Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
- NJU,ReAct (zero-shot),DeepSeek,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
3
- NJU,ReAct (zero-shot),GPT,96.1,50.5 / 0,72.4 / 32.5,0,0
4
- NJU,ReAct (one-shot),DeepSeek,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
5
- NJU,ReAct (one-shot),GPT,69.5,46.3 / 0,63.6 / 46.8,0,0
6
- NJU,NeSy Planning,DeepSeek,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
7
- NJU,NeSy Planning,GPT,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
8
- NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
9
- NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
10
- NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
11
- NJU,TTG (oracle),DeepSeek,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
12
- NJU,LLM-Modulo*,DeepSeek,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
13
- NJU,LLM-Modulo*,GPT,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
14
- NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
15
- NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
16
- NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
17
- NJU,NeSy Planning*,DeepSeek,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
18
- NJU,NeSy Planning*,GPT,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
19
- NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
20
- NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
21
- NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/lb_all/human1000.csv DELETED
@@ -1,7 +0,0 @@
1
- Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
- NJU,NeSy Planning,DeepSeek,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
3
- NJU,NeSy Planning,GPT,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
4
- NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
5
- NJU,NeSy Planning*,DeepSeek,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
6
- NJU,NeSy Planning*,GPT,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
7
- NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8