Spaces:

LAMDA-NeSy
/

ChinaTravel

Build error

App Files Files Community

博闻 commited on Feb 4

Commit

3a4034e

1 Parent(s): 992e8c1

add relevant info

Browse files

Files changed (9) hide show

app.py +3 -2
chinatravel/ui/content.py +19 -6
easy_submission_example.zip +3 -0
leaderboard_data/easy.csv +13 -13
leaderboard_data/human.csv +11 -11
leaderboard_data/human1000.csv +4 -4
leaderboard_data/lb_all/easy.csv +0 -23
leaderboard_data/lb_all/human.csv +0 -21
leaderboard_data/lb_all/human1000.csv +0 -7

app.py CHANGED Viewed

@@ -13,13 +13,14 @@ with gr.Blocks(title="ChinaTravel Benchmark Evaluation") as demo:
     gr.HTML(content.TITLE_HTML)
     gr.Markdown(content.INTRO_MARKDOWN)
     gr.Markdown(content.SUBMISSION_GUIDE)
-    gr.Markdown("### 🏆 Leaderboard")
-    gr.Markdown("- Methods marked with * leverage Oracle DSL or an Oracle Verifier.")
     if SPLITS_LIST:
         with gr.Tabs():
             for split in SPLITS_LIST:
                 with gr.Tab(split):
                     gr.Dataframe(
                         value=leaderboard_frames.get(split),
                         interactive=False,

     gr.HTML(content.TITLE_HTML)
     gr.Markdown(content.INTRO_MARKDOWN)
     gr.Markdown(content.SUBMISSION_GUIDE)
+    gr.Markdown(content.LEADERBOARD_INTRO)
     if SPLITS_LIST:
         with gr.Tabs():
             for split in SPLITS_LIST:
                 with gr.Tab(split):
+                    hf_url = f"https://huggingface.co/datasets/LAMDA-NeSy/ChinaTravel/viewer/default/{split}"
+                    gr.Markdown(f"📂 [Hugging Face Dataset Viewer]({hf_url})")
                     gr.Dataframe(
                         value=leaderboard_frames.get(split),
                         interactive=False,

chinatravel/ui/content.py CHANGED Viewed

@@ -1,24 +1,37 @@
 TITLE_HTML = """
-<h1 style=\"text-align:center; margin-bottom: 0.25rem;\">🧭 ChinaTravel Benchmark Evaluation</h1>
 """
 INTRO_MARKDOWN = """
-ChinaTravel is an open-ended travel planning benchmark with compositional constraint validation for language agents. (See our [paper](https://arxiv.org/abs/2412.13682) for more details.)
 """
 SUBMISSION_GUIDE = """
 📥 **How to submit**
 - Pick a split. The split determines which query UIDs are expected.
 - Upload a `.zip` that contains JSON files named by query UIDs.
-- Each JSON must follow the target schema: see [chinatravel/evaluation/output_schema.json](chinatravel/evaluation/output_schema.json).
 - You can dry-run locally via `python eval_exp.py --splits <split> --method <your_method>` to mirror the hosted evaluation.
 📊 **Output**
-- We compute DR (schema pass rate), EPR_micro/EPR_macro (commonsense), LPR_micro/LPR_macro/C-LPR (logic), and FPR (all-pass rate).
 - A detailed JSON report is produced for download after evaluation.
 📨 **Contact**
-- If you are interested in showing your results on our leaderboard or have any questions, please contact [Jie-Jing Shao](shaojj@lamda.nju.edu.cn), [Bo-Wen Zhang](221900200@smail.nju.edu.cn), [Xiao-Wen Yang](yangxw@lamda.nju.edu.cn)
 """
-CONTACT = "Contact: ✉️ zbw@smail.nju.edu.cn, ✉️ shaojj@lamda.nju.edu.cn"

 TITLE_HTML = """
+<h1 style=\"text-align:center; margin-bottom: 0.25rem;\">🧭 ChinaTravel Benchmark Leaderboard</h1>
 """
 INTRO_MARKDOWN = """
+<div style=\"max-width: 450px; margin: 0 auto; padding: 16px 18px; border: 1px solid #e5e7eb; border-radius: 12px; background: linear-gradient(90deg, #f8fafc 0%, #eef2ff 100%); box-shadow: 0 6px 18px rgba(0,0,0,0.04);\">
+	<div style=\"display: flex; justify-content: center; flex-wrap: wrap; gap: 8px;\">
+		<a href="https://arxiv.org/abs/2412.13682" target="_blank" style="text-decoration:none; padding: 6px 12px; border-radius: 999px; border: 1px solid #cbd5e1; background: #ffffffcc; font-weight: 600; color: #0f172a;">📑 Paper</a>
+		<a href="https://github.com/LAMDASZ-ML/ChinaTravel" target="_blank" style="text-decoration:none; padding: 6px 12px; border-radius: 999px; border: 1px solid #cbd5e1; background: #ffffffcc; font-weight: 600; color: #0f172a;">🛠️ GitHub</a>
+		<a href="https://huggingface.co/datasets/LAMDA-NeSy/ChinaTravel" target="_blank" style="text-decoration:none; padding: 6px 12px; border-radius: 999px; border: 1px solid #cbd5e1; background: #ffffffcc; font-weight: 600; color: #0f172a;">📂 Dataset</a>
+		<a href="https://www.lamda.nju.edu.cn/shaojj/chinatravel/" target="_blank" style="text-decoration:none; padding: 6px 12px; border-radius: 999px; border: 1px solid #cbd5e1; background: #ffffffcc; font-weight: 600; color: #0f172a;">🌆 Project</a>
+	</div>
+</div>
+ChinaTravel is an open-ended travel planning benchmark with compositional constraint validation for language agents.
 """
 SUBMISSION_GUIDE = """
 📥 **How to submit**
 - Pick a split. The split determines which query UIDs are expected.
 - Upload a `.zip` that contains JSON files named by query UIDs.
+- Each JSON must follow the target schema: see [chinatravel/evaluation/output_schema.json](https://huggingface.co/spaces/LAMDA-NeSy/ChinaTravel/blob/main/chinatravel/evaluation/output_schema.json).
+- Example archive: [easy_submission_example.zip](https://huggingface.co/spaces/LAMDA-NeSy/ChinaTravel/blob/main//easy_submission_example.zip)
 - You can dry-run locally via `python eval_exp.py --splits <split> --method <your_method>` to mirror the hosted evaluation.
 📊 **Output**
+- We compute $DR$ (schema pass rate), $EPR_micro/EPR_macro$ (commonsense), $LPR_micro/LPR_macro/C-LPR$ (logic), and $FPR$ (all-pass rate)$.
 - A detailed JSON report is produced for download after evaluation.
 📨 **Contact**
+- If you are interested in showing your results on our leaderboard or have any questions, please contact shaojj@lamda.nju.edu.cn, zbw@smail.nju.edu.cn, yangxw@lamda.nju.edu.cn
 """
+LEADERBOARD_INTRO = """
+🏆 **Leaderboard**
+- Methods marked with * leverage Oracle DSL or an Oracle Verifier.
+"""

easy_submission_example.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df16f032bb130907eb07bed0a1d19cfd35233b294215b12dd08881b471873b24
+size 2844

leaderboard_data/easy.csv CHANGED Viewed

@@ -1,23 +1,23 @@
 Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
-NJU,Act,DeepSeek,70.4,49.9 / 0,64.6 / 30.6,0,0
-NJU,Act,GPT,97.5,70.8 / 0,86.8 / 68.6,0,0
-NJU,ReAct (zero-shot),DeepSeek,43.3,40.8 / 0,41.9 / 19.6,0,0
-NJU,ReAct (zero-shot),GPT,95.4,48.2 / 0,71.3 / 33.0,0,0
-NJU,ReAct (one-shot),DeepSeek,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
-NJU,ReAct (one-shot),GPT,94.2,68.1 / 0,89.4 / 70.6,0,0
-NJU,NeSy Planning,DeepSeek,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
-NJU,NeSy Planning,GPT,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
 NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
 NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
 NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
-NJU,TTG (oracle),DeepSeek,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
-NJU,LLM-Modulo*,DeepSeek,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
-NJU,LLM-Modulo*,GPT,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
 NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
 NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
 NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
-NJU,NeSy Planning*,DeepSeek,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
-NJU,NeSy Planning*,GPT,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
 NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
 NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
 NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3

 Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
+NJU,Act,DeepSeek-V3,70.4,49.9 / 0,64.6 / 30.6,0,0
+NJU,Act,GPT-4o,97.5,70.8 / 0,86.8 / 68.6,0,0
+NJU,ReAct (zero-shot),DeepSeek-V3,43.3,40.8 / 0,41.9 / 19.6,0,0
+NJU,ReAct (zero-shot),GPT-4o,95.4,48.2 / 0,71.3 / 33.0,0,0
+NJU,ReAct (one-shot),DeepSeek-V3,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
+NJU,ReAct (one-shot),GPT-4o,94.2,68.1 / 0,89.4 / 70.6,0,0
+NJU,NeSy Planning,DeepSeek-V3,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
+NJU,NeSy Planning,GPT-4o,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
 NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
 NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
 NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
+NJU,TTG (oracle),DeepSeek-V3,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
+NJU,LLM-Modulo*,DeepSeek-V3,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
+NJU,LLM-Modulo*,GPT-4o,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
 NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
 NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
 NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
+NJU,NeSy Planning*,DeepSeek-V3,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
+NJU,NeSy Planning*,GPT-4o,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
 NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
 NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
 NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3

leaderboard_data/human.csv CHANGED Viewed

@@ -1,21 +1,21 @@
 Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
-NJU,ReAct (zero-shot),DeepSeek,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
-NJU,ReAct (zero-shot),GPT,96.1,50.5 / 0,72.4 / 32.5,0,0
-NJU,ReAct (one-shot),DeepSeek,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
-NJU,ReAct (one-shot),GPT,69.5,46.3 / 0,63.6 / 46.8,0,0
-NJU,NeSy Planning,DeepSeek,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
-NJU,NeSy Planning,GPT,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
 NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
 NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
 NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
-NJU,TTG (oracle),DeepSeek,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
-NJU,LLM-Modulo*,DeepSeek,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
-NJU,LLM-Modulo*,GPT,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
 NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
 NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
 NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
-NJU,NeSy Planning*,DeepSeek,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
-NJU,NeSy Planning*,GPT,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
 NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
 NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
 NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4

 Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
+NJU,ReAct (zero-shot),DeepSeek-V3,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
+NJU,ReAct (zero-shot),GPT-4o,96.1,50.5 / 0,72.4 / 32.5,0,0
+NJU,ReAct (one-shot),DeepSeek-V3,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
+NJU,ReAct (one-shot),GPT-4o,69.5,46.3 / 0,63.6 / 46.8,0,0
+NJU,NeSy Planning,DeepSeek-V3,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
+NJU,NeSy Planning,GPT-4o,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
 NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
 NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
 NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
+NJU,TTG (oracle),DeepSeek-V3,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
+NJU,LLM-Modulo*,DeepSeek-V3,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
+NJU,LLM-Modulo*,GPT-4o,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
 NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
 NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
 NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
+NJU,NeSy Planning*,DeepSeek-V3,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
+NJU,NeSy Planning*,GPT-4o,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
 NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
 NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
 NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4

leaderboard_data/human1000.csv CHANGED Viewed

@@ -1,7 +1,7 @@
 Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
-NJU,NeSy Planning,DeepSeek,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
-NJU,NeSy Planning,GPT,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
 NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
-NJU,NeSy Planning*,DeepSeek,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
-NJU,NeSy Planning*,GPT,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
 NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8

 Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
+NJU,NeSy Planning,DeepSeek-V3,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
+NJU,NeSy Planning,GPT-4o,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
 NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
+NJU,NeSy Planning*,DeepSeek-V3,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
+NJU,NeSy Planning*,GPT-4o,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
 NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8

leaderboard_data/lb_all/easy.csv DELETED Viewed

@@ -1,23 +0,0 @@
-Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
-NJU,Act,DeepSeek,70.4,49.9 / 0,64.6 / 30.6,0,0
-NJU,Act,GPT,97.5,70.8 / 0,86.8 / 68.6,0,0
-NJU,ReAct (zero-shot),DeepSeek,43.3,40.8 / 0,41.9 / 19.6,0,0
-NJU,ReAct (zero-shot),GPT,95.4,48.2 / 0,71.3 / 33.0,0,0
-NJU,ReAct (one-shot),DeepSeek,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
-NJU,ReAct (one-shot),GPT,94.2,68.1 / 0,89.4 / 70.6,0,0
-NJU,NeSy Planning,DeepSeek,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
-NJU,NeSy Planning,GPT,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
-NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
-NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
-NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
-NJU,TTG (oracle),DeepSeek,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
-NJU,LLM-Modulo*,DeepSeek,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
-NJU,LLM-Modulo*,GPT,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
-NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
-NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
-NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
-NJU,NeSy Planning*,DeepSeek,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
-NJU,NeSy Planning*,GPT,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
-NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
-NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
-NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3

leaderboard_data/lb_all/human.csv DELETED Viewed

@@ -1,21 +0,0 @@
-Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
-NJU,ReAct (zero-shot),DeepSeek,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
-NJU,ReAct (zero-shot),GPT,96.1,50.5 / 0,72.4 / 32.5,0,0
-NJU,ReAct (one-shot),DeepSeek,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
-NJU,ReAct (one-shot),GPT,69.5,46.3 / 0,63.6 / 46.8,0,0
-NJU,NeSy Planning,DeepSeek,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
-NJU,NeSy Planning,GPT,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
-NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
-NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
-NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
-NJU,TTG (oracle),DeepSeek,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
-NJU,LLM-Modulo*,DeepSeek,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
-NJU,LLM-Modulo*,GPT,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
-NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
-NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
-NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
-NJU,NeSy Planning*,DeepSeek,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
-NJU,NeSy Planning*,GPT,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
-NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
-NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
-NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4

leaderboard_data/lb_all/human1000.csv DELETED Viewed

@@ -1,7 +0,0 @@
-Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
-NJU,NeSy Planning,DeepSeek,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
-NJU,NeSy Planning,GPT,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
-NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
-NJU,NeSy Planning*,DeepSeek,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
-NJU,NeSy Planning*,GPT,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
-NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8