FredericFan's picture
Add GitHub and arXiv links to leaderboard page
ca6af8e
from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
# QA Benchmark Tasks
class QATasks(Enum):
overall_avg = Task("overall_avg", "score", "Overall Avg")
knowledge = Task("knowledge", "score", "MD-KnowledgeEval")
syntax = Task("syntax", "score", "LAMMPS-SyntaxEval")
# Code Generation Benchmark Tasks
class CodeGenTasks(Enum):
exec_success_at_3 = Task("exec_success_at_3", "score", "Exec-Success@3 (%)")
code_score_human = Task("code_score_human", "score", "Code-Score-Human")
NUM_FEWSHOT = 0
TITLE = """<h1 align="center" id="space-title">🧪 MD-EvalBench Leaderboard</h1>"""
INTRODUCTION_TEXT = """
**MD-EvalBench** is the first comprehensive benchmark for evaluating Large Language Models in the Molecular Dynamics (MD) domain,
proposed in the paper [*"MDAgent2: Large Language Model for Code Generation and Knowledge Q&A in Molecular Dynamics"*](https://arxiv.org/abs/2601.02075).
The benchmark consists of three evaluation datasets:
- **MD-KnowledgeEval** (336 questions): Theoretical knowledge assessment covering interatomic potentials, integration algorithms, equilibrium conditions, and statistical ensembles.
- **LAMMPS-SyntaxEval** (368 questions): Command and syntax understanding assessment for LAMMPS scripting.
- **LAMMPS-CodeGenEval** (566 tasks): Automatic code generation quality assessment for executable LAMMPS scripts.
Models are evaluated on both **Question Answering** (knowledge + syntax) and **Code Generation** (execution success + human scoring) capabilities.
To access the evaluation datasets, code, and submission guidelines, please visit our [GitHub repository](https://github.com/FredericVAN/PKU_MDAgent2).
"""
LLM_BENCHMARKS_TEXT = """
## Evaluation Protocol
All experiments are repeated three times and the average results are reported.
### QA Evaluation (MD-KnowledgeEval + LAMMPS-SyntaxEval)
- Four question types: single-choice, multiple-choice, fill-in-the-blank, and short-answer
- Three difficulty levels: Easy, Medium, Hard
- Score: accuracy percentage (0-100)
### Code Generation Evaluation (LAMMPS-CodeGenEval)
- **Exec-Success@3**: Proportion of tasks for which at least one of 3 generated candidates can be successfully executed in LAMMPS
- **Code-Score-Human**: Subjective rating in [0, 10] by domain experts based on readability, robustness, and physical correctness
### Evaluation Dimensions for LAMMPS Code
1. Syntax Correctness
2. Logical Consistency
3. Parameter Rationality
4. Core Logic Accuracy
5. Logical Completeness
6. Code Completeness
7. Result Validity
8. Physical Soundness
### Generation Settings
- **Direct Prompting**: Single prompt without tool integration or execution feedback
- **MDAgent**: Multi-agent framework with generate-evaluate-rewrite loop (prior work)
- **MDAgent2-RUNTIME**: Deployable multi-agent system integrating code generation, execution, evaluation, and self-correction
## Dataset Statistics
| Dataset | Samples |
|---------|---------|
| MD-KnowledgeEval | 336 |
| LAMMPS-SyntaxEval | 368 |
| LAMMPS-CodeGenEval | 566 |
## Reproducibility
Models are evaluated using the MD-EvalBench benchmark suite.
For evaluation data, code, and detailed methodology, please visit our [GitHub repository](https://github.com/FredericVAN/PKU_MDAgent2).
For the full paper, see [arXiv:2601.02075](https://arxiv.org/abs/2601.02075).
"""
EVALUATION_QUEUE_TEXT = """
## Submit your model for evaluation
### Requirements
1. Your model must be publicly available on the Hugging Face Hub
2. Model should be compatible with AutoClasses:
```python
from transformers import AutoConfig, AutoModel, AutoTokenizer
config = AutoConfig.from_pretrained("your-model-name", revision=revision)
model = AutoModel.from_pretrained("your-model-name", revision=revision)
tokenizer = AutoTokenizer.from_pretrained("your-model-name", revision=revision)
```
3. Convert weights to [safetensors](https://huggingface.co/docs/safetensors/index) format
4. Ensure your model has an open license
5. Fill up your model card with training details
### Evaluation Process
Submitted models will be evaluated on all three MD-EvalBench datasets:
- MD-KnowledgeEval (knowledge QA)
- LAMMPS-SyntaxEval (syntax QA)
- LAMMPS-CodeGenEval (code generation)
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@article{shi2026mdagent2,
title={MDAgent2: Large Language Model for Code Generation and Knowledge Q\&A in Molecular Dynamics},
author={Shi, Zhuofan and A, Hubao and Shao, Yufei and Dai, Mengyan and Yu, Yadong and Xiang, Pan and Huang, Dongliang and An, Hongxu and Xin, Chunxiao and Shen, Haiyang and Wang, Zhenyu and Na, Yunshan and Ma, Yun and Huang, Gang and Jing, Xiang},
journal={Science China Information Sciences},
year={2026}
}"""