Commit ·
ca6af8e
1
Parent(s): c96266d
Add GitHub and arXiv links to leaderboard page
Browse files- Add styled button bar (GitHub + arXiv) under the title banner
- Link to paper in introduction text
- Direct users to GitHub repo for evaluation data and code
- Add arXiv reference in About tab reproducibility section
- app.py +12 -0
- src/about.py +6 -2
- src/display/css_html_js.py +43 -0
app.py
CHANGED
|
@@ -167,6 +167,18 @@ demo = gr.Blocks(css=custom_css)
|
|
| 167 |
with demo:
|
| 168 |
gr.HTML(TITLE)
|
| 169 |
gr.HTML('<p id="space-subtitle">The First Comprehensive Benchmark for LLMs in Molecular Dynamics</p>')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
gr.HTML(build_metric_cards())
|
| 171 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 172 |
|
|
|
|
| 167 |
with demo:
|
| 168 |
gr.HTML(TITLE)
|
| 169 |
gr.HTML('<p id="space-subtitle">The First Comprehensive Benchmark for LLMs in Molecular Dynamics</p>')
|
| 170 |
+
gr.HTML("""
|
| 171 |
+
<div class="link-bar">
|
| 172 |
+
<a class="link-github" href="https://github.com/FredericVAN/PKU_MDAgent2" target="_blank">
|
| 173 |
+
<svg viewBox="0 0 16 16"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/></svg>
|
| 174 |
+
GitHub
|
| 175 |
+
</a>
|
| 176 |
+
<a class="link-arxiv" href="https://arxiv.org/abs/2601.02075" target="_blank">
|
| 177 |
+
<svg viewBox="0 0 16 16"><path d="M2 1h12a1 1 0 011 1v12a1 1 0 01-1 1H2a1 1 0 01-1-1V2a1 1 0 011-1zm1.5 2v10h2V9.5L7.5 12h1.2L6.5 9l2-2.5H7.3L5.5 9V3h-2zm5 0v10h2V3h-2zm3.5 0v2h1V3h-1z"/></svg>
|
| 178 |
+
arXiv Paper
|
| 179 |
+
</a>
|
| 180 |
+
</div>
|
| 181 |
+
""")
|
| 182 |
gr.HTML(build_metric_cards())
|
| 183 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 184 |
|
src/about.py
CHANGED
|
@@ -29,7 +29,7 @@ TITLE = """<h1 align="center" id="space-title">🧪 MD-EvalBench Leaderboard</h1
|
|
| 29 |
|
| 30 |
INTRODUCTION_TEXT = """
|
| 31 |
**MD-EvalBench** is the first comprehensive benchmark for evaluating Large Language Models in the Molecular Dynamics (MD) domain,
|
| 32 |
-
proposed in the paper *"MDAgent2: Large Language Model for Code Generation and Knowledge Q&A in Molecular Dynamics"*.
|
| 33 |
|
| 34 |
The benchmark consists of three evaluation datasets:
|
| 35 |
- **MD-KnowledgeEval** (336 questions): Theoretical knowledge assessment covering interatomic potentials, integration algorithms, equilibrium conditions, and statistical ensembles.
|
|
@@ -37,6 +37,8 @@ The benchmark consists of three evaluation datasets:
|
|
| 37 |
- **LAMMPS-CodeGenEval** (566 tasks): Automatic code generation quality assessment for executable LAMMPS scripts.
|
| 38 |
|
| 39 |
Models are evaluated on both **Question Answering** (knowledge + syntax) and **Code Generation** (execution success + human scoring) capabilities.
|
|
|
|
|
|
|
| 40 |
"""
|
| 41 |
|
| 42 |
LLM_BENCHMARKS_TEXT = """
|
|
@@ -78,7 +80,9 @@ All experiments are repeated three times and the average results are reported.
|
|
| 78 |
|
| 79 |
## Reproducibility
|
| 80 |
|
| 81 |
-
Models are evaluated using the MD-EvalBench benchmark suite.
|
|
|
|
|
|
|
| 82 |
"""
|
| 83 |
|
| 84 |
EVALUATION_QUEUE_TEXT = """
|
|
|
|
| 29 |
|
| 30 |
INTRODUCTION_TEXT = """
|
| 31 |
**MD-EvalBench** is the first comprehensive benchmark for evaluating Large Language Models in the Molecular Dynamics (MD) domain,
|
| 32 |
+
proposed in the paper [*"MDAgent2: Large Language Model for Code Generation and Knowledge Q&A in Molecular Dynamics"*](https://arxiv.org/abs/2601.02075).
|
| 33 |
|
| 34 |
The benchmark consists of three evaluation datasets:
|
| 35 |
- **MD-KnowledgeEval** (336 questions): Theoretical knowledge assessment covering interatomic potentials, integration algorithms, equilibrium conditions, and statistical ensembles.
|
|
|
|
| 37 |
- **LAMMPS-CodeGenEval** (566 tasks): Automatic code generation quality assessment for executable LAMMPS scripts.
|
| 38 |
|
| 39 |
Models are evaluated on both **Question Answering** (knowledge + syntax) and **Code Generation** (execution success + human scoring) capabilities.
|
| 40 |
+
|
| 41 |
+
To access the evaluation datasets, code, and submission guidelines, please visit our [GitHub repository](https://github.com/FredericVAN/PKU_MDAgent2).
|
| 42 |
"""
|
| 43 |
|
| 44 |
LLM_BENCHMARKS_TEXT = """
|
|
|
|
| 80 |
|
| 81 |
## Reproducibility
|
| 82 |
|
| 83 |
+
Models are evaluated using the MD-EvalBench benchmark suite.
|
| 84 |
+
For evaluation data, code, and detailed methodology, please visit our [GitHub repository](https://github.com/FredericVAN/PKU_MDAgent2).
|
| 85 |
+
For the full paper, see [arXiv:2601.02075](https://arxiv.org/abs/2601.02075).
|
| 86 |
"""
|
| 87 |
|
| 88 |
EVALUATION_QUEUE_TEXT = """
|
src/display/css_html_js.py
CHANGED
|
@@ -27,6 +27,49 @@ custom_css = """
|
|
| 27 |
font-weight: 400;
|
| 28 |
}
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
/* ===== Tabs ===== */
|
| 31 |
.tab-buttons button {
|
| 32 |
font-size: 17px !important;
|
|
|
|
| 27 |
font-weight: 400;
|
| 28 |
}
|
| 29 |
|
| 30 |
+
/* ===== Link Buttons Bar ===== */
|
| 31 |
+
.link-bar {
|
| 32 |
+
display: flex;
|
| 33 |
+
justify-content: center;
|
| 34 |
+
gap: 12px;
|
| 35 |
+
margin: 4px 0 18px 0;
|
| 36 |
+
flex-wrap: wrap;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
.link-bar a {
|
| 40 |
+
display: inline-flex;
|
| 41 |
+
align-items: center;
|
| 42 |
+
gap: 6px;
|
| 43 |
+
padding: 7px 18px;
|
| 44 |
+
border-radius: 8px;
|
| 45 |
+
font-size: 14px;
|
| 46 |
+
font-weight: 600;
|
| 47 |
+
text-decoration: none;
|
| 48 |
+
transition: transform 0.12s, box-shadow 0.15s;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
.link-bar a:hover {
|
| 52 |
+
transform: translateY(-1px);
|
| 53 |
+
box-shadow: 0 3px 10px rgba(0, 0, 0, 0.12);
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
.link-bar a.link-github {
|
| 57 |
+
background: #24292f;
|
| 58 |
+
color: #fff;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
.link-bar a.link-arxiv {
|
| 62 |
+
background: #b31b1b;
|
| 63 |
+
color: #fff;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
.link-bar a svg {
|
| 67 |
+
width: 16px;
|
| 68 |
+
height: 16px;
|
| 69 |
+
fill: currentColor;
|
| 70 |
+
flex-shrink: 0;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
/* ===== Tabs ===== */
|
| 74 |
.tab-buttons button {
|
| 75 |
font-size: 17px !important;
|