Spaces:

PKU-JX-LAB
/

Molecular-Dynamics-Benchmark

Sleeping

FredericFan commited on 7 days ago

Commit

ca6af8e

1 Parent(s): c96266d

Add GitHub and arXiv links to leaderboard page

- Add styled button bar (GitHub + arXiv) under the title banner
- Link to paper in introduction text
- Direct users to GitHub repo for evaluation data and code
- Add arXiv reference in About tab reproducibility section

Files changed (3) hide show

app.py +12 -0
src/about.py +6 -2
src/display/css_html_js.py +43 -0

app.py CHANGED Viewed

@@ -167,6 +167,18 @@ demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
     gr.HTML('<p id="space-subtitle">The First Comprehensive Benchmark for LLMs in Molecular Dynamics</p>')
     gr.HTML(build_metric_cards())
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

 with demo:
     gr.HTML(TITLE)
     gr.HTML('<p id="space-subtitle">The First Comprehensive Benchmark for LLMs in Molecular Dynamics</p>')
+    gr.HTML("""
+    <div class="link-bar">
+        <a class="link-github" href="https://github.com/FredericVAN/PKU_MDAgent2" target="_blank">
+            <svg viewBox="0 0 16 16"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/></svg>
+            GitHub
+        </a>
+        <a class="link-arxiv" href="https://arxiv.org/abs/2601.02075" target="_blank">
+            <svg viewBox="0 0 16 16"><path d="M2 1h12a1 1 0 011 1v12a1 1 0 01-1 1H2a1 1 0 01-1-1V2a1 1 0 011-1zm1.5 2v10h2V9.5L7.5 12h1.2L6.5 9l2-2.5H7.3L5.5 9V3h-2zm5 0v10h2V3h-2zm3.5 0v2h1V3h-1z"/></svg>
+            arXiv Paper
+        </a>
+    </div>
+    """)
     gr.HTML(build_metric_cards())
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

src/about.py CHANGED Viewed

@@ -29,7 +29,7 @@ TITLE = """<h1 align="center" id="space-title">🧪 MD-EvalBench Leaderboard</h1
 INTRODUCTION_TEXT = """
 **MD-EvalBench** is the first comprehensive benchmark for evaluating Large Language Models in the Molecular Dynamics (MD) domain,
-proposed in the paper *"MDAgent2: Large Language Model for Code Generation and Knowledge Q&A in Molecular Dynamics"*.
 The benchmark consists of three evaluation datasets:
 - **MD-KnowledgeEval** (336 questions): Theoretical knowledge assessment covering interatomic potentials, integration algorithms, equilibrium conditions, and statistical ensembles.
@@ -37,6 +37,8 @@ The benchmark consists of three evaluation datasets:
 - **LAMMPS-CodeGenEval** (566 tasks): Automatic code generation quality assessment for executable LAMMPS scripts.
 Models are evaluated on both **Question Answering** (knowledge + syntax) and **Code Generation** (execution success + human scoring) capabilities.
 """
 LLM_BENCHMARKS_TEXT = """
@@ -78,7 +80,9 @@ All experiments are repeated three times and the average results are reported.
 ## Reproducibility
-Models are evaluated using the MD-EvalBench benchmark suite. For detailed methodology, please refer to the paper.
 """
 EVALUATION_QUEUE_TEXT = """

 INTRODUCTION_TEXT = """
 **MD-EvalBench** is the first comprehensive benchmark for evaluating Large Language Models in the Molecular Dynamics (MD) domain,
+proposed in the paper [*"MDAgent2: Large Language Model for Code Generation and Knowledge Q&A in Molecular Dynamics"*](https://arxiv.org/abs/2601.02075).
 The benchmark consists of three evaluation datasets:
 - **MD-KnowledgeEval** (336 questions): Theoretical knowledge assessment covering interatomic potentials, integration algorithms, equilibrium conditions, and statistical ensembles.
 - **LAMMPS-CodeGenEval** (566 tasks): Automatic code generation quality assessment for executable LAMMPS scripts.
 Models are evaluated on both **Question Answering** (knowledge + syntax) and **Code Generation** (execution success + human scoring) capabilities.
+To access the evaluation datasets, code, and submission guidelines, please visit our [GitHub repository](https://github.com/FredericVAN/PKU_MDAgent2).
 """
 LLM_BENCHMARKS_TEXT = """
 ## Reproducibility
+Models are evaluated using the MD-EvalBench benchmark suite.
+For evaluation data, code, and detailed methodology, please visit our [GitHub repository](https://github.com/FredericVAN/PKU_MDAgent2).
+For the full paper, see [arXiv:2601.02075](https://arxiv.org/abs/2601.02075).
 """
 EVALUATION_QUEUE_TEXT = """

src/display/css_html_js.py CHANGED Viewed

@@ -27,6 +27,49 @@ custom_css = """
     font-weight: 400;
 }
 /* ===== Tabs ===== */
 .tab-buttons button {
     font-size: 17px !important;

     font-weight: 400;
 }
+/* ===== Link Buttons Bar ===== */
+.link-bar {
+    display: flex;
+    justify-content: center;
+    gap: 12px;
+    margin: 4px 0 18px 0;
+    flex-wrap: wrap;
+}
+.link-bar a {
+    display: inline-flex;
+    align-items: center;
+    gap: 6px;
+    padding: 7px 18px;
+    border-radius: 8px;
+    font-size: 14px;
+    font-weight: 600;
+    text-decoration: none;
+    transition: transform 0.12s, box-shadow 0.15s;
+}
+.link-bar a:hover {
+    transform: translateY(-1px);
+    box-shadow: 0 3px 10px rgba(0, 0, 0, 0.12);
+}
+.link-bar a.link-github {
+    background: #24292f;
+    color: #fff;
+}
+.link-bar a.link-arxiv {
+    background: #b31b1b;
+    color: #fff;
+}
+.link-bar a svg {
+    width: 16px;
+    height: 16px;
+    fill: currentColor;
+    flex-shrink: 0;
+}
 /* ===== Tabs ===== */
 .tab-buttons button {
     font-size: 17px !important;