Update README.md
Browse files
README.md
CHANGED
|
@@ -47,6 +47,8 @@ license: other
|
|
| 47 |
publisher = {Hugging Face},
|
| 48 |
howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
|
| 49 |
}
|
|
|
|
|
|
|
| 50 |
@software{eval-harness,
|
| 51 |
author = {Gao, Leo and
|
| 52 |
Tow, Jonathan and
|
|
@@ -73,6 +75,8 @@ license: other
|
|
| 73 |
doi = {10.5281/zenodo.5371628},
|
| 74 |
url = {https://doi.org/10.5281/zenodo.5371628}
|
| 75 |
}
|
|
|
|
|
|
|
| 76 |
@misc{clark2018think,
|
| 77 |
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
|
| 78 |
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
|
|
@@ -81,6 +85,8 @@ license: other
|
|
| 81 |
archivePrefix={arXiv},
|
| 82 |
primaryClass={cs.AI}
|
| 83 |
}
|
|
|
|
|
|
|
| 84 |
@misc{zellers2019hellaswag,
|
| 85 |
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
| 86 |
author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
|
|
@@ -89,6 +95,8 @@ license: other
|
|
| 89 |
archivePrefix={arXiv},
|
| 90 |
primaryClass={cs.CL}
|
| 91 |
}
|
|
|
|
|
|
|
| 92 |
@misc{hendrycks2021measuring,
|
| 93 |
title={Measuring Massive Multitask Language Understanding},
|
| 94 |
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
|
@@ -97,7 +105,8 @@ license: other
|
|
| 97 |
archivePrefix={arXiv},
|
| 98 |
primaryClass={cs.CY}
|
| 99 |
}
|
| 100 |
-
|
|
|
|
| 101 |
@misc{lin2022truthfulqa,
|
| 102 |
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
|
| 103 |
author={Stephanie Lin and Jacob Hilton and Owain Evans},
|
|
|
|
| 47 |
publisher = {Hugging Face},
|
| 48 |
howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
|
| 49 |
}
|
| 50 |
+
```
|
| 51 |
+
```
|
| 52 |
@software{eval-harness,
|
| 53 |
author = {Gao, Leo and
|
| 54 |
Tow, Jonathan and
|
|
|
|
| 75 |
doi = {10.5281/zenodo.5371628},
|
| 76 |
url = {https://doi.org/10.5281/zenodo.5371628}
|
| 77 |
}
|
| 78 |
+
```
|
| 79 |
+
```
|
| 80 |
@misc{clark2018think,
|
| 81 |
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
|
| 82 |
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
|
|
|
|
| 85 |
archivePrefix={arXiv},
|
| 86 |
primaryClass={cs.AI}
|
| 87 |
}
|
| 88 |
+
```
|
| 89 |
+
```
|
| 90 |
@misc{zellers2019hellaswag,
|
| 91 |
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
| 92 |
author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
|
|
|
|
| 95 |
archivePrefix={arXiv},
|
| 96 |
primaryClass={cs.CL}
|
| 97 |
}
|
| 98 |
+
```
|
| 99 |
+
```
|
| 100 |
@misc{hendrycks2021measuring,
|
| 101 |
title={Measuring Massive Multitask Language Understanding},
|
| 102 |
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
|
|
|
| 105 |
archivePrefix={arXiv},
|
| 106 |
primaryClass={cs.CY}
|
| 107 |
}
|
| 108 |
+
```
|
| 109 |
+
```
|
| 110 |
@misc{lin2022truthfulqa,
|
| 111 |
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
|
| 112 |
author={Stephanie Lin and Jacob Hilton and Owain Evans},
|