Spaces:
Running
Running
wenhuchen
commited on
Commit
·
236a68e
1
Parent(s):
4abf394
update leaderboard
Browse files
utils.py
CHANGED
|
@@ -14,9 +14,10 @@ MODEL_INFO = [
|
|
| 14 |
"TheoremQA",
|
| 15 |
"MATH",
|
| 16 |
"GSM",
|
|
|
|
| 17 |
]
|
| 18 |
|
| 19 |
-
DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number']
|
| 20 |
|
| 21 |
SUBMISSION_NAME = "science_leaderboard_submission"
|
| 22 |
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
|
|
@@ -64,13 +65,11 @@ GPQA: A Graduate-Level Google-Proof Q&A Benchmark<br>
|
|
| 64 |
"""
|
| 65 |
|
| 66 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 67 |
-
CITATION_BUTTON_TEXT = r"""@
|
| 68 |
title={Measuring Mathematical Problem Solving With the MATH Dataset},
|
| 69 |
author={Hendrycks, Dan and Burns, Collin and Kadavath, Saurav and Arora, Akul and Basart, Steven and Tang, Eric and Song, Dawn and Steinhardt, Jacob},
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
number={4},
|
| 73 |
-
pages={0--6}
|
| 74 |
}
|
| 75 |
@article{cobbe2021training,
|
| 76 |
title={Training verifiers to solve math word problems},
|
|
@@ -111,7 +110,7 @@ def get_df():
|
|
| 111 |
repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
|
| 112 |
repo.git_pull()
|
| 113 |
df = pd.read_csv(CSV_DIR)
|
| 114 |
-
df['Avg'] = df[['TheoremQA', 'MATH', 'GSM']].mean(axis=1).round(1)
|
| 115 |
df = df.sort_values(by=['Avg'], ascending=False)
|
| 116 |
return df[COLUMN_NAMES]
|
| 117 |
|
|
@@ -122,7 +121,7 @@ def add_new_eval(
|
|
| 122 |
return "Error! Empty file!"
|
| 123 |
|
| 124 |
upload_data=json.loads(input_file)
|
| 125 |
-
data_row = [upload_data['ModelName'], upload_data['TheoremQA'], upload_data['MATH'], upload_data['GSM']]
|
| 126 |
|
| 127 |
submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
|
| 128 |
submission_repo.git_pull()
|
|
|
|
| 14 |
"TheoremQA",
|
| 15 |
"MATH",
|
| 16 |
"GSM",
|
| 17 |
+
"GPQA",
|
| 18 |
]
|
| 19 |
|
| 20 |
+
DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number']
|
| 21 |
|
| 22 |
SUBMISSION_NAME = "science_leaderboard_submission"
|
| 23 |
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
|
|
|
|
| 65 |
"""
|
| 66 |
|
| 67 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 68 |
+
CITATION_BUTTON_TEXT = r"""@inproceedings{hendrycks2021measuring,
|
| 69 |
title={Measuring Mathematical Problem Solving With the MATH Dataset},
|
| 70 |
author={Hendrycks, Dan and Burns, Collin and Kadavath, Saurav and Arora, Akul and Basart, Steven and Tang, Eric and Song, Dawn and Steinhardt, Jacob},
|
| 71 |
+
booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)},
|
| 72 |
+
year={2021}
|
|
|
|
|
|
|
| 73 |
}
|
| 74 |
@article{cobbe2021training,
|
| 75 |
title={Training verifiers to solve math word problems},
|
|
|
|
| 110 |
repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
|
| 111 |
repo.git_pull()
|
| 112 |
df = pd.read_csv(CSV_DIR)
|
| 113 |
+
df['Avg'] = df[['TheoremQA', 'MATH', 'GSM', 'GPQA']].mean(axis=1).round(1)
|
| 114 |
df = df.sort_values(by=['Avg'], ascending=False)
|
| 115 |
return df[COLUMN_NAMES]
|
| 116 |
|
|
|
|
| 121 |
return "Error! Empty file!"
|
| 122 |
|
| 123 |
upload_data=json.loads(input_file)
|
| 124 |
+
data_row = [upload_data['ModelName'], upload_data['TheoremQA'], upload_data['MATH'], upload_data['GSM'], upload_data['GPQA']]
|
| 125 |
|
| 126 |
submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
|
| 127 |
submission_repo.git_pull()
|