Spaces:
Runtime error
Runtime error
chore: clean up requests-related codes
Browse files- src/leaderboard/read_evals.py +2 -24
- tests/src/leaderboard/test_read_evals.py +5 -15
- tests/toydata/test_requests/bge-m3/NoReranker/eval_request_2023-11-21T18-10-08.json +0 -6
- tests/toydata/test_requests/bge-m3/NoReranker/eval_request_2023-12-21T18-10-08.json +0 -6
- tests/toydata/test_requests/bge-m3/bge-reranker-v2-m3/eval_request_2023-11-21T18-10-08.json +0 -6
- tests/toydata/test_requests/bge-m3/bge-reranker-v2-m3/eval_request_2023-12-21T18-10-08.json +0 -6
- tests/toydata/test_results/bge-m3/NoReranker/results_2023-12-21T18-10-08.json +0 -50
src/leaderboard/read_evals.py
CHANGED
|
@@ -91,21 +91,6 @@ class FullEvalResult:
|
|
| 91 |
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
|
| 92 |
return [v for v in results.values()]
|
| 93 |
|
| 94 |
-
def update_with_request_file(self, request_path):
|
| 95 |
-
"""
|
| 96 |
-
Update the request file
|
| 97 |
-
"""
|
| 98 |
-
request_file = get_request_file_for_model(
|
| 99 |
-
request_path, self.retrieval_model, self.reranking_model
|
| 100 |
-
)
|
| 101 |
-
|
| 102 |
-
try:
|
| 103 |
-
with open(request_file, "r") as f:
|
| 104 |
-
request = json.load(f)
|
| 105 |
-
self.date = request.get("submitted_time", "")
|
| 106 |
-
except Exception:
|
| 107 |
-
print(f"Failed to find request file for {self.retrieval_model}, {self.reranking_model}: {request_path}")
|
| 108 |
-
|
| 109 |
|
| 110 |
def get_request_file_for_model(requests_path, retrieval_model_name, reranking_model_name):
|
| 111 |
"""
|
|
@@ -130,7 +115,7 @@ def get_request_file_for_model(requests_path, retrieval_model_name, reranking_mo
|
|
| 130 |
return request_file
|
| 131 |
|
| 132 |
|
| 133 |
-
def get_raw_eval_results(results_path: str
|
| 134 |
"""
|
| 135 |
Load the evaluation results from a json file
|
| 136 |
"""
|
|
@@ -151,14 +136,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEval
|
|
| 151 |
for model_result_filepath in model_result_filepaths:
|
| 152 |
# create evaluation results
|
| 153 |
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
|
| 154 |
-
|
| 155 |
-
eval_result.update_with_request_file(requests_path)
|
| 156 |
-
latest_date_str = eval_result.date.replace(":", "-")
|
| 157 |
-
model_result_date_str = model_result_filepath.split('/')[-1
|
| 158 |
-
].removeprefix("results_").removesuffix(".json")
|
| 159 |
-
if latest_date_str != model_result_date_str:
|
| 160 |
-
print(f'file skipped: {model_result_filepath}')
|
| 161 |
-
continue
|
| 162 |
print(f'file loaded: {model_result_filepath}')
|
| 163 |
eval_name = eval_result.eval_name
|
| 164 |
eval_results[eval_name] = eval_result
|
|
|
|
| 91 |
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
|
| 92 |
return [v for v in results.values()]
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
def get_request_file_for_model(requests_path, retrieval_model_name, reranking_model_name):
|
| 96 |
"""
|
|
|
|
| 115 |
return request_file
|
| 116 |
|
| 117 |
|
| 118 |
+
def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
| 119 |
"""
|
| 120 |
Load the evaluation results from a json file
|
| 121 |
"""
|
|
|
|
| 136 |
for model_result_filepath in model_result_filepaths:
|
| 137 |
# create evaluation results
|
| 138 |
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
|
| 139 |
+
model_result_date_str = model_result_filepath.split('/')[-1].removeprefix("results_").removesuffix(".json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
print(f'file loaded: {model_result_filepath}')
|
| 141 |
eval_name = eval_result.eval_name
|
| 142 |
eval_results[eval_name] = eval_result
|
tests/src/leaderboard/test_read_evals.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
| 3 |
-
from src.leaderboard.read_evals import FullEvalResult, get_raw_eval_results
|
| 4 |
|
| 5 |
cur_fp = Path(__file__)
|
| 6 |
|
|
@@ -19,26 +19,16 @@ def test_to_dict():
|
|
| 19 |
result_dict = result_list[0]
|
| 20 |
assert result_dict["Retrieval Model"] == "bge-m3"
|
| 21 |
assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
|
| 22 |
-
assert result_dict["
|
| 23 |
-
assert result_dict["
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def test_get_request_file_for_model():
|
| 27 |
-
requests_path = cur_fp.parents[2] / "toydata" / "test_requests"
|
| 28 |
-
request_file = get_request_file_for_model(requests_path, "bge-m3", "bge-reranker-v2-m3")
|
| 29 |
-
# only load the latest finished results
|
| 30 |
-
assert Path(request_file).name.removeprefix("eval_request_").removesuffix(".json") == "2023-11-21T18-10-08"
|
| 31 |
|
| 32 |
|
| 33 |
def test_get_raw_eval_results():
|
| 34 |
-
requests_path = cur_fp.parents[2] / "toydata" / "test_requests"
|
| 35 |
results_path = cur_fp.parents[2] / "toydata" / "test_results" / "bge-m3"
|
| 36 |
-
results = get_raw_eval_results(results_path
|
| 37 |
# only load the latest results
|
| 38 |
assert len(results) == 2
|
| 39 |
-
assert results[0].date == "2023-12-21T18:10:08"
|
| 40 |
assert results[0].eval_name == "bge-m3_NoReranker"
|
| 41 |
-
assert len(results[0].results) ==
|
| 42 |
assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
|
| 43 |
-
assert results[1].date == "2023-11-21T18:10:08"
|
| 44 |
assert len(results[1].results) == 6
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
| 3 |
+
from src.leaderboard.read_evals import FullEvalResult, get_raw_eval_results
|
| 4 |
|
| 5 |
cur_fp = Path(__file__)
|
| 6 |
|
|
|
|
| 19 |
result_dict = result_list[0]
|
| 20 |
assert result_dict["Retrieval Model"] == "bge-m3"
|
| 21 |
assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
|
| 22 |
+
assert result_dict["wiki_en"] is not None
|
| 23 |
+
assert result_dict["wiki_zh"] is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def test_get_raw_eval_results():
|
|
|
|
| 27 |
results_path = cur_fp.parents[2] / "toydata" / "test_results" / "bge-m3"
|
| 28 |
+
results = get_raw_eval_results(results_path)
|
| 29 |
# only load the latest results
|
| 30 |
assert len(results) == 2
|
|
|
|
| 31 |
assert results[0].eval_name == "bge-m3_NoReranker"
|
| 32 |
+
assert len(results[0].results) == 6
|
| 33 |
assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
|
|
|
|
| 34 |
assert len(results[1].results) == 6
|
tests/toydata/test_requests/bge-m3/NoReranker/eval_request_2023-11-21T18-10-08.json
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"retrieval_model": "BAAI/bge-m3",
|
| 3 |
-
"reranking_model": "NoReranker",
|
| 4 |
-
"status": "FINISHED",
|
| 5 |
-
"submitted_time": "2023-11-21T18:10:08"
|
| 6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_requests/bge-m3/NoReranker/eval_request_2023-12-21T18-10-08.json
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"retrieval_model": "BAAI/bge-m3",
|
| 3 |
-
"reranking_model": "NoReranker",
|
| 4 |
-
"status": "FINISHED",
|
| 5 |
-
"submitted_time": "2023-12-21T18:10:08"
|
| 6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_requests/bge-m3/bge-reranker-v2-m3/eval_request_2023-11-21T18-10-08.json
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"retrieval_model": "BAAI/bge-m3",
|
| 3 |
-
"reranking_model": "BAAI/bge-reranker-v2-m3",
|
| 4 |
-
"status": "FINISHED",
|
| 5 |
-
"submitted_time": "2023-11-21T18:10:08"
|
| 6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_requests/bge-m3/bge-reranker-v2-m3/eval_request_2023-12-21T18-10-08.json
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"retrieval_model": "BAAI/bge-m3",
|
| 3 |
-
"reranking_model": "BAAI/bge-reranker-v2-m3",
|
| 4 |
-
"status": "RUNNING",
|
| 5 |
-
"submitted_time": "2023-12-21T18:10:08"
|
| 6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_results/bge-m3/NoReranker/results_2023-12-21T18-10-08.json
DELETED
|
@@ -1,50 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"config": {
|
| 4 |
-
"retrieval_model": "bge-m3",
|
| 5 |
-
"reranking_model": "NoReranker",
|
| 6 |
-
"task": "long_doc",
|
| 7 |
-
"metric": "ndcg_at_1"
|
| 8 |
-
},
|
| 9 |
-
"results": [
|
| 10 |
-
{
|
| 11 |
-
"domain": "law",
|
| 12 |
-
"lang": "en",
|
| 13 |
-
"dataset": "lex_files_500K-600K",
|
| 14 |
-
"value": 0.45723
|
| 15 |
-
}
|
| 16 |
-
]
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"config": {
|
| 20 |
-
"retrieval_model": "bge-m3",
|
| 21 |
-
"reranking_model": "NoReranker",
|
| 22 |
-
"task": "qa",
|
| 23 |
-
"metric": "ndcg_at_1"
|
| 24 |
-
},
|
| 25 |
-
"results": [
|
| 26 |
-
{
|
| 27 |
-
"domain": "wiki",
|
| 28 |
-
"lang": "en",
|
| 29 |
-
"dataset": "unknown",
|
| 30 |
-
"value": 0.39083
|
| 31 |
-
}
|
| 32 |
-
]
|
| 33 |
-
},
|
| 34 |
-
{
|
| 35 |
-
"config": {
|
| 36 |
-
"retrieval_model": "bge-m3",
|
| 37 |
-
"reranking_model": "NoReranker",
|
| 38 |
-
"task": "qa",
|
| 39 |
-
"metric": "ndcg_at_1"
|
| 40 |
-
},
|
| 41 |
-
"results": [
|
| 42 |
-
{
|
| 43 |
-
"domain": "wiki",
|
| 44 |
-
"lang": "zh",
|
| 45 |
-
"dataset": "unknown",
|
| 46 |
-
"value": 0.78358
|
| 47 |
-
}
|
| 48 |
-
]
|
| 49 |
-
}
|
| 50 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|