Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
refactor: move the data model
Browse files- src/read_evals.py +3 -123
- src/utils.py +2 -1
- tests/src/test_read_evals.py +2 -1
src/read_evals.py
CHANGED
|
@@ -1,18 +1,13 @@
|
|
| 1 |
-
import json
|
| 2 |
import os.path
|
| 3 |
-
from collections import defaultdict
|
| 4 |
-
from dataclasses import dataclass
|
| 5 |
from typing import List
|
| 6 |
|
| 7 |
import pandas as pd
|
| 8 |
|
| 9 |
-
from src.benchmarks import
|
| 10 |
from src.display.utils import COLS_QA, COLS_LONG_DOC
|
| 11 |
-
from src.display.column_names import COL_NAME_AVG,
|
| 12 |
-
COL_NAME_RETRIEVAL_MODEL_LINK, COL_NAME_RERANKING_MODEL_LINK, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, \
|
| 13 |
-
COL_NAME_IS_ANONYMOUS
|
| 14 |
|
| 15 |
-
from src.
|
| 16 |
|
| 17 |
pd.options.mode.copy_on_write = True
|
| 18 |
|
|
@@ -24,121 +19,6 @@ def calculate_mean(row):
|
|
| 24 |
return row.mean()
|
| 25 |
|
| 26 |
|
| 27 |
-
@dataclass
|
| 28 |
-
class EvalResult:
|
| 29 |
-
"""
|
| 30 |
-
Evaluation result of a single embedding model with a specific reranking model on benchmarks over different
|
| 31 |
-
domains, languages, and datasets
|
| 32 |
-
"""
|
| 33 |
-
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]_[metric]
|
| 34 |
-
retrieval_model: str
|
| 35 |
-
reranking_model: str
|
| 36 |
-
results: list # results on all the benchmarks stored as dict
|
| 37 |
-
task: str
|
| 38 |
-
metric: str
|
| 39 |
-
timestamp: str = "" # submission timestamp
|
| 40 |
-
revision: str = ""
|
| 41 |
-
is_anonymous: bool = False
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
@dataclass
|
| 45 |
-
class FullEvalResult:
|
| 46 |
-
"""
|
| 47 |
-
Evaluation result of a single embedding model with a specific reranking model on benchmarks over different tasks
|
| 48 |
-
"""
|
| 49 |
-
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
|
| 50 |
-
retrieval_model: str
|
| 51 |
-
reranking_model: str
|
| 52 |
-
retrieval_model_link: str
|
| 53 |
-
reranking_model_link: str
|
| 54 |
-
results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
|
| 55 |
-
timestamp: str = ""
|
| 56 |
-
revision: str = ""
|
| 57 |
-
is_anonymous: bool = False
|
| 58 |
-
|
| 59 |
-
@classmethod
|
| 60 |
-
def init_from_json_file(cls, json_filepath):
|
| 61 |
-
"""
|
| 62 |
-
Initiate from the result json file for a single model.
|
| 63 |
-
The json file will be written only when the status is FINISHED.
|
| 64 |
-
"""
|
| 65 |
-
with open(json_filepath) as fp:
|
| 66 |
-
model_data = json.load(fp)
|
| 67 |
-
|
| 68 |
-
# store all the results for different metrics and tasks
|
| 69 |
-
result_list = []
|
| 70 |
-
retrieval_model_link = ""
|
| 71 |
-
reranking_model_link = ""
|
| 72 |
-
revision = ""
|
| 73 |
-
for item in model_data:
|
| 74 |
-
config = item.get("config", {})
|
| 75 |
-
# eval results for different metrics
|
| 76 |
-
results = item.get("results", [])
|
| 77 |
-
retrieval_model_link = config["retrieval_model_link"]
|
| 78 |
-
if config["reranking_model_link"] is None:
|
| 79 |
-
reranking_model_link = ""
|
| 80 |
-
else:
|
| 81 |
-
reranking_model_link = config["reranking_model_link"]
|
| 82 |
-
eval_result = EvalResult(
|
| 83 |
-
eval_name=f"{config['retrieval_model']}_{config['reranking_model']}_{config['metric']}",
|
| 84 |
-
retrieval_model=config["retrieval_model"],
|
| 85 |
-
reranking_model=config["reranking_model"],
|
| 86 |
-
results=results,
|
| 87 |
-
task=config["task"],
|
| 88 |
-
metric=config["metric"],
|
| 89 |
-
timestamp=config.get("timestamp", "2024-05-12T12:24:02Z"),
|
| 90 |
-
revision=config.get("revision", "3a2ba9dcad796a48a02ca1147557724e"),
|
| 91 |
-
is_anonymous=config.get("is_anonymous", False)
|
| 92 |
-
)
|
| 93 |
-
result_list.append(eval_result)
|
| 94 |
-
return cls(
|
| 95 |
-
eval_name=f"{result_list[0].retrieval_model}_{result_list[0].reranking_model}",
|
| 96 |
-
retrieval_model=result_list[0].retrieval_model,
|
| 97 |
-
reranking_model=result_list[0].reranking_model,
|
| 98 |
-
retrieval_model_link=retrieval_model_link,
|
| 99 |
-
reranking_model_link=reranking_model_link,
|
| 100 |
-
results=result_list,
|
| 101 |
-
timestamp=result_list[0].timestamp,
|
| 102 |
-
revision=result_list[0].revision,
|
| 103 |
-
is_anonymous=result_list[0].is_anonymous
|
| 104 |
-
)
|
| 105 |
-
|
| 106 |
-
def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
|
| 107 |
-
"""
|
| 108 |
-
Convert the results in all the EvalResults over different tasks and metrics. The output is a list of dict compatible with the dataframe UI
|
| 109 |
-
"""
|
| 110 |
-
results = defaultdict(dict)
|
| 111 |
-
for eval_result in self.results:
|
| 112 |
-
if eval_result.metric != metric:
|
| 113 |
-
continue
|
| 114 |
-
if eval_result.task != task:
|
| 115 |
-
continue
|
| 116 |
-
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
| 117 |
-
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = (
|
| 118 |
-
make_clickable_model(self.retrieval_model, self.retrieval_model_link))
|
| 119 |
-
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = (
|
| 120 |
-
make_clickable_model(self.reranking_model, self.reranking_model_link))
|
| 121 |
-
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL_LINK] = self.retrieval_model_link
|
| 122 |
-
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL_LINK] = self.reranking_model_link
|
| 123 |
-
results[eval_result.eval_name][COL_NAME_REVISION] = self.revision
|
| 124 |
-
results[eval_result.eval_name][COL_NAME_TIMESTAMP] = self.timestamp
|
| 125 |
-
results[eval_result.eval_name][COL_NAME_IS_ANONYMOUS] = self.is_anonymous
|
| 126 |
-
|
| 127 |
-
# print(f'result loaded: {eval_result.eval_name}')
|
| 128 |
-
for result in eval_result.results:
|
| 129 |
-
# add result for each domain, language, and dataset
|
| 130 |
-
domain = result["domain"]
|
| 131 |
-
lang = result["lang"]
|
| 132 |
-
dataset = result["dataset"]
|
| 133 |
-
value = result["value"] * 100
|
| 134 |
-
if dataset == 'default':
|
| 135 |
-
benchmark_name = f"{domain}_{lang}"
|
| 136 |
-
else:
|
| 137 |
-
benchmark_name = f"{domain}_{lang}_{dataset}"
|
| 138 |
-
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
|
| 139 |
-
return [v for v in results.values()]
|
| 140 |
-
|
| 141 |
-
|
| 142 |
def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
| 143 |
"""
|
| 144 |
Load the evaluation results from a json file
|
|
|
|
|
|
|
| 1 |
import os.path
|
|
|
|
|
|
|
| 2 |
from typing import List
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
+
from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
|
| 7 |
from src.display.utils import COLS_QA, COLS_LONG_DOC
|
| 8 |
+
from src.display.column_names import COL_NAME_AVG, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_IS_ANONYMOUS
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
from src.models import FullEvalResult
|
| 11 |
|
| 12 |
pd.options.mode.copy_on_write = True
|
| 13 |
|
|
|
|
| 19 |
return row.mean()
|
| 20 |
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
| 23 |
"""
|
| 24 |
Load the evaluation results from a json file
|
src/utils.py
CHANGED
|
@@ -12,7 +12,8 @@ from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC,
|
|
| 12 |
from src.display.column_names import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, \
|
| 13 |
COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
|
| 14 |
from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION
|
| 15 |
-
from src.read_evals import
|
|
|
|
| 16 |
|
| 17 |
import re
|
| 18 |
|
|
|
|
| 12 |
from src.display.column_names import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, \
|
| 13 |
COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
|
| 14 |
from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION
|
| 15 |
+
from src.read_evals import get_leaderboard_df, calculate_mean
|
| 16 |
+
from src.models import FullEvalResult
|
| 17 |
|
| 18 |
import re
|
| 19 |
|
tests/src/test_read_evals.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
| 3 |
-
from src.read_evals import
|
|
|
|
| 4 |
|
| 5 |
cur_fp = Path(__file__)
|
| 6 |
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
| 3 |
+
from src.read_evals import get_raw_eval_results, get_leaderboard_df
|
| 4 |
+
from src.models import FullEvalResult
|
| 5 |
|
| 6 |
cur_fp = Path(__file__)
|
| 7 |
|