| import os | |
| from typing import Any, List | |
| from .benchmark import Benchmark | |
| from .measures import ems, f1_score, acc_score | |
| from ..core.logging import logger | |
| from ..utils.utils import download_file | |
| NQ_FILES_MAP = {"train": "nq-train.qa.csv", "dev": "nq-dev.qa.csv", "test": "nq-test.qa.csv"} | |
| VALID_RAW_NQ_FILES = [file for file in list(NQ_FILES_MAP.values()) if file is not None] | |
| def download_raw_nq_data(name: str, save_folder: str): | |
| assert name in VALID_RAW_NQ_FILES, f"'{name}' is an invalid nq file name. Available file names: {VALID_RAW_NQ_FILES}" | |
| file_type_map = {file_name: typ for typ, file_name in NQ_FILES_MAP.items()} | |
| typ = file_type_map[name] | |
| url = f"https://dl.fbaipublicfiles.com/dpr/data/retriever/{name}" | |
| logger.info(f"Downloading NQ {typ} data from: {url}") | |
| download_file(url=url, save_file=os.path.join(save_folder, name)) | |
| def load_tsv_data(file_path: str) -> List[dict]: | |
| base_name = os.path.basename(file_path) | |
| file_type_map = {file_name: typ for typ, file_name in NQ_FILES_MAP.items()} | |
| assert base_name in file_type_map, f"'{base_name}' is an invalid nq file name. Available file names: {VALID_RAW_NQ_FILES}" | |
| typ = file_type_map[base_name] | |
| data = [] | |
| with open(file_path, encoding="utf-8") as f: | |
| for i, line in enumerate(f): | |
| question, answers = line.strip().split("\t") | |
| answers = eval(answers) | |
| data.append({"id": f"{typ}-{i+1}", "question": question, "answers": answers}) | |
| return data | |
| class NQ(Benchmark): | |
| """Benchmark class for evaluating question answering on Natural Questions dataset. | |
| Natural Questions (NQ) is a dataset for open-domain question answering, | |
| containing real questions from Google Search and answers from Wikipedia. | |
| This class handles loading the dataset, evaluating answers, and computing | |
| metrics like exact match and F1 score. | |
| Each NQ example has the following structure: | |
| { | |
| "id": str, | |
| "question": str, | |
| "answers": List[str] | |
| } | |
| The benchmark evaluates answers using exact match, F1 score, and accuracy metrics. | |
| """ | |
| def __init__(self, path: str = None, mode: str = "all", **kwargs): | |
| path = os.path.expanduser(path or "~/.evoagentx/data/nq") | |
| super().__init__(name=type(self).__name__, path=path, mode=mode, **kwargs) | |
| def _load_data_from_file(self, file_name: str): | |
| if file_name is None: | |
| return None | |
| file_path = os.path.join(self.path, file_name) | |
| if not os.path.exists(file_path): | |
| download_raw_nq_data(name=file_name, save_folder=self.path) | |
| logger.info(f"loading NQ data from {file_path} ...") | |
| return load_tsv_data(file_path=file_path) | |
| def _load_data(self): | |
| if self.mode == "train" or self.mode == "all": | |
| self._train_data = self._load_data_from_file(file_name=NQ_FILES_MAP["train"]) | |
| if self.mode == "dev" or self.mode == "all": | |
| self._dev_data = self._load_data_from_file(file_name=NQ_FILES_MAP["dev"]) | |
| if self.mode == "test" or self.mode == "all": | |
| self._test_data = self._load_data_from_file(file_name=NQ_FILES_MAP["test"]) | |
| def _get_label(self, example: Any) -> Any: | |
| return example["answers"] | |
| def _get_id(self, example: Any) -> Any: | |
| return example["id"] | |
| def evaluate(self, prediction: Any, label: Any) -> dict: | |
| em = ems(prediction=prediction, ground_truths=label) | |
| f1 = max(f1_score(prediction=prediction, ground_truth=one_answer) for one_answer in label) | |
| acc = acc_score(prediction=prediction, ground_truths=label) | |
| return {"f1": f1, "em": em, "acc": acc} | |