iLOVE2D's picture
Upload 2846 files
5374a2d verified
import os
from typing import Any, List
from .benchmark import Benchmark
from .measures import ems, f1_score, acc_score
from ..core.logging import logger
from ..utils.utils import download_file
NQ_FILES_MAP = {"train": "nq-train.qa.csv", "dev": "nq-dev.qa.csv", "test": "nq-test.qa.csv"}
VALID_RAW_NQ_FILES = [file for file in list(NQ_FILES_MAP.values()) if file is not None]
def download_raw_nq_data(name: str, save_folder: str):
assert name in VALID_RAW_NQ_FILES, f"'{name}' is an invalid nq file name. Available file names: {VALID_RAW_NQ_FILES}"
file_type_map = {file_name: typ for typ, file_name in NQ_FILES_MAP.items()}
typ = file_type_map[name]
url = f"https://dl.fbaipublicfiles.com/dpr/data/retriever/{name}"
logger.info(f"Downloading NQ {typ} data from: {url}")
download_file(url=url, save_file=os.path.join(save_folder, name))
def load_tsv_data(file_path: str) -> List[dict]:
base_name = os.path.basename(file_path)
file_type_map = {file_name: typ for typ, file_name in NQ_FILES_MAP.items()}
assert base_name in file_type_map, f"'{base_name}' is an invalid nq file name. Available file names: {VALID_RAW_NQ_FILES}"
typ = file_type_map[base_name]
data = []
with open(file_path, encoding="utf-8") as f:
for i, line in enumerate(f):
question, answers = line.strip().split("\t")
answers = eval(answers)
data.append({"id": f"{typ}-{i+1}", "question": question, "answers": answers})
return data
class NQ(Benchmark):
"""Benchmark class for evaluating question answering on Natural Questions dataset.
Natural Questions (NQ) is a dataset for open-domain question answering,
containing real questions from Google Search and answers from Wikipedia.
This class handles loading the dataset, evaluating answers, and computing
metrics like exact match and F1 score.
Each NQ example has the following structure:
{
"id": str,
"question": str,
"answers": List[str]
}
The benchmark evaluates answers using exact match, F1 score, and accuracy metrics.
"""
def __init__(self, path: str = None, mode: str = "all", **kwargs):
path = os.path.expanduser(path or "~/.evoagentx/data/nq")
super().__init__(name=type(self).__name__, path=path, mode=mode, **kwargs)
def _load_data_from_file(self, file_name: str):
if file_name is None:
return None
file_path = os.path.join(self.path, file_name)
if not os.path.exists(file_path):
download_raw_nq_data(name=file_name, save_folder=self.path)
logger.info(f"loading NQ data from {file_path} ...")
return load_tsv_data(file_path=file_path)
def _load_data(self):
if self.mode == "train" or self.mode == "all":
self._train_data = self._load_data_from_file(file_name=NQ_FILES_MAP["train"])
if self.mode == "dev" or self.mode == "all":
self._dev_data = self._load_data_from_file(file_name=NQ_FILES_MAP["dev"])
if self.mode == "test" or self.mode == "all":
self._test_data = self._load_data_from_file(file_name=NQ_FILES_MAP["test"])
def _get_label(self, example: Any) -> Any:
return example["answers"]
def _get_id(self, example: Any) -> Any:
return example["id"]
def evaluate(self, prediction: Any, label: Any) -> dict:
em = ems(prediction=prediction, ground_truths=label)
f1 = max(f1_score(prediction=prediction, ground_truth=one_answer) for one_answer in label)
acc = acc_score(prediction=prediction, ground_truths=label)
return {"f1": f1, "em": em, "acc": acc}