|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Module to compute TREC evaluation scores.""" |
|
|
|
|
|
import datasets |
|
|
import pandas as pd |
|
|
from trectools import TrecEval, TrecQrel, TrecRun |
|
|
|
|
|
import evaluate |
|
|
|
|
|
|
|
|
_CITATION = """\ |
|
|
@inproceedings{palotti2019, |
|
|
author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido}, |
|
|
title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns}, |
|
|
series = {SIGIR'19}, |
|
|
year = {2019}, |
|
|
location = {Paris, France}, |
|
|
publisher = {ACM} |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
_DESCRIPTION = """\ |
|
|
The TREC Eval metric combines a number of information retrieval metrics such as \ |
|
|
precision and nDCG. It is used to score rankings of retrieved documents with reference values.""" |
|
|
|
|
|
|
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
|
Calculates TREC evaluation scores based on a run and qrel. |
|
|
Args: |
|
|
predictions: list containing a single run. |
|
|
references: list containing a single qrel. |
|
|
Returns: |
|
|
dict: TREC evaluation scores. |
|
|
Examples: |
|
|
>>> trec = evaluate.load("trec_eval") |
|
|
>>> qrel = { |
|
|
... "query": [0], |
|
|
... "q0": ["0"], |
|
|
... "docid": ["doc_1"], |
|
|
... "rel": [2] |
|
|
... } |
|
|
>>> run = { |
|
|
... "query": [0, 0], |
|
|
... "q0": ["q0", "q0"], |
|
|
... "docid": ["doc_2", "doc_1"], |
|
|
... "rank": [0, 1], |
|
|
... "score": [1.5, 1.2], |
|
|
... "system": ["test", "test"] |
|
|
... } |
|
|
>>> results = trec.compute(references=[qrel], predictions=[run]) |
|
|
>>> print(results["P@5"]) |
|
|
0.2 |
|
|
""" |
|
|
|
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
|
class TRECEval(evaluate.Metric): |
|
|
"""Compute TREC evaluation scores.""" |
|
|
|
|
|
def _info(self): |
|
|
return evaluate.MetricInfo( |
|
|
module_type="metric", |
|
|
description=_DESCRIPTION, |
|
|
citation=_CITATION, |
|
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
features=datasets.Features( |
|
|
{ |
|
|
"predictions": { |
|
|
"query": datasets.Sequence(datasets.Value("int64")), |
|
|
"q0": datasets.Sequence(datasets.Value("string")), |
|
|
"docid": datasets.Sequence(datasets.Value("string")), |
|
|
"rank": datasets.Sequence(datasets.Value("int64")), |
|
|
"score": datasets.Sequence(datasets.Value("float")), |
|
|
"system": datasets.Sequence(datasets.Value("string")), |
|
|
}, |
|
|
"references": { |
|
|
"query": datasets.Sequence(datasets.Value("int64")), |
|
|
"q0": datasets.Sequence(datasets.Value("string")), |
|
|
"docid": datasets.Sequence(datasets.Value("string")), |
|
|
"rel": datasets.Sequence(datasets.Value("int64")), |
|
|
}, |
|
|
} |
|
|
), |
|
|
homepage="https://github.com/joaopalotti/trectools", |
|
|
) |
|
|
|
|
|
def _compute(self, references, predictions): |
|
|
"""Returns the TREC evaluation scores.""" |
|
|
|
|
|
if len(predictions) > 1 or len(references) > 1: |
|
|
raise ValueError( |
|
|
f"You can only pass one prediction and reference per evaluation. You passed {len(predictions)} prediction(s) and {len(references)} reference(s)." |
|
|
) |
|
|
|
|
|
df_run = pd.DataFrame(predictions[0]) |
|
|
df_qrel = pd.DataFrame(references[0]) |
|
|
|
|
|
trec_run = TrecRun() |
|
|
trec_run.filename = "placeholder.file" |
|
|
trec_run.run_data = df_run |
|
|
|
|
|
trec_qrel = TrecQrel() |
|
|
trec_qrel.filename = "placeholder.file" |
|
|
trec_qrel.qrels_data = df_qrel |
|
|
|
|
|
trec_eval = TrecEval(trec_run, trec_qrel) |
|
|
|
|
|
result = {} |
|
|
result["runid"] = trec_eval.run.get_runid() |
|
|
result["num_ret"] = trec_eval.get_retrieved_documents(per_query=False) |
|
|
result["num_rel"] = trec_eval.get_relevant_documents(per_query=False) |
|
|
result["num_rel_ret"] = trec_eval.get_relevant_retrieved_documents(per_query=False) |
|
|
result["num_q"] = len(trec_eval.run.topics()) |
|
|
result["map"] = trec_eval.get_map(depth=10000, per_query=False, trec_eval=True) |
|
|
result["gm_map"] = trec_eval.get_geometric_map(depth=10000, trec_eval=True) |
|
|
result["bpref"] = trec_eval.get_bpref(depth=1000, per_query=False, trec_eval=True) |
|
|
result["Rprec"] = trec_eval.get_rprec(depth=1000, per_query=False, trec_eval=True) |
|
|
result["recip_rank"] = trec_eval.get_reciprocal_rank(depth=1000, per_query=False, trec_eval=True) |
|
|
|
|
|
for v in [5, 10, 15, 20, 30, 100, 200, 500, 1000]: |
|
|
result[f"P@{v}"] = trec_eval.get_precision(depth=v, per_query=False, trec_eval=True) |
|
|
for v in [5, 10, 15, 20, 30, 100, 200, 500, 1000]: |
|
|
result[f"NDCG@{v}"] = trec_eval.get_ndcg(depth=v, per_query=False, trec_eval=True) |
|
|
|
|
|
return result |
|
|
|