DylanJHJ's picture
download
raw
2.41 kB
import csv
import json
import logging
import os
import ir_datasets
from collections import defaultdict, OrderedDict
from typing import Optional
logger = logging.getLogger(__name__)
def load(
dataset_name: str,
query_fields: Optional[list] = None,
doc_fields: Optional[list] = None,
ignore_corpus: bool = False,
) -> tuple[dict[str, dict[str, str]], dict[str, str], dict[str, dict[str, int]]]:
dataset = ir_datasets.load(dataset_name)
corpus, queries, qrels = {}, {}, {}
# ids
query_fields = ['text'] if query_fields is None else query_fields
doc_fields = ['text'] if doc_fields is None else doc_fields
logger.info("Loading Queries...")
for query in dataset.queries_iter():
query_contents = [getattr(query, f) for f in query_fields]
query_contents = " ".join(query_contents)
queries[query.query_id] = query_contents
logger.info("Query Example: %s", list(queries.values())[0])
logger.info("Loading Qrels...")
n = 0
for qrel in dataset.qrels_iter():
n += 1
if qrel.query_id not in qrels:
qrels[qrel.query_id] = {qrel.doc_id: qrel.relevance}
else:
qrels[qrel.query_id][qrel.doc_id] = qrel.relevance
if ignore_corpus:
return None, queries, qrels
# TODO revise this to fit all the document format
# NOTE do we want to mention the title here
logger.info("Loading Corpus...")
for doc in dataset.docs_iter():
contents = [getattr(doc, f) for f in doc_fields]
contents = " ".join(contents)
corpus[doc.doc_id] = {"contents": contents}
logger.info("Doc Example: %s", list(corpus.values())[0])
return corpus, queries, qrels
def load_run(path, topk=100):
run_dict = defaultdict(list)
logger.info("Loading Run File from %s", path)
with open(path, 'r') as f:
for line in f:
qid, _, docid, rank, score, _ = line.strip().split()
if int(rank) <= (topk or 9999):
run_dict[str(qid)] += [(docid, float(rank), float(score))]
# sort by score and return static dictionary
sorted_run_dict = OrderedDict()
for qid, docid_ranks in run_dict.items():
sorted_docid_ranks = sorted(docid_ranks, key=lambda x: x[1], reverse=False)
sorted_run_dict[qid] = {docid: rel_score for docid, rel_rank, rel_score in sorted_docid_ranks}
return sorted_run_dict

Xet Storage Details

Size:
2.41 kB
·
Xet hash:
4b6c71590eb5ade05a58b0fd071465df05fb13e6cdb78db4652142c9db0886f7

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.