DylanJHJ's picture
download
raw
2.02 kB
import csv
import json
import logging
import os
from collections import defaultdict, OrderedDict
from typing import Optional
from crux.tools.neuclir.ir_utils import load_topic, get_qrel
from datasets import load_dataset
logger = logging.getLogger(__name__)
def load(
dataset_name: str,
query_fields: Optional[list] = None,
doc_fields: Optional[list] = None,
ignore_corpus: bool = False,
) -> tuple[dict[str, dict[str, str]], dict[str, str], dict[str, dict[str, int]]]:
queries = load_topic()
logger.info("Query Example: %s", list(queries.values())[0])
qrels = get_qrel()
logger.info("Qrel Example: %s", list(qrels.values())[0])
if ignore_corpus:
return None, queries, qrels
# [TODO] revise this to fit all the document format
# CoveR's top1000: /home/dju/trec2026/data/neuclir/neuclir24-relevant-docs.jsonl
# All: /home/dju/scratch/neuclir1/*.processed.jsonl.gz
ds = load_dataset('json',
data_files='/home/dju/trec2026/data/neuclir/neuclir24-relevant-docs.jsonl.gz',
num_proc=3,
split='train'
)
corpus = {example["id"]: {"contents": example["title"] + " " + example["text"]} \
for example in ds}
logger.info("Doc Example: %s", list(corpus.values())[0])
return corpus, queries, qrels
# [deprecated] will use the function above instead
def load_run(path, topk=100):
run_dict = defaultdict(list)
with open(path, 'r') as f:
for line in f:
qid, _, docid, rank, score, _ = line.strip().split()
if int(rank) <= (topk or 9999):
run_dict[str(qid)] += [(docid, float(rank), float(score))]
# sort by score and return static dictionary
sorted_run_dict = OrderedDict()
for qid, docid_ranks in run_dict.items():
sorted_docid_ranks = sorted(docid_ranks, key=lambda x: x[1], reverse=False)
sorted_run_dict[qid] = {docid: rel_score for docid, rel_rank, rel_score in sorted_docid_ranks}
return sorted_run_dict

Xet Storage Details

Size:
2.02 kB
·
Xet hash:
7f6f4b78fe066d6c51a90acb47914e544002dae5d48fedf0d80e307653dce94a

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.