Spaces:
Sleeping
Sleeping
File size: 4,328 Bytes
bb04c5f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | # evaluation/dataset_loader.py
import json
import csv
import os
class DatasetLoader:
"""
Loads BEIR-format datasets (SciFact, NFCorpus, etc.)
BEIR format:
corpus.jsonl β {_id, title, text}
queries.jsonl β {_id, text}
qrels/*.tsv β query_id, doc_id, relevance_score
Relevance scales:
SciFact β binary (0 or 1)
NFCorpus β graded (0, 1, 2, 3) β we keep anything >= 1
"""
def __init__(self, dataset_path: str):
self.dataset_path = dataset_path
self.corpus_path = os.path.join(dataset_path, "corpus.jsonl")
self.queries_path = os.path.join(dataset_path, "queries.jsonl")
# qrels path β try test.tsv first, fallback to dev.tsv
# NFCorpus ships with dev.tsv instead of test.tsv
test_path = os.path.join(dataset_path, "qrels", "test.tsv")
dev_path = os.path.join(dataset_path, "qrels", "dev.tsv")
if os.path.exists(test_path):
self.qrels_path = test_path
elif os.path.exists(dev_path):
self.qrels_path = dev_path
print(f"[INFO] test.tsv not found, using dev.tsv for qrels")
else:
raise FileNotFoundError(
f"No qrels file found in {os.path.join(dataset_path, 'qrels')} β "
f"expected test.tsv or dev.tsv"
)
def load_corpus(self) -> dict:
"""
Load all documents from corpus.jsonl.
Returns:
dict β {doc_id: {"title": str, "text": str}}
"""
corpus = {}
with open(self.corpus_path, "r", encoding="utf-8") as f:
for line in f:
doc = json.loads(line.strip())
doc_id = str(doc["_id"])
corpus[doc_id] = {
"title": doc.get("title", ""),
"text": doc.get("text", ""),
}
print(f"Loaded {len(corpus)} documents from corpus")
return corpus
def load_queries(self) -> dict:
"""
Load test queries from queries.jsonl.
Returns:
dict β {query_id: query_text}
"""
queries = {}
with open(self.queries_path, "r", encoding="utf-8") as f:
for line in f:
q = json.loads(line.strip())
queries[str(q["_id"])] = q["text"]
print(f"Loaded {len(queries)} queries")
return queries
def load_qrels(self) -> dict:
"""
Load relevance judgments from qrels file.
Handles both:
SciFact β binary relevance (0 or 1)
NFCorpus β graded relevance (0, 1, 2, 3) β keep score >= 1
Returns:
dict β {query_id: {doc_id: relevance_score}}
"""
qrels = {}
with open(self.qrels_path, "r", encoding="utf-8") as f:
reader = csv.reader(f, delimiter="\t")
next(reader) # skip header: query-id corpus-id score
for row in reader:
if len(row) < 3:
continue
query_id = str(row[0])
doc_id = str(row[1])
score = int(row[2])
# skip completely irrelevant docs
# this handles both binary (0/1) and graded (0/1/2/3)
if score < 1:
continue
if query_id not in qrels:
qrels[query_id] = {}
qrels[query_id][doc_id] = score
print(f"Loaded qrels for {len(qrels)} queries "
f"from {os.path.basename(self.qrels_path)}")
return qrels
if __name__ == "__main__":
import sys
# pass dataset path as argument or default to scifact
# usage: python -m evaluation.dataset_loader data/nfcorpus
path = sys.argv[1] if len(sys.argv) > 1 else "data/scifact"
loader = DatasetLoader(path)
corpus = loader.load_corpus()
queries = loader.load_queries()
qrels = loader.load_qrels()
# show a sample
sample_qid = list(queries.keys())[0]
print(f"\nSample query [{sample_qid}]: {queries[sample_qid]}")
print(f"Relevant docs : {qrels.get(sample_qid, {})}") |