Semantic_File / evaluation /dataset_loader.py
JackSparrow89's picture
Upload 65 files
bb04c5f verified
# evaluation/dataset_loader.py
import json
import csv
import os
class DatasetLoader:
"""
Loads BEIR-format datasets (SciFact, NFCorpus, etc.)
BEIR format:
corpus.jsonl β€” {_id, title, text}
queries.jsonl β€” {_id, text}
qrels/*.tsv β€” query_id, doc_id, relevance_score
Relevance scales:
SciFact β€” binary (0 or 1)
NFCorpus β€” graded (0, 1, 2, 3) β†’ we keep anything >= 1
"""
def __init__(self, dataset_path: str):
self.dataset_path = dataset_path
self.corpus_path = os.path.join(dataset_path, "corpus.jsonl")
self.queries_path = os.path.join(dataset_path, "queries.jsonl")
# qrels path β€” try test.tsv first, fallback to dev.tsv
# NFCorpus ships with dev.tsv instead of test.tsv
test_path = os.path.join(dataset_path, "qrels", "test.tsv")
dev_path = os.path.join(dataset_path, "qrels", "dev.tsv")
if os.path.exists(test_path):
self.qrels_path = test_path
elif os.path.exists(dev_path):
self.qrels_path = dev_path
print(f"[INFO] test.tsv not found, using dev.tsv for qrels")
else:
raise FileNotFoundError(
f"No qrels file found in {os.path.join(dataset_path, 'qrels')} β€” "
f"expected test.tsv or dev.tsv"
)
def load_corpus(self) -> dict:
"""
Load all documents from corpus.jsonl.
Returns:
dict β€” {doc_id: {"title": str, "text": str}}
"""
corpus = {}
with open(self.corpus_path, "r", encoding="utf-8") as f:
for line in f:
doc = json.loads(line.strip())
doc_id = str(doc["_id"])
corpus[doc_id] = {
"title": doc.get("title", ""),
"text": doc.get("text", ""),
}
print(f"Loaded {len(corpus)} documents from corpus")
return corpus
def load_queries(self) -> dict:
"""
Load test queries from queries.jsonl.
Returns:
dict β€” {query_id: query_text}
"""
queries = {}
with open(self.queries_path, "r", encoding="utf-8") as f:
for line in f:
q = json.loads(line.strip())
queries[str(q["_id"])] = q["text"]
print(f"Loaded {len(queries)} queries")
return queries
def load_qrels(self) -> dict:
"""
Load relevance judgments from qrels file.
Handles both:
SciFact β€” binary relevance (0 or 1)
NFCorpus β€” graded relevance (0, 1, 2, 3) β†’ keep score >= 1
Returns:
dict β€” {query_id: {doc_id: relevance_score}}
"""
qrels = {}
with open(self.qrels_path, "r", encoding="utf-8") as f:
reader = csv.reader(f, delimiter="\t")
next(reader) # skip header: query-id corpus-id score
for row in reader:
if len(row) < 3:
continue
query_id = str(row[0])
doc_id = str(row[1])
score = int(row[2])
# skip completely irrelevant docs
# this handles both binary (0/1) and graded (0/1/2/3)
if score < 1:
continue
if query_id not in qrels:
qrels[query_id] = {}
qrels[query_id][doc_id] = score
print(f"Loaded qrels for {len(qrels)} queries "
f"from {os.path.basename(self.qrels_path)}")
return qrels
if __name__ == "__main__":
import sys
# pass dataset path as argument or default to scifact
# usage: python -m evaluation.dataset_loader data/nfcorpus
path = sys.argv[1] if len(sys.argv) > 1 else "data/scifact"
loader = DatasetLoader(path)
corpus = loader.load_corpus()
queries = loader.load_queries()
qrels = loader.load_qrels()
# show a sample
sample_qid = list(queries.keys())[0]
print(f"\nSample query [{sample_qid}]: {queries[sample_qid]}")
print(f"Relevant docs : {qrels.get(sample_qid, {})}")