Spaces:
Sleeping
Sleeping
| # evaluation/dataset_loader.py | |
| import json | |
| import csv | |
| import os | |
| class DatasetLoader: | |
| """ | |
| Loads BEIR-format datasets (SciFact, NFCorpus, etc.) | |
| BEIR format: | |
| corpus.jsonl β {_id, title, text} | |
| queries.jsonl β {_id, text} | |
| qrels/*.tsv β query_id, doc_id, relevance_score | |
| Relevance scales: | |
| SciFact β binary (0 or 1) | |
| NFCorpus β graded (0, 1, 2, 3) β we keep anything >= 1 | |
| """ | |
| def __init__(self, dataset_path: str): | |
| self.dataset_path = dataset_path | |
| self.corpus_path = os.path.join(dataset_path, "corpus.jsonl") | |
| self.queries_path = os.path.join(dataset_path, "queries.jsonl") | |
| # qrels path β try test.tsv first, fallback to dev.tsv | |
| # NFCorpus ships with dev.tsv instead of test.tsv | |
| test_path = os.path.join(dataset_path, "qrels", "test.tsv") | |
| dev_path = os.path.join(dataset_path, "qrels", "dev.tsv") | |
| if os.path.exists(test_path): | |
| self.qrels_path = test_path | |
| elif os.path.exists(dev_path): | |
| self.qrels_path = dev_path | |
| print(f"[INFO] test.tsv not found, using dev.tsv for qrels") | |
| else: | |
| raise FileNotFoundError( | |
| f"No qrels file found in {os.path.join(dataset_path, 'qrels')} β " | |
| f"expected test.tsv or dev.tsv" | |
| ) | |
| def load_corpus(self) -> dict: | |
| """ | |
| Load all documents from corpus.jsonl. | |
| Returns: | |
| dict β {doc_id: {"title": str, "text": str}} | |
| """ | |
| corpus = {} | |
| with open(self.corpus_path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| doc = json.loads(line.strip()) | |
| doc_id = str(doc["_id"]) | |
| corpus[doc_id] = { | |
| "title": doc.get("title", ""), | |
| "text": doc.get("text", ""), | |
| } | |
| print(f"Loaded {len(corpus)} documents from corpus") | |
| return corpus | |
| def load_queries(self) -> dict: | |
| """ | |
| Load test queries from queries.jsonl. | |
| Returns: | |
| dict β {query_id: query_text} | |
| """ | |
| queries = {} | |
| with open(self.queries_path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| q = json.loads(line.strip()) | |
| queries[str(q["_id"])] = q["text"] | |
| print(f"Loaded {len(queries)} queries") | |
| return queries | |
| def load_qrels(self) -> dict: | |
| """ | |
| Load relevance judgments from qrels file. | |
| Handles both: | |
| SciFact β binary relevance (0 or 1) | |
| NFCorpus β graded relevance (0, 1, 2, 3) β keep score >= 1 | |
| Returns: | |
| dict β {query_id: {doc_id: relevance_score}} | |
| """ | |
| qrels = {} | |
| with open(self.qrels_path, "r", encoding="utf-8") as f: | |
| reader = csv.reader(f, delimiter="\t") | |
| next(reader) # skip header: query-id corpus-id score | |
| for row in reader: | |
| if len(row) < 3: | |
| continue | |
| query_id = str(row[0]) | |
| doc_id = str(row[1]) | |
| score = int(row[2]) | |
| # skip completely irrelevant docs | |
| # this handles both binary (0/1) and graded (0/1/2/3) | |
| if score < 1: | |
| continue | |
| if query_id not in qrels: | |
| qrels[query_id] = {} | |
| qrels[query_id][doc_id] = score | |
| print(f"Loaded qrels for {len(qrels)} queries " | |
| f"from {os.path.basename(self.qrels_path)}") | |
| return qrels | |
| if __name__ == "__main__": | |
| import sys | |
| # pass dataset path as argument or default to scifact | |
| # usage: python -m evaluation.dataset_loader data/nfcorpus | |
| path = sys.argv[1] if len(sys.argv) > 1 else "data/scifact" | |
| loader = DatasetLoader(path) | |
| corpus = loader.load_corpus() | |
| queries = loader.load_queries() | |
| qrels = loader.load_qrels() | |
| # show a sample | |
| sample_qid = list(queries.keys())[0] | |
| print(f"\nSample query [{sample_qid}]: {queries[sample_qid]}") | |
| print(f"Relevant docs : {qrels.get(sample_qid, {})}") |