File size: 4,328 Bytes
bb04c5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# evaluation/dataset_loader.py

import json
import csv
import os


class DatasetLoader:
    """

    Loads BEIR-format datasets (SciFact, NFCorpus, etc.)



    BEIR format:

        corpus.jsonl  β€” {_id, title, text}

        queries.jsonl β€” {_id, text}

        qrels/*.tsv   β€” query_id, doc_id, relevance_score



    Relevance scales:

        SciFact  β€” binary (0 or 1)

        NFCorpus β€” graded (0, 1, 2, 3)  β†’ we keep anything >= 1

    """

    def __init__(self, dataset_path: str):
        self.dataset_path = dataset_path
        self.corpus_path  = os.path.join(dataset_path, "corpus.jsonl")
        self.queries_path = os.path.join(dataset_path, "queries.jsonl")

        # qrels path β€” try test.tsv first, fallback to dev.tsv
        # NFCorpus ships with dev.tsv instead of test.tsv
        test_path = os.path.join(dataset_path, "qrels", "test.tsv")
        dev_path  = os.path.join(dataset_path, "qrels", "dev.tsv")

        if os.path.exists(test_path):
            self.qrels_path = test_path
        elif os.path.exists(dev_path):
            self.qrels_path = dev_path
            print(f"[INFO] test.tsv not found, using dev.tsv for qrels")
        else:
            raise FileNotFoundError(
                f"No qrels file found in {os.path.join(dataset_path, 'qrels')} β€” "
                f"expected test.tsv or dev.tsv"
            )

    def load_corpus(self) -> dict:
        """

        Load all documents from corpus.jsonl.



        Returns:

            dict β€” {doc_id: {"title": str, "text": str}}

        """
        corpus = {}
        with open(self.corpus_path, "r", encoding="utf-8") as f:
            for line in f:
                doc    = json.loads(line.strip())
                doc_id = str(doc["_id"])
                corpus[doc_id] = {
                    "title": doc.get("title", ""),
                    "text":  doc.get("text",  ""),
                }
        print(f"Loaded {len(corpus)} documents from corpus")
        return corpus

    def load_queries(self) -> dict:
        """

        Load test queries from queries.jsonl.



        Returns:

            dict β€” {query_id: query_text}

        """
        queries = {}
        with open(self.queries_path, "r", encoding="utf-8") as f:
            for line in f:
                q = json.loads(line.strip())
                queries[str(q["_id"])] = q["text"]
        print(f"Loaded {len(queries)} queries")
        return queries

    def load_qrels(self) -> dict:
        """

        Load relevance judgments from qrels file.



        Handles both:

            SciFact  β€” binary relevance (0 or 1)

            NFCorpus β€” graded relevance (0, 1, 2, 3) β†’ keep score >= 1



        Returns:

            dict β€” {query_id: {doc_id: relevance_score}}

        """
        qrels = {}

        with open(self.qrels_path, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t")
            next(reader)  # skip header: query-id  corpus-id  score

            for row in reader:
                if len(row) < 3:
                    continue

                query_id = str(row[0])
                doc_id   = str(row[1])
                score    = int(row[2])

                # skip completely irrelevant docs
                # this handles both binary (0/1) and graded (0/1/2/3)
                if score < 1:
                    continue

                if query_id not in qrels:
                    qrels[query_id] = {}

                qrels[query_id][doc_id] = score

        print(f"Loaded qrels for {len(qrels)} queries "
              f"from {os.path.basename(self.qrels_path)}")
        return qrels


if __name__ == "__main__":
    import sys

    # pass dataset path as argument or default to scifact
    # usage: python -m evaluation.dataset_loader data/nfcorpus
    path   = sys.argv[1] if len(sys.argv) > 1 else "data/scifact"
    loader = DatasetLoader(path)

    corpus  = loader.load_corpus()
    queries = loader.load_queries()
    qrels   = loader.load_qrels()

    # show a sample
    sample_qid = list(queries.keys())[0]
    print(f"\nSample query  [{sample_qid}]: {queries[sample_qid]}")
    print(f"Relevant docs : {qrels.get(sample_qid, {})}")