File size: 680 Bytes
09daf0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from utils.preprocess import tokenize

def process_sample(sample):
    context_tokens = tokenize(sample["context"])
    question_tokens = tokenize(sample["question"])
    answer_tokens = tokenize(sample["answer_text"])

    # 🔥 Combine question + context
    tokens = question_tokens + ["[SEP]"] + context_tokens

    start = -1
    for i in range(len(context_tokens)):
        if context_tokens[i:i+len(answer_tokens)] == answer_tokens:
            start = i + len(question_tokens) + 1
            break

    if start == -1:
        return None

    end = start + len(answer_tokens) - 1

    return {
        "tokens": tokens,
        "start": start,
        "end": end
    }