from utils.preprocess import tokenize def process_sample(sample): context_tokens = tokenize(sample["context"]) question_tokens = tokenize(sample["question"]) answer_tokens = tokenize(sample["answer_text"]) # 🔥 Combine question + context tokens = question_tokens + ["[SEP]"] + context_tokens start = -1 for i in range(len(context_tokens)): if context_tokens[i:i+len(answer_tokens)] == answer_tokens: start = i + len(question_tokens) + 1 break if start == -1: return None end = start + len(answer_tokens) - 1 return { "tokens": tokens, "start": start, "end": end }