| from utils.preprocess import tokenize | |
| def process_sample(sample): | |
| context_tokens = tokenize(sample["context"]) | |
| question_tokens = tokenize(sample["question"]) | |
| answer_tokens = tokenize(sample["answer_text"]) | |
| # 🔥 Combine question + context | |
| tokens = question_tokens + ["[SEP]"] + context_tokens | |
| start = -1 | |
| for i in range(len(context_tokens)): | |
| if context_tokens[i:i+len(answer_tokens)] == answer_tokens: | |
| start = i + len(question_tokens) + 1 | |
| break | |
| if start == -1: | |
| return None | |
| end = start + len(answer_tokens) - 1 | |
| return { | |
| "tokens": tokens, | |
| "start": start, | |
| "end": end | |
| } | |