SQuAD / utils /squad_preprocess.py
tnp554's picture
feat: deploy SQuAD backend with all AI models
09daf0b
from utils.preprocess import tokenize
def process_sample(sample):
context_tokens = tokenize(sample["context"])
question_tokens = tokenize(sample["question"])
answer_tokens = tokenize(sample["answer_text"])
# 🔥 Combine question + context
tokens = question_tokens + ["[SEP]"] + context_tokens
start = -1
for i in range(len(context_tokens)):
if context_tokens[i:i+len(answer_tokens)] == answer_tokens:
start = i + len(question_tokens) + 1
break
if start == -1:
return None
end = start + len(answer_tokens) - 1
return {
"tokens": tokens,
"start": start,
"end": end
}