File size: 5,674 Bytes
a00ac7f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import collections
import numpy as np
import string
import logging
import json
import os
import sys
import evaluate
from dataclasses import dataclass, field
from typing import Optional
from transformers import (
AutoModelForQuestionAnswering,
AutoTokenizer,
EvalPrediction,
TrainingArguments,
DefaultDataCollator,
)
from utils_qa import load_dataset
from utils_qa import postprocess_qa_predictions
from trainer_qa import QuestionAnsweringTrainer
dataset_path = 'data/train.json'
model_checkpoint = 'xlm-roberta-base'
if __name__ == '__main__':
# Load the raw dataset which contains context, question and answers
raw_dataset = load_dataset(dataset_path)
# Load the pretrained tokenizer and model from huggingface.co
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
# Validation preprocessing
def preprocess_function(examples):
'''
help to create a tokenized dataset which should finally be used to train the my question answering model
'''
examples['question'] = [q.lstrip() for q in examples['question']]
tokenized_examples = tokenizer(
examples['question'],
examples['context'],
truncation = "only_second",
max_length = tokenizer.model_max_length,
return_offsets_mapping = True,
padding = "max_length",
)
# The offset mappings will give us a map from token to character position in the original context. This will
# help us compute the start_positions and end_positions.
offset_mapping = tokenized_examples.pop("offset_mapping")
assert(len(offset_mapping) == len(tokenized_examples['input_ids']))
# Let's label those examples!
tokenized_examples["start_positions"] = []
tokenized_examples["end_positions"] = []
for i, offset in enumerate(offset_mapping):
input_ids = tokenized_examples["input_ids"][i]
cls_index = input_ids.index(tokenizer.cls_token_id)
# Grab the sequence corresponding to that example (to know what is the context and what is the question).
sequence_ids = tokenized_examples.sequence_ids(i)
answers = examples['answers'][i]
# If no answers are given, set the cls_index as answer.
tokenized_examples["start_positions"].append(cls_index)
tokenized_examples["end_positions"].append(cls_index)
if len(answers) == 0:
continue
# Find the start and end of the context
context_start = sequence_ids.index(1)
context_end = sequence_ids[context_start:].index(None) + context_start - 1
start_char = answers[0]["answer_start"]
end_char = start_char + len(answers[0]["text"])
# If the answer is not fully inside the context, label it (0, 0)
if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
continue
# Otherwise it's the start and end token positions
token_start_index = context_start
token_end_index = context_end
while token_start_index < len(offset) and offset[token_start_index][0] <= start_char: token_start_index += 1
while token_end_index >= 0 and offset[token_end_index][1] >= end_char: token_end_index -= 1
tokenized_examples["start_positions"][-1] = token_start_index - 1
tokenized_examples["end_positions"][-1] = token_end_index + 1
return tokenized_examples
# Create train features from raw dataset
tokenized_dataset = raw_dataset.map(preprocess_function, batched = True, remove_columns = ['title', 'context', 'question'])
# Post-processing:
def post_processing_function(features, tokenizer, predictions, stage = "eval"):
# Post-processing: we match the start logits and end logits to answers in the original context.
predictions = postprocess_qa_predictions(
features = features,
tokenizer = tokenizer,
predictions = predictions
)
formatted_predictions = [
{"id": k,
"prediction_text": v,
"no_answer_probability": 0.0
} for k, v in predictions.items()
]
references = [{"id": ft["id"], "answers": ft["answers"]} for ft in features]
return EvalPrediction(predictions = formatted_predictions, label_ids = references)
metric = evaluate.load("squad_v2")
def compute_metrics(p: EvalPrediction):
return metric.compute(predictions = p.predictions,
references = p.label_ids)
data_collator = DefaultDataCollator()
training_args = TrainingArguments(
output_dir = "./results",
evaluation_strategy = 'steps',
learning_rate = 2e-5,
per_device_train_batch_size = 16,
per_device_eval_batch_size = 16,
save_total_limit = 1,
save_steps = 1000,
eval_steps = 1000,
num_train_epochs = 10,
weight_decay = 0.01,
)
trainer = QuestionAnsweringTrainer(
model=model,
args=training_args,
train_dataset = tokenized_dataset["train"],
eval_dataset = tokenized_dataset["valid"],
tokenizer = tokenizer,
data_collator = data_collator,
post_process_function=post_processing_function,
compute_metrics = compute_metrics,
)
trainer.train()
|