|
|
import collections |
|
|
import numpy as np |
|
|
import string |
|
|
|
|
|
import logging |
|
|
import json |
|
|
import os |
|
|
import sys |
|
|
import evaluate |
|
|
|
|
|
from dataclasses import dataclass, field |
|
|
from typing import Optional |
|
|
|
|
|
from transformers import ( |
|
|
AutoModelForQuestionAnswering, |
|
|
AutoTokenizer, |
|
|
EvalPrediction, |
|
|
TrainingArguments, |
|
|
DefaultDataCollator, |
|
|
) |
|
|
|
|
|
from utils_qa import load_dataset |
|
|
from utils_qa import postprocess_qa_predictions |
|
|
|
|
|
from trainer_qa import QuestionAnsweringTrainer |
|
|
|
|
|
dataset_path = 'data/train.json' |
|
|
model_checkpoint = 'xlm-roberta-base' |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
raw_dataset = load_dataset(dataset_path) |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
|
|
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint) |
|
|
|
|
|
|
|
|
def preprocess_function(examples): |
|
|
''' |
|
|
help to create a tokenized dataset which should finally be used to train the my question answering model |
|
|
''' |
|
|
examples['question'] = [q.lstrip() for q in examples['question']] |
|
|
tokenized_examples = tokenizer( |
|
|
examples['question'], |
|
|
examples['context'], |
|
|
truncation = "only_second", |
|
|
max_length = tokenizer.model_max_length, |
|
|
return_offsets_mapping = True, |
|
|
padding = "max_length", |
|
|
) |
|
|
|
|
|
|
|
|
offset_mapping = tokenized_examples.pop("offset_mapping") |
|
|
|
|
|
assert(len(offset_mapping) == len(tokenized_examples['input_ids'])) |
|
|
|
|
|
|
|
|
tokenized_examples["start_positions"] = [] |
|
|
tokenized_examples["end_positions"] = [] |
|
|
|
|
|
for i, offset in enumerate(offset_mapping): |
|
|
input_ids = tokenized_examples["input_ids"][i] |
|
|
cls_index = input_ids.index(tokenizer.cls_token_id) |
|
|
|
|
|
|
|
|
sequence_ids = tokenized_examples.sequence_ids(i) |
|
|
answers = examples['answers'][i] |
|
|
|
|
|
|
|
|
|
|
|
tokenized_examples["start_positions"].append(cls_index) |
|
|
tokenized_examples["end_positions"].append(cls_index) |
|
|
|
|
|
if len(answers) == 0: |
|
|
continue |
|
|
|
|
|
|
|
|
context_start = sequence_ids.index(1) |
|
|
context_end = sequence_ids[context_start:].index(None) + context_start - 1 |
|
|
|
|
|
start_char = answers[0]["answer_start"] |
|
|
end_char = start_char + len(answers[0]["text"]) |
|
|
|
|
|
|
|
|
if offset[context_start][0] > end_char or offset[context_end][1] < start_char: |
|
|
continue |
|
|
|
|
|
|
|
|
token_start_index = context_start |
|
|
token_end_index = context_end |
|
|
|
|
|
while token_start_index < len(offset) and offset[token_start_index][0] <= start_char: token_start_index += 1 |
|
|
while token_end_index >= 0 and offset[token_end_index][1] >= end_char: token_end_index -= 1 |
|
|
|
|
|
tokenized_examples["start_positions"][-1] = token_start_index - 1 |
|
|
tokenized_examples["end_positions"][-1] = token_end_index + 1 |
|
|
|
|
|
return tokenized_examples |
|
|
|
|
|
|
|
|
tokenized_dataset = raw_dataset.map(preprocess_function, batched = True, remove_columns = ['title', 'context', 'question']) |
|
|
|
|
|
|
|
|
def post_processing_function(features, tokenizer, predictions, stage = "eval"): |
|
|
|
|
|
predictions = postprocess_qa_predictions( |
|
|
features = features, |
|
|
tokenizer = tokenizer, |
|
|
predictions = predictions |
|
|
) |
|
|
formatted_predictions = [ |
|
|
{"id": k, |
|
|
"prediction_text": v, |
|
|
"no_answer_probability": 0.0 |
|
|
} for k, v in predictions.items() |
|
|
] |
|
|
references = [{"id": ft["id"], "answers": ft["answers"]} for ft in features] |
|
|
|
|
|
return EvalPrediction(predictions = formatted_predictions, label_ids = references) |
|
|
|
|
|
metric = evaluate.load("squad_v2") |
|
|
|
|
|
def compute_metrics(p: EvalPrediction): |
|
|
return metric.compute(predictions = p.predictions, |
|
|
references = p.label_ids) |
|
|
|
|
|
data_collator = DefaultDataCollator() |
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir = "./results", |
|
|
evaluation_strategy = 'steps', |
|
|
learning_rate = 2e-5, |
|
|
per_device_train_batch_size = 16, |
|
|
per_device_eval_batch_size = 16, |
|
|
save_total_limit = 1, |
|
|
save_steps = 1000, |
|
|
eval_steps = 1000, |
|
|
num_train_epochs = 10, |
|
|
weight_decay = 0.01, |
|
|
) |
|
|
trainer = QuestionAnsweringTrainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset = tokenized_dataset["train"], |
|
|
eval_dataset = tokenized_dataset["valid"], |
|
|
tokenizer = tokenizer, |
|
|
data_collator = data_collator, |
|
|
post_process_function=post_processing_function, |
|
|
compute_metrics = compute_metrics, |
|
|
) |
|
|
trainer.train() |
|
|
|