File size: 5,674 Bytes
a00ac7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import collections
import numpy as np
import string

import logging
import json
import os
import sys
import evaluate

from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    EvalPrediction,
    TrainingArguments,
    DefaultDataCollator,
)

from utils_qa import load_dataset
from utils_qa import postprocess_qa_predictions

from trainer_qa import QuestionAnsweringTrainer

dataset_path = 'data/train.json'
model_checkpoint = 'xlm-roberta-base'

if __name__ == '__main__':
    # Load the raw dataset which contains context, question and answers
    raw_dataset = load_dataset(dataset_path)

    # Load the pretrained tokenizer and model from huggingface.co
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model     = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

    # Validation preprocessing
    def preprocess_function(examples):
        '''
        help to create a tokenized dataset which should finally be used to train the my question answering model
        '''
        examples['question'] = [q.lstrip() for q in examples['question']]
        tokenized_examples = tokenizer(
            examples['question'],
            examples['context'],
            truncation = "only_second",
            max_length = tokenizer.model_max_length,
            return_offsets_mapping = True,
            padding = "max_length",
        )
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")

        assert(len(offset_mapping) == len(tokenized_examples['input_ids']))

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offset in enumerate(offset_mapping):
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)
            answers = examples['answers'][i]

            # If no answers are given, set the cls_index as answer.
            
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)

            if len(answers) == 0:
                continue
            
            # Find the start and end of the context
            context_start = sequence_ids.index(1)
            context_end   = sequence_ids[context_start:].index(None) + context_start - 1
            
            start_char = answers[0]["answer_start"]
            end_char   = start_char + len(answers[0]["text"])

            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                continue
            
            # Otherwise it's the start and end token positions
            token_start_index = context_start
            token_end_index   = context_end

            while token_start_index < len(offset) and offset[token_start_index][0] <= start_char:   token_start_index += 1
            while token_end_index  >= 0           and offset[token_end_index][1]   >= end_char:     token_end_index -= 1

            tokenized_examples["start_positions"][-1] = token_start_index - 1
            tokenized_examples["end_positions"][-1]   = token_end_index + 1

        return  tokenized_examples
    
    # Create train features from raw dataset
    tokenized_dataset = raw_dataset.map(preprocess_function, batched = True, remove_columns = ['title', 'context', 'question'])

    # Post-processing:
    def post_processing_function(features, tokenizer, predictions, stage = "eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions = postprocess_qa_predictions(
            features = features,
            tokenizer = tokenizer,
            predictions = predictions
        )
        formatted_predictions = [
            {"id": k,
            "prediction_text": v,
            "no_answer_probability": 0.0
            } for k, v in predictions.items()
        ]
        references = [{"id": ft["id"], "answers": ft["answers"]} for ft in features]

        return  EvalPrediction(predictions = formatted_predictions, label_ids = references)

    metric = evaluate.load("squad_v2")

    def compute_metrics(p: EvalPrediction):
        return metric.compute(predictions = p.predictions,
                              references  = p.label_ids)
    
    data_collator = DefaultDataCollator()

    training_args = TrainingArguments(
        output_dir = "./results",
        evaluation_strategy = 'steps',
        learning_rate = 2e-5,
        per_device_train_batch_size = 16,
        per_device_eval_batch_size  = 16,
        save_total_limit = 1,
        save_steps = 1000, 
        eval_steps = 1000,
        num_train_epochs = 10,
        weight_decay = 0.01,
    )
    trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset = tokenized_dataset["train"],
        eval_dataset  = tokenized_dataset["valid"],
        tokenizer = tokenizer,
        data_collator = data_collator,
        post_process_function=post_processing_function,
        compute_metrics = compute_metrics,
    )
    trainer.train()