combi2k2
/

MRC001

Vietnamese

xlm-roberta

Model card Files Files and versions

xet

Community

combi2k2 commited on Sep 18, 2022

Commit

a00ac7f

1 Parent(s): 846f8be

Running this file starts training the model

Browse files

Files changed (1) hide show

run_qa.py +152 -0

run_qa.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import collections
+import numpy as np
+import string
+import logging
+import json
+import os
+import sys
+import evaluate
+from dataclasses import dataclass, field
+from typing import Optional
+from transformers import (
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
+    EvalPrediction,
+    TrainingArguments,
+    DefaultDataCollator,
+)
+from utils_qa import load_dataset
+from utils_qa import postprocess_qa_predictions
+from trainer_qa import QuestionAnsweringTrainer
+dataset_path = 'data/train.json'
+model_checkpoint = 'xlm-roberta-base'
+if __name__ == '__main__':
+    # Load the raw dataset which contains context, question and answers
+    raw_dataset = load_dataset(dataset_path)
+    # Load the pretrained tokenizer and model from huggingface.co
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+    model     = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
+    # Validation preprocessing
+    def preprocess_function(examples):
+        '''
+        help to create a tokenized dataset which should finally be used to train the my question answering model
+        '''
+        examples['question'] = [q.lstrip() for q in examples['question']]
+        tokenized_examples = tokenizer(
+            examples['question'],
+            examples['context'],
+            truncation = "only_second",
+            max_length = tokenizer.model_max_length,
+            return_offsets_mapping = True,
+            padding = "max_length",
+        )
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+        assert(len(offset_mapping) == len(tokenized_examples['input_ids']))
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+        for i, offset in enumerate(offset_mapping):
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            answers = examples['answers'][i]
+            # If no answers are given, set the cls_index as answer.
+            tokenized_examples["start_positions"].append(cls_index)
+            tokenized_examples["end_positions"].append(cls_index)
+            if len(answers) == 0:
+                continue
+            # Find the start and end of the context
+            context_start = sequence_ids.index(1)
+            context_end   = sequence_ids[context_start:].index(None) + context_start - 1
+            start_char = answers[0]["answer_start"]
+            end_char   = start_char + len(answers[0]["text"])
+            # If the answer is not fully inside the context, label it (0, 0)
+            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+                continue
+            # Otherwise it's the start and end token positions
+            token_start_index = context_start
+            token_end_index   = context_end
+            while token_start_index < len(offset) and offset[token_start_index][0] <= start_char:   token_start_index += 1
+            while token_end_index  >= 0           and offset[token_end_index][1]   >= end_char:     token_end_index -= 1
+            tokenized_examples["start_positions"][-1] = token_start_index - 1
+            tokenized_examples["end_positions"][-1]   = token_end_index + 1
+        return  tokenized_examples
+    # Create train features from raw dataset
+    tokenized_dataset = raw_dataset.map(preprocess_function, batched = True, remove_columns = ['title', 'context', 'question'])
+    # Post-processing:
+    def post_processing_function(features, tokenizer, predictions, stage = "eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions = postprocess_qa_predictions(
+            features = features,
+            tokenizer = tokenizer,
+            predictions = predictions
+        )
+        formatted_predictions = [
+            {"id": k,
+            "prediction_text": v,
+            "no_answer_probability": 0.0
+            } for k, v in predictions.items()
+        ]
+        references = [{"id": ft["id"], "answers": ft["answers"]} for ft in features]
+        return  EvalPrediction(predictions = formatted_predictions, label_ids = references)
+    metric = evaluate.load("squad_v2")
+    def compute_metrics(p: EvalPrediction):
+        return metric.compute(predictions = p.predictions,
+                              references  = p.label_ids)
+    data_collator = DefaultDataCollator()
+    training_args = TrainingArguments(
+        output_dir = "./results",
+        evaluation_strategy = 'steps',
+        learning_rate = 2e-5,
+        per_device_train_batch_size = 16,
+        per_device_eval_batch_size  = 16,
+        save_total_limit = 1,
+        save_steps = 1000,
+        eval_steps = 1000,
+        num_train_epochs = 10,
+        weight_decay = 0.01,
+    )
+    trainer = QuestionAnsweringTrainer(
+        model=model,
+        args=training_args,
+        train_dataset = tokenized_dataset["train"],
+        eval_dataset  = tokenized_dataset["valid"],
+        tokenizer = tokenizer,
+        data_collator = data_collator,
+        post_process_function=post_processing_function,
+        compute_metrics = compute_metrics,
+    )
+    trainer.train()