| | from datasets import load_dataset |
| | from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator |
| | from transformers.optimization import AdamW |
| | from transformers.data.data_collator import default_data_collator |
| |
|
| | squad = load_dataset("squad", split="train[:5000]") |
| | squad = squad.train_test_split(test_size=0.2) |
| |
|
| |
|
| | tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") |
| |
|
| | def preprocess_function(examples): |
| | questions = [q.strip() for q in examples["question"]] |
| | inputs = tokenizer( |
| | questions, |
| | examples["context"], |
| | max_length=384, |
| | truncation="only_second", |
| | return_offsets_mapping=True, |
| | padding="max_length", |
| | ) |
| |
|
| | offset_mapping = inputs.pop("offset_mapping") |
| | answers = examples["answers"] |
| | start_positions = [] |
| | end_positions = [] |
| |
|
| | for i, offset in enumerate(offset_mapping): |
| | answer = answers[i] |
| | start_char = answer["answer_start"][0] |
| | end_char = answer["answer_start"][0] + len(answer["text"][0]) |
| | sequence_ids = inputs.sequence_ids(i) |
| |
|
| | |
| | idx = 0 |
| | while sequence_ids[idx] != 1: |
| | idx += 1 |
| | context_start = idx |
| | while sequence_ids[idx] == 1: |
| | idx += 1 |
| | context_end = idx - 1 |
| |
|
| | |
| | if offset[context_start][0] > end_char or offset[context_end][1] < start_char: |
| | start_positions.append(0) |
| | end_positions.append(0) |
| | else: |
| | |
| | idx = context_start |
| | while idx <= context_end and offset[idx][0] <= start_char: |
| | idx += 1 |
| | start_positions.append(idx - 1) |
| |
|
| | idx = context_end |
| | while idx >= context_start and offset[idx][1] >= end_char: |
| | idx -= 1 |
| | end_positions.append(idx + 1) |
| |
|
| | inputs["start_positions"] = start_positions |
| | inputs["end_positions"] = end_positions |
| | return inputs |
| |
|
| | |
| | model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased") |
| |
|
| | |
| | optimizer = AdamW(model.parameters(), lr=2e-5) |
| |
|
| | |
| | loss_fn = default_data_collator |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir="question-answering", |
| | evaluation_strategy="epoch", |
| | learning_rate=2e-5, |
| | per_device_train_batch_size=16, |
| | per_device_eval_batch_size=16, |
| | num_train_epochs=3, |
| | weight_decay=0.01, |
| | push_to_hub=True, |
| | ) |
| |
|
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=train_dataset, |
| | eval_dataset=eval_dataset, |
| | tokenizer=tokenizer, |
| | data_collator=loss_fn, |
| | optimizer=optimizer, |
| | ) |
| |
|
| | |
| | trainer.train() |
| |
|
| |
|
| |
|
| | |
| |
|