chiapudding
/

question-answering-2

Model card Files Files and versions

chiapudding commited on Mar 9, 2023

Commit

0c732eb

·

1 Parent(s): 8504c73

upload model

Files changed (1) hide show

qatransformer2.py +91 -0

qatransformer2.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator
+squad = load_dataset("squad", split="train[:5000]")
+squad = squad.train_test_split(test_size=0.2)
+# preprocess
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+def preprocess_function(examples):
+    questions = [q.strip() for q in examples["question"]]
+    inputs = tokenizer(
+        questions,
+        examples["context"],
+        max_length=384,
+        truncation="only_second",
+        return_offsets_mapping=True,
+        padding="max_length",
+    )
+    offset_mapping = inputs.pop("offset_mapping")
+    answers = examples["answers"]
+    start_positions = []
+    end_positions = []
+    for i, offset in enumerate(offset_mapping):
+        answer = answers[i]
+        start_char = answer["answer_start"][0]
+        end_char = answer["answer_start"][0] + len(answer["text"][0])
+        sequence_ids = inputs.sequence_ids(i)
+        # Find the start and end of the context
+        idx = 0
+        while sequence_ids[idx] != 1:
+            idx += 1
+        context_start = idx
+        while sequence_ids[idx] == 1:
+            idx += 1
+        context_end = idx - 1
+        # If the answer is not fully inside the context, label it (0, 0)
+        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+            start_positions.append(0)
+            end_positions.append(0)
+        else:
+            # Otherwise it's the start and end token positions
+            idx = context_start
+            while idx <= context_end and offset[idx][0] <= start_char:
+                idx += 1
+            start_positions.append(idx - 1)
+            idx = context_end
+            while idx >= context_start and offset[idx][1] >= end_char:
+                idx -= 1
+            end_positions.append(idx + 1)
+    inputs["start_positions"] = start_positions
+    inputs["end_positions"] = end_positions
+    return inputs
+# train
+train_dataset = squad["train"].map(preprocess_function, batched=True)
+eval_dataset = squad["test"].map(preprocess_function, batched=True)
+model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
+training_args = TrainingArguments(
+    output_dir="question-answering",
+    evaluation_strategy="epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    push_to_hub=True,
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+)
+trainer.train()
+#evaluation - todo