chiapudding
/

question-answering-2

Model card Files Files and versions

question-answering-2 / qatransformer2.py

chiapudding's picture

added comments and more explicit training loop

f3420c2 almost 3 years ago

history blame contribute delete

2.93 kB

	from datasets import load_dataset
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator
	from transformers.optimization import AdamW
	from transformers.data.data_collator import default_data_collator

	squad = load_dataset("squad", split="train[:5000]")
	squad = squad.train_test_split(test_size=0.2)


	tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

	def preprocess_function(examples):
	questions = [q.strip() for q in examples["question"]]
	inputs = tokenizer(
	questions,
	examples["context"],
	max_length=384,
	truncation="only_second",
	return_offsets_mapping=True,
	padding="max_length",
	)

	offset_mapping = inputs.pop("offset_mapping")
	answers = examples["answers"]
	start_positions = []
	end_positions = []

	for i, offset in enumerate(offset_mapping):
	answer = answers[i]
	start_char = answer["answer_start"][0]
	end_char = answer["answer_start"][0] + len(answer["text"][0])
	sequence_ids = inputs.sequence_ids(i)

	# Find the start and end of the context
	idx = 0
	while sequence_ids[idx] != 1:
	idx += 1
	context_start = idx
	while sequence_ids[idx] == 1:
	idx += 1
	context_end = idx - 1

	# If the answer is not fully inside the context, label it (0, 0)
	if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
	start_positions.append(0)
	end_positions.append(0)
	else:
	# Otherwise it's the start and end token positions
	idx = context_start
	while idx <= context_end and offset[idx][0] <= start_char:
	idx += 1
	start_positions.append(idx - 1)

	idx = context_end
	while idx >= context_start and offset[idx][1] >= end_char:
	idx -= 1
	end_positions.append(idx + 1)

	inputs["start_positions"] = start_positions
	inputs["end_positions"] = end_positions
	return inputs

	# Define the model
	model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

	# Define the optimization algorithm
	optimizer = AdamW(model.parameters(), lr=2e-5)

	# Define the loss function
	loss_fn = default_data_collator

	# Define the training arguments
	training_args = TrainingArguments(
	output_dir="question-answering",
	evaluation_strategy="epoch",
	learning_rate=2e-5,
	per_device_train_batch_size=16,
	per_device_eval_batch_size=16,
	num_train_epochs=3,
	weight_decay=0.01,
	push_to_hub=True,
	)

	# Define the trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	tokenizer=tokenizer,
	data_collator=loss_fn,
	optimizer=optimizer,
	)

	# Train the model
	trainer.train()



	#evaluation - todo