Spaces:
No application file
No application file
| from datasets import load_dataset, load_metric | |
| raw_datasets = load_dataset("wiki_qa") | |
| dataset = raw_datasets['test'].train_test_split(train_size=0.67, seed=42) | |
| raw_datasets["validation"]=dataset.pop("test") | |
| raw_datasets['test']= dataset['train'] | |
| print(raw_datasets) | |
| raw_datasets.set_format('pandas') | |
| print('n\n\n\ntraining_labels:\n', raw_datasets['train']['label'].value_counts(),'\n\n', | |
| 'validation_labels:\n', raw_datasets['validation']['label'].value_counts(),'\n\n', | |
| 'testing_labels:\n',raw_datasets['test']['label'].value_counts()) | |
| raw_datasets.reset_format() | |
| from transformers import GPT2Config, GPT2ForSequenceClassification, GPT2Tokenizer | |
| # Load the GPT-2 tokenizer | |
| tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
| # Load the GPT-2 configuration | |
| config = GPT2Config.from_pretrained("gpt2") | |
| # Modify the configuration for sequence classification | |
| config.num_labels = 2 # Specify the number of classes for your classification task | |
| config.pad_token_id = tokenizer.eos_token_id | |
| # Initialize the GPT-2 model for sequence classification | |
| model = GPT2ForSequenceClassification.from_pretrained("gpt2", config=config) | |
| tokenizer.pad_token = tokenizer.eos_token | |
| def tokenize_function(examples): | |
| # Tokenize the question and answer text | |
| question_inputs = tokenizer(examples['question'], padding='max_length', truncation=True, return_tensors='pt', max_length=800) | |
| answer_inputs = tokenizer(examples['answer'], padding='max_length', truncation=True, return_tensors='pt', max_length=800) | |
| # Combine question and answer inputs | |
| inputs = { | |
| 'input_ids': question_inputs['input_ids'], | |
| 'attention_mask': question_inputs['attention_mask'], | |
| 'answer_input_ids': answer_inputs['input_ids'], | |
| 'answer_attention_mask': answer_inputs['attention_mask'], | |
| } | |
| return inputs | |
| # Tokenize the train, test, and validation datasets | |
| tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) | |
| from transformers import Trainer, TrainingArguments | |
| # Training arguments | |
| training_args = TrainingArguments( | |
| output_dir="./output", | |
| num_train_epochs=3, | |
| evaluation_strategy="steps", | |
| save_total_limit=2, | |
| per_device_train_batch_size=4, | |
| per_device_eval_batch_size=4, | |
| save_steps=200, | |
| eval_steps=200, | |
| logging_steps=200, | |
| fp16=True, | |
| ) | |
| # Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_datasets['train'], | |
| eval_dataset=tokenized_datasets['validation'], | |
| ) | |
| # Train the model | |
| trainer.train() | |
| # Evaluate on the test dataset | |
| results = trainer.evaluate(tokenized_datasets['test']) | |
| print(results) |