Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments | |
| from sklearn.model_selection import train_test_split | |
| data = pd.read_csv('data/train_data.csv') | |
| queries = data['query'].tolist() | |
| arguments = data['arguments'].tolist() | |
| train_queries, eval_queries, train_arguments, eval_arguments = train_test_split(queries, arguments, test_size=0.2, random_state=42) | |
| tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large") | |
| model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large") | |
| train_encodings = tokenizer(train_queries, truncation=True, padding=True) | |
| eval_encodings = tokenizer(eval_queries, truncation=True, padding=True) | |
| with tokenizer.as_target_tokenizer(): | |
| train_labels = tokenizer(train_arguments, truncation=True, padding=True) | |
| eval_labels = tokenizer(eval_arguments, truncation=True, padding=True) | |
| class PlotDataset(torch.utils.data.Dataset): | |
| def __init__(self, encodings, labels): | |
| self.encodings = encodings | |
| self.labels = labels | |
| def __getitem__(self, idx): | |
| item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
| item['labels'] = torch.tensor(self.labels['input_ids'][idx]) | |
| return item | |
| def __len__(self): | |
| return len(self.encodings.input_ids) | |
| train_dataset = PlotDataset(train_encodings, train_labels) | |
| eval_dataset = PlotDataset(eval_encodings, eval_labels) | |
| training_args = Seq2SeqTrainingArguments( | |
| output_dir='./results', | |
| per_device_train_batch_size=2, | |
| per_device_eval_batch_size=2, | |
| num_train_epochs=3, | |
| logging_dir='./logs', | |
| logging_steps=10, | |
| save_steps=500, | |
| save_total_limit=2, | |
| evaluation_strategy="epoch", | |
| predict_with_generate=True, | |
| generation_max_length=100, | |
| ) | |
| trainer = Seq2SeqTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| tokenizer=tokenizer, | |
| ) | |
| trainer.train() | |
| trainer.save_model("fine-tuned-bart-large") | |
| tokenizer.save_pretrained("fine-tuned-bart-large") | |
| print("Model and tokenizer fine-tuned and saved as 'fine-tuned-bart-large'") |