import torch from transformers import ( DataCollatorWithPadding, AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, ) from split_data import make_train_data # Check for GPU device = "cuda" if torch.cuda.is_available() else "cpu" # Load model and tokeniser id2label = {0: "NEGATIVE", 1: "POSITIVE"} label2id = {"NEGATIVE": 0, "POSITIVE": 1} model = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id, #Add dropout for hidden and attention layers hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.3 ).to(device) tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Preprocessing function def tokenize_func(data): return tokenizer(data["text"], truncation=True) # Load and pre-process dataset train_data, validation_data = make_train_data() tokenized_train_data = train_data.map(tokenize_func, batched=True) tokenized_validation_data = validation_data.map(tokenize_func, batched=True) # Data collator data_collator = DataCollatorWithPadding(tokenizer) steps_per_epoch = len(tokenized_train_data) // 16 logging_steps = steps_per_epoch // 25 # Training arguments training_args = TrainingArguments( output_dir='./finetuned', learning_rate=1.0e-5, per_device_train_batch_size=32, num_train_epochs=2, save_total_limit=2, #Weight decay weight_decay=0.01, fp16=torch.cuda.is_available(), logging_dir='./logs', logging_steps=logging_steps, eval_strategy="steps", eval_steps=logging_steps, save_strategy="steps", save_steps=logging_steps, ) # Trainer instance trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=tokenized_train_data, eval_dataset=tokenized_validation_data, ) # Train trainer.train() trainer.save_model() tokenizer.save_pretrained('./finetuned')