Vibescribe / train.py
shaheerawan3's picture
Update train.py
65778eb verified
# train.py
import torch
from datasets import load_dataset
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# Step 1: Define our evaluation metrics
def compute_metrics(pred):
"""
Calculate accuracy, precision, recall, and F1 score
Args:
pred: predictions from the model
Returns:
dict: containing all metrics
"""
labels = pred.label_ids # True labels
preds = pred.predictions.argmax(-1) # Model predictions
# Calculate all metrics
precision, recall, f1, _ = precision_recall_fscore_support(
labels,
preds,
average='binary'
)
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
def train_model():
# Step 2: Load the IMDB dataset
# This dataset contains movie reviews labeled as positive or negative
print("Loading dataset...")
dataset = load_dataset("imdb")
# Step 3: Initialize our model and tokenizer
# We use DistilBERT as it's smaller and faster than BERT
print("Loading tokenizer and model...")
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=2 # Binary classification: positive or negative
)
# Step 4: Create tokenization function
def tokenize_function(examples):
"""
Tokenize the input text data
Args:
examples: batch of examples from dataset
Returns:
tokenized examples
"""
return tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=512 # Maximum length of input text
)
# Step 5: Tokenize the dataset
print("Tokenizing dataset...")
tokenized_datasets = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset["train"].column_names
)
# Step 6: Define training arguments
print("Setting up training arguments...")
training_args = TrainingArguments(
output_dir="./results", # Directory to save model checkpoints
learning_rate=2e-5, # Learning rate
per_device_train_batch_size=16, # Batch size for training
per_device_eval_batch_size=16, # Batch size for evaluation
num_train_epochs=3, # Number of training epochs
weight_decay=0.01, # Weight decay for regularization
evaluation_strategy="epoch", # Evaluate after each epoch
save_strategy="epoch", # Save model after each epoch
load_best_model_at_end=True, # Load best model at end of training
push_to_hub=True, # Push model to Hugging Face Hub
hub_model_id="shaheerawan3/Vibescribe" # Replace with your username
)
# Step 7: Initialize the trainer
print("Initializing trainer...")
trainer = Trainer(
model=model, # The model to train
args=training_args, # Training arguments
train_dataset=tokenized_datasets["train"], # Training dataset
eval_dataset=tokenized_datasets["test"], # Evaluation dataset
tokenizer=tokenizer, # Tokenizer
data_collator=DataCollatorWithPadding(tokenizer=tokenizer), # Handles padding
compute_metrics=compute_metrics # Evaluation metrics
)
# Step 8: Train the model
print("Starting training...")
trainer.train()
# Step 9: Push model to Hugging Face Hub
print("Pushing model to Hugging Face Hub...")
trainer.push_to_hub()
if __name__ == "__main__":
train_model()