# train.py import torch from datasets import load_dataset from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding ) from sklearn.metrics import accuracy_score, precision_recall_fscore_support # Step 1: Define our evaluation metrics def compute_metrics(pred): """ Calculate accuracy, precision, recall, and F1 score Args: pred: predictions from the model Returns: dict: containing all metrics """ labels = pred.label_ids # True labels preds = pred.predictions.argmax(-1) # Model predictions # Calculate all metrics precision, recall, f1, _ = precision_recall_fscore_support( labels, preds, average='binary' ) acc = accuracy_score(labels, preds) return { 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall } def train_model(): # Step 2: Load the IMDB dataset # This dataset contains movie reviews labeled as positive or negative print("Loading dataset...") dataset = load_dataset("imdb") # Step 3: Initialize our model and tokenizer # We use DistilBERT as it's smaller and faster than BERT print("Loading tokenizer and model...") model_name = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=2 # Binary classification: positive or negative ) # Step 4: Create tokenization function def tokenize_function(examples): """ Tokenize the input text data Args: examples: batch of examples from dataset Returns: tokenized examples """ return tokenizer( examples["text"], truncation=True, padding="max_length", max_length=512 # Maximum length of input text ) # Step 5: Tokenize the dataset print("Tokenizing dataset...") tokenized_datasets = dataset.map( tokenize_function, batched=True, remove_columns=dataset["train"].column_names ) # Step 6: Define training arguments print("Setting up training arguments...") training_args = TrainingArguments( output_dir="./results", # Directory to save model checkpoints learning_rate=2e-5, # Learning rate per_device_train_batch_size=16, # Batch size for training per_device_eval_batch_size=16, # Batch size for evaluation num_train_epochs=3, # Number of training epochs weight_decay=0.01, # Weight decay for regularization evaluation_strategy="epoch", # Evaluate after each epoch save_strategy="epoch", # Save model after each epoch load_best_model_at_end=True, # Load best model at end of training push_to_hub=True, # Push model to Hugging Face Hub hub_model_id="shaheerawan3/Vibescribe" # Replace with your username ) # Step 7: Initialize the trainer print("Initializing trainer...") trainer = Trainer( model=model, # The model to train args=training_args, # Training arguments train_dataset=tokenized_datasets["train"], # Training dataset eval_dataset=tokenized_datasets["test"], # Evaluation dataset tokenizer=tokenizer, # Tokenizer data_collator=DataCollatorWithPadding(tokenizer=tokenizer), # Handles padding compute_metrics=compute_metrics # Evaluation metrics ) # Step 8: Train the model print("Starting training...") trainer.train() # Step 9: Push model to Hugging Face Hub print("Pushing model to Hugging Face Hub...") trainer.push_to_hub() if __name__ == "__main__": train_model()