Shreshth2002's picture
Upload folder using huggingface_hub
9cbb56b verified
"""
Training and evaluation logic for DistilBERT sentiment analysis
File: train.py
"""
# Hugging Face imports
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
Trainer,
TrainingArguments,
logging
)
# Local imports
from utils import compute_metrics
from datasets import load_dataset
# Standard library imports
import torch
import numpy as np
import pandas as pd
# Sklearn metrics
from sklearn.metrics import (
precision_recall_fscore_support,
accuracy_score,
confusion_matrix
)
# Suppress HF log spam
logging.set_verbosity_error()
# ===== DATASET LOADING =====
def load_imdb_data(subset_size=None):
"""Load IMDB dataset with optional subsampling"""
dataset = load_dataset("imdb")
# Optional subsetting for memory constraints
if subset_size:
dataset["train"] = dataset["train"].select(range(subset_size))
dataset["test"] = dataset["test"].select(range(min(subset_size // 4, len(dataset["test"]))))
print(f"Dataset loaded - Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")
return dataset
# ===== PREPROCESSING =====
def preprocess_data(dataset, max_length=256):
"""Tokenize and prepare dataset for training"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=max_length
)
# Tokenize both splits
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Rename label column and set format
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
return tokenized_dataset, tokenizer
# ===== MODEL LOADING =====
def load_model():
"""Load pre-trained DistilBERT model for sequence classification"""
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased",
num_labels=2,
return_dict=True
)
return model
# ===== TRAINING SETUP =====
def get_training_args():
"""Define training arguments"""
return TrainingArguments(
output_dir="./model",
per_device_train_batch_size=2,
per_device_eval_batch_size=4,
gradient_accumulation_steps=2,
num_train_epochs=3,
eval_strategy="epoch", # Changed from evaluation_strategy
save_strategy="epoch",
logging_dir="./logs",
logging_steps=50,
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
seed=42
)
def setup_trainer(model, tokenizer, train_dataset, eval_dataset):
"""Initialize Trainer with model and datasets"""
training_args = get_training_args()
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
return trainer
# ===== TRAIN & EVALUATE =====
def train_model(trainer):
"""Train the model"""
print("Starting training...")
trainer.train()
print("Training completed!")
def evaluate_model(trainer):
"""Evaluate the trained model"""
print("Evaluating model...")
results = trainer.evaluate()
print("=== Evaluation Results ===")
for key, value in results.items():
print(f"{key}: {value:.4f}")
return results
# ===== SAVE MODEL =====
def save_model(trainer, tokenizer, save_path="./model"):
"""Save trained model and tokenizer"""
print(f"Saving model to {save_path}...")
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print("Model saved successfully!")