Spaces:
Sleeping
Sleeping
File size: 4,033 Bytes
9cbb56b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | """
Training and evaluation logic for DistilBERT sentiment analysis
File: train.py
"""
# Hugging Face imports
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
Trainer,
TrainingArguments,
logging
)
# Local imports
from utils import compute_metrics
from datasets import load_dataset
# Standard library imports
import torch
import numpy as np
import pandas as pd
# Sklearn metrics
from sklearn.metrics import (
precision_recall_fscore_support,
accuracy_score,
confusion_matrix
)
# Suppress HF log spam
logging.set_verbosity_error()
# ===== DATASET LOADING =====
def load_imdb_data(subset_size=None):
"""Load IMDB dataset with optional subsampling"""
dataset = load_dataset("imdb")
# Optional subsetting for memory constraints
if subset_size:
dataset["train"] = dataset["train"].select(range(subset_size))
dataset["test"] = dataset["test"].select(range(min(subset_size // 4, len(dataset["test"]))))
print(f"Dataset loaded - Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")
return dataset
# ===== PREPROCESSING =====
def preprocess_data(dataset, max_length=256):
"""Tokenize and prepare dataset for training"""
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=max_length
)
# Tokenize both splits
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Rename label column and set format
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
return tokenized_dataset, tokenizer
# ===== MODEL LOADING =====
def load_model():
"""Load pre-trained DistilBERT model for sequence classification"""
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased",
num_labels=2,
return_dict=True
)
return model
# ===== TRAINING SETUP =====
def get_training_args():
"""Define training arguments"""
return TrainingArguments(
output_dir="./model",
per_device_train_batch_size=2,
per_device_eval_batch_size=4,
gradient_accumulation_steps=2,
num_train_epochs=3,
eval_strategy="epoch", # Changed from evaluation_strategy
save_strategy="epoch",
logging_dir="./logs",
logging_steps=50,
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
seed=42
)
def setup_trainer(model, tokenizer, train_dataset, eval_dataset):
"""Initialize Trainer with model and datasets"""
training_args = get_training_args()
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
return trainer
# ===== TRAIN & EVALUATE =====
def train_model(trainer):
"""Train the model"""
print("Starting training...")
trainer.train()
print("Training completed!")
def evaluate_model(trainer):
"""Evaluate the trained model"""
print("Evaluating model...")
results = trainer.evaluate()
print("=== Evaluation Results ===")
for key, value in results.items():
print(f"{key}: {value:.4f}")
return results
# ===== SAVE MODEL =====
def save_model(trainer, tokenizer, save_path="./model"):
"""Save trained model and tokenizer"""
print(f"Saving model to {save_path}...")
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print("Model saved successfully!") |