my-training-data / train.py
maxxcarl's picture
Upload train.py with huggingface_hub
ab0a6e9 verified
Raw
History Blame Contribute Delete
5.53 kB
"""
Training script for ML Pipeline on Kaggle with HF Spaces storage.
This script:
1. Downloads training data from Hugging Face Datasets
2. Trains a model using GPU acceleration
3. Pushes the trained model to Hugging Face Model Hub
"""
import os
from config import MODEL_REPO_ID, DATASET_REPO_ID, validate_config
# Validate configuration before proceeding
validate_config()
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
)
from sklearn.metrics import accuracy_score, f1_score
import torch
def load_training_data():
"""Load dataset from Hugging Face."""
print(f"Loading dataset...")
# Option 1: Load from HF Datasets Hub (public datasets)
# Using Spotify Songs dataset: https://huggingface.co/datasets/gem1925/spotify_songs
try:
print("Loading Spotify songs dataset...")
dataset = load_dataset("gem1925/spotify_songs", split="train")
print(f"βœ“ Loaded Spotify dataset: {len(dataset)} samples")
# Convert to text classification format
from datasets import Dataset
# Use song name + artist as text, energy level as label
texts = [f"{row['song_name']} by {row['artist']}" for row in dataset]
labels = [1 if row['energy'] > 0.5 else 0 for row in dataset]
# Create new dataset with text/label format
new_dataset = Dataset.from_dict({"text": texts, "label": labels})
new_dataset = new_dataset.train_test_split(test_size=0.2)
return new_dataset
except Exception as e:
print(f"Could not load Spotify dataset: {e}")
print("Falling back to IMDB reviews dataset...")
# Fallback to IMDB
try:
dataset = load_dataset("imdb")
print(f"βœ“ Loaded IMDB dataset")
return dataset
except Exception as e2:
print(f"Could not load IMDB: {e2}")
print("Using sample dataset for demonstration...")
from datasets import Dataset
sample_data = {
"text": [
"I love this product! It works great.",
"Terrible experience, would not recommend.",
"Amazing quality and fast shipping.",
"Waste of money, broke after one day.",
] * 100,
"label": [1, 0, 1, 0] * 100,
}
dataset = Dataset.from_dict(sample_data).train_test_split(test_size=0.2)
return dataset
return dataset
def compute_metrics(eval_pred):
"""Compute evaluation metrics."""
predictions, labels = eval_pred
predictions = predictions.argmax(axis=1)
return {
"accuracy": accuracy_score(labels, predictions),
"f1": f1_score(labels, predictions),
}
def main():
"""Main training function."""
print("=" * 50)
print("πŸš€ ML Training Pipeline - Starting")
print("=" * 50)
# Check GPU availability
if torch.cuda.is_available():
print(f"βœ“ GPU Available: {torch.cuda.get_device_name(0)}")
else:
print("⚠ No GPU detected, training on CPU")
# Load data
print("\nπŸ“Š Loading training data...")
dataset = load_training_data()
print(f"βœ“ Dataset loaded: {len(dataset['train'])} training samples")
# Load tokenizer and model
print("\nπŸ€– Loading model and tokenizer...")
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=2
)
# Tokenize data
print("\nπŸ“ Tokenizing data...")
def tokenize(batch):
return tokenizer(
batch["text"],
padding="max_length",
truncation=True,
max_length=128
)
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format(
type="torch",
columns=["input_ids", "attention_mask", "labels"]
)
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=100,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=True,
hub_model_id=MODEL_REPO_ID,
hub_token=os.getenv("HF_TOKEN"),
)
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
compute_metrics=compute_metrics,
)
# Train
print("\nπŸ”₯ Starting training...")
trainer.train()
# Evaluate
print("\nπŸ“ˆ Evaluating model...")
results = trainer.evaluate()
print(f"βœ“ Evaluation results: {results}")
# Push to HF Hub
print(f"\nπŸ’Ύ Pushing model to Hugging Face: {MODEL_REPO_ID}")
trainer.push_to_hub()
print(f"βœ“ Model successfully pushed to: https://huggingface.co/{MODEL_REPO_ID}")
print("\n" + "=" * 50)
print("βœ… Training Complete!")
print("=" * 50)
if __name__ == "__main__":
main()