""" Training script for ML Pipeline on Kaggle with HF Spaces storage. This script: 1. Downloads training data from Hugging Face Datasets 2. Trains a model using GPU acceleration 3. Pushes the trained model to Hugging Face Model Hub """ import os from config import MODEL_REPO_ID, DATASET_REPO_ID, validate_config # Validate configuration before proceeding validate_config() from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, ) from sklearn.metrics import accuracy_score, f1_score import torch def load_training_data(): """Load dataset from Hugging Face.""" print(f"Loading dataset...") # Option 1: Load from HF Datasets Hub (public datasets) # Using Spotify Songs dataset: https://huggingface.co/datasets/gem1925/spotify_songs try: print("Loading Spotify songs dataset...") dataset = load_dataset("gem1925/spotify_songs", split="train") print(f"āœ“ Loaded Spotify dataset: {len(dataset)} samples") # Convert to text classification format from datasets import Dataset # Use song name + artist as text, energy level as label texts = [f"{row['song_name']} by {row['artist']}" for row in dataset] labels = [1 if row['energy'] > 0.5 else 0 for row in dataset] # Create new dataset with text/label format new_dataset = Dataset.from_dict({"text": texts, "label": labels}) new_dataset = new_dataset.train_test_split(test_size=0.2) return new_dataset except Exception as e: print(f"Could not load Spotify dataset: {e}") print("Falling back to IMDB reviews dataset...") # Fallback to IMDB try: dataset = load_dataset("imdb") print(f"āœ“ Loaded IMDB dataset") return dataset except Exception as e2: print(f"Could not load IMDB: {e2}") print("Using sample dataset for demonstration...") from datasets import Dataset sample_data = { "text": [ "I love this product! It works great.", "Terrible experience, would not recommend.", "Amazing quality and fast shipping.", "Waste of money, broke after one day.", ] * 100, "label": [1, 0, 1, 0] * 100, } dataset = Dataset.from_dict(sample_data).train_test_split(test_size=0.2) return dataset return dataset def compute_metrics(eval_pred): """Compute evaluation metrics.""" predictions, labels = eval_pred predictions = predictions.argmax(axis=1) return { "accuracy": accuracy_score(labels, predictions), "f1": f1_score(labels, predictions), } def main(): """Main training function.""" print("=" * 50) print("šŸš€ ML Training Pipeline - Starting") print("=" * 50) # Check GPU availability if torch.cuda.is_available(): print(f"āœ“ GPU Available: {torch.cuda.get_device_name(0)}") else: print("⚠ No GPU detected, training on CPU") # Load data print("\nšŸ“Š Loading training data...") dataset = load_training_data() print(f"āœ“ Dataset loaded: {len(dataset['train'])} training samples") # Load tokenizer and model print("\nšŸ¤– Loading model and tokenizer...") model_name = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=2 ) # Tokenize data print("\nšŸ“ Tokenizing data...") def tokenize(batch): return tokenizer( batch["text"], padding="max_length", truncation=True, max_length=128 ) tokenized_dataset = dataset.map(tokenize, batched=True) tokenized_dataset = tokenized_dataset.rename_column("label", "labels") tokenized_dataset.set_format( type="torch", columns=["input_ids", "attention_mask", "labels"] ) # Training arguments training_args = TrainingArguments( output_dir="./results", num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=32, warmup_steps=500, weight_decay=0.01, logging_dir="./logs", logging_steps=100, eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, push_to_hub=True, hub_model_id=MODEL_REPO_ID, hub_token=os.getenv("HF_TOKEN"), ) # Initialize trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["test"], compute_metrics=compute_metrics, ) # Train print("\nšŸ”„ Starting training...") trainer.train() # Evaluate print("\nšŸ“ˆ Evaluating model...") results = trainer.evaluate() print(f"āœ“ Evaluation results: {results}") # Push to HF Hub print(f"\nšŸ’¾ Pushing model to Hugging Face: {MODEL_REPO_ID}") trainer.push_to_hub() print(f"āœ“ Model successfully pushed to: https://huggingface.co/{MODEL_REPO_ID}") print("\n" + "=" * 50) print("āœ… Training Complete!") print("=" * 50) if __name__ == "__main__": main()