| """ |
| Training script for ML Pipeline on Kaggle with HF Spaces storage. |
| |
| This script: |
| 1. Downloads training data from Hugging Face Datasets |
| 2. Trains a model using GPU acceleration |
| 3. Pushes the trained model to Hugging Face Model Hub |
| """ |
|
|
| import os |
| from config import MODEL_REPO_ID, DATASET_REPO_ID, validate_config |
|
|
| |
| validate_config() |
|
|
| from datasets import load_dataset |
| from transformers import ( |
| AutoTokenizer, |
| AutoModelForSequenceClassification, |
| TrainingArguments, |
| Trainer, |
| ) |
| from sklearn.metrics import accuracy_score, f1_score |
| import torch |
|
|
|
|
| def load_training_data(): |
| """Load dataset from Hugging Face.""" |
| print(f"Loading dataset...") |
| |
| |
| |
| try: |
| print("Loading Spotify songs dataset...") |
| dataset = load_dataset("gem1925/spotify_songs", split="train") |
| print(f"β Loaded Spotify dataset: {len(dataset)} samples") |
| |
| |
| from datasets import Dataset |
| |
| texts = [f"{row['song_name']} by {row['artist']}" for row in dataset] |
| labels = [1 if row['energy'] > 0.5 else 0 for row in dataset] |
| |
| |
| new_dataset = Dataset.from_dict({"text": texts, "label": labels}) |
| new_dataset = new_dataset.train_test_split(test_size=0.2) |
| return new_dataset |
| |
| except Exception as e: |
| print(f"Could not load Spotify dataset: {e}") |
| print("Falling back to IMDB reviews dataset...") |
| |
| |
| try: |
| dataset = load_dataset("imdb") |
| print(f"β Loaded IMDB dataset") |
| return dataset |
| except Exception as e2: |
| print(f"Could not load IMDB: {e2}") |
| print("Using sample dataset for demonstration...") |
| from datasets import Dataset |
| sample_data = { |
| "text": [ |
| "I love this product! It works great.", |
| "Terrible experience, would not recommend.", |
| "Amazing quality and fast shipping.", |
| "Waste of money, broke after one day.", |
| ] * 100, |
| "label": [1, 0, 1, 0] * 100, |
| } |
| dataset = Dataset.from_dict(sample_data).train_test_split(test_size=0.2) |
| return dataset |
| |
| return dataset |
|
|
|
|
| def compute_metrics(eval_pred): |
| """Compute evaluation metrics.""" |
| predictions, labels = eval_pred |
| predictions = predictions.argmax(axis=1) |
| return { |
| "accuracy": accuracy_score(labels, predictions), |
| "f1": f1_score(labels, predictions), |
| } |
|
|
|
|
| def main(): |
| """Main training function.""" |
| print("=" * 50) |
| print("π ML Training Pipeline - Starting") |
| print("=" * 50) |
| |
| |
| if torch.cuda.is_available(): |
| print(f"β GPU Available: {torch.cuda.get_device_name(0)}") |
| else: |
| print("β No GPU detected, training on CPU") |
| |
| |
| print("\nπ Loading training data...") |
| dataset = load_training_data() |
| print(f"β Dataset loaded: {len(dataset['train'])} training samples") |
| |
| |
| print("\nπ€ Loading model and tokenizer...") |
| model_name = "distilbert-base-uncased" |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForSequenceClassification.from_pretrained( |
| model_name, |
| num_labels=2 |
| ) |
| |
| |
| print("\nπ Tokenizing data...") |
| |
| def tokenize(batch): |
| return tokenizer( |
| batch["text"], |
| padding="max_length", |
| truncation=True, |
| max_length=128 |
| ) |
| |
| tokenized_dataset = dataset.map(tokenize, batched=True) |
| tokenized_dataset = tokenized_dataset.rename_column("label", "labels") |
| tokenized_dataset.set_format( |
| type="torch", |
| columns=["input_ids", "attention_mask", "labels"] |
| ) |
| |
| |
| training_args = TrainingArguments( |
| output_dir="./results", |
| num_train_epochs=3, |
| per_device_train_batch_size=16, |
| per_device_eval_batch_size=32, |
| warmup_steps=500, |
| weight_decay=0.01, |
| logging_dir="./logs", |
| logging_steps=100, |
| eval_strategy="epoch", |
| save_strategy="epoch", |
| load_best_model_at_end=True, |
| push_to_hub=True, |
| hub_model_id=MODEL_REPO_ID, |
| hub_token=os.getenv("HF_TOKEN"), |
| ) |
| |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=tokenized_dataset["train"], |
| eval_dataset=tokenized_dataset["test"], |
| compute_metrics=compute_metrics, |
| ) |
| |
| |
| print("\nπ₯ Starting training...") |
| trainer.train() |
| |
| |
| print("\nπ Evaluating model...") |
| results = trainer.evaluate() |
| print(f"β Evaluation results: {results}") |
| |
| |
| print(f"\nπΎ Pushing model to Hugging Face: {MODEL_REPO_ID}") |
| trainer.push_to_hub() |
| print(f"β Model successfully pushed to: https://huggingface.co/{MODEL_REPO_ID}") |
| |
| print("\n" + "=" * 50) |
| print("β
Training Complete!") |
| print("=" * 50) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|