""" Model Training Tests These tests verify that the model training process works correctly: - Training completes without errors - Loss decreases over epochs - No overfitting on a single batch - Training works on different devices (CPU, GPU if available) Based on the "Testing Models" section from the behavioral testing framework. """ import pytest import numpy as np import torch from sklearn.ensemble import RandomForestClassifier from sklearn.multioutput import MultiOutputClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score from pathlib import Path from hopcroft_skill_classification_tool_competition.config import DATA_PATHS, TRAINING_CONFIG @pytest.mark.training class TestModelTraining: """Test suite for model training validation.""" def test_training_completes_without_errors(self): """ Test that training completes without raising exceptions. Uses a small subset of data for fast testing. """ # Load small subset of data X = np.load(DATA_PATHS["features"])[:100] # First 100 samples Y = np.load(DATA_PATHS["labels"])[:100] # Remove zero-columns col_sums = Y.sum(axis=0) valid_cols = col_sums > 0 Y = Y[:, valid_cols] X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state=42 ) # Train simple model base_model = RandomForestClassifier( n_estimators=10, # Small number for speed max_depth=5, random_state=42, n_jobs=-1 ) model = MultiOutputClassifier(base_model) # Should not raise any exceptions try: model.fit(X_train, Y_train) predictions = model.predict(X_test) assert predictions.shape == Y_test.shape, "Prediction shape mismatch" except Exception as e: pytest.fail(f"Training failed with error: {e}") def test_decreasing_loss_after_training(self): """ Test that loss decreases after one training epoch. We verify this by checking that the model performs better than random. """ # Load small subset X = np.load(DATA_PATHS["features"])[:200] Y = np.load(DATA_PATHS["labels"])[:200] # Remove zero-columns col_sums = Y.sum(axis=0) valid_cols = col_sums > 0 Y = Y[:, valid_cols] X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state=42 ) # Train model base_model = RandomForestClassifier( n_estimators=20, max_depth=5, random_state=42, n_jobs=-1 ) model = MultiOutputClassifier(base_model) model.fit(X_train, Y_train) # Get predictions Y_pred = model.predict(X_test) # Calculate F1 score f1 = f1_score(Y_test, Y_pred, average='micro', zero_division=0) print(f"\nF1 Score after training: {f1:.4f}") # Model should perform better than random (F1 > 0.1) # Random would be around 0.05-0.1 for multi-label assert f1 > 0.1, ( f"Model F1 score ({f1:.4f}) is too low, " "suggests training didn't improve performance" ) def test_overfitting_on_single_batch(self): """ Test that model can overfit on a single batch. A model should be able to memorize a small dataset (overfitting check). This verifies the model has sufficient capacity to learn. """ # Use very small dataset (single "batch") X = np.load(DATA_PATHS["features"])[:20] Y = np.load(DATA_PATHS["labels"])[:20] # Remove zero-columns col_sums = Y.sum(axis=0) valid_cols = col_sums > 0 Y = Y[:, valid_cols] # Train on the same small dataset base_model = RandomForestClassifier( n_estimators=50, max_depth=None, # No limit for overfitting min_samples_split=2, random_state=42, n_jobs=-1 ) model = MultiOutputClassifier(base_model) model.fit(X, Y) # Predict on training data Y_pred = model.predict(X) # Calculate accuracy on training data accuracy = (Y_pred == Y).mean() print(f"\nTraining accuracy (should overfit): {accuracy:.4f}") # Should achieve high accuracy on training data (overfitting) assert accuracy > 0.7, ( f"Model cannot overfit on small dataset (accuracy: {accuracy:.4f}). " "This suggests the model lacks capacity to learn." ) def test_training_on_cpu(self): """ Test that training works on CPU. """ # Small dataset X = np.load(DATA_PATHS["features"])[:50] Y = np.load(DATA_PATHS["labels"])[:50] # Remove zero-columns col_sums = Y.sum(axis=0) valid_cols = col_sums > 0 Y = Y[:, valid_cols] # Train on CPU (RandomForest uses CPU by default) base_model = RandomForestClassifier( n_estimators=10, max_depth=5, random_state=42, n_jobs=1 # Single CPU core ) model = MultiOutputClassifier(base_model) try: model.fit(X, Y) predictions = model.predict(X) assert predictions.shape == Y.shape print("\n[PASS] Training on CPU successful") except Exception as e: pytest.fail(f"Training on CPU failed: {e}") def test_training_on_multiple_cores(self): """ Test that training works with parallel processing (multiple CPU cores). """ # Small dataset X = np.load(DATA_PATHS["features"])[:50] Y = np.load(DATA_PATHS["labels"])[:50] # Remove zero-columns col_sums = Y.sum(axis=0) valid_cols = col_sums > 0 Y = Y[:, valid_cols] # Train with all CPU cores base_model = RandomForestClassifier( n_estimators=10, max_depth=5, random_state=42, n_jobs=-1 # Use all cores ) model = MultiOutputClassifier(base_model) try: model.fit(X, Y) predictions = model.predict(X) assert predictions.shape == Y.shape print("\n[PASS] Training with multiple CPU cores successful") except Exception as e: pytest.fail(f"Training with multiple cores failed: {e}") @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_training_on_gpu(self): """ Test that training works on GPU (if available). Note: RandomForest doesn't use GPU, but this test demonstrates the pattern for models that do (like neural networks). """ # This test is skipped if no GPU is available # For RandomForest, we just verify CUDA is detected assert torch.cuda.is_available(), "GPU should be available" print(f"\n[PASS] GPU detected: {torch.cuda.get_device_name(0)}") print("Note: RandomForest uses CPU. This test verifies GPU availability.") def test_reproducibility_with_random_seed(self): """ Test that training is reproducible when using the same random seed. """ # Small dataset X = np.load(DATA_PATHS["features"])[:50] Y = np.load(DATA_PATHS["labels"])[:50] # Remove zero-columns col_sums = Y.sum(axis=0) valid_cols = col_sums > 0 Y = Y[:, valid_cols] # Train first model model1 = MultiOutputClassifier( RandomForestClassifier( n_estimators=10, max_depth=5, random_state=42, n_jobs=-1 ) ) model1.fit(X, Y) pred1 = model1.predict(X) # Train second model with same seed model2 = MultiOutputClassifier( RandomForestClassifier( n_estimators=10, max_depth=5, random_state=42, n_jobs=-1 ) ) model2.fit(X, Y) pred2 = model2.predict(X) # Predictions should be identical assert np.array_equal(pred1, pred2), ( "Models with same random seed should produce identical predictions" ) print("\n[PASS] Training is reproducible with random seed") def test_model_improves_with_more_data(self): """ Test that model performance improves with more training data. """ X_full = np.load(DATA_PATHS["features"])[:500] Y_full = np.load(DATA_PATHS["labels"])[:500] # Remove zero-columns col_sums = Y_full.sum(axis=0) valid_cols = col_sums > 0 Y_full = Y_full[:, valid_cols] # Split for testing X_train_full, X_test, Y_train_full, Y_test = train_test_split( X_full, Y_full, test_size=0.2, random_state=42 ) # Train with small dataset X_small = X_train_full[:50] Y_small = Y_train_full[:50] model_small = MultiOutputClassifier( RandomForestClassifier( n_estimators=20, max_depth=5, random_state=42, n_jobs=-1 ) ) model_small.fit(X_small, Y_small) pred_small = model_small.predict(X_test) f1_small = f1_score(Y_test, pred_small, average='micro', zero_division=0) # Train with larger dataset model_large = MultiOutputClassifier( RandomForestClassifier( n_estimators=20, max_depth=5, random_state=42, n_jobs=-1 ) ) model_large.fit(X_train_full, Y_train_full) pred_large = model_large.predict(X_test) f1_large = f1_score(Y_test, pred_large, average='micro', zero_division=0) print(f"\nF1 with 50 samples: {f1_small:.4f}") print(f"F1 with {len(X_train_full)} samples: {f1_large:.4f}") # More data should generally improve performance (or at least not degrade) # Allow small tolerance for variance assert f1_large >= f1_small * 0.9, ( f"Model with more data ({f1_large:.4f}) should not perform " f"significantly worse than with less data ({f1_small:.4f})" ) def test_model_saves_and_loads_correctly(self, tmp_path): """ Test that trained model can be saved and loaded without errors. """ import joblib # Small dataset X = np.load(DATA_PATHS["features"])[:50] Y = np.load(DATA_PATHS["labels"])[:50] # Remove zero-columns col_sums = Y.sum(axis=0) valid_cols = col_sums > 0 Y = Y[:, valid_cols] # Train model model = MultiOutputClassifier( RandomForestClassifier( n_estimators=10, max_depth=5, random_state=42, n_jobs=-1 ) ) model.fit(X, Y) pred_original = model.predict(X) # Save model model_path = tmp_path / "test_model.pkl" joblib.dump(model, model_path) # Load model loaded_model = joblib.load(model_path) pred_loaded = loaded_model.predict(X) # Predictions should be identical assert np.array_equal(pred_original, pred_loaded), ( "Loaded model should produce identical predictions" ) print("\n[PASS] Model saves and loads correctly")