| """ |
| Model Training Tests |
| |
| These tests verify that the model training process works correctly: |
| - Training completes without errors |
| - Loss decreases over epochs |
| - No overfitting on a single batch |
| - Training works on different devices (CPU, GPU if available) |
| |
| Based on the "Testing Models" section from the behavioral testing framework. |
| """ |
| import pytest |
| import numpy as np |
| import torch |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.multioutput import MultiOutputClassifier |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import f1_score |
| from pathlib import Path |
|
|
| from hopcroft_skill_classification_tool_competition.config import DATA_PATHS, TRAINING_CONFIG |
|
|
|
|
| @pytest.mark.training |
| class TestModelTraining: |
| """Test suite for model training validation.""" |
| |
| def test_training_completes_without_errors(self): |
| """ |
| Test that training completes without raising exceptions. |
| |
| Uses a small subset of data for fast testing. |
| """ |
| |
| X = np.load(DATA_PATHS["features"])[:100] |
| Y = np.load(DATA_PATHS["labels"])[:100] |
| |
| |
| col_sums = Y.sum(axis=0) |
| valid_cols = col_sums > 0 |
| Y = Y[:, valid_cols] |
| |
| X_train, X_test, Y_train, Y_test = train_test_split( |
| X, Y, test_size=0.2, random_state=42 |
| ) |
| |
| |
| base_model = RandomForestClassifier( |
| n_estimators=10, |
| max_depth=5, |
| random_state=42, |
| n_jobs=-1 |
| ) |
| model = MultiOutputClassifier(base_model) |
| |
| |
| try: |
| model.fit(X_train, Y_train) |
| predictions = model.predict(X_test) |
| assert predictions.shape == Y_test.shape, "Prediction shape mismatch" |
| except Exception as e: |
| pytest.fail(f"Training failed with error: {e}") |
| |
| def test_decreasing_loss_after_training(self): |
| """ |
| Test that loss decreases after one training epoch. |
| |
| We verify this by checking that the model performs better than random. |
| """ |
| |
| X = np.load(DATA_PATHS["features"])[:200] |
| Y = np.load(DATA_PATHS["labels"])[:200] |
| |
| |
| col_sums = Y.sum(axis=0) |
| valid_cols = col_sums > 0 |
| Y = Y[:, valid_cols] |
| |
| X_train, X_test, Y_train, Y_test = train_test_split( |
| X, Y, test_size=0.2, random_state=42 |
| ) |
| |
| |
| base_model = RandomForestClassifier( |
| n_estimators=20, |
| max_depth=5, |
| random_state=42, |
| n_jobs=-1 |
| ) |
| model = MultiOutputClassifier(base_model) |
| model.fit(X_train, Y_train) |
| |
| |
| Y_pred = model.predict(X_test) |
| |
| |
| f1 = f1_score(Y_test, Y_pred, average='micro', zero_division=0) |
| |
| print(f"\nF1 Score after training: {f1:.4f}") |
| |
| |
| |
| assert f1 > 0.1, ( |
| f"Model F1 score ({f1:.4f}) is too low, " |
| "suggests training didn't improve performance" |
| ) |
| |
| def test_overfitting_on_single_batch(self): |
| """ |
| Test that model can overfit on a single batch. |
| |
| A model should be able to memorize a small dataset (overfitting check). |
| This verifies the model has sufficient capacity to learn. |
| """ |
| |
| X = np.load(DATA_PATHS["features"])[:20] |
| Y = np.load(DATA_PATHS["labels"])[:20] |
| |
| |
| col_sums = Y.sum(axis=0) |
| valid_cols = col_sums > 0 |
| Y = Y[:, valid_cols] |
| |
| |
| base_model = RandomForestClassifier( |
| n_estimators=50, |
| max_depth=None, |
| min_samples_split=2, |
| random_state=42, |
| n_jobs=-1 |
| ) |
| model = MultiOutputClassifier(base_model) |
| model.fit(X, Y) |
| |
| |
| Y_pred = model.predict(X) |
| |
| |
| accuracy = (Y_pred == Y).mean() |
| |
| print(f"\nTraining accuracy (should overfit): {accuracy:.4f}") |
| |
| |
| assert accuracy > 0.7, ( |
| f"Model cannot overfit on small dataset (accuracy: {accuracy:.4f}). " |
| "This suggests the model lacks capacity to learn." |
| ) |
| |
| def test_training_on_cpu(self): |
| """ |
| Test that training works on CPU. |
| """ |
| |
| X = np.load(DATA_PATHS["features"])[:50] |
| Y = np.load(DATA_PATHS["labels"])[:50] |
| |
| |
| col_sums = Y.sum(axis=0) |
| valid_cols = col_sums > 0 |
| Y = Y[:, valid_cols] |
| |
| |
| base_model = RandomForestClassifier( |
| n_estimators=10, |
| max_depth=5, |
| random_state=42, |
| n_jobs=1 |
| ) |
| model = MultiOutputClassifier(base_model) |
| |
| try: |
| model.fit(X, Y) |
| predictions = model.predict(X) |
| assert predictions.shape == Y.shape |
| print("\n[PASS] Training on CPU successful") |
| except Exception as e: |
| pytest.fail(f"Training on CPU failed: {e}") |
| |
| def test_training_on_multiple_cores(self): |
| """ |
| Test that training works with parallel processing (multiple CPU cores). |
| """ |
| |
| X = np.load(DATA_PATHS["features"])[:50] |
| Y = np.load(DATA_PATHS["labels"])[:50] |
| |
| |
| col_sums = Y.sum(axis=0) |
| valid_cols = col_sums > 0 |
| Y = Y[:, valid_cols] |
| |
| |
| base_model = RandomForestClassifier( |
| n_estimators=10, |
| max_depth=5, |
| random_state=42, |
| n_jobs=-1 |
| ) |
| model = MultiOutputClassifier(base_model) |
| |
| try: |
| model.fit(X, Y) |
| predictions = model.predict(X) |
| assert predictions.shape == Y.shape |
| print("\n[PASS] Training with multiple CPU cores successful") |
| except Exception as e: |
| pytest.fail(f"Training with multiple cores failed: {e}") |
| |
| @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") |
| def test_training_on_gpu(self): |
| """ |
| Test that training works on GPU (if available). |
| |
| Note: RandomForest doesn't use GPU, but this test demonstrates |
| the pattern for models that do (like neural networks). |
| """ |
| |
| |
| assert torch.cuda.is_available(), "GPU should be available" |
| print(f"\n[PASS] GPU detected: {torch.cuda.get_device_name(0)}") |
| print("Note: RandomForest uses CPU. This test verifies GPU availability.") |
| |
| def test_reproducibility_with_random_seed(self): |
| """ |
| Test that training is reproducible when using the same random seed. |
| """ |
| |
| X = np.load(DATA_PATHS["features"])[:50] |
| Y = np.load(DATA_PATHS["labels"])[:50] |
| |
| |
| col_sums = Y.sum(axis=0) |
| valid_cols = col_sums > 0 |
| Y = Y[:, valid_cols] |
| |
| |
| model1 = MultiOutputClassifier( |
| RandomForestClassifier( |
| n_estimators=10, |
| max_depth=5, |
| random_state=42, |
| n_jobs=-1 |
| ) |
| ) |
| model1.fit(X, Y) |
| pred1 = model1.predict(X) |
| |
| |
| model2 = MultiOutputClassifier( |
| RandomForestClassifier( |
| n_estimators=10, |
| max_depth=5, |
| random_state=42, |
| n_jobs=-1 |
| ) |
| ) |
| model2.fit(X, Y) |
| pred2 = model2.predict(X) |
| |
| |
| assert np.array_equal(pred1, pred2), ( |
| "Models with same random seed should produce identical predictions" |
| ) |
| print("\n[PASS] Training is reproducible with random seed") |
| |
| def test_model_improves_with_more_data(self): |
| """ |
| Test that model performance improves with more training data. |
| """ |
| X_full = np.load(DATA_PATHS["features"])[:500] |
| Y_full = np.load(DATA_PATHS["labels"])[:500] |
| |
| |
| col_sums = Y_full.sum(axis=0) |
| valid_cols = col_sums > 0 |
| Y_full = Y_full[:, valid_cols] |
| |
| |
| X_train_full, X_test, Y_train_full, Y_test = train_test_split( |
| X_full, Y_full, test_size=0.2, random_state=42 |
| ) |
| |
| |
| X_small = X_train_full[:50] |
| Y_small = Y_train_full[:50] |
| |
| model_small = MultiOutputClassifier( |
| RandomForestClassifier( |
| n_estimators=20, |
| max_depth=5, |
| random_state=42, |
| n_jobs=-1 |
| ) |
| ) |
| model_small.fit(X_small, Y_small) |
| pred_small = model_small.predict(X_test) |
| f1_small = f1_score(Y_test, pred_small, average='micro', zero_division=0) |
| |
| |
| model_large = MultiOutputClassifier( |
| RandomForestClassifier( |
| n_estimators=20, |
| max_depth=5, |
| random_state=42, |
| n_jobs=-1 |
| ) |
| ) |
| model_large.fit(X_train_full, Y_train_full) |
| pred_large = model_large.predict(X_test) |
| f1_large = f1_score(Y_test, pred_large, average='micro', zero_division=0) |
| |
| print(f"\nF1 with 50 samples: {f1_small:.4f}") |
| print(f"F1 with {len(X_train_full)} samples: {f1_large:.4f}") |
| |
| |
| |
| assert f1_large >= f1_small * 0.9, ( |
| f"Model with more data ({f1_large:.4f}) should not perform " |
| f"significantly worse than with less data ({f1_small:.4f})" |
| ) |
| |
| def test_model_saves_and_loads_correctly(self, tmp_path): |
| """ |
| Test that trained model can be saved and loaded without errors. |
| """ |
| import joblib |
| |
| |
| X = np.load(DATA_PATHS["features"])[:50] |
| Y = np.load(DATA_PATHS["labels"])[:50] |
| |
| |
| col_sums = Y.sum(axis=0) |
| valid_cols = col_sums > 0 |
| Y = Y[:, valid_cols] |
| |
| |
| model = MultiOutputClassifier( |
| RandomForestClassifier( |
| n_estimators=10, |
| max_depth=5, |
| random_state=42, |
| n_jobs=-1 |
| ) |
| ) |
| model.fit(X, Y) |
| pred_original = model.predict(X) |
| |
| |
| model_path = tmp_path / "test_model.pkl" |
| joblib.dump(model, model_path) |
| |
| |
| loaded_model = joblib.load(model_path) |
| pred_loaded = loaded_model.predict(X) |
| |
| |
| assert np.array_equal(pred_original, pred_loaded), ( |
| "Loaded model should produce identical predictions" |
| ) |
| print("\n[PASS] Model saves and loads correctly") |
|
|