Spaces:
Sleeping
Sleeping
| """ | |
| Model Training Tests | |
| These tests verify that the model training process works correctly: | |
| - Training completes without errors | |
| - Loss decreases over epochs | |
| - No overfitting on a single batch | |
| - Training works on different devices (CPU, GPU if available) | |
| Based on the "Testing Models" section from the behavioral testing framework. | |
| """ | |
| import pytest | |
| import numpy as np | |
| import torch | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.multioutput import MultiOutputClassifier | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import f1_score | |
| from pathlib import Path | |
| from hopcroft_skill_classification_tool_competition.config import DATA_PATHS, TRAINING_CONFIG | |
| class TestModelTraining: | |
| """Test suite for model training validation.""" | |
| def test_training_completes_without_errors(self): | |
| """ | |
| Test that training completes without raising exceptions. | |
| Uses a small subset of data for fast testing. | |
| """ | |
| # Load small subset of data | |
| X = np.load(DATA_PATHS["features"])[:100] # First 100 samples | |
| Y = np.load(DATA_PATHS["labels"])[:100] | |
| # Remove zero-columns | |
| col_sums = Y.sum(axis=0) | |
| valid_cols = col_sums > 0 | |
| Y = Y[:, valid_cols] | |
| X_train, X_test, Y_train, Y_test = train_test_split( | |
| X, Y, test_size=0.2, random_state=42 | |
| ) | |
| # Train simple model | |
| base_model = RandomForestClassifier( | |
| n_estimators=10, # Small number for speed | |
| max_depth=5, | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| model = MultiOutputClassifier(base_model) | |
| # Should not raise any exceptions | |
| try: | |
| model.fit(X_train, Y_train) | |
| predictions = model.predict(X_test) | |
| assert predictions.shape == Y_test.shape, "Prediction shape mismatch" | |
| except Exception as e: | |
| pytest.fail(f"Training failed with error: {e}") | |
| def test_decreasing_loss_after_training(self): | |
| """ | |
| Test that loss decreases after one training epoch. | |
| We verify this by checking that the model performs better than random. | |
| """ | |
| # Load small subset | |
| X = np.load(DATA_PATHS["features"])[:200] | |
| Y = np.load(DATA_PATHS["labels"])[:200] | |
| # Remove zero-columns | |
| col_sums = Y.sum(axis=0) | |
| valid_cols = col_sums > 0 | |
| Y = Y[:, valid_cols] | |
| X_train, X_test, Y_train, Y_test = train_test_split( | |
| X, Y, test_size=0.2, random_state=42 | |
| ) | |
| # Train model | |
| base_model = RandomForestClassifier( | |
| n_estimators=20, | |
| max_depth=5, | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| model = MultiOutputClassifier(base_model) | |
| model.fit(X_train, Y_train) | |
| # Get predictions | |
| Y_pred = model.predict(X_test) | |
| # Calculate F1 score | |
| f1 = f1_score(Y_test, Y_pred, average='micro', zero_division=0) | |
| print(f"\nF1 Score after training: {f1:.4f}") | |
| # Model should perform better than random (F1 > 0.1) | |
| # Random would be around 0.05-0.1 for multi-label | |
| assert f1 > 0.1, ( | |
| f"Model F1 score ({f1:.4f}) is too low, " | |
| "suggests training didn't improve performance" | |
| ) | |
| def test_overfitting_on_single_batch(self): | |
| """ | |
| Test that model can overfit on a single batch. | |
| A model should be able to memorize a small dataset (overfitting check). | |
| This verifies the model has sufficient capacity to learn. | |
| """ | |
| # Use very small dataset (single "batch") | |
| X = np.load(DATA_PATHS["features"])[:20] | |
| Y = np.load(DATA_PATHS["labels"])[:20] | |
| # Remove zero-columns | |
| col_sums = Y.sum(axis=0) | |
| valid_cols = col_sums > 0 | |
| Y = Y[:, valid_cols] | |
| # Train on the same small dataset | |
| base_model = RandomForestClassifier( | |
| n_estimators=50, | |
| max_depth=None, # No limit for overfitting | |
| min_samples_split=2, | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| model = MultiOutputClassifier(base_model) | |
| model.fit(X, Y) | |
| # Predict on training data | |
| Y_pred = model.predict(X) | |
| # Calculate accuracy on training data | |
| accuracy = (Y_pred == Y).mean() | |
| print(f"\nTraining accuracy (should overfit): {accuracy:.4f}") | |
| # Should achieve high accuracy on training data (overfitting) | |
| assert accuracy > 0.7, ( | |
| f"Model cannot overfit on small dataset (accuracy: {accuracy:.4f}). " | |
| "This suggests the model lacks capacity to learn." | |
| ) | |
| def test_training_on_cpu(self): | |
| """ | |
| Test that training works on CPU. | |
| """ | |
| # Small dataset | |
| X = np.load(DATA_PATHS["features"])[:50] | |
| Y = np.load(DATA_PATHS["labels"])[:50] | |
| # Remove zero-columns | |
| col_sums = Y.sum(axis=0) | |
| valid_cols = col_sums > 0 | |
| Y = Y[:, valid_cols] | |
| # Train on CPU (RandomForest uses CPU by default) | |
| base_model = RandomForestClassifier( | |
| n_estimators=10, | |
| max_depth=5, | |
| random_state=42, | |
| n_jobs=1 # Single CPU core | |
| ) | |
| model = MultiOutputClassifier(base_model) | |
| try: | |
| model.fit(X, Y) | |
| predictions = model.predict(X) | |
| assert predictions.shape == Y.shape | |
| print("\n[PASS] Training on CPU successful") | |
| except Exception as e: | |
| pytest.fail(f"Training on CPU failed: {e}") | |
| def test_training_on_multiple_cores(self): | |
| """ | |
| Test that training works with parallel processing (multiple CPU cores). | |
| """ | |
| # Small dataset | |
| X = np.load(DATA_PATHS["features"])[:50] | |
| Y = np.load(DATA_PATHS["labels"])[:50] | |
| # Remove zero-columns | |
| col_sums = Y.sum(axis=0) | |
| valid_cols = col_sums > 0 | |
| Y = Y[:, valid_cols] | |
| # Train with all CPU cores | |
| base_model = RandomForestClassifier( | |
| n_estimators=10, | |
| max_depth=5, | |
| random_state=42, | |
| n_jobs=-1 # Use all cores | |
| ) | |
| model = MultiOutputClassifier(base_model) | |
| try: | |
| model.fit(X, Y) | |
| predictions = model.predict(X) | |
| assert predictions.shape == Y.shape | |
| print("\n[PASS] Training with multiple CPU cores successful") | |
| except Exception as e: | |
| pytest.fail(f"Training with multiple cores failed: {e}") | |
| def test_training_on_gpu(self): | |
| """ | |
| Test that training works on GPU (if available). | |
| Note: RandomForest doesn't use GPU, but this test demonstrates | |
| the pattern for models that do (like neural networks). | |
| """ | |
| # This test is skipped if no GPU is available | |
| # For RandomForest, we just verify CUDA is detected | |
| assert torch.cuda.is_available(), "GPU should be available" | |
| print(f"\n[PASS] GPU detected: {torch.cuda.get_device_name(0)}") | |
| print("Note: RandomForest uses CPU. This test verifies GPU availability.") | |
| def test_reproducibility_with_random_seed(self): | |
| """ | |
| Test that training is reproducible when using the same random seed. | |
| """ | |
| # Small dataset | |
| X = np.load(DATA_PATHS["features"])[:50] | |
| Y = np.load(DATA_PATHS["labels"])[:50] | |
| # Remove zero-columns | |
| col_sums = Y.sum(axis=0) | |
| valid_cols = col_sums > 0 | |
| Y = Y[:, valid_cols] | |
| # Train first model | |
| model1 = MultiOutputClassifier( | |
| RandomForestClassifier( | |
| n_estimators=10, | |
| max_depth=5, | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| ) | |
| model1.fit(X, Y) | |
| pred1 = model1.predict(X) | |
| # Train second model with same seed | |
| model2 = MultiOutputClassifier( | |
| RandomForestClassifier( | |
| n_estimators=10, | |
| max_depth=5, | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| ) | |
| model2.fit(X, Y) | |
| pred2 = model2.predict(X) | |
| # Predictions should be identical | |
| assert np.array_equal(pred1, pred2), ( | |
| "Models with same random seed should produce identical predictions" | |
| ) | |
| print("\n[PASS] Training is reproducible with random seed") | |
| def test_model_improves_with_more_data(self): | |
| """ | |
| Test that model performance improves with more training data. | |
| """ | |
| X_full = np.load(DATA_PATHS["features"])[:500] | |
| Y_full = np.load(DATA_PATHS["labels"])[:500] | |
| # Remove zero-columns | |
| col_sums = Y_full.sum(axis=0) | |
| valid_cols = col_sums > 0 | |
| Y_full = Y_full[:, valid_cols] | |
| # Split for testing | |
| X_train_full, X_test, Y_train_full, Y_test = train_test_split( | |
| X_full, Y_full, test_size=0.2, random_state=42 | |
| ) | |
| # Train with small dataset | |
| X_small = X_train_full[:50] | |
| Y_small = Y_train_full[:50] | |
| model_small = MultiOutputClassifier( | |
| RandomForestClassifier( | |
| n_estimators=20, | |
| max_depth=5, | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| ) | |
| model_small.fit(X_small, Y_small) | |
| pred_small = model_small.predict(X_test) | |
| f1_small = f1_score(Y_test, pred_small, average='micro', zero_division=0) | |
| # Train with larger dataset | |
| model_large = MultiOutputClassifier( | |
| RandomForestClassifier( | |
| n_estimators=20, | |
| max_depth=5, | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| ) | |
| model_large.fit(X_train_full, Y_train_full) | |
| pred_large = model_large.predict(X_test) | |
| f1_large = f1_score(Y_test, pred_large, average='micro', zero_division=0) | |
| print(f"\nF1 with 50 samples: {f1_small:.4f}") | |
| print(f"F1 with {len(X_train_full)} samples: {f1_large:.4f}") | |
| # More data should generally improve performance (or at least not degrade) | |
| # Allow small tolerance for variance | |
| assert f1_large >= f1_small * 0.9, ( | |
| f"Model with more data ({f1_large:.4f}) should not perform " | |
| f"significantly worse than with less data ({f1_small:.4f})" | |
| ) | |
| def test_model_saves_and_loads_correctly(self, tmp_path): | |
| """ | |
| Test that trained model can be saved and loaded without errors. | |
| """ | |
| import joblib | |
| # Small dataset | |
| X = np.load(DATA_PATHS["features"])[:50] | |
| Y = np.load(DATA_PATHS["labels"])[:50] | |
| # Remove zero-columns | |
| col_sums = Y.sum(axis=0) | |
| valid_cols = col_sums > 0 | |
| Y = Y[:, valid_cols] | |
| # Train model | |
| model = MultiOutputClassifier( | |
| RandomForestClassifier( | |
| n_estimators=10, | |
| max_depth=5, | |
| random_state=42, | |
| n_jobs=-1 | |
| ) | |
| ) | |
| model.fit(X, Y) | |
| pred_original = model.predict(X) | |
| # Save model | |
| model_path = tmp_path / "test_model.pkl" | |
| joblib.dump(model, model_path) | |
| # Load model | |
| loaded_model = joblib.load(model_path) | |
| pred_loaded = loaded_model.predict(X) | |
| # Predictions should be identical | |
| assert np.array_equal(pred_original, pred_loaded), ( | |
| "Loaded model should produce identical predictions" | |
| ) | |
| print("\n[PASS] Model saves and loads correctly") | |