|
|
""" |
|
|
Model Training Tests |
|
|
|
|
|
These tests verify that the model training process works correctly: |
|
|
- Training completes without errors |
|
|
- Loss decreases over epochs |
|
|
- No overfitting on a single batch |
|
|
- Training works on different devices (CPU, GPU if available) |
|
|
|
|
|
Based on the "Testing Models" section from the behavioral testing framework. |
|
|
""" |
|
|
import pytest |
|
|
import numpy as np |
|
|
import torch |
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.multioutput import MultiOutputClassifier |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.metrics import f1_score |
|
|
from pathlib import Path |
|
|
|
|
|
from hopcroft_skill_classification_tool_competition.config import DATA_PATHS, TRAINING_CONFIG |
|
|
|
|
|
|
|
|
@pytest.mark.training |
|
|
class TestModelTraining: |
|
|
"""Test suite for model training validation.""" |
|
|
|
|
|
def test_training_completes_without_errors(self): |
|
|
""" |
|
|
Test that training completes without raising exceptions. |
|
|
|
|
|
Uses a small subset of data for fast testing. |
|
|
""" |
|
|
|
|
|
X = np.load(DATA_PATHS["features"])[:100] |
|
|
Y = np.load(DATA_PATHS["labels"])[:100] |
|
|
|
|
|
|
|
|
col_sums = Y.sum(axis=0) |
|
|
valid_cols = col_sums > 0 |
|
|
Y = Y[:, valid_cols] |
|
|
|
|
|
X_train, X_test, Y_train, Y_test = train_test_split( |
|
|
X, Y, test_size=0.2, random_state=42 |
|
|
) |
|
|
|
|
|
|
|
|
base_model = RandomForestClassifier( |
|
|
n_estimators=10, |
|
|
max_depth=5, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
model = MultiOutputClassifier(base_model) |
|
|
|
|
|
|
|
|
try: |
|
|
model.fit(X_train, Y_train) |
|
|
predictions = model.predict(X_test) |
|
|
assert predictions.shape == Y_test.shape, "Prediction shape mismatch" |
|
|
except Exception as e: |
|
|
pytest.fail(f"Training failed with error: {e}") |
|
|
|
|
|
def test_decreasing_loss_after_training(self): |
|
|
""" |
|
|
Test that loss decreases after one training epoch. |
|
|
|
|
|
We verify this by checking that the model performs better than random. |
|
|
""" |
|
|
|
|
|
X = np.load(DATA_PATHS["features"])[:200] |
|
|
Y = np.load(DATA_PATHS["labels"])[:200] |
|
|
|
|
|
|
|
|
col_sums = Y.sum(axis=0) |
|
|
valid_cols = col_sums > 0 |
|
|
Y = Y[:, valid_cols] |
|
|
|
|
|
X_train, X_test, Y_train, Y_test = train_test_split( |
|
|
X, Y, test_size=0.2, random_state=42 |
|
|
) |
|
|
|
|
|
|
|
|
base_model = RandomForestClassifier( |
|
|
n_estimators=20, |
|
|
max_depth=5, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
model = MultiOutputClassifier(base_model) |
|
|
model.fit(X_train, Y_train) |
|
|
|
|
|
|
|
|
Y_pred = model.predict(X_test) |
|
|
|
|
|
|
|
|
f1 = f1_score(Y_test, Y_pred, average='micro', zero_division=0) |
|
|
|
|
|
print(f"\nF1 Score after training: {f1:.4f}") |
|
|
|
|
|
|
|
|
|
|
|
assert f1 > 0.1, ( |
|
|
f"Model F1 score ({f1:.4f}) is too low, " |
|
|
"suggests training didn't improve performance" |
|
|
) |
|
|
|
|
|
def test_overfitting_on_single_batch(self): |
|
|
""" |
|
|
Test that model can overfit on a single batch. |
|
|
|
|
|
A model should be able to memorize a small dataset (overfitting check). |
|
|
This verifies the model has sufficient capacity to learn. |
|
|
""" |
|
|
|
|
|
X = np.load(DATA_PATHS["features"])[:20] |
|
|
Y = np.load(DATA_PATHS["labels"])[:20] |
|
|
|
|
|
|
|
|
col_sums = Y.sum(axis=0) |
|
|
valid_cols = col_sums > 0 |
|
|
Y = Y[:, valid_cols] |
|
|
|
|
|
|
|
|
base_model = RandomForestClassifier( |
|
|
n_estimators=50, |
|
|
max_depth=None, |
|
|
min_samples_split=2, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
model = MultiOutputClassifier(base_model) |
|
|
model.fit(X, Y) |
|
|
|
|
|
|
|
|
Y_pred = model.predict(X) |
|
|
|
|
|
|
|
|
accuracy = (Y_pred == Y).mean() |
|
|
|
|
|
print(f"\nTraining accuracy (should overfit): {accuracy:.4f}") |
|
|
|
|
|
|
|
|
assert accuracy > 0.7, ( |
|
|
f"Model cannot overfit on small dataset (accuracy: {accuracy:.4f}). " |
|
|
"This suggests the model lacks capacity to learn." |
|
|
) |
|
|
|
|
|
def test_training_on_cpu(self): |
|
|
""" |
|
|
Test that training works on CPU. |
|
|
""" |
|
|
|
|
|
X = np.load(DATA_PATHS["features"])[:50] |
|
|
Y = np.load(DATA_PATHS["labels"])[:50] |
|
|
|
|
|
|
|
|
col_sums = Y.sum(axis=0) |
|
|
valid_cols = col_sums > 0 |
|
|
Y = Y[:, valid_cols] |
|
|
|
|
|
|
|
|
base_model = RandomForestClassifier( |
|
|
n_estimators=10, |
|
|
max_depth=5, |
|
|
random_state=42, |
|
|
n_jobs=1 |
|
|
) |
|
|
model = MultiOutputClassifier(base_model) |
|
|
|
|
|
try: |
|
|
model.fit(X, Y) |
|
|
predictions = model.predict(X) |
|
|
assert predictions.shape == Y.shape |
|
|
print("\n[PASS] Training on CPU successful") |
|
|
except Exception as e: |
|
|
pytest.fail(f"Training on CPU failed: {e}") |
|
|
|
|
|
def test_training_on_multiple_cores(self): |
|
|
""" |
|
|
Test that training works with parallel processing (multiple CPU cores). |
|
|
""" |
|
|
|
|
|
X = np.load(DATA_PATHS["features"])[:50] |
|
|
Y = np.load(DATA_PATHS["labels"])[:50] |
|
|
|
|
|
|
|
|
col_sums = Y.sum(axis=0) |
|
|
valid_cols = col_sums > 0 |
|
|
Y = Y[:, valid_cols] |
|
|
|
|
|
|
|
|
base_model = RandomForestClassifier( |
|
|
n_estimators=10, |
|
|
max_depth=5, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
model = MultiOutputClassifier(base_model) |
|
|
|
|
|
try: |
|
|
model.fit(X, Y) |
|
|
predictions = model.predict(X) |
|
|
assert predictions.shape == Y.shape |
|
|
print("\n[PASS] Training with multiple CPU cores successful") |
|
|
except Exception as e: |
|
|
pytest.fail(f"Training with multiple cores failed: {e}") |
|
|
|
|
|
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") |
|
|
def test_training_on_gpu(self): |
|
|
""" |
|
|
Test that training works on GPU (if available). |
|
|
|
|
|
Note: RandomForest doesn't use GPU, but this test demonstrates |
|
|
the pattern for models that do (like neural networks). |
|
|
""" |
|
|
|
|
|
|
|
|
assert torch.cuda.is_available(), "GPU should be available" |
|
|
print(f"\n[PASS] GPU detected: {torch.cuda.get_device_name(0)}") |
|
|
print("Note: RandomForest uses CPU. This test verifies GPU availability.") |
|
|
|
|
|
def test_reproducibility_with_random_seed(self): |
|
|
""" |
|
|
Test that training is reproducible when using the same random seed. |
|
|
""" |
|
|
|
|
|
X = np.load(DATA_PATHS["features"])[:50] |
|
|
Y = np.load(DATA_PATHS["labels"])[:50] |
|
|
|
|
|
|
|
|
col_sums = Y.sum(axis=0) |
|
|
valid_cols = col_sums > 0 |
|
|
Y = Y[:, valid_cols] |
|
|
|
|
|
|
|
|
model1 = MultiOutputClassifier( |
|
|
RandomForestClassifier( |
|
|
n_estimators=10, |
|
|
max_depth=5, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
) |
|
|
model1.fit(X, Y) |
|
|
pred1 = model1.predict(X) |
|
|
|
|
|
|
|
|
model2 = MultiOutputClassifier( |
|
|
RandomForestClassifier( |
|
|
n_estimators=10, |
|
|
max_depth=5, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
) |
|
|
model2.fit(X, Y) |
|
|
pred2 = model2.predict(X) |
|
|
|
|
|
|
|
|
assert np.array_equal(pred1, pred2), ( |
|
|
"Models with same random seed should produce identical predictions" |
|
|
) |
|
|
print("\n[PASS] Training is reproducible with random seed") |
|
|
|
|
|
def test_model_improves_with_more_data(self): |
|
|
""" |
|
|
Test that model performance improves with more training data. |
|
|
""" |
|
|
X_full = np.load(DATA_PATHS["features"])[:500] |
|
|
Y_full = np.load(DATA_PATHS["labels"])[:500] |
|
|
|
|
|
|
|
|
col_sums = Y_full.sum(axis=0) |
|
|
valid_cols = col_sums > 0 |
|
|
Y_full = Y_full[:, valid_cols] |
|
|
|
|
|
|
|
|
X_train_full, X_test, Y_train_full, Y_test = train_test_split( |
|
|
X_full, Y_full, test_size=0.2, random_state=42 |
|
|
) |
|
|
|
|
|
|
|
|
X_small = X_train_full[:50] |
|
|
Y_small = Y_train_full[:50] |
|
|
|
|
|
model_small = MultiOutputClassifier( |
|
|
RandomForestClassifier( |
|
|
n_estimators=20, |
|
|
max_depth=5, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
) |
|
|
model_small.fit(X_small, Y_small) |
|
|
pred_small = model_small.predict(X_test) |
|
|
f1_small = f1_score(Y_test, pred_small, average='micro', zero_division=0) |
|
|
|
|
|
|
|
|
model_large = MultiOutputClassifier( |
|
|
RandomForestClassifier( |
|
|
n_estimators=20, |
|
|
max_depth=5, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
) |
|
|
model_large.fit(X_train_full, Y_train_full) |
|
|
pred_large = model_large.predict(X_test) |
|
|
f1_large = f1_score(Y_test, pred_large, average='micro', zero_division=0) |
|
|
|
|
|
print(f"\nF1 with 50 samples: {f1_small:.4f}") |
|
|
print(f"F1 with {len(X_train_full)} samples: {f1_large:.4f}") |
|
|
|
|
|
|
|
|
|
|
|
assert f1_large >= f1_small * 0.9, ( |
|
|
f"Model with more data ({f1_large:.4f}) should not perform " |
|
|
f"significantly worse than with less data ({f1_small:.4f})" |
|
|
) |
|
|
|
|
|
def test_model_saves_and_loads_correctly(self, tmp_path): |
|
|
""" |
|
|
Test that trained model can be saved and loaded without errors. |
|
|
""" |
|
|
import joblib |
|
|
|
|
|
|
|
|
X = np.load(DATA_PATHS["features"])[:50] |
|
|
Y = np.load(DATA_PATHS["labels"])[:50] |
|
|
|
|
|
|
|
|
col_sums = Y.sum(axis=0) |
|
|
valid_cols = col_sums > 0 |
|
|
Y = Y[:, valid_cols] |
|
|
|
|
|
|
|
|
model = MultiOutputClassifier( |
|
|
RandomForestClassifier( |
|
|
n_estimators=10, |
|
|
max_depth=5, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
) |
|
|
model.fit(X, Y) |
|
|
pred_original = model.predict(X) |
|
|
|
|
|
|
|
|
model_path = tmp_path / "test_model.pkl" |
|
|
joblib.dump(model, model_path) |
|
|
|
|
|
|
|
|
loaded_model = joblib.load(model_path) |
|
|
pred_loaded = loaded_model.predict(X) |
|
|
|
|
|
|
|
|
assert np.array_equal(pred_original, pred_loaded), ( |
|
|
"Loaded model should produce identical predictions" |
|
|
) |
|
|
print("\n[PASS] Model saves and loads correctly") |
|
|
|