File size: 12,112 Bytes
225af6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 |
"""
Model Training Tests
These tests verify that the model training process works correctly:
- Training completes without errors
- Loss decreases over epochs
- No overfitting on a single batch
- Training works on different devices (CPU, GPU if available)
Based on the "Testing Models" section from the behavioral testing framework.
"""
import pytest
import numpy as np
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from pathlib import Path
from hopcroft_skill_classification_tool_competition.config import DATA_PATHS, TRAINING_CONFIG
@pytest.mark.training
class TestModelTraining:
"""Test suite for model training validation."""
def test_training_completes_without_errors(self):
"""
Test that training completes without raising exceptions.
Uses a small subset of data for fast testing.
"""
# Load small subset of data
X = np.load(DATA_PATHS["features"])[:100] # First 100 samples
Y = np.load(DATA_PATHS["labels"])[:100]
# Remove zero-columns
col_sums = Y.sum(axis=0)
valid_cols = col_sums > 0
Y = Y[:, valid_cols]
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.2, random_state=42
)
# Train simple model
base_model = RandomForestClassifier(
n_estimators=10, # Small number for speed
max_depth=5,
random_state=42,
n_jobs=-1
)
model = MultiOutputClassifier(base_model)
# Should not raise any exceptions
try:
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
assert predictions.shape == Y_test.shape, "Prediction shape mismatch"
except Exception as e:
pytest.fail(f"Training failed with error: {e}")
def test_decreasing_loss_after_training(self):
"""
Test that loss decreases after one training epoch.
We verify this by checking that the model performs better than random.
"""
# Load small subset
X = np.load(DATA_PATHS["features"])[:200]
Y = np.load(DATA_PATHS["labels"])[:200]
# Remove zero-columns
col_sums = Y.sum(axis=0)
valid_cols = col_sums > 0
Y = Y[:, valid_cols]
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.2, random_state=42
)
# Train model
base_model = RandomForestClassifier(
n_estimators=20,
max_depth=5,
random_state=42,
n_jobs=-1
)
model = MultiOutputClassifier(base_model)
model.fit(X_train, Y_train)
# Get predictions
Y_pred = model.predict(X_test)
# Calculate F1 score
f1 = f1_score(Y_test, Y_pred, average='micro', zero_division=0)
print(f"\nF1 Score after training: {f1:.4f}")
# Model should perform better than random (F1 > 0.1)
# Random would be around 0.05-0.1 for multi-label
assert f1 > 0.1, (
f"Model F1 score ({f1:.4f}) is too low, "
"suggests training didn't improve performance"
)
def test_overfitting_on_single_batch(self):
"""
Test that model can overfit on a single batch.
A model should be able to memorize a small dataset (overfitting check).
This verifies the model has sufficient capacity to learn.
"""
# Use very small dataset (single "batch")
X = np.load(DATA_PATHS["features"])[:20]
Y = np.load(DATA_PATHS["labels"])[:20]
# Remove zero-columns
col_sums = Y.sum(axis=0)
valid_cols = col_sums > 0
Y = Y[:, valid_cols]
# Train on the same small dataset
base_model = RandomForestClassifier(
n_estimators=50,
max_depth=None, # No limit for overfitting
min_samples_split=2,
random_state=42,
n_jobs=-1
)
model = MultiOutputClassifier(base_model)
model.fit(X, Y)
# Predict on training data
Y_pred = model.predict(X)
# Calculate accuracy on training data
accuracy = (Y_pred == Y).mean()
print(f"\nTraining accuracy (should overfit): {accuracy:.4f}")
# Should achieve high accuracy on training data (overfitting)
assert accuracy > 0.7, (
f"Model cannot overfit on small dataset (accuracy: {accuracy:.4f}). "
"This suggests the model lacks capacity to learn."
)
def test_training_on_cpu(self):
"""
Test that training works on CPU.
"""
# Small dataset
X = np.load(DATA_PATHS["features"])[:50]
Y = np.load(DATA_PATHS["labels"])[:50]
# Remove zero-columns
col_sums = Y.sum(axis=0)
valid_cols = col_sums > 0
Y = Y[:, valid_cols]
# Train on CPU (RandomForest uses CPU by default)
base_model = RandomForestClassifier(
n_estimators=10,
max_depth=5,
random_state=42,
n_jobs=1 # Single CPU core
)
model = MultiOutputClassifier(base_model)
try:
model.fit(X, Y)
predictions = model.predict(X)
assert predictions.shape == Y.shape
print("\n[PASS] Training on CPU successful")
except Exception as e:
pytest.fail(f"Training on CPU failed: {e}")
def test_training_on_multiple_cores(self):
"""
Test that training works with parallel processing (multiple CPU cores).
"""
# Small dataset
X = np.load(DATA_PATHS["features"])[:50]
Y = np.load(DATA_PATHS["labels"])[:50]
# Remove zero-columns
col_sums = Y.sum(axis=0)
valid_cols = col_sums > 0
Y = Y[:, valid_cols]
# Train with all CPU cores
base_model = RandomForestClassifier(
n_estimators=10,
max_depth=5,
random_state=42,
n_jobs=-1 # Use all cores
)
model = MultiOutputClassifier(base_model)
try:
model.fit(X, Y)
predictions = model.predict(X)
assert predictions.shape == Y.shape
print("\n[PASS] Training with multiple CPU cores successful")
except Exception as e:
pytest.fail(f"Training with multiple cores failed: {e}")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_training_on_gpu(self):
"""
Test that training works on GPU (if available).
Note: RandomForest doesn't use GPU, but this test demonstrates
the pattern for models that do (like neural networks).
"""
# This test is skipped if no GPU is available
# For RandomForest, we just verify CUDA is detected
assert torch.cuda.is_available(), "GPU should be available"
print(f"\n[PASS] GPU detected: {torch.cuda.get_device_name(0)}")
print("Note: RandomForest uses CPU. This test verifies GPU availability.")
def test_reproducibility_with_random_seed(self):
"""
Test that training is reproducible when using the same random seed.
"""
# Small dataset
X = np.load(DATA_PATHS["features"])[:50]
Y = np.load(DATA_PATHS["labels"])[:50]
# Remove zero-columns
col_sums = Y.sum(axis=0)
valid_cols = col_sums > 0
Y = Y[:, valid_cols]
# Train first model
model1 = MultiOutputClassifier(
RandomForestClassifier(
n_estimators=10,
max_depth=5,
random_state=42,
n_jobs=-1
)
)
model1.fit(X, Y)
pred1 = model1.predict(X)
# Train second model with same seed
model2 = MultiOutputClassifier(
RandomForestClassifier(
n_estimators=10,
max_depth=5,
random_state=42,
n_jobs=-1
)
)
model2.fit(X, Y)
pred2 = model2.predict(X)
# Predictions should be identical
assert np.array_equal(pred1, pred2), (
"Models with same random seed should produce identical predictions"
)
print("\n[PASS] Training is reproducible with random seed")
def test_model_improves_with_more_data(self):
"""
Test that model performance improves with more training data.
"""
X_full = np.load(DATA_PATHS["features"])[:500]
Y_full = np.load(DATA_PATHS["labels"])[:500]
# Remove zero-columns
col_sums = Y_full.sum(axis=0)
valid_cols = col_sums > 0
Y_full = Y_full[:, valid_cols]
# Split for testing
X_train_full, X_test, Y_train_full, Y_test = train_test_split(
X_full, Y_full, test_size=0.2, random_state=42
)
# Train with small dataset
X_small = X_train_full[:50]
Y_small = Y_train_full[:50]
model_small = MultiOutputClassifier(
RandomForestClassifier(
n_estimators=20,
max_depth=5,
random_state=42,
n_jobs=-1
)
)
model_small.fit(X_small, Y_small)
pred_small = model_small.predict(X_test)
f1_small = f1_score(Y_test, pred_small, average='micro', zero_division=0)
# Train with larger dataset
model_large = MultiOutputClassifier(
RandomForestClassifier(
n_estimators=20,
max_depth=5,
random_state=42,
n_jobs=-1
)
)
model_large.fit(X_train_full, Y_train_full)
pred_large = model_large.predict(X_test)
f1_large = f1_score(Y_test, pred_large, average='micro', zero_division=0)
print(f"\nF1 with 50 samples: {f1_small:.4f}")
print(f"F1 with {len(X_train_full)} samples: {f1_large:.4f}")
# More data should generally improve performance (or at least not degrade)
# Allow small tolerance for variance
assert f1_large >= f1_small * 0.9, (
f"Model with more data ({f1_large:.4f}) should not perform "
f"significantly worse than with less data ({f1_small:.4f})"
)
def test_model_saves_and_loads_correctly(self, tmp_path):
"""
Test that trained model can be saved and loaded without errors.
"""
import joblib
# Small dataset
X = np.load(DATA_PATHS["features"])[:50]
Y = np.load(DATA_PATHS["labels"])[:50]
# Remove zero-columns
col_sums = Y.sum(axis=0)
valid_cols = col_sums > 0
Y = Y[:, valid_cols]
# Train model
model = MultiOutputClassifier(
RandomForestClassifier(
n_estimators=10,
max_depth=5,
random_state=42,
n_jobs=-1
)
)
model.fit(X, Y)
pred_original = model.predict(X)
# Save model
model_path = tmp_path / "test_model.pkl"
joblib.dump(model, model_path)
# Load model
loaded_model = joblib.load(model_path)
pred_loaded = loaded_model.predict(X)
# Predictions should be identical
assert np.array_equal(pred_original, pred_loaded), (
"Loaded model should produce identical predictions"
)
print("\n[PASS] Model saves and loads correctly")
|