File size: 12,112 Bytes
225af6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
"""
Model Training Tests

These tests verify that the model training process works correctly:
- Training completes without errors
- Loss decreases over epochs
- No overfitting on a single batch
- Training works on different devices (CPU, GPU if available)

Based on the "Testing Models" section from the behavioral testing framework.
"""
import pytest
import numpy as np
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from pathlib import Path

from hopcroft_skill_classification_tool_competition.config import DATA_PATHS, TRAINING_CONFIG


@pytest.mark.training
class TestModelTraining:
    """Test suite for model training validation."""
    
    def test_training_completes_without_errors(self):
        """
        Test that training completes without raising exceptions.
        
        Uses a small subset of data for fast testing.
        """
        # Load small subset of data
        X = np.load(DATA_PATHS["features"])[:100]  # First 100 samples
        Y = np.load(DATA_PATHS["labels"])[:100]
        
        # Remove zero-columns
        col_sums = Y.sum(axis=0)
        valid_cols = col_sums > 0
        Y = Y[:, valid_cols]
        
        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y, test_size=0.2, random_state=42
        )
        
        # Train simple model
        base_model = RandomForestClassifier(
            n_estimators=10,  # Small number for speed
            max_depth=5,
            random_state=42,
            n_jobs=-1
        )
        model = MultiOutputClassifier(base_model)
        
        # Should not raise any exceptions
        try:
            model.fit(X_train, Y_train)
            predictions = model.predict(X_test)
            assert predictions.shape == Y_test.shape, "Prediction shape mismatch"
        except Exception as e:
            pytest.fail(f"Training failed with error: {e}")
    
    def test_decreasing_loss_after_training(self):
        """
        Test that loss decreases after one training epoch.
        
        We verify this by checking that the model performs better than random.
        """
        # Load small subset
        X = np.load(DATA_PATHS["features"])[:200]
        Y = np.load(DATA_PATHS["labels"])[:200]
        
        # Remove zero-columns
        col_sums = Y.sum(axis=0)
        valid_cols = col_sums > 0
        Y = Y[:, valid_cols]
        
        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y, test_size=0.2, random_state=42
        )
        
        # Train model
        base_model = RandomForestClassifier(
            n_estimators=20,
            max_depth=5,
            random_state=42,
            n_jobs=-1
        )
        model = MultiOutputClassifier(base_model)
        model.fit(X_train, Y_train)
        
        # Get predictions
        Y_pred = model.predict(X_test)
        
        # Calculate F1 score
        f1 = f1_score(Y_test, Y_pred, average='micro', zero_division=0)
        
        print(f"\nF1 Score after training: {f1:.4f}")
        
        # Model should perform better than random (F1 > 0.1)
        # Random would be around 0.05-0.1 for multi-label
        assert f1 > 0.1, (
            f"Model F1 score ({f1:.4f}) is too low, "
            "suggests training didn't improve performance"
        )
    
    def test_overfitting_on_single_batch(self):
        """
        Test that model can overfit on a single batch.
        
        A model should be able to memorize a small dataset (overfitting check).
        This verifies the model has sufficient capacity to learn.
        """
        # Use very small dataset (single "batch")
        X = np.load(DATA_PATHS["features"])[:20]
        Y = np.load(DATA_PATHS["labels"])[:20]
        
        # Remove zero-columns
        col_sums = Y.sum(axis=0)
        valid_cols = col_sums > 0
        Y = Y[:, valid_cols]
        
        # Train on the same small dataset
        base_model = RandomForestClassifier(
            n_estimators=50,
            max_depth=None,  # No limit for overfitting
            min_samples_split=2,
            random_state=42,
            n_jobs=-1
        )
        model = MultiOutputClassifier(base_model)
        model.fit(X, Y)
        
        # Predict on training data
        Y_pred = model.predict(X)
        
        # Calculate accuracy on training data
        accuracy = (Y_pred == Y).mean()
        
        print(f"\nTraining accuracy (should overfit): {accuracy:.4f}")
        
        # Should achieve high accuracy on training data (overfitting)
        assert accuracy > 0.7, (
            f"Model cannot overfit on small dataset (accuracy: {accuracy:.4f}). "
            "This suggests the model lacks capacity to learn."
        )
    
    def test_training_on_cpu(self):
        """
        Test that training works on CPU.
        """
        # Small dataset
        X = np.load(DATA_PATHS["features"])[:50]
        Y = np.load(DATA_PATHS["labels"])[:50]
        
        # Remove zero-columns
        col_sums = Y.sum(axis=0)
        valid_cols = col_sums > 0
        Y = Y[:, valid_cols]
        
        # Train on CPU (RandomForest uses CPU by default)
        base_model = RandomForestClassifier(
            n_estimators=10,
            max_depth=5,
            random_state=42,
            n_jobs=1  # Single CPU core
        )
        model = MultiOutputClassifier(base_model)
        
        try:
            model.fit(X, Y)
            predictions = model.predict(X)
            assert predictions.shape == Y.shape
            print("\n[PASS] Training on CPU successful")
        except Exception as e:
            pytest.fail(f"Training on CPU failed: {e}")
    
    def test_training_on_multiple_cores(self):
        """
        Test that training works with parallel processing (multiple CPU cores).
        """
        # Small dataset
        X = np.load(DATA_PATHS["features"])[:50]
        Y = np.load(DATA_PATHS["labels"])[:50]
        
        # Remove zero-columns
        col_sums = Y.sum(axis=0)
        valid_cols = col_sums > 0
        Y = Y[:, valid_cols]
        
        # Train with all CPU cores
        base_model = RandomForestClassifier(
            n_estimators=10,
            max_depth=5,
            random_state=42,
            n_jobs=-1  # Use all cores
        )
        model = MultiOutputClassifier(base_model)
        
        try:
            model.fit(X, Y)
            predictions = model.predict(X)
            assert predictions.shape == Y.shape
            print("\n[PASS] Training with multiple CPU cores successful")
        except Exception as e:
            pytest.fail(f"Training with multiple cores failed: {e}")
    
    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
    def test_training_on_gpu(self):
        """
        Test that training works on GPU (if available).
        
        Note: RandomForest doesn't use GPU, but this test demonstrates
        the pattern for models that do (like neural networks).
        """
        # This test is skipped if no GPU is available
        # For RandomForest, we just verify CUDA is detected
        assert torch.cuda.is_available(), "GPU should be available"
        print(f"\n[PASS] GPU detected: {torch.cuda.get_device_name(0)}")
        print("Note: RandomForest uses CPU. This test verifies GPU availability.")
    
    def test_reproducibility_with_random_seed(self):
        """
        Test that training is reproducible when using the same random seed.
        """
        # Small dataset
        X = np.load(DATA_PATHS["features"])[:50]
        Y = np.load(DATA_PATHS["labels"])[:50]
        
        # Remove zero-columns
        col_sums = Y.sum(axis=0)
        valid_cols = col_sums > 0
        Y = Y[:, valid_cols]
        
        # Train first model
        model1 = MultiOutputClassifier(
            RandomForestClassifier(
                n_estimators=10,
                max_depth=5,
                random_state=42,
                n_jobs=-1
            )
        )
        model1.fit(X, Y)
        pred1 = model1.predict(X)
        
        # Train second model with same seed
        model2 = MultiOutputClassifier(
            RandomForestClassifier(
                n_estimators=10,
                max_depth=5,
                random_state=42,
                n_jobs=-1
            )
        )
        model2.fit(X, Y)
        pred2 = model2.predict(X)
        
        # Predictions should be identical
        assert np.array_equal(pred1, pred2), (
            "Models with same random seed should produce identical predictions"
        )
        print("\n[PASS] Training is reproducible with random seed")
    
    def test_model_improves_with_more_data(self):
        """
        Test that model performance improves with more training data.
        """
        X_full = np.load(DATA_PATHS["features"])[:500]
        Y_full = np.load(DATA_PATHS["labels"])[:500]
        
        # Remove zero-columns
        col_sums = Y_full.sum(axis=0)
        valid_cols = col_sums > 0
        Y_full = Y_full[:, valid_cols]
        
        # Split for testing
        X_train_full, X_test, Y_train_full, Y_test = train_test_split(
            X_full, Y_full, test_size=0.2, random_state=42
        )
        
        # Train with small dataset
        X_small = X_train_full[:50]
        Y_small = Y_train_full[:50]
        
        model_small = MultiOutputClassifier(
            RandomForestClassifier(
                n_estimators=20,
                max_depth=5,
                random_state=42,
                n_jobs=-1
            )
        )
        model_small.fit(X_small, Y_small)
        pred_small = model_small.predict(X_test)
        f1_small = f1_score(Y_test, pred_small, average='micro', zero_division=0)
        
        # Train with larger dataset
        model_large = MultiOutputClassifier(
            RandomForestClassifier(
                n_estimators=20,
                max_depth=5,
                random_state=42,
                n_jobs=-1
            )
        )
        model_large.fit(X_train_full, Y_train_full)
        pred_large = model_large.predict(X_test)
        f1_large = f1_score(Y_test, pred_large, average='micro', zero_division=0)
        
        print(f"\nF1 with 50 samples: {f1_small:.4f}")
        print(f"F1 with {len(X_train_full)} samples: {f1_large:.4f}")
        
        # More data should generally improve performance (or at least not degrade)
        # Allow small tolerance for variance
        assert f1_large >= f1_small * 0.9, (
            f"Model with more data ({f1_large:.4f}) should not perform "
            f"significantly worse than with less data ({f1_small:.4f})"
        )
    
    def test_model_saves_and_loads_correctly(self, tmp_path):
        """
        Test that trained model can be saved and loaded without errors.
        """
        import joblib
        
        # Small dataset
        X = np.load(DATA_PATHS["features"])[:50]
        Y = np.load(DATA_PATHS["labels"])[:50]
        
        # Remove zero-columns
        col_sums = Y.sum(axis=0)
        valid_cols = col_sums > 0
        Y = Y[:, valid_cols]
        
        # Train model
        model = MultiOutputClassifier(
            RandomForestClassifier(
                n_estimators=10,
                max_depth=5,
                random_state=42,
                n_jobs=-1
            )
        )
        model.fit(X, Y)
        pred_original = model.predict(X)
        
        # Save model
        model_path = tmp_path / "test_model.pkl"
        joblib.dump(model, model_path)
        
        # Load model
        loaded_model = joblib.load(model_path)
        pred_loaded = loaded_model.predict(X)
        
        # Predictions should be identical
        assert np.array_equal(pred_original, pred_loaded), (
            "Loaded model should produce identical predictions"
        )
        print("\n[PASS] Model saves and loads correctly")