| import os |
| import pytest |
| import joblib |
| import pandas as pd |
| import numpy as np |
|
|
| PROCESSED_DIR = os.path.join('data', 'processed') |
| MODELS_DIR = os.path.join('models') |
|
|
|
|
| def test_processed_files_exist(): |
| """All 6 split files must exist after preprocess.py runs.""" |
| files = [ |
| 'X_train_raw.pkl', 'y_train_raw.pkl', |
| 'X_train_smote.pkl', 'y_train_smote.pkl', |
| 'X_test.pkl', 'y_test.pkl', |
| ] |
| for f in files: |
| path = os.path.join(PROCESSED_DIR, f) |
| assert os.path.exists(path), f"Missing processed file: {f}" |
|
|
|
|
| def test_scaler_exists(): |
| """scaler.pkl must exist — API needs it at inference time.""" |
| assert os.path.exists(os.path.join(MODELS_DIR, 'scaler.pkl')) |
|
|
|
|
| def test_train_test_shapes(): |
| """Train and test feature sets must have the same number of columns.""" |
| X_train = joblib.load(os.path.join(PROCESSED_DIR, 'X_train_smote.pkl')) |
| X_test = joblib.load(os.path.join(PROCESSED_DIR, 'X_test.pkl')) |
| assert X_train.shape[1] == X_test.shape[1], "Column count mismatch between train and test" |
| assert X_train.shape[1] == 30, f"Expected 30 features, got {X_train.shape[1]}" |
|
|
|
|
| def test_smote_balanced(): |
| """After SMOTE, fraud and normal classes must be equal in the training set.""" |
| y_train = joblib.load(os.path.join(PROCESSED_DIR, 'y_train_smote.pkl')) |
| counts = y_train.value_counts() |
| assert counts[0] == counts[1], "SMOTE did not balance the classes equally" |
|
|
|
|
| def test_test_set_untouched(): |
| """Test labels must still be imbalanced — SMOTE should never touch the test set.""" |
| y_test = joblib.load(os.path.join(PROCESSED_DIR, 'y_test.pkl')) |
| fraud_ratio = y_test.mean() |
| assert fraud_ratio < 0.01, f"Test set fraud ratio {fraud_ratio:.4f} is too high — SMOTE may have leaked" |
|
|
|
|
| def test_no_missing_values(): |
| """No NaN values in any split.""" |
| for fname in ['X_train_smote.pkl', 'X_test.pkl']: |
| df = joblib.load(os.path.join(PROCESSED_DIR, fname)) |
| assert not df.isnull().any().any(), f"NaN values found in {fname}" |
|
|
|
|
| def test_scaler_transforms_correctly(): |
| """Scaler must transform Amount and Time without errors and change their values.""" |
| scaler = joblib.load(os.path.join(MODELS_DIR, 'scaler.pkl')) |
| X_test = joblib.load(os.path.join(PROCESSED_DIR, 'X_test.pkl')) |
| original = X_test[['Amount', 'Time']].values.copy() |
| scaled = scaler.transform(X_test[['Amount', 'Time']]) |
| assert scaled.shape == original.shape |
| assert not np.allclose(scaled, original), "Scaler did not change the values" |
|
|