File size: 2,551 Bytes
a2bc2a9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | import os
import pytest
import joblib
import pandas as pd
import numpy as np
PROCESSED_DIR = os.path.join('data', 'processed')
MODELS_DIR = os.path.join('models')
def test_processed_files_exist():
"""All 6 split files must exist after preprocess.py runs."""
files = [
'X_train_raw.pkl', 'y_train_raw.pkl',
'X_train_smote.pkl', 'y_train_smote.pkl',
'X_test.pkl', 'y_test.pkl',
]
for f in files:
path = os.path.join(PROCESSED_DIR, f)
assert os.path.exists(path), f"Missing processed file: {f}"
def test_scaler_exists():
"""scaler.pkl must exist — API needs it at inference time."""
assert os.path.exists(os.path.join(MODELS_DIR, 'scaler.pkl'))
def test_train_test_shapes():
"""Train and test feature sets must have the same number of columns."""
X_train = joblib.load(os.path.join(PROCESSED_DIR, 'X_train_smote.pkl'))
X_test = joblib.load(os.path.join(PROCESSED_DIR, 'X_test.pkl'))
assert X_train.shape[1] == X_test.shape[1], "Column count mismatch between train and test"
assert X_train.shape[1] == 30, f"Expected 30 features, got {X_train.shape[1]}"
def test_smote_balanced():
"""After SMOTE, fraud and normal classes must be equal in the training set."""
y_train = joblib.load(os.path.join(PROCESSED_DIR, 'y_train_smote.pkl'))
counts = y_train.value_counts()
assert counts[0] == counts[1], "SMOTE did not balance the classes equally"
def test_test_set_untouched():
"""Test labels must still be imbalanced — SMOTE should never touch the test set."""
y_test = joblib.load(os.path.join(PROCESSED_DIR, 'y_test.pkl'))
fraud_ratio = y_test.mean()
assert fraud_ratio < 0.01, f"Test set fraud ratio {fraud_ratio:.4f} is too high — SMOTE may have leaked"
def test_no_missing_values():
"""No NaN values in any split."""
for fname in ['X_train_smote.pkl', 'X_test.pkl']:
df = joblib.load(os.path.join(PROCESSED_DIR, fname))
assert not df.isnull().any().any(), f"NaN values found in {fname}"
def test_scaler_transforms_correctly():
"""Scaler must transform Amount and Time without errors and change their values."""
scaler = joblib.load(os.path.join(MODELS_DIR, 'scaler.pkl'))
X_test = joblib.load(os.path.join(PROCESSED_DIR, 'X_test.pkl'))
original = X_test[['Amount', 'Time']].values.copy()
scaled = scaler.transform(X_test[['Amount', 'Time']])
assert scaled.shape == original.shape
assert not np.allclose(scaled, original), "Scaler did not change the values"
|