PayShield-ML / tests /test_models /test_pipeline.py
Sibi Krishnamoorthy
prod
8a08300
"""
Tests for Model Training Pipeline.
Tests the pipeline construction and feature extraction logic.
"""
import numpy as np
import pandas as pd
import pytest
from sklearn.base import BaseEstimator
from src.models.pipeline import FraudFeatureExtractor, create_fraud_pipeline
class TestFraudFeatureExtractor:
"""Test suite for custom feature extractor."""
def test_haversine_distance(self):
"""Test Haversine distance calculation."""
extractor = FraudFeatureExtractor()
# Test data: NYC to LA (approx 3944 km)
data = pd.DataFrame(
{
"lat": [40.7128],
"long": [-74.0060],
"merch_lat": [34.0522],
"merch_long": [-118.2437],
}
)
result = extractor.transform(data)
assert "distance_km" in result.columns
# Rough check (actual is ~3944 km)
assert 3900 < result["distance_km"].iloc[0] < 4000
def test_cyclical_time_features(self):
"""Test cyclical encoding of hour and day."""
extractor = FraudFeatureExtractor()
data = pd.DataFrame(
{
"trans_date_trans_time": ["2019-01-01 12:00:00"] # Noon on Tuesday
}
)
result = extractor.transform(data)
# Check features exist
assert "hour_sin" in result.columns
assert "hour_cos" in result.columns
assert "day_sin" in result.columns
assert "day_cos" in result.columns
# Noon (12) should be at pi (sin≈0, cos≈-1)
assert abs(result["hour_sin"].iloc[0]) < 0.1
assert result["hour_cos"].iloc[0] < 0
def test_amount_log_transform(self):
"""Test log transformation of amount."""
extractor = FraudFeatureExtractor()
data = pd.DataFrame({"amt": [100.0, 1000.0]})
result = extractor.transform(data)
assert "amt_log" in result.columns
# log1p(100) ≈ 4.615, log1p(1000) ≈ 6.908
assert 4.5 < result["amt_log"].iloc[0] < 4.7
assert 6.8 < result["amt_log"].iloc[1] < 7.0
def test_gender_mapping(self):
"""Test gender binary encoding."""
extractor = FraudFeatureExtractor()
data = pd.DataFrame({"gender": ["M", "F", "M"]})
result = extractor.transform(data)
assert result["gender"].tolist() == [1, 0, 1]
class TestPipelineCreation:
"""Test pipeline factory function."""
def test_create_pipeline(self):
"""Test that pipeline is created correctly."""
params = {"max_depth": 6, "learning_rate": 0.1, "n_estimators": 50}
pipeline = create_fraud_pipeline(params)
# Check it's a valid estimator
assert isinstance(pipeline, BaseEstimator)
# Check steps exist
assert "features" in pipeline.named_steps
assert "preprocessor" in pipeline.named_steps
assert "model" in pipeline.named_steps
def test_pipeline_fit_predict(self):
"""Test that pipeline can fit and predict."""
# Create minimal sample data
np.random.seed(42)
n_samples = 100
data = pd.DataFrame(
{
"trans_date_trans_time": pd.date_range("2019-01-01", periods=n_samples, freq="H"),
"amt": np.random.uniform(10, 500, n_samples),
"lat": np.random.uniform(30, 45, n_samples),
"long": np.random.uniform(-120, -70, n_samples),
"merch_lat": np.random.uniform(30, 45, n_samples),
"merch_long": np.random.uniform(-120, -70, n_samples),
"job": np.random.choice(["Engineer, biomedical", "Data scientist"], n_samples),
"category": np.random.choice(["grocery_pos", "gas_transport"], n_samples),
"gender": np.random.choice(["M", "F"], n_samples),
"dob": ["1990-01-01"] * n_samples,
"trans_count_24h": np.random.randint(1, 10, n_samples),
"amt_to_avg_ratio_24h": np.random.uniform(0.5, 2.0, n_samples),
"amt_relative_to_all_time": np.random.uniform(0.5, 2.0, n_samples),
}
)
y = np.random.randint(0, 2, n_samples) # Random binary labels
params = {"max_depth": 3, "n_estimators": 10}
pipeline = create_fraud_pipeline(params)
# Should fit without errors
pipeline.fit(data, y)
# Should predict
predictions = pipeline.predict(data)
assert len(predictions) == n_samples
assert set(predictions).issubset({0, 1})
# Should predict probabilities
probas = pipeline.predict_proba(data)
assert probas.shape == (n_samples, 2)
assert np.all((probas >= 0) & (probas <= 1))