File size: 4,751 Bytes
8a08300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Tests for Model Training Pipeline.

Tests the pipeline construction and feature extraction logic.
"""

import numpy as np
import pandas as pd
import pytest
from sklearn.base import BaseEstimator

from src.models.pipeline import FraudFeatureExtractor, create_fraud_pipeline


class TestFraudFeatureExtractor:
    """Test suite for custom feature extractor."""

    def test_haversine_distance(self):
        """Test Haversine distance calculation."""
        extractor = FraudFeatureExtractor()

        # Test data: NYC to LA (approx 3944 km)
        data = pd.DataFrame(
            {
                "lat": [40.7128],
                "long": [-74.0060],
                "merch_lat": [34.0522],
                "merch_long": [-118.2437],
            }
        )

        result = extractor.transform(data)

        assert "distance_km" in result.columns
        # Rough check (actual is ~3944 km)
        assert 3900 < result["distance_km"].iloc[0] < 4000

    def test_cyclical_time_features(self):
        """Test cyclical encoding of hour and day."""
        extractor = FraudFeatureExtractor()

        data = pd.DataFrame(
            {
                "trans_date_trans_time": ["2019-01-01 12:00:00"]  # Noon on Tuesday
            }
        )

        result = extractor.transform(data)

        # Check features exist
        assert "hour_sin" in result.columns
        assert "hour_cos" in result.columns
        assert "day_sin" in result.columns
        assert "day_cos" in result.columns

        # Noon (12) should be at pi (sin≈0, cos≈-1)
        assert abs(result["hour_sin"].iloc[0]) < 0.1
        assert result["hour_cos"].iloc[0] < 0

    def test_amount_log_transform(self):
        """Test log transformation of amount."""
        extractor = FraudFeatureExtractor()

        data = pd.DataFrame({"amt": [100.0, 1000.0]})

        result = extractor.transform(data)

        assert "amt_log" in result.columns
        # log1p(100) ≈ 4.615, log1p(1000) ≈ 6.908
        assert 4.5 < result["amt_log"].iloc[0] < 4.7
        assert 6.8 < result["amt_log"].iloc[1] < 7.0

    def test_gender_mapping(self):
        """Test gender binary encoding."""
        extractor = FraudFeatureExtractor()

        data = pd.DataFrame({"gender": ["M", "F", "M"]})

        result = extractor.transform(data)

        assert result["gender"].tolist() == [1, 0, 1]


class TestPipelineCreation:
    """Test pipeline factory function."""

    def test_create_pipeline(self):
        """Test that pipeline is created correctly."""
        params = {"max_depth": 6, "learning_rate": 0.1, "n_estimators": 50}

        pipeline = create_fraud_pipeline(params)

        # Check it's a valid estimator
        assert isinstance(pipeline, BaseEstimator)

        # Check steps exist
        assert "features" in pipeline.named_steps
        assert "preprocessor" in pipeline.named_steps
        assert "model" in pipeline.named_steps

    def test_pipeline_fit_predict(self):
        """Test that pipeline can fit and predict."""
        # Create minimal sample data
        np.random.seed(42)
        n_samples = 100

        data = pd.DataFrame(
            {
                "trans_date_trans_time": pd.date_range("2019-01-01", periods=n_samples, freq="H"),
                "amt": np.random.uniform(10, 500, n_samples),
                "lat": np.random.uniform(30, 45, n_samples),
                "long": np.random.uniform(-120, -70, n_samples),
                "merch_lat": np.random.uniform(30, 45, n_samples),
                "merch_long": np.random.uniform(-120, -70, n_samples),
                "job": np.random.choice(["Engineer, biomedical", "Data scientist"], n_samples),
                "category": np.random.choice(["grocery_pos", "gas_transport"], n_samples),
                "gender": np.random.choice(["M", "F"], n_samples),
                "dob": ["1990-01-01"] * n_samples,
                "trans_count_24h": np.random.randint(1, 10, n_samples),
                "amt_to_avg_ratio_24h": np.random.uniform(0.5, 2.0, n_samples),
                "amt_relative_to_all_time": np.random.uniform(0.5, 2.0, n_samples),
            }
        )

        y = np.random.randint(0, 2, n_samples)  # Random binary labels

        params = {"max_depth": 3, "n_estimators": 10}
        pipeline = create_fraud_pipeline(params)

        # Should fit without errors
        pipeline.fit(data, y)

        # Should predict
        predictions = pipeline.predict(data)
        assert len(predictions) == n_samples
        assert set(predictions).issubset({0, 1})

        # Should predict probabilities
        probas = pipeline.predict_proba(data)
        assert probas.shape == (n_samples, 2)
        assert np.all((probas >= 0) & (probas <= 1))