Spaces:
Running
Running
| """ | |
| Tests for Model Training Pipeline. | |
| Tests the pipeline construction and feature extraction logic. | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| import pytest | |
| from sklearn.base import BaseEstimator | |
| from src.models.pipeline import FraudFeatureExtractor, create_fraud_pipeline | |
| class TestFraudFeatureExtractor: | |
| """Test suite for custom feature extractor.""" | |
| def test_haversine_distance(self): | |
| """Test Haversine distance calculation.""" | |
| extractor = FraudFeatureExtractor() | |
| # Test data: NYC to LA (approx 3944 km) | |
| data = pd.DataFrame( | |
| { | |
| "lat": [40.7128], | |
| "long": [-74.0060], | |
| "merch_lat": [34.0522], | |
| "merch_long": [-118.2437], | |
| } | |
| ) | |
| result = extractor.transform(data) | |
| assert "distance_km" in result.columns | |
| # Rough check (actual is ~3944 km) | |
| assert 3900 < result["distance_km"].iloc[0] < 4000 | |
| def test_cyclical_time_features(self): | |
| """Test cyclical encoding of hour and day.""" | |
| extractor = FraudFeatureExtractor() | |
| data = pd.DataFrame( | |
| { | |
| "trans_date_trans_time": ["2019-01-01 12:00:00"] # Noon on Tuesday | |
| } | |
| ) | |
| result = extractor.transform(data) | |
| # Check features exist | |
| assert "hour_sin" in result.columns | |
| assert "hour_cos" in result.columns | |
| assert "day_sin" in result.columns | |
| assert "day_cos" in result.columns | |
| # Noon (12) should be at pi (sin≈0, cos≈-1) | |
| assert abs(result["hour_sin"].iloc[0]) < 0.1 | |
| assert result["hour_cos"].iloc[0] < 0 | |
| def test_amount_log_transform(self): | |
| """Test log transformation of amount.""" | |
| extractor = FraudFeatureExtractor() | |
| data = pd.DataFrame({"amt": [100.0, 1000.0]}) | |
| result = extractor.transform(data) | |
| assert "amt_log" in result.columns | |
| # log1p(100) ≈ 4.615, log1p(1000) ≈ 6.908 | |
| assert 4.5 < result["amt_log"].iloc[0] < 4.7 | |
| assert 6.8 < result["amt_log"].iloc[1] < 7.0 | |
| def test_gender_mapping(self): | |
| """Test gender binary encoding.""" | |
| extractor = FraudFeatureExtractor() | |
| data = pd.DataFrame({"gender": ["M", "F", "M"]}) | |
| result = extractor.transform(data) | |
| assert result["gender"].tolist() == [1, 0, 1] | |
| class TestPipelineCreation: | |
| """Test pipeline factory function.""" | |
| def test_create_pipeline(self): | |
| """Test that pipeline is created correctly.""" | |
| params = {"max_depth": 6, "learning_rate": 0.1, "n_estimators": 50} | |
| pipeline = create_fraud_pipeline(params) | |
| # Check it's a valid estimator | |
| assert isinstance(pipeline, BaseEstimator) | |
| # Check steps exist | |
| assert "features" in pipeline.named_steps | |
| assert "preprocessor" in pipeline.named_steps | |
| assert "model" in pipeline.named_steps | |
| def test_pipeline_fit_predict(self): | |
| """Test that pipeline can fit and predict.""" | |
| # Create minimal sample data | |
| np.random.seed(42) | |
| n_samples = 100 | |
| data = pd.DataFrame( | |
| { | |
| "trans_date_trans_time": pd.date_range("2019-01-01", periods=n_samples, freq="H"), | |
| "amt": np.random.uniform(10, 500, n_samples), | |
| "lat": np.random.uniform(30, 45, n_samples), | |
| "long": np.random.uniform(-120, -70, n_samples), | |
| "merch_lat": np.random.uniform(30, 45, n_samples), | |
| "merch_long": np.random.uniform(-120, -70, n_samples), | |
| "job": np.random.choice(["Engineer, biomedical", "Data scientist"], n_samples), | |
| "category": np.random.choice(["grocery_pos", "gas_transport"], n_samples), | |
| "gender": np.random.choice(["M", "F"], n_samples), | |
| "dob": ["1990-01-01"] * n_samples, | |
| "trans_count_24h": np.random.randint(1, 10, n_samples), | |
| "amt_to_avg_ratio_24h": np.random.uniform(0.5, 2.0, n_samples), | |
| "amt_relative_to_all_time": np.random.uniform(0.5, 2.0, n_samples), | |
| } | |
| ) | |
| y = np.random.randint(0, 2, n_samples) # Random binary labels | |
| params = {"max_depth": 3, "n_estimators": 10} | |
| pipeline = create_fraud_pipeline(params) | |
| # Should fit without errors | |
| pipeline.fit(data, y) | |
| # Should predict | |
| predictions = pipeline.predict(data) | |
| assert len(predictions) == n_samples | |
| assert set(predictions).issubset({0, 1}) | |
| # Should predict probabilities | |
| probas = pipeline.predict_proba(data) | |
| assert probas.shape == (n_samples, 2) | |
| assert np.all((probas >= 0) & (probas <= 1)) | |