File size: 3,590 Bytes
4937cba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from __future__ import annotations

import json

import numpy as np
import pandas as pd
import yaml

from src.evaluate import rank_models
from src.train import run_training_pipeline, train_single_model


def _synthetic_df(rows: int = 160) -> pd.DataFrame:
    rng = np.random.default_rng(7)
    data: dict[str, np.ndarray] = {
        "Time": rng.normal(loc=1000, scale=250, size=rows),
        "Amount": rng.normal(loc=80, scale=20, size=rows),
    }
    for i in range(1, 29):
        data[f"V{i}"] = rng.normal(size=rows)

    y = np.zeros(rows, dtype=int)
    fraud_indices = rng.choice(rows, size=max(8, rows // 20), replace=False)
    y[fraud_indices] = 1

    # Inject weak signal for separability.
    data["Amount"][fraud_indices] += 40
    data["V3"][fraud_indices] += 1.5
    data["Class"] = y
    return pd.DataFrame(data)


def test_rank_models_orders_by_recall_then_precision() -> None:
    ranked = rank_models(
        [
            {"model_name": "a", "metrics": {"recall": 0.8, "precision": 0.9, "roc_auc": 0.9}},
            {"model_name": "b", "metrics": {"recall": 0.9, "precision": 0.7, "roc_auc": 0.95}},
            {"model_name": "c", "metrics": {"recall": 0.9, "precision": 0.8, "roc_auc": 0.85}},
        ]
    )
    assert [entry["model_name"] for entry in ranked] == ["c", "b", "a"]


def test_train_single_model_returns_expected_metrics() -> None:
    df = _synthetic_df(200)
    X = df.drop(columns=["Class"])
    y = df["Class"]

    # Simple split for unit test.
    X_train, X_test = X.iloc[:160], X.iloc[160:]
    y_train, y_test = y.iloc[:160], y.iloc[160:]

    _, metrics = train_single_model(
        model_name="logistic_regression",
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        random_state=42,
    )

    assert set(metrics.keys()) == {"precision", "recall", "f1", "roc_auc", "pr_auc", "confusion_matrix"}
    assert 0.0 <= metrics["recall"] <= 1.0


def test_run_training_pipeline_creates_report_and_model(tmp_path) -> None:
    df = _synthetic_df(240)
    data_path = tmp_path / "creditcard.csv"
    config_path = tmp_path / "train.yaml"
    model_path = tmp_path / "best_model.pkl"
    preprocessor_path = tmp_path / "preprocessor.pkl"
    report_path = tmp_path / "training_report.json"
    model_report_path = tmp_path / "model_report.json"
    validation_report_path = tmp_path / "data_validation.json"

    df.to_csv(data_path, index=False)

    config = {
        "experiment": {"name": "test-experiment"},
        "training": {
            "test_size": 0.2,
            "random_state": 42,
            "imbalance_method": "class_weight",
            "models": ["logistic_regression"],
        },
        "mlflow": {"tracking_uri": f"file:{tmp_path / 'mlruns'}"},
    }
    config_path.write_text(yaml.safe_dump(config), encoding="utf-8")

    report = run_training_pipeline(
        config_path=config_path,
        data_path=data_path,
        model_path=model_path,
        preprocessor_path=preprocessor_path,
        report_path=report_path,
        model_report_path=model_report_path,
        validation_report_path=validation_report_path,
    )

    assert model_path.exists()
    assert preprocessor_path.exists()
    assert report_path.exists()
    assert model_report_path.exists()
    assert validation_report_path.exists()
    assert report["best_model"]["model_name"] == "logistic_regression"
    assert 0.0 < report["best_model"]["selected_threshold"] < 1.0

    stored = json.loads(report_path.read_text(encoding="utf-8"))
    assert stored["best_model"]["run_id"]