fraud-detection-mlops-api / tests /test_training.py
github-actions[bot]
deploy: sync snapshot from github
4937cba
from __future__ import annotations
import json
import numpy as np
import pandas as pd
import yaml
from src.evaluate import rank_models
from src.train import run_training_pipeline, train_single_model
def _synthetic_df(rows: int = 160) -> pd.DataFrame:
rng = np.random.default_rng(7)
data: dict[str, np.ndarray] = {
"Time": rng.normal(loc=1000, scale=250, size=rows),
"Amount": rng.normal(loc=80, scale=20, size=rows),
}
for i in range(1, 29):
data[f"V{i}"] = rng.normal(size=rows)
y = np.zeros(rows, dtype=int)
fraud_indices = rng.choice(rows, size=max(8, rows // 20), replace=False)
y[fraud_indices] = 1
# Inject weak signal for separability.
data["Amount"][fraud_indices] += 40
data["V3"][fraud_indices] += 1.5
data["Class"] = y
return pd.DataFrame(data)
def test_rank_models_orders_by_recall_then_precision() -> None:
ranked = rank_models(
[
{"model_name": "a", "metrics": {"recall": 0.8, "precision": 0.9, "roc_auc": 0.9}},
{"model_name": "b", "metrics": {"recall": 0.9, "precision": 0.7, "roc_auc": 0.95}},
{"model_name": "c", "metrics": {"recall": 0.9, "precision": 0.8, "roc_auc": 0.85}},
]
)
assert [entry["model_name"] for entry in ranked] == ["c", "b", "a"]
def test_train_single_model_returns_expected_metrics() -> None:
df = _synthetic_df(200)
X = df.drop(columns=["Class"])
y = df["Class"]
# Simple split for unit test.
X_train, X_test = X.iloc[:160], X.iloc[160:]
y_train, y_test = y.iloc[:160], y.iloc[160:]
_, metrics = train_single_model(
model_name="logistic_regression",
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
random_state=42,
)
assert set(metrics.keys()) == {"precision", "recall", "f1", "roc_auc", "pr_auc", "confusion_matrix"}
assert 0.0 <= metrics["recall"] <= 1.0
def test_run_training_pipeline_creates_report_and_model(tmp_path) -> None:
df = _synthetic_df(240)
data_path = tmp_path / "creditcard.csv"
config_path = tmp_path / "train.yaml"
model_path = tmp_path / "best_model.pkl"
preprocessor_path = tmp_path / "preprocessor.pkl"
report_path = tmp_path / "training_report.json"
model_report_path = tmp_path / "model_report.json"
validation_report_path = tmp_path / "data_validation.json"
df.to_csv(data_path, index=False)
config = {
"experiment": {"name": "test-experiment"},
"training": {
"test_size": 0.2,
"random_state": 42,
"imbalance_method": "class_weight",
"models": ["logistic_regression"],
},
"mlflow": {"tracking_uri": f"file:{tmp_path / 'mlruns'}"},
}
config_path.write_text(yaml.safe_dump(config), encoding="utf-8")
report = run_training_pipeline(
config_path=config_path,
data_path=data_path,
model_path=model_path,
preprocessor_path=preprocessor_path,
report_path=report_path,
model_report_path=model_report_path,
validation_report_path=validation_report_path,
)
assert model_path.exists()
assert preprocessor_path.exists()
assert report_path.exists()
assert model_report_path.exists()
assert validation_report_path.exists()
assert report["best_model"]["model_name"] == "logistic_regression"
assert 0.0 < report["best_model"]["selected_threshold"] < 1.0
stored = json.loads(report_path.read_text(encoding="utf-8"))
assert stored["best_model"]["run_id"]