Spaces:

thasvithu
/

fraud-detection-mlops-api

Sleeping

App Files Files Community

github-actions[bot] commited on Feb 18

Commit

4937cba

0 Parent(s):

deploy: sync snapshot from github

Browse files

Files changed (42) hide show

.dockerignore +22 -0
.github/workflows/ci.yml +65 -0
.github/workflows/deploy-hf-space.yml +60 -0
.github/workflows/keepalive-hf-space.yml +34 -0
.gitignore +220 -0
.python-version +1 -0
Dockerfile +30 -0
LICENSE +21 -0
README.md +274 -0
api/__init__.py +0 -0
api/app.py +187 -0
api/schemas.py +75 -0
api/service.py +148 -0
artifacts/data_validation.json +16 -0
artifacts/metrics_logistic_regression.json +17 -0
artifacts/metrics_xgboost.json +17 -0
artifacts/model_report.json +1834 -0
artifacts/model_training_report.json +94 -0
configs/logging.yaml +12 -0
configs/train.yaml +20 -0
docker-compose.yml +18 -0
models/logistic_regression.pkl +0 -0
models/model.pkl +0 -0
models/preprocessor.pkl +0 -0
pyproject.toml +22 -0
pytest.ini +3 -0
requirements.txt +14 -0
src/__init__.py +0 -0
src/data_ingestion.py +130 -0
src/evaluate.py +120 -0
src/predict.py +1 -0
src/preprocessing.py +176 -0
src/register_model.py +1 -0
src/train.py +304 -0
tests/conftest.py +9 -0
tests/test_api.py +128 -0
tests/test_data_ingestion.py +75 -0
tests/test_evaluate.py +48 -0
tests/test_preprocessing.py +120 -0
tests/test_service.py +103 -0
tests/test_smoke.py +2 -0
tests/test_training.py +109 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,22 @@

+.git
+.gitignore
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.pytest_cache
+.coverage
+.coverage.*
+htmlcov
+.venv
+uv.lock
+pytest.ini
+venv
+env
+.env
+logs
+mlruns
+notebooks
+data/raw
+tests
+.github

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,65 @@

+name: CI-CD
+on:
+  pull_request:
+  push:
+    branches: [main]
+  workflow_dispatch:
+concurrency:
+  group: ci-${{ github.ref }}
+  cancel-in-progress: true
+env:
+  PYTHON_VERSION: "3.11"
+  IMAGE_NAME: fraud-detection-api
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v5
+      - name: Install dependencies
+        run: |
+          uv pip install --system -r requirements.txt
+      - name: Run tests
+        run: python -m pytest
+  build-image:
+    runs-on: ubuntu-latest
+    needs: test
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build Docker image
+        run: docker build -t $IMAGE_NAME:${{ github.sha }} .
+      - name: Smoke check image metadata
+        run: docker image inspect $IMAGE_NAME:${{ github.sha }}
+  deploy:
+    runs-on: ubuntu-latest
+    needs: build-image
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    steps:
+      - name: Trigger deployment webhook (if configured)
+        run: |
+          if [ -z "$DEPLOY_WEBHOOK_URL" ]; then
+            echo "DEPLOY_WEBHOOK_URL secret is not set; skipping deploy trigger."
+            exit 0
+          fi
+          curl -fsS -X POST "$DEPLOY_WEBHOOK_URL"
+        env:
+          DEPLOY_WEBHOOK_URL: ${{ secrets.DEPLOY_WEBHOOK_URL }}

.github/workflows/deploy-hf-space.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+name: Deploy to Hugging Face Space
+on:
+  push:
+    branches: [main]
+  workflow_dispatch:
+concurrency:
+  group: deploy-hf-space-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Validate required secrets
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_SPACE_REPO: ${{ secrets.HF_SPACE_REPO }}
+        run: |
+          if [ -z "$HF_TOKEN" ] || [ -z "$HF_SPACE_REPO" ]; then
+            echo "HF_TOKEN or HF_SPACE_REPO is not set. Configure repository secrets."
+            exit 1
+          fi
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Configure git
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+      - name: Remove non-serving artifacts for HF push
+        run: |
+          # Space runtime only needs selected serving artifacts.
+          rm -f models/xgboost.pkl
+      - name: Push to Hugging Face Space
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_SPACE_REPO: ${{ secrets.HF_SPACE_REPO }}
+        run: |
+          TMP_DIR="$(mktemp -d)"
+          rsync -a --delete --exclude=".git" ./ "${TMP_DIR}/"
+          # Exclude artifacts not needed for serving in Space.
+          rm -f "${TMP_DIR}/models/xgboost.pkl"
+          cd "${TMP_DIR}"
+          git init -b main
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "deploy: sync snapshot from github"
+          git remote add hf "https://oauth2:${HF_TOKEN}@huggingface.co/spaces/${HF_SPACE_REPO}"
+          git push hf main --force

.github/workflows/keepalive-hf-space.yml ADDED Viewed

	@@ -0,0 +1,34 @@

+name: Keep HF Space Warm
+on:
+  schedule:
+    # Monday, Wednesday, Friday at 09:00 UTC
+    - cron: "0 9 * * 1,3,5"
+  workflow_dispatch:
+jobs:
+  ping:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Validate HF Space URL secret
+        env:
+          HF_SPACE_URL: ${{ secrets.HF_SPACE_URL }}
+        run: |
+          if [ -z "$HF_SPACE_URL" ]; then
+            echo "HF_SPACE_URL secret is not set."
+            exit 1
+          fi
+      - name: Ping health endpoint
+        env:
+          HF_SPACE_URL: ${{ secrets.HF_SPACE_URL }}
+        run: |
+          set -e
+          curl -fsS --retry 3 --retry-delay 5 "$HF_SPACE_URL/health"
+      - name: Ping metrics endpoint
+        env:
+          HF_SPACE_URL: ${{ secrets.HF_SPACE_URL }}
+        run: |
+          set -e
+          curl -fsS --retry 3 --retry-delay 5 "$HF_SPACE_URL/metrics"

.gitignore ADDED Viewed

	@@ -0,0 +1,220 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Project-specific
+data/raw/
+data/processed/
+logs/
+mlruns/
+IMPLEMENTATION_PLAN.md
+End-to-End MLOps Project Documentation.txt
+uv.lock
+explaintovithu.md
+interview_explanation.md

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM python:3.11-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1
+WORKDIR /app
+# Install Python dependencies first for better layer caching.
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code and runtime artifacts.
+COPY api ./api
+COPY src ./src
+COPY configs ./configs
+COPY models ./models
+COPY artifacts ./artifacts
+# Run API as non-root user.
+RUN useradd --create-home --shell /usr/sbin/nologin appuser \
+    && chown -R appuser:appuser /app
+USER appuser
+EXPOSE 8000
+HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
+  CMD python -c "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:8000/health'); sys.exit(0)"
+CMD ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "8000"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 Vimalathas Vithusan
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,274 @@

+---
+title: Fraud Detection MLOps API
+emoji: 🚨
+colorFrom: blue
+colorTo: green
+sdk: docker
+app_port: 8000
+---
+# Fraud Detection MLOps Pipeline
+Production-style end-to-end fraud detection system with training, experiment tracking, API serving, containerization, CI/CD, and runtime monitoring.
+## Highlights
+- End-to-end ML lifecycle: data validation -> preprocessing -> training -> threshold tuning -> API inference.
+- Imbalanced classification handling with recall-first model ranking.
+- MLflow experiment tracking and artifact logging.
+- FastAPI inference service with single/batch prediction endpoints.
+- Dockerized deployment with health checks and non-root runtime.
+- CI/CD with automated tests, coverage gates, image build, and HF deployment sync.
+- Runtime observability via request IDs, structured logs, and `/metrics`.
+## Live Deployment
+- Hugging Face Space: `https://thasvithu-fraud-detection-mlops-api.hf.space`
+- API Docs: `https://thasvithu-fraud-detection-mlops-api.hf.space/docs`
+## Architecture
+```mermaid
+flowchart LR
+    A[Raw Data<br/>data/raw/creditcard.csv] --> B[Data Validation<br/>src/data_ingestion.py]
+    B --> C[Preprocessing<br/>src/preprocessing.py]
+    C --> D[Model Training<br/>src/train.py]
+    D --> E[Evaluation + Threshold Tuning<br/>src/evaluate.py]
+    E --> F[Artifacts<br/>models/*.pkl<br/>artifacts/*.json]
+    F --> G[Inference Service<br/>api/service.py]
+    G --> H[FastAPI App<br/>api/app.py]
+    H --> I["/predict"]
+    H --> J["/predict/batch"]
+    H --> K["/health"]
+    H --> L["/metrics"]
+```
+## ML Training Workflow
+```mermaid
+flowchart TD
+    T1[Load Config<br/>configs/train.yaml] --> T2[Validate Dataset]
+    T2 --> T3[Split + Scale + Imbalance Handling]
+    T3 --> T4[Train Candidate Models]
+    T4 --> T5[Compute Metrics]
+    T5 --> T6[Log Runs to MLflow]
+    T6 --> T7[Rank by recall -> precision -> roc_auc]
+    T7 --> T8[Select Best Model]
+    T8 --> T9[Threshold Sweep + Selection]
+    T9 --> T10[Save model + preprocessor + reports]
+```
+## Inference Request Flow
+```mermaid
+sequenceDiagram
+    autonumber
+    participant Client
+    participant API as FastAPI
+    participant Svc as InferenceService
+    participant Art as Artifacts
+    Client->>API: POST /predict (transaction payload)
+    API->>Svc: load_inference_service() [cached]
+    Svc->>Art: model.pkl + preprocessor.pkl + threshold reports
+    Svc-->>API: prediction + probability + risk level
+    API-->>Client: JSON response (+ request headers)
+```
+## CI/CD and Deployment Workflows
+```mermaid
+flowchart LR
+    P[Push / PR] --> C1[ci.yml]
+    C1 --> C2[Test + Coverage Gate]
+    C2 --> C3[Build Docker Image]
+    C3 --> C4[Optional Deploy Webhook]
+    M[Push main] --> H1[deploy-hf-space.yml]
+    H1 --> H2[Snapshot Sync to HF Space]
+    S[Schedule Mon/Wed/Fri] --> K1[keepalive-hf-space.yml]
+    K1 --> K2[Ping /health and /metrics]
+```
+## Project Structure
+```text
+fraud-detection-mlops-pipeline/
+├── api/
+│   ├── app.py
+│   ├── schemas.py
+│   └── service.py
+├── src/
+│   ├── data_ingestion.py
+│   ├── preprocessing.py
+│   ├── train.py
+│   ├── evaluate.py
+│   ├── predict.py
+│   └── register_model.py
+├── configs/
+│   ├── train.yaml
+│   └── logging.yaml
+├── data/
+│   ├── raw/
+│   └── processed/
+├── models/
+├── artifacts/
+├── tests/
+├── .github/workflows/
+│   ├── ci.yml
+│   ├── deploy-hf-space.yml
+│   └── keepalive-hf-space.yml
+├── Dockerfile
+├── docker-compose.yml
+├── requirements.txt
+└── pytest.ini
+```
+## Tech Stack
+- Python 3.11
+- Pandas, NumPy, scikit-learn, imbalanced-learn, XGBoost
+- MLflow
+- FastAPI + Pydantic
+- Docker + Docker Compose
+- GitHub Actions
+- Hugging Face Spaces (Docker SDK)
+## API Endpoints
+- `GET /health`: Service and model readiness
+- `GET /metrics`: Runtime operational counters
+- `POST /predict`: Single transaction prediction
+- `POST /predict/batch`: Batch transaction predictions
+- `GET /docs`: Swagger UI
+### Example: Single Prediction
+```bash
+BASE="https://thasvithu-fraud-detection-mlops-api.hf.space"
+curl -X POST "$BASE/predict" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "Time": 0,
+    "Amount": 149.62,
+    "V1": -1.359807, "V2": -0.072781, "V3": 2.536347, "V4": 1.378155,
+    "V5": -0.338321, "V6": 0.462388, "V7": 0.239599, "V8": 0.098698,
+    "V9": 0.363787, "V10": 0.090794, "V11": -0.551600, "V12": -0.617801,
+    "V13": -0.991390, "V14": -0.311169, "V15": 1.468177, "V16": -0.470401,
+    "V17": 0.207971, "V18": 0.025791, "V19": 0.403993, "V20": 0.251412,
+    "V21": -0.018307, "V22": 0.277838, "V23": -0.110474, "V24": 0.066928,
+    "V25": 0.128539, "V26": -0.189115, "V27": 0.133558, "V28": -0.021053
+  }'
+```
+## Local Setup
+### Prerequisites
+- Python 3.11+
+- `uv`
+- Docker (optional, for container run)
+### Install
+```bash
+uv pip install -r requirements.txt
+```
+### Train
+```bash
+uv run python -m src.train
+```
+### Test
+```bash
+uv run pytest
+```
+### Run API
+```bash
+uv run uvicorn api.app:app --reload --host 0.0.0.0 --port 8000
+```
+## Docker Usage
+### Build
+```bash
+docker build -t fraud-detection-api:latest .
+```
+### Run
+```bash
+docker run --rm -p 8000:8000 fraud-detection-api:latest
+```
+### Compose
+```bash
+docker compose up --build
+```
+## Quality Gates
+- Test coverage enforced via `pytest.ini`
+- Minimum coverage: `>= 80%` across `src` + `api`
+- Current status: passing (see GitHub Actions)
+## Monitoring and Operations
+Runtime metrics exposed by `/metrics`:
+- `total_requests`
+- `error_count`
+- `error_rate`
+- `total_predictions`
+- `fraud_predictions`
+- `fraud_prediction_rate`
+- `avg_latency_ms`
+Request-level observability:
+- `X-Request-ID`
+- `X-Process-Time-Ms`
+- Structured JSON logs for request and prediction events
+## GitHub Actions Workflows
+- `ci.yml`: test + coverage + image build (+ optional webhook deploy)
+- `deploy-hf-space.yml`: sync `main` to Hugging Face Space
+- `keepalive-hf-space.yml`: scheduled pings to reduce Space inactivity sleep
+## Required GitHub Secrets
+For Hugging Face deploy:
+- `HF_TOKEN`
+- `HF_SPACE_REPO` (format: `username/space-name`)
+For HF keepalive:
+- `HF_SPACE_URL`
+Optional webhook deploy:
+- `DEPLOY_WEBHOOK_URL`
+## Milestone Status
+All planned phases (0-9) are complete:
+- Foundation
+- Data validation
+- Preprocessing
+- Training + MLflow tracking
+- Evaluation + threshold tuning
+- FastAPI inference service
+- Testing + quality gates
+- Containerization
+- CI/CD automation
+- Monitoring and operations
+## License
+MIT (see `LICENSE`)

api/__init__.py ADDED Viewed

File without changes

api/app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+from __future__ import annotations
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from threading import Lock
+from typing import Annotated
+from uuid import uuid4
+from fastapi import Depends, FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse
+from api.schemas import (
+    BatchPredictionRequest,
+    BatchPredictionResponse,
+    HealthResponse,
+    MetricsResponse,
+    PredictionResponse,
+    Transaction,
+)
+from api.service import InferenceService, load_inference_service
+logger = logging.getLogger("api")
+if not logger.handlers:
+    logging.basicConfig(level=logging.INFO)
+@dataclass
+class MonitoringState:
+    total_requests: int = 0
+    error_count: int = 0
+    total_predictions: int = 0
+    fraud_predictions: int = 0
+    total_latency_ms: float = 0.0
+    _lock: Lock = field(default_factory=Lock)
+    def record_request(self, *, latency_ms: float, status_code: int) -> None:
+        with self._lock:
+            self.total_requests += 1
+            self.total_latency_ms += latency_ms
+            if status_code >= 400:
+                self.error_count += 1
+    def record_predictions(self, predictions: list[dict[str, object]]) -> None:
+        fraud_count = sum(1 for p in predictions if bool(p.get("is_fraud")))
+        with self._lock:
+            self.total_predictions += len(predictions)
+            self.fraud_predictions += fraud_count
+    def snapshot(self) -> dict[str, float | int]:
+        with self._lock:
+            avg_latency = self.total_latency_ms / self.total_requests if self.total_requests else 0.0
+            error_rate = self.error_count / self.total_requests if self.total_requests else 0.0
+            fraud_rate = (
+                self.fraud_predictions / self.total_predictions if self.total_predictions else 0.0
+            )
+            return {
+                "total_requests": self.total_requests,
+                "error_count": self.error_count,
+                "error_rate": float(error_rate),
+                "total_predictions": self.total_predictions,
+                "fraud_predictions": self.fraud_predictions,
+                "fraud_prediction_rate": float(fraud_rate),
+                "avg_latency_ms": float(avg_latency),
+            }
+app = FastAPI(title="Fraud Detection API", version="0.3.0")
+monitoring_state = MonitoringState()
+@app.middleware("http")
+async def add_observability(request: Request, call_next):
+    request_id = request.headers.get("X-Request-ID", str(uuid4()))
+    start = time.perf_counter()
+    status_code = 500
+    try:
+        response = await call_next(request)
+        status_code = response.status_code
+    except Exception:
+        latency_ms = (time.perf_counter() - start) * 1000
+        monitoring_state.record_request(latency_ms=latency_ms, status_code=status_code)
+        logger.exception(
+            json.dumps(
+                {
+                    "event": "request_error",
+                    "request_id": request_id,
+                    "path": request.url.path,
+                    "method": request.method,
+                    "latency_ms": round(latency_ms, 2),
+                }
+            )
+        )
+        raise
+    latency_ms = (time.perf_counter() - start) * 1000
+    monitoring_state.record_request(latency_ms=latency_ms, status_code=status_code)
+    response.headers["X-Process-Time-Ms"] = f"{latency_ms:.2f}"
+    response.headers["X-Request-ID"] = request_id
+    logger.info(
+        json.dumps(
+            {
+                "event": "request_complete",
+                "request_id": request_id,
+                "path": request.url.path,
+                "method": request.method,
+                "status_code": status_code,
+                "latency_ms": round(latency_ms, 2),
+            }
+        )
+    )
+    return response
+def get_inference_service() -> InferenceService:
+    try:
+        return load_inference_service()
+    except FileNotFoundError as exc:
+        raise HTTPException(status_code=503, detail=str(exc)) from exc
+ServiceDep = Annotated[InferenceService, Depends(get_inference_service)]
+@app.exception_handler(ValueError)
+async def value_error_handler(_: Request, exc: ValueError) -> JSONResponse:
+    return JSONResponse(status_code=400, content={"detail": str(exc)})
+@app.get("/health", response_model=HealthResponse)
+def health(service: ServiceDep) -> HealthResponse:
+    return HealthResponse(
+        status="ok",
+        model_loaded=True,
+        model_path=str(service.model_path),
+        preprocessor_path=str(service.preprocessor_path),
+        threshold=service.threshold,
+    )
+@app.get("/metrics", response_model=MetricsResponse)
+def metrics() -> MetricsResponse:
+    return MetricsResponse(**monitoring_state.snapshot())
+@app.post("/predict", response_model=PredictionResponse)
+def predict(transaction: Transaction, service: ServiceDep) -> PredictionResponse:
+    output = service.predict_records([transaction.model_dump()])[0]
+    monitoring_state.record_predictions([output])
+    logger.info(
+        json.dumps(
+            {
+                "event": "prediction",
+                "prediction_count": 1,
+                "fraud_predictions": int(output["is_fraud"]),
+                "avg_probability": round(float(output["fraud_probability"]), 6),
+                "threshold": float(output["threshold"]),
+            }
+        )
+    )
+    return PredictionResponse(**output)
+@app.post("/predict/batch", response_model=BatchPredictionResponse)
+def predict_batch(request: BatchPredictionRequest, service: ServiceDep) -> BatchPredictionResponse:
+    predictions = service.predict_records([record.model_dump() for record in request.transactions])
+    monitoring_state.record_predictions(predictions)
+    fraud_count = sum(1 for row in predictions if row["is_fraud"])
+    avg_probability = sum(float(row["fraud_probability"]) for row in predictions) / len(predictions)
+    logger.info(
+        json.dumps(
+            {
+                "event": "prediction_batch",
+                "prediction_count": len(predictions),
+                "fraud_predictions": fraud_count,
+                "avg_probability": round(avg_probability, 6),
+                "threshold": float(predictions[0]["threshold"]),
+            }
+        )
+    )
+    return BatchPredictionResponse(predictions=[PredictionResponse(**row) for row in predictions])

api/schemas.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""Pydantic request/response schemas for the inference API."""
+from __future__ import annotations
+from pydantic import BaseModel, ConfigDict, Field
+class Transaction(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    Time: float
+    V1: float
+    V2: float
+    V3: float
+    V4: float
+    V5: float
+    V6: float
+    V7: float
+    V8: float
+    V9: float
+    V10: float
+    V11: float
+    V12: float
+    V13: float
+    V14: float
+    V15: float
+    V16: float
+    V17: float
+    V18: float
+    V19: float
+    V20: float
+    V21: float
+    V22: float
+    V23: float
+    V24: float
+    V25: float
+    V26: float
+    V27: float
+    V28: float
+    Amount: float = Field(ge=0)
+class PredictionResponse(BaseModel):
+    is_fraud: bool
+    fraud_probability: float
+    risk_level: str
+    threshold: float
+class BatchPredictionRequest(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    transactions: list[Transaction] = Field(min_length=1)
+class BatchPredictionResponse(BaseModel):
+    predictions: list[PredictionResponse]
+class HealthResponse(BaseModel):
+    status: str
+    model_loaded: bool
+    model_path: str
+    preprocessor_path: str
+    threshold: float
+class MetricsResponse(BaseModel):
+    total_requests: int
+    error_count: int
+    error_rate: float
+    total_predictions: int
+    fraud_predictions: int
+    fraud_prediction_rate: float
+    avg_latency_ms: float

api/service.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""Model loading and prediction service helpers."""
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+import joblib
+import pandas as pd
+import yaml
+from src.data_ingestion import EXPECTED_COLUMNS
+DEFAULT_MODEL_PATH = Path("models/model.pkl")
+DEFAULT_PREPROCESSOR_PATH = Path("models/preprocessor.pkl")
+DEFAULT_TRAINING_REPORT_PATH = Path("artifacts/model_training_report.json")
+DEFAULT_MODEL_REPORT_PATH = Path("artifacts/model_report.json")
+DEFAULT_CONFIG_PATH = Path("configs/train.yaml")
+FEATURE_COLUMNS = [column for column in EXPECTED_COLUMNS if column != "Class"]
+@dataclass
+class InferenceService:
+    """Encapsulate model/preprocessor runtime and prediction logic."""
+    model: Any
+    preprocessor: Any
+    threshold: float
+    model_path: Path
+    preprocessor_path: Path
+    feature_columns: list[str]
+    def predict_records(self, records: list[dict[str, float]]) -> list[dict[str, Any]]:
+        """Predict fraud labels/probabilities for input transaction records."""
+        frame = pd.DataFrame(records)
+        frame = frame[self.feature_columns]
+        transformed = self.preprocessor.transform(frame)
+        probabilities = self.model.predict_proba(transformed)[:, 1]
+        outputs: list[dict[str, Any]] = []
+        for prob in probabilities:
+            probability = float(prob)
+            outputs.append(
+                {
+                    "is_fraud": bool(probability >= self.threshold),
+                    "fraud_probability": probability,
+                    "risk_level": _risk_level(probability),
+                    "threshold": float(self.threshold),
+                }
+            )
+        return outputs
+def _risk_level(probability: float) -> str:
+    if probability >= 0.7:
+        return "high"
+    if probability >= 0.3:
+        return "medium"
+    return "low"
+def _threshold_from_training_report(training_report_path: Path) -> float | None:
+    if not training_report_path.exists():
+        return None
+    payload = json.loads(training_report_path.read_text(encoding="utf-8"))
+    best = payload.get("best_model", {})
+    threshold = best.get("selected_threshold")
+    return float(threshold) if threshold is not None else None
+def _threshold_from_model_report(model_report_path: Path) -> float | None:
+    if not model_report_path.exists():
+        return None
+    payload = json.loads(model_report_path.read_text(encoding="utf-8"))
+    selection = payload.get("threshold_selection", {})
+    threshold = selection.get("selected_threshold")
+    return float(threshold) if threshold is not None else None
+def _threshold_from_config(config_path: Path) -> float | None:
+    if not config_path.exists():
+        return None
+    config = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
+    threshold_cfg = config.get("threshold", {})
+    threshold = threshold_cfg.get("decision_threshold")
+    return float(threshold) if threshold is not None else None
+def resolve_threshold(
+    *,
+    training_report_path: Path = DEFAULT_TRAINING_REPORT_PATH,
+    model_report_path: Path = DEFAULT_MODEL_REPORT_PATH,
+    config_path: Path = DEFAULT_CONFIG_PATH,
+) -> float:
+    """Resolve runtime threshold from artifacts, then fallback config/default."""
+    value = _threshold_from_training_report(training_report_path)
+    if value is not None:
+        return value
+    value = _threshold_from_model_report(model_report_path)
+    if value is not None:
+        return value
+    value = _threshold_from_config(config_path)
+    if value is not None:
+        return value
+    return 0.5
+@lru_cache(maxsize=1)
+def load_inference_service(
+    *,
+    model_path: str = str(DEFAULT_MODEL_PATH),
+    preprocessor_path: str = str(DEFAULT_PREPROCESSOR_PATH),
+    training_report_path: str = str(DEFAULT_TRAINING_REPORT_PATH),
+    model_report_path: str = str(DEFAULT_MODEL_REPORT_PATH),
+    config_path: str = str(DEFAULT_CONFIG_PATH),
+) -> InferenceService:
+    """Load model + preprocessor + threshold and cache service singleton."""
+    model_file = Path(model_path)
+    preprocessor_file = Path(preprocessor_path)
+    if not model_file.exists():
+        raise FileNotFoundError(f"Model artifact not found: {model_file}")
+    if not preprocessor_file.exists():
+        raise FileNotFoundError(f"Preprocessor artifact not found: {preprocessor_file}")
+    model = joblib.load(model_file)
+    preprocessor = joblib.load(preprocessor_file)
+    threshold = resolve_threshold(
+        training_report_path=Path(training_report_path),
+        model_report_path=Path(model_report_path),
+        config_path=Path(config_path),
+    )
+    feature_names_in = getattr(preprocessor, "feature_names_in_", FEATURE_COLUMNS)
+    feature_columns = list(feature_names_in)
+    return InferenceService(
+        model=model,
+        preprocessor=preprocessor,
+        threshold=threshold,
+        model_path=model_file,
+        preprocessor_path=preprocessor_file,
+        feature_columns=feature_columns,
+    )

artifacts/data_validation.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "is_valid": true,
+  "errors": [],
+  "warnings": [],
+  "statistics": {
+    "row_count": 284807,
+    "column_count": 31,
+    "missing_values_total": 0,
+    "duplicate_rows": 1081,
+    "class_counts": {
+      "0": 284315,
+      "1": 492
+    },
+    "fraud_ratio": 0.001727485630620034
+  }
+}

artifacts/metrics_logistic_regression.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "precision": 0.06097560975609756,
+  "recall": 0.9183673469387755,
+  "f1": 0.11435832274459974,
+  "roc_auc": 0.9721687370080279,
+  "pr_auc": 0.7159122424484009,
+  "confusion_matrix": [
+    [
+      55478,
+      1386
+    ],
+    [
+      8,
+      90
+    ]
+  ]
+}

artifacts/metrics_xgboost.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "precision": 0.9186046511627907,
+  "recall": 0.8061224489795918,
+  "f1": 0.8586956521739131,
+  "roc_auc": 0.9775147361983623,
+  "pr_auc": 0.87487299490182,
+  "confusion_matrix": [
+    [
+      56857,
+      7
+    ],
+    [
+      19,
+      79
+    ]
+  ]
+}

artifacts/model_report.json ADDED Viewed

	@@ -0,0 +1,1834 @@

+{
+  "timestamp_utc": "2026-02-18T04:09:02.995799+00:00",
+  "best_model_name": "logistic_regression",
+  "default_threshold_metrics": {
+    "precision": 0.06097560975609756,
+    "recall": 0.9183673469387755,
+    "f1": 0.11435832274459974,
+    "roc_auc": 0.9721687370080279,
+    "pr_auc": 0.7159122424484009,
+    "confusion_matrix": [
+      [
+        55478,
+        1386
+      ],
+      [
+        8,
+        90
+      ]
+    ]
+  },
+  "threshold_selection": {
+    "selection_reason": "meets_min_recall",
+    "min_recall_target": 0.9,
+    "selected_threshold": 0.74,
+    "selected_metrics": {
+      "precision": 0.13650306748466257,
+      "recall": 0.9081632653061225,
+      "f1": 0.23733333333333334,
+      "roc_auc": 0.9721687370080279,
+      "pr_auc": 0.7159122424484009,
+      "confusion_matrix": [
+        [
+          56301,
+          563
+        ],
+        [
+          9,
+          89
+        ]
+      ],
+      "threshold": 0.74
+    },
+    "threshold_grid_size": 99,
+    "thresholds_evaluated": [
+      {
+        "precision": 0.0024050381830804323,
+        "recall": 0.9897959183673469,
+        "f1": 0.004798417017066535,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            16629,
+            40235
+          ],
+          [
+            1,
+            97
+          ]
+        ],
+        "threshold": 0.01
+      },
+      {
+        "precision": 0.0030859288009416853,
+        "recall": 0.9897959183673469,
+        "f1": 0.006152675145095303,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            25528,
+            31336
+          ],
+          [
+            1,
+            97
+          ]
+        ],
+        "threshold": 0.02
+      },
+      {
+        "precision": 0.0037338104313328927,
+        "recall": 0.9795918367346939,
+        "f1": 0.007439265372544461,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            31249,
+            25615
+          ],
+          [
+            2,
+            96
+          ]
+        ],
+        "threshold": 0.03
+      },
+      {
+        "precision": 0.004303044174868391,
+        "recall": 0.9591836734693877,
+        "f1": 0.00856765255434535,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            35113,
+            21751
+          ],
+          [
+            4,
+            94
+          ]
+        ],
+        "threshold": 0.04
+      },
+      {
+        "precision": 0.004967499867885642,
+        "recall": 0.9591836734693877,
+        "f1": 0.009883812628147836,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            38035,
+            18829
+          ],
+          [
+            4,
+            94
+          ]
+        ],
+        "threshold": 0.05
+      },
+      {
+        "precision": 0.005584932564909988,
+        "recall": 0.9591836734693877,
+        "f1": 0.011105204087660228,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            40127,
+            16737
+          ],
+          [
+            4,
+            94
+          ]
+        ],
+        "threshold": 0.060000000000000005
+      },
+      {
+        "precision": 0.006171610591280112,
+        "recall": 0.9489795918367347,
+        "f1": 0.01226346673699479,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            41888,
+            14976
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.06999999999999999
+      },
+      {
+        "precision": 0.006826189078097475,
+        "recall": 0.9489795918367347,
+        "f1": 0.01355487538259729,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            43333,
+            13531
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.08
+      },
+      {
+        "precision": 0.007441190590494479,
+        "recall": 0.9489795918367347,
+        "f1": 0.014766592569069545,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            44459,
+            12405
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.09
+      },
+      {
+        "precision": 0.008117308195862791,
+        "recall": 0.9489795918367347,
+        "f1": 0.01609692773691043,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            45500,
+            11364
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.09999999999999999
+      },
+      {
+        "precision": 0.008798486281929991,
+        "recall": 0.9489795918367347,
+        "f1": 0.017435320584926885,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            46387,
+            10477
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.11
+      },
+      {
+        "precision": 0.009562982005141388,
+        "recall": 0.9489795918367347,
+        "f1": 0.018935152193830806,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            47232,
+            9632
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.12
+      },
+      {
+        "precision": 0.01033103754721173,
+        "recall": 0.9489795918367347,
+        "f1": 0.02043956043956044,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            47955,
+            8909
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.13
+      },
+      {
+        "precision": 0.011143062544931704,
+        "recall": 0.9489795918367347,
+        "f1": 0.022027475130270015,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            48611,
+            8253
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.14
+      },
+      {
+        "precision": 0.011935318275154004,
+        "recall": 0.9489795918367347,
+        "f1": 0.023574144486692015,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            49165,
+            7699
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.15000000000000002
+      },
+      {
+        "precision": 0.012781748213304012,
+        "recall": 0.9489795918367347,
+        "f1": 0.025223759153783564,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            49681,
+            7183
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.16
+      },
+      {
+        "precision": 0.013650374284456186,
+        "recall": 0.9489795918367347,
+        "f1": 0.02691361597453335,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            50144,
+            6720
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.17
+      },
+      {
+        "precision": 0.014563106796116505,
+        "recall": 0.9489795918367347,
+        "f1": 0.028685996298581123,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            50571,
+            6293
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.18000000000000002
+      },
+      {
+        "precision": 0.015567458988952126,
+        "recall": 0.9489795918367347,
+        "f1": 0.030632411067193676,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            50983,
+            5881
+          ],
+          [
+            5,
+            93
+          ]
+        ],
+        "threshold": 0.19
+      },
+      {
+        "precision": 0.016358463726884778,
+        "recall": 0.9387755102040817,
+        "f1": 0.03215658860538273,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            51332,
+            5532
+          ],
+          [
+            6,
+            92
+          ]
+        ],
+        "threshold": 0.2
+      },
+      {
+        "precision": 0.017355215996981702,
+        "recall": 0.9387755102040817,
+        "f1": 0.03408038525652899,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            51655,
+            5209
+          ],
+          [
+            6,
+            92
+          ]
+        ],
+        "threshold": 0.21000000000000002
+      },
+      {
+        "precision": 0.018236472945891785,
+        "recall": 0.9285714285714286,
+        "f1": 0.035770440251572326,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            51965,
+            4899
+          ],
+          [
+            7,
+            91
+          ]
+        ],
+        "threshold": 0.22
+      },
+      {
+        "precision": 0.01904761904761905,
+        "recall": 0.9183673469387755,
+        "f1": 0.037321169396641096,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            52229,
+            4635
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.23
+      },
+      {
+        "precision": 0.020049008687903765,
+        "recall": 0.9183673469387755,
+        "f1": 0.03924133420536298,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            52465,
+            4399
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.24000000000000002
+      },
+      {
+        "precision": 0.021216407355021217,
+        "recall": 0.9183673469387755,
+        "f1": 0.041474654377880185,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            52712,
+            4152
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.25
+      },
+      {
+        "precision": 0.0224159402241594,
+        "recall": 0.9183673469387755,
+        "f1": 0.0437636761487965,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            52939,
+            3925
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.26
+      },
+      {
+        "precision": 0.023578726748755566,
+        "recall": 0.9183673469387755,
+        "f1": 0.04597701149425287,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            53137,
+            3727
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.27
+      },
+      {
+        "precision": 0.024725274725274724,
+        "recall": 0.9183673469387755,
+        "f1": 0.048154093097913325,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            53314,
+            3550
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.28
+      },
+      {
+        "precision": 0.02601156069364162,
+        "recall": 0.9183673469387755,
+        "f1": 0.050590219224283306,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            53494,
+            3370
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.29000000000000004
+      },
+      {
+        "precision": 0.0272975432211101,
+        "recall": 0.9183673469387755,
+        "f1": 0.053019145802650956,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            53657,
+            3207
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.3
+      },
+      {
+        "precision": 0.028598665395614873,
+        "recall": 0.9183673469387755,
+        "f1": 0.05546995377503852,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            53807,
+            3057
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.31
+      },
+      {
+        "precision": 0.030010003334444816,
+        "recall": 0.9183673469387755,
+        "f1": 0.05812076202776881,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            53955,
+            2909
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.32
+      },
+      {
+        "precision": 0.031315240083507306,
+        "recall": 0.9183673469387755,
+        "f1": 0.06056527590847914,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            54080,
+            2784
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.33
+      },
+      {
+        "precision": 0.03278688524590164,
+        "recall": 0.9183673469387755,
+        "f1": 0.06331340133661625,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            54209,
+            2655
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.34
+      },
+      {
+        "precision": 0.03425961172440046,
+        "recall": 0.9183673469387755,
+        "f1": 0.06605504587155964,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            54327,
+            2537
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.35000000000000003
+      },
+      {
+        "precision": 0.03587086488640893,
+        "recall": 0.9183673469387755,
+        "f1": 0.06904487917146145,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            54445,
+            2419
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.36000000000000004
+      },
+      {
+        "precision": 0.037282518641259324,
+        "recall": 0.9183673469387755,
+        "f1": 0.07165605095541401,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            54540,
+            2324
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.37
+      },
+      {
+        "precision": 0.038860103626943004,
+        "recall": 0.9183673469387755,
+        "f1": 0.07456503728251865,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            54638,
+            2226
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.38
+      },
+      {
+        "precision": 0.04025044722719141,
+        "recall": 0.9183673469387755,
+        "f1": 0.07712082262210797,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            54718,
+            2146
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.39
+      },
+      {
+        "precision": 0.04205607476635514,
+        "recall": 0.9183673469387755,
+        "f1": 0.08042895442359249,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            54814,
+            2050
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.4
+      },
+      {
+        "precision": 0.043923865300146414,
+        "recall": 0.9183673469387755,
+        "f1": 0.08383791336748952,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            54905,
+            1959
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.41000000000000003
+      },
+      {
+        "precision": 0.045754956786985254,
+        "recall": 0.9183673469387755,
+        "f1": 0.08716707021791767,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            54987,
+            1877
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.42000000000000004
+      },
+      {
+        "precision": 0.04736842105263158,
+        "recall": 0.9183673469387755,
+        "f1": 0.09009009009009009,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55054,
+            1810
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.43
+      },
+      {
+        "precision": 0.049099836333878884,
+        "recall": 0.9183673469387755,
+        "f1": 0.09321595028482652,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55121,
+            1743
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.44
+      },
+      {
+        "precision": 0.050818746470920384,
+        "recall": 0.9183673469387755,
+        "f1": 0.09630818619582665,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55183,
+            1681
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.45
+      },
+      {
+        "precision": 0.052508751458576426,
+        "recall": 0.9183673469387755,
+        "f1": 0.09933774834437085,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55240,
+            1624
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.46
+      },
+      {
+        "precision": 0.054678007290400975,
+        "recall": 0.9183673469387755,
+        "f1": 0.10321100917431193,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55308,
+            1556
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.47000000000000003
+      },
+      {
+        "precision": 0.056568196103079824,
+        "recall": 0.9183673469387755,
+        "f1": 0.10657193605683836,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55363,
+            1501
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.48000000000000004
+      },
+      {
+        "precision": 0.05870841487279843,
+        "recall": 0.9183673469387755,
+        "f1": 0.11036174126302882,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55421,
+            1443
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.49
+      },
+      {
+        "precision": 0.06097560975609756,
+        "recall": 0.9183673469387755,
+        "f1": 0.11435832274459974,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55478,
+            1386
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.5
+      },
+      {
+        "precision": 0.06382978723404255,
+        "recall": 0.9183673469387755,
+        "f1": 0.11936339522546419,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55544,
+            1320
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.51
+      },
+      {
+        "precision": 0.06642066420664207,
+        "recall": 0.9183673469387755,
+        "f1": 0.12388162422573985,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55599,
+            1265
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.52
+      },
+      {
+        "precision": 0.06813020439061317,
+        "recall": 0.9183673469387755,
+        "f1": 0.12684989429175475,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55633,
+            1231
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.53
+      },
+      {
+        "precision": 0.0706436420722135,
+        "recall": 0.9183673469387755,
+        "f1": 0.13119533527696792,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55680,
+            1184
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.54
+      },
+      {
+        "precision": 0.07317073170731707,
+        "recall": 0.9183673469387755,
+        "f1": 0.1355421686746988,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55724,
+            1140
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.55
+      },
+      {
+        "precision": 0.0760777683854607,
+        "recall": 0.9183673469387755,
+        "f1": 0.1405152224824356,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55771,
+            1093
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.56
+      },
+      {
+        "precision": 0.07853403141361257,
+        "recall": 0.9183673469387755,
+        "f1": 0.14469453376205788,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55808,
+            1056
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.5700000000000001
+      },
+      {
+        "precision": 0.0820419325432999,
+        "recall": 0.9183673469387755,
+        "f1": 0.1506276150627615,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55857,
+            1007
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.5800000000000001
+      },
+      {
+        "precision": 0.08458646616541353,
+        "recall": 0.9183673469387755,
+        "f1": 0.1549053356282272,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55890,
+            974
+          ],
+          [
+            8,
+            90
+          ]
+        ],
+        "threshold": 0.59
+      },
+      {
+        "precision": 0.0866601752677702,
+        "recall": 0.9081632653061225,
+        "f1": 0.1582222222222222,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55926,
+            938
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.6
+      },
+      {
+        "precision": 0.09035532994923857,
+        "recall": 0.9081632653061225,
+        "f1": 0.16435826408125578,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55968,
+            896
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.61
+      },
+      {
+        "precision": 0.09290187891440502,
+        "recall": 0.9081632653061225,
+        "f1": 0.16856060606060605,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55995,
+            869
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.62
+      },
+      {
+        "precision": 0.09611231101511879,
+        "recall": 0.9081632653061225,
+        "f1": 0.173828125,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56027,
+            837
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.63
+      },
+      {
+        "precision": 0.09866962305986696,
+        "recall": 0.9081632653061225,
+        "f1": 0.178,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56051,
+            813
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.64
+      },
+      {
+        "precision": 0.10194730813287514,
+        "recall": 0.9081632653061225,
+        "f1": 0.18331616889804325,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56080,
+            784
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.65
+      },
+      {
+        "precision": 0.10620525059665871,
+        "recall": 0.9081632653061225,
+        "f1": 0.19017094017094016,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56115,
+            749
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.66
+      },
+      {
+        "precision": 0.11014851485148515,
+        "recall": 0.9081632653061225,
+        "f1": 0.19646799116997793,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56145,
+            719
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.67
+      },
+      {
+        "precision": 0.11424903722721438,
+        "recall": 0.9081632653061225,
+        "f1": 0.20296465222348917,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56174,
+            690
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.68
+      },
+      {
+        "precision": 0.11772486772486772,
+        "recall": 0.9081632653061225,
+        "f1": 0.20843091334894615,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56197,
+            667
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.6900000000000001
+      },
+      {
+        "precision": 0.12141882673942701,
+        "recall": 0.9081632653061225,
+        "f1": 0.21419975932611313,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56220,
+            644
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.7000000000000001
+      },
+      {
+        "precision": 0.12588401697312587,
+        "recall": 0.9081632653061225,
+        "f1": 0.22111801242236026,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56246,
+            618
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.7100000000000001
+      },
+      {
+        "precision": 0.12936046511627908,
+        "recall": 0.9081632653061225,
+        "f1": 0.22646310432569974,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56265,
+            599
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.72
+      },
+      {
+        "precision": 0.13343328335832083,
+        "recall": 0.9081632653061225,
+        "f1": 0.2326797385620915,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56286,
+            578
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.73
+      },
+      {
+        "precision": 0.13650306748466257,
+        "recall": 0.9081632653061225,
+        "f1": 0.23733333333333334,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56301,
+            563
+          ],
+          [
+            9,
+            89
+          ]
+        ],
+        "threshold": 0.74
+      },
+      {
+        "precision": 0.14012738853503184,
+        "recall": 0.8979591836734694,
+        "f1": 0.24242424242424243,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56324,
+            540
+          ],
+          [
+            10,
+            88
+          ]
+        ],
+        "threshold": 0.75
+      },
+      {
+        "precision": 0.14402618657937807,
+        "recall": 0.8979591836734694,
+        "f1": 0.24823695345557123,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56341,
+            523
+          ],
+          [
+            10,
+            88
+          ]
+        ],
+        "threshold": 0.76
+      },
+      {
+        "precision": 0.14864864864864866,
+        "recall": 0.8979591836734694,
+        "f1": 0.25507246376811593,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56360,
+            504
+          ],
+          [
+            10,
+            88
+          ]
+        ],
+        "threshold": 0.77
+      },
+      {
+        "precision": 0.15198618307426598,
+        "recall": 0.8979591836734694,
+        "f1": 0.25997045790251105,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56373,
+            491
+          ],
+          [
+            10,
+            88
+          ]
+        ],
+        "threshold": 0.78
+      },
+      {
+        "precision": 0.15630550621669628,
+        "recall": 0.8979591836734694,
+        "f1": 0.26626323751891073,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56389,
+            475
+          ],
+          [
+            10,
+            88
+          ]
+        ],
+        "threshold": 0.79
+      },
+      {
+        "precision": 0.16087751371115175,
+        "recall": 0.8979591836734694,
+        "f1": 0.27286821705426356,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56405,
+            459
+          ],
+          [
+            10,
+            88
+          ]
+        ],
+        "threshold": 0.8
+      },
+      {
+        "precision": 0.1638418079096045,
+        "recall": 0.8877551020408163,
+        "f1": 0.2766295707472178,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56420,
+            444
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.81
+      },
+      {
+        "precision": 0.17058823529411765,
+        "recall": 0.8877551020408163,
+        "f1": 0.28618421052631576,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56441,
+            423
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.8200000000000001
+      },
+      {
+        "precision": 0.174,
+        "recall": 0.8877551020408163,
+        "f1": 0.2909698996655518,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56451,
+            413
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.8300000000000001
+      },
+      {
+        "precision": 0.1797520661157025,
+        "recall": 0.8877551020408163,
+        "f1": 0.29896907216494845,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56467,
+            397
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.8400000000000001
+      },
+      {
+        "precision": 0.18471337579617833,
+        "recall": 0.8877551020408163,
+        "f1": 0.30579964850615116,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56480,
+            384
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.85
+      },
+      {
+        "precision": 0.19506726457399104,
+        "recall": 0.8877551020408163,
+        "f1": 0.31985294117647056,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56505,
+            359
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.86
+      },
+      {
+        "precision": 0.20374707259953162,
+        "recall": 0.8877551020408163,
+        "f1": 0.3314285714285714,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56524,
+            340
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.87
+      },
+      {
+        "precision": 0.21375921375921375,
+        "recall": 0.8877551020408163,
+        "f1": 0.3445544554455445,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56544,
+            320
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.88
+      },
+      {
+        "precision": 0.2265625,
+        "recall": 0.8877551020408163,
+        "f1": 0.36099585062240663,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56567,
+            297
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.89
+      },
+      {
+        "precision": 0.24507042253521127,
+        "recall": 0.8877551020408163,
+        "f1": 0.3841059602649007,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56596,
+            268
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.9
+      },
+      {
+        "precision": 0.2636363636363636,
+        "recall": 0.8877551020408163,
+        "f1": 0.40654205607476634,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56621,
+            243
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.91
+      },
+      {
+        "precision": 0.28618421052631576,
+        "recall": 0.8877551020408163,
+        "f1": 0.43283582089552236,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56647,
+            217
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.92
+      },
+      {
+        "precision": 0.3246268656716418,
+        "recall": 0.8877551020408163,
+        "f1": 0.47540983606557374,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56683,
+            181
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.93
+      },
+      {
+        "precision": 0.35080645161290325,
+        "recall": 0.8877551020408163,
+        "f1": 0.5028901734104047,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56703,
+            161
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.9400000000000001
+      },
+      {
+        "precision": 0.3918918918918919,
+        "recall": 0.8877551020408163,
+        "f1": 0.54375,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56729,
+            135
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.9500000000000001
+      },
+      {
+        "precision": 0.44387755102040816,
+        "recall": 0.8877551020408163,
+        "f1": 0.5918367346938775,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56755,
+            109
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.9600000000000001
+      },
+      {
+        "precision": 0.47802197802197804,
+        "recall": 0.8877551020408163,
+        "f1": 0.6214285714285714,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56769,
+            95
+          ],
+          [
+            11,
+            87
+          ]
+        ],
+        "threshold": 0.97
+      },
+      {
+        "precision": 0.5151515151515151,
+        "recall": 0.8673469387755102,
+        "f1": 0.6463878326996197,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56784,
+            80
+          ],
+          [
+            13,
+            85
+          ]
+        ],
+        "threshold": 0.98
+      },
+      {
+        "precision": 0.5763888888888888,
+        "recall": 0.8469387755102041,
+        "f1": 0.6859504132231405,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            56803,
+            61
+          ],
+          [
+            15,
+            83
+          ]
+        ],
+        "threshold": 0.99
+      }
+    ]
+  },
+  "evaluation_summary": {
+    "test_rows": 56962,
+    "min_recall_target": 0.9,
+    "selection_reason": "meets_min_recall"
+  }
+}

artifacts/model_training_report.json ADDED Viewed

	@@ -0,0 +1,94 @@

+{
+  "timestamp_utc": "2026-02-18T04:09:02.997602+00:00",
+  "experiment_name": "fraud-detection-baseline",
+  "tracking_uri": "file:./mlruns",
+  "data_path": "data/raw/creditcard.csv",
+  "preprocessor_path": "models/preprocessor.pkl",
+  "model_path": "models/model.pkl",
+  "model_report_path": "artifacts/model_report.json",
+  "best_model": {
+    "model_name": "logistic_regression",
+    "run_id": "f953d6a1c2d944338f8fc210408267a9",
+    "metrics": {
+      "precision": 0.06097560975609756,
+      "recall": 0.9183673469387755,
+      "f1": 0.11435832274459974,
+      "roc_auc": 0.9721687370080279,
+      "pr_auc": 0.7159122424484009,
+      "confusion_matrix": [
+        [
+          55478,
+          1386
+        ],
+        [
+          8,
+          90
+        ]
+      ]
+    },
+    "selected_threshold": 0.74,
+    "threshold_metrics": {
+      "precision": 0.13650306748466257,
+      "recall": 0.9081632653061225,
+      "f1": 0.23733333333333334,
+      "roc_auc": 0.9721687370080279,
+      "pr_auc": 0.7159122424484009,
+      "confusion_matrix": [
+        [
+          56301,
+          563
+        ],
+        [
+          9,
+          89
+        ]
+      ],
+      "threshold": 0.74
+    }
+  },
+  "all_results": [
+    {
+      "model_name": "logistic_regression",
+      "run_id": "f953d6a1c2d944338f8fc210408267a9",
+      "metrics": {
+        "precision": 0.06097560975609756,
+        "recall": 0.9183673469387755,
+        "f1": 0.11435832274459974,
+        "roc_auc": 0.9721687370080279,
+        "pr_auc": 0.7159122424484009,
+        "confusion_matrix": [
+          [
+            55478,
+            1386
+          ],
+          [
+            8,
+            90
+          ]
+        ]
+      }
+    },
+    {
+      "model_name": "xgboost",
+      "run_id": "0ad9425817db4958a142b29f816108f4",
+      "metrics": {
+        "precision": 0.9186046511627907,
+        "recall": 0.8061224489795918,
+        "f1": 0.8586956521739131,
+        "roc_auc": 0.9775147361983623,
+        "pr_auc": 0.87487299490182,
+        "confusion_matrix": [
+          [
+            56857,
+            7
+          ],
+          [
+            19,
+            79
+          ]
+        ]
+      }
+    }
+  ],
+  "skipped_models": []
+}

configs/logging.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+version: 1
+formatters:
+  standard:
+    format: '%(asctime)s | %(levelname)s | %(name)s | %(message)s'
+handlers:
+  console:
+    class: logging.StreamHandler
+    formatter: standard
+    level: INFO
+root:
+  handlers: [console]
+  level: INFO

configs/train.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+experiment:
+  name: fraud-detection-baseline
+training:
+  test_size: 0.2
+  random_state: 42
+  imbalance_method: class_weight
+  models:
+    - logistic_regression
+    - xgboost
+mlflow:
+  tracking_uri: file:./mlruns
+threshold:
+  decision_threshold: 0.5
+  min_recall_target: 0.9
+  min_threshold: 0.01
+  max_threshold: 0.99
+  grid_size: 99

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,18 @@

+services:
+  api:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: fraud-detection-api:latest
+    container_name: fraud-detection-api
+    restart: unless-stopped
+    ports:
+      - "8000:8000"
+    environment:
+      - PYTHONUNBUFFERED=1
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:8000/health'); sys.exit(0)"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 20s

models/logistic_regression.pkl ADDED Viewed

Binary file (1.54 kB). View file

models/model.pkl ADDED Viewed

Binary file (1.54 kB). View file

models/preprocessor.pkl ADDED Viewed

Binary file (2.68 kB). View file

pyproject.toml ADDED Viewed

	@@ -0,0 +1,22 @@

+[project]
+name = "fraud-detection-mlops-pipeline"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "fastapi>=0.110,<0.116",
+    "httpx>=0.27,<0.29",
+    "imbalanced-learn>=0.12,<0.15",
+    "mlflow>=2.11,<3.0",
+    "numpy>=1.26,<3.0",
+    "pandas>=2.1,<2.4",
+    "pydantic>=2.6,<3.0",
+    "pytest>=8.0,<9.0",
+    "pytest-cov>=5.0,<7.0",
+    "python-dotenv>=1.0,<2.0",
+    "pyyaml>=6.0,<7.0",
+    "scikit-learn>=1.4,<1.8",
+    "uvicorn[standard]>=0.29,<0.36",
+    "xgboost>=2.0,<3.0",
+]

pytest.ini ADDED Viewed

	@@ -0,0 +1,3 @@

+[pytest]
+addopts = -q --cov=src --cov=api --cov-report=term-missing --cov-fail-under=80
+testpaths = tests

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+numpy>=1.26,<3.0
+pandas>=2.1,<2.4
+scikit-learn>=1.4,<1.8
+imbalanced-learn>=0.12,<0.15
+xgboost>=2.0,<3.0
+mlflow>=2.11,<3.0
+fastapi>=0.110,<0.116
+uvicorn[standard]>=0.29,<0.36
+pydantic>=2.6,<3.0
+python-dotenv>=1.0,<2.0
+pyyaml>=6.0,<7.0
+pytest>=8.0,<9.0
+pytest-cov>=5.0,<7.0
+httpx>=0.27,<0.29

src/__init__.py ADDED Viewed

File without changes

src/data_ingestion.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""Data ingestion and validation utilities for the fraud dataset."""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+import pandas as pd
+EXPECTED_ROW_COUNT = 284_807
+EXPECTED_COLUMNS = ["Time", *[f"V{i}" for i in range(1, 29)], "Amount", "Class"]
+EXPECTED_CLASS_VALUES = {0, 1}
+def load_data(file_path: str | Path) -> pd.DataFrame:
+    """Load CSV data from disk."""
+    path = Path(file_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Dataset not found: {path}")
+    if path.suffix.lower() != ".csv":
+        raise ValueError(f"Expected a CSV file, got: {path.suffix}")
+    return pd.read_csv(path)
+def get_data_statistics(df: pd.DataFrame) -> dict[str, Any]:
+    """Return key dataset statistics used for validation and monitoring."""
+    class_counts: dict[str, int] = {}
+    fraud_ratio: float | None = None
+    if "Class" in df.columns:
+        raw_counts = df["Class"].value_counts(dropna=False).to_dict()
+        class_counts = {str(k): int(v) for k, v in raw_counts.items()}
+        if len(df) > 0:
+            fraud_ratio = float((df["Class"] == 1).sum() / len(df))
+    return {
+        "row_count": int(df.shape[0]),
+        "column_count": int(df.shape[1]),
+        "missing_values_total": int(df.isna().sum().sum()),
+        "duplicate_rows": int(df.duplicated().sum()),
+        "class_counts": class_counts,
+        "fraud_ratio": fraud_ratio,
+    }
+def validate_data(df: pd.DataFrame, expected_rows: int = EXPECTED_ROW_COUNT) -> dict[str, Any]:
+    """Validate schema and data quality; return a structured report."""
+    errors: list[str] = []
+    warnings: list[str] = []
+    actual_columns = list(df.columns)
+    missing_columns = [col for col in EXPECTED_COLUMNS if col not in actual_columns]
+    unexpected_columns = [col for col in actual_columns if col not in EXPECTED_COLUMNS]
+    if missing_columns:
+        errors.append(f"Missing required columns: {missing_columns}")
+    if unexpected_columns:
+        warnings.append(f"Unexpected columns present: {unexpected_columns}")
+    stats = get_data_statistics(df)
+    if expected_rows and stats["row_count"] != expected_rows:
+        warnings.append(
+            f"Row count differs from expected {expected_rows}: got {stats['row_count']}"
+        )
+    if stats["missing_values_total"] > 0:
+        warnings.append(f"Dataset contains {stats['missing_values_total']} missing values")
+    if "Class" in df.columns:
+        class_values = set(df["Class"].dropna().unique().tolist())
+        invalid_class_values = sorted(class_values - EXPECTED_CLASS_VALUES)
+        if invalid_class_values:
+            errors.append(f"Class contains invalid values: {invalid_class_values}")
+        if len(class_values) == 1:
+            warnings.append("Class column has only one class present")
+    else:
+        errors.append("Class column not found")
+    is_valid = len(errors) == 0
+    return {"is_valid": is_valid, "errors": errors, "warnings": warnings, "statistics": stats}
+def save_validation_report(report: dict[str, Any], output_path: str | Path) -> Path:
+    """Write validation report to JSON."""
+    output = Path(output_path)
+    output.parent.mkdir(parents=True, exist_ok=True)
+    output.write_text(json.dumps(report, indent=2), encoding="utf-8")
+    return output
+def run_data_validation(
+    file_path: str | Path = "data/raw/creditcard.csv",
+    report_path: str | Path = "artifacts/data_validation.json",
+) -> dict[str, Any]:
+    """Load dataset, validate, persist report, and fail fast on schema errors."""
+    df = load_data(file_path)
+    report = validate_data(df)
+    save_validation_report(report, report_path)
+    if not report["is_valid"]:
+        raise ValueError(f"Data validation failed: {report['errors']}")
+    return report
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Validate fraud dataset schema and quality.")
+    parser.add_argument(
+        "--data-path",
+        default="data/raw/creditcard.csv",
+        help="Path to the raw CSV dataset.",
+    )
+    parser.add_argument(
+        "--report-path",
+        default="artifacts/data_validation.json",
+        help="Path to write the validation report JSON.",
+    )
+    return parser
+def main() -> None:
+    args = _build_parser().parse_args()
+    report = run_data_validation(args.data_path, args.report_path)
+    print("Data validation passed.")
+    print(json.dumps(report["statistics"], indent=2))
+if __name__ == "__main__":
+    main()

src/evaluate.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""Model evaluation utilities."""
+from __future__ import annotations
+from typing import Any
+import numpy as np
+from sklearn.metrics import (
+    average_precision_score,
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    recall_score,
+    roc_auc_score,
+)
+def _safe_roc_auc(y_true, y_pred_proba) -> float:
+    try:
+        return float(roc_auc_score(y_true, y_pred_proba))
+    except ValueError:
+        return float("nan")
+def _safe_pr_auc(y_true, y_pred_proba) -> float:
+    try:
+        return float(average_precision_score(y_true, y_pred_proba))
+    except ValueError:
+        return float("nan")
+def calculate_metrics(y_true, y_pred, y_pred_proba) -> dict[str, Any]:
+    """Calculate classification metrics used for model comparison."""
+    cm = confusion_matrix(y_true, y_pred)
+    return {
+        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
+        "recall": float(recall_score(y_true, y_pred, zero_division=0)),
+        "f1": float(f1_score(y_true, y_pred, zero_division=0)),
+        "roc_auc": _safe_roc_auc(y_true, y_pred_proba),
+        "pr_auc": _safe_pr_auc(y_true, y_pred_proba),
+        "confusion_matrix": cm.tolist(),
+    }
+def rank_models(results: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Sort candidate model results by recall, then precision, then roc_auc."""
+    return sorted(
+        results,
+        key=lambda r: (r["metrics"]["recall"], r["metrics"]["precision"], r["metrics"]["roc_auc"]),
+        reverse=True,
+    )
+def calculate_metrics_at_threshold(
+    y_true,
+    y_pred_proba,
+    *,
+    threshold: float,
+) -> dict[str, Any]:
+    """Compute metrics using a probability threshold."""
+    y_pred = (np.asarray(y_pred_proba) >= threshold).astype(int)
+    metrics = calculate_metrics(y_true, y_pred, y_pred_proba)
+    metrics["threshold"] = float(threshold)
+    return metrics
+def evaluate_thresholds(
+    y_true,
+    y_pred_proba,
+    *,
+    thresholds: list[float] | None = None,
+    min_threshold: float = 0.01,
+    max_threshold: float = 0.99,
+    grid_size: int = 99,
+) -> list[dict[str, Any]]:
+    """Evaluate model metrics across threshold grid."""
+    if thresholds is None:
+        thresholds = np.linspace(min_threshold, max_threshold, grid_size).tolist()
+    return [
+        calculate_metrics_at_threshold(y_true, y_pred_proba, threshold=t)
+        for t in thresholds
+    ]
+def select_best_threshold(
+    y_true,
+    y_pred_proba,
+    *,
+    min_recall: float = 0.90,
+    min_threshold: float = 0.01,
+    max_threshold: float = 0.99,
+    grid_size: int = 99,
+) -> dict[str, Any]:
+    """Select threshold by maximizing precision while meeting recall target."""
+    evaluations = evaluate_thresholds(
+        y_true,
+        y_pred_proba,
+        min_threshold=min_threshold,
+        max_threshold=max_threshold,
+        grid_size=grid_size,
+    )
+    feasible = [m for m in evaluations if m["recall"] >= min_recall]
+    search_space = feasible if feasible else evaluations
+    selection_reason = "meets_min_recall" if feasible else "fallback_max_recall"
+    best = sorted(
+        search_space,
+        key=lambda m: (m["precision"], m["f1"], m["recall"]),
+        reverse=True,
+    )[0]
+    return {
+        "selection_reason": selection_reason,
+        "min_recall_target": float(min_recall),
+        "selected_threshold": float(best["threshold"]),
+        "selected_metrics": best,
+        "threshold_grid_size": int(grid_size),
+        "thresholds_evaluated": evaluations,
+    }

src/predict.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Batch/single prediction helper functions."""

src/preprocessing.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""Training/inference preprocessing pipeline utilities."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+import joblib
+import numpy as np
+import pandas as pd
+from imblearn.over_sampling import SMOTE
+from sklearn.compose import ColumnTransformer
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils.class_weight import compute_class_weight
+TARGET_COLUMN = "Class"
+SCALE_COLUMNS = ["Time", "Amount"]
+def split_data(
+    df: pd.DataFrame,
+    *,
+    target_column: str = TARGET_COLUMN,
+    test_size: float = 0.2,
+    random_state: int = 42,
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
+    """Split dataframe into train/test with class stratification."""
+    if target_column not in df.columns:
+        raise ValueError(f"Missing target column: {target_column}")
+    if not 0 < test_size < 1:
+        raise ValueError("test_size must be between 0 and 1")
+    X = df.drop(columns=[target_column])
+    y = df[target_column]
+    return train_test_split(
+        X,
+        y,
+        test_size=test_size,
+        random_state=random_state,
+        stratify=y,
+    )
+def scale_features(
+    df: pd.DataFrame,
+    *,
+    columns: list[str] | None = None,
+    scaler: StandardScaler | None = None,
+) -> tuple[pd.DataFrame, StandardScaler]:
+    """Scale selected columns and return transformed dataframe and scaler."""
+    scale_columns = columns or SCALE_COLUMNS
+    missing = [column for column in scale_columns if column not in df.columns]
+    if missing:
+        raise ValueError(f"Columns not found for scaling: {missing}")
+    local_scaler = scaler or StandardScaler()
+    result = df.copy()
+    result[scale_columns] = local_scaler.fit_transform(df[scale_columns])
+    return result, local_scaler
+def build_preprocessor(
+    feature_columns: list[str],
+    *,
+    scale_columns: list[str] | None = None,
+) -> ColumnTransformer:
+    """Build a column transformer for consistent training/inference transforms."""
+    chosen_scale_columns = scale_columns or SCALE_COLUMNS
+    missing = [column for column in chosen_scale_columns if column not in feature_columns]
+    if missing:
+        raise ValueError(f"Scale columns missing from features: {missing}")
+    preprocessor = ColumnTransformer(
+        transformers=[("scale", StandardScaler(), chosen_scale_columns)],
+        remainder="passthrough",
+        verbose_feature_names_out=False,
+    )
+    preprocessor.set_output(transform="pandas")
+    return preprocessor
+def transform_features(
+    preprocessor: ColumnTransformer,
+    X: pd.DataFrame,
+) -> pd.DataFrame:
+    """Transform feature dataframe using a fitted preprocessor."""
+    transformed = preprocessor.transform(X)
+    if not isinstance(transformed, pd.DataFrame):
+        transformed = pd.DataFrame(transformed, columns=preprocessor.get_feature_names_out())
+    return transformed
+def handle_imbalance(
+    X_train: pd.DataFrame,
+    y_train: pd.Series,
+    *,
+    method: str = "class_weight",
+    random_state: int = 42,
+    sampling_strategy: float = 0.5,
+) -> tuple[pd.DataFrame, pd.Series, dict[str, Any]]:
+    """Handle class imbalance using strategy selected by method."""
+    selected = method.lower()
+    if selected not in {"none", "class_weight", "smote"}:
+        raise ValueError("method must be one of: none, class_weight, smote")
+    if selected == "none":
+        return X_train, y_train, {"method": "none", "class_weight": None}
+    if selected == "class_weight":
+        classes = np.array(sorted(y_train.unique().tolist()))
+        weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
+        class_weight = {int(label): float(weight) for label, weight in zip(classes, weights)}
+        return X_train, y_train, {"method": "class_weight", "class_weight": class_weight}
+    smote = SMOTE(random_state=random_state, sampling_strategy=sampling_strategy)
+    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
+    X_balanced = pd.DataFrame(X_resampled, columns=X_train.columns)
+    y_balanced = pd.Series(y_resampled, name=y_train.name)
+    return X_balanced, y_balanced, {"method": "smote", "class_weight": None}
+def save_preprocessor(preprocessor: ColumnTransformer, output_path: str | Path) -> Path:
+    """Persist fitted preprocessor to disk."""
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    joblib.dump(preprocessor, path)
+    return path
+def load_preprocessor(preprocessor_path: str | Path) -> ColumnTransformer:
+    """Load persisted preprocessor from disk."""
+    return joblib.load(Path(preprocessor_path))
+def preprocess_for_training(
+    df: pd.DataFrame,
+    *,
+    target_column: str = TARGET_COLUMN,
+    test_size: float = 0.2,
+    random_state: int = 42,
+    imbalance_method: str = "class_weight",
+    preprocessor_path: str | Path = "models/preprocessor.pkl",
+) -> dict[str, Any]:
+    """Run train/test split, fit/transform preprocessor, and handle imbalance."""
+    X_train_raw, X_test_raw, y_train, y_test = split_data(
+        df,
+        target_column=target_column,
+        test_size=test_size,
+        random_state=random_state,
+    )
+    preprocessor = build_preprocessor(feature_columns=X_train_raw.columns.tolist())
+    preprocessor.fit(X_train_raw)
+    X_train = transform_features(preprocessor, X_train_raw)
+    X_test = transform_features(preprocessor, X_test_raw)
+    X_train_final, y_train_final, imbalance_metadata = handle_imbalance(
+        X_train,
+        y_train,
+        method=imbalance_method,
+        random_state=random_state,
+    )
+    save_preprocessor(preprocessor, preprocessor_path)
+    return {
+        "X_train": X_train_final,
+        "X_test": X_test,
+        "y_train": y_train_final,
+        "y_test": y_test,
+        "preprocessor": preprocessor,
+        "imbalance_metadata": imbalance_metadata,
+    }

src/register_model.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Model registration helpers (local/MLflow registry)."""

src/train.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""Training entrypoint for fraud detection models with MLflow tracking."""
+from __future__ import annotations
+import argparse
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+import joblib
+import mlflow
+import pandas as pd
+import yaml
+from sklearn.linear_model import LogisticRegression
+from src.data_ingestion import load_data, run_data_validation
+from src.evaluate import calculate_metrics, rank_models, select_best_threshold
+from src.preprocessing import preprocess_for_training
+try:
+    from xgboost import XGBClassifier
+except Exception:  # pragma: no cover - handled at runtime
+    XGBClassifier = None
+DEFAULT_CONFIG_PATH = Path("configs/train.yaml")
+DEFAULT_DATA_PATH = Path("data/raw/creditcard.csv")
+DEFAULT_MODEL_PATH = Path("models/model.pkl")
+DEFAULT_PREPROCESSOR_PATH = Path("models/preprocessor.pkl")
+DEFAULT_REPORT_PATH = Path("artifacts/model_training_report.json")
+DEFAULT_MODEL_REPORT_PATH = Path("artifacts/model_report.json")
+DEFAULT_VALIDATION_REPORT_PATH = Path("artifacts/data_validation.json")
+def load_training_config(config_path: str | Path = DEFAULT_CONFIG_PATH) -> dict[str, Any]:
+    """Load YAML training configuration."""
+    config = yaml.safe_load(Path(config_path).read_text(encoding="utf-8")) or {}
+    config.setdefault("experiment", {})
+    config.setdefault("training", {})
+    config.setdefault("mlflow", {})
+    return config
+def create_model(model_name: str, random_state: int) -> Any:
+    """Create model instance from configured model name."""
+    if model_name == "logistic_regression":
+        return LogisticRegression(
+            max_iter=500,
+            solver="lbfgs",
+            class_weight="balanced",
+            random_state=random_state,
+        )
+    if model_name == "xgboost":
+        if XGBClassifier is None:
+            raise RuntimeError("xgboost is not available in the environment")
+        return XGBClassifier(
+            n_estimators=300,
+            max_depth=5,
+            learning_rate=0.05,
+            subsample=0.9,
+            colsample_bytree=0.9,
+            eval_metric="logloss",
+            random_state=random_state,
+            n_jobs=2,
+        )
+    raise ValueError(f"Unsupported model: {model_name}")
+def train_single_model(
+    model_name: str,
+    X_train: pd.DataFrame,
+    y_train: pd.Series,
+    X_test: pd.DataFrame,
+    y_test: pd.Series,
+    *,
+    random_state: int,
+) -> tuple[Any, dict[str, Any]]:
+    """Train one model and return model + metrics."""
+    model = create_model(model_name, random_state=random_state)
+    model.fit(X_train, y_train)
+    y_pred = model.predict(X_test)
+    y_pred_proba = model.predict_proba(X_test)[:, 1]
+    metrics = calculate_metrics(y_test, y_pred, y_pred_proba)
+    return model, metrics
+def log_run_to_mlflow(
+    *,
+    experiment_name: str,
+    model_name: str,
+    params: dict[str, Any],
+    metrics: dict[str, Any],
+    preprocessor_path: Path,
+    model_temp_path: Path,
+    artifact_dir: Path,
+) -> str:
+    """Log one training run to MLflow and return run id."""
+    mlflow.set_experiment(experiment_name)
+    with mlflow.start_run(run_name=model_name) as run:
+        mlflow.log_params(params)
+        metric_values = {k: v for k, v in metrics.items() if isinstance(v, float)}
+        mlflow.log_metrics(metric_values)
+        # Structured artifacts for debugging and reproducibility.
+        metrics_path = artifact_dir / f"metrics_{model_name}.json"
+        metrics_path.parent.mkdir(parents=True, exist_ok=True)
+        metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8")
+        mlflow.log_artifact(str(preprocessor_path), artifact_path="preprocessor")
+        mlflow.log_artifact(str(model_temp_path), artifact_path="model")
+        mlflow.log_artifact(str(metrics_path), artifact_path="metrics")
+        return run.info.run_id
+def save_model(model: Any, output_path: str | Path = DEFAULT_MODEL_PATH) -> Path:
+    """Save model artifact to disk."""
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    joblib.dump(model, path)
+    return path
+def run_training_pipeline(
+    *,
+    config_path: str | Path = DEFAULT_CONFIG_PATH,
+    data_path: str | Path = DEFAULT_DATA_PATH,
+    model_path: str | Path = DEFAULT_MODEL_PATH,
+    preprocessor_path: str | Path = DEFAULT_PREPROCESSOR_PATH,
+    report_path: str | Path = DEFAULT_REPORT_PATH,
+    model_report_path: str | Path = DEFAULT_MODEL_REPORT_PATH,
+    validation_report_path: str | Path = DEFAULT_VALIDATION_REPORT_PATH,
+) -> dict[str, Any]:
+    """Execute end-to-end training and experiment tracking pipeline."""
+    config = load_training_config(config_path)
+    experiment_name = config["experiment"].get("name", "fraud-detection-baseline")
+    tracking_uri = config["mlflow"].get("tracking_uri", "file:./mlruns")
+    mlflow.set_tracking_uri(tracking_uri)
+    training_cfg = config["training"]
+    random_state = int(training_cfg.get("random_state", 42))
+    test_size = float(training_cfg.get("test_size", 0.2))
+    imbalance_method = str(training_cfg.get("imbalance_method", "class_weight"))
+    models = training_cfg.get("models") or [training_cfg.get("model", "logistic_regression")]
+    threshold_cfg = config.get("threshold", {})
+    min_recall_target = float(threshold_cfg.get("min_recall_target", 0.90))
+    threshold_grid_size = int(threshold_cfg.get("grid_size", 99))
+    threshold_min = float(threshold_cfg.get("min_threshold", 0.01))
+    threshold_max = float(threshold_cfg.get("max_threshold", 0.99))
+    run_data_validation(file_path=data_path, report_path=validation_report_path)
+    raw_df = load_data(data_path)
+    prep = preprocess_for_training(
+        raw_df,
+        test_size=test_size,
+        random_state=random_state,
+        imbalance_method=imbalance_method,
+        preprocessor_path=preprocessor_path,
+    )
+    results: list[dict[str, Any]] = []
+    skipped_models: list[dict[str, str]] = []
+    artifact_dir = Path(report_path).parent
+    artifact_dir.mkdir(parents=True, exist_ok=True)
+    preprocessor_path_obj = Path(preprocessor_path)
+    for model_name in models:
+        try:
+            model, metrics = train_single_model(
+                model_name=model_name,
+                X_train=prep["X_train"],
+                y_train=prep["y_train"],
+                X_test=prep["X_test"],
+                y_test=prep["y_test"],
+                random_state=random_state,
+            )
+        except RuntimeError as exc:
+            skipped_models.append({"model_name": model_name, "reason": str(exc)})
+            continue
+        temp_model_path = Path(model_path).parent / f"{model_name}.pkl"
+        save_model(model, temp_model_path)
+        run_id = log_run_to_mlflow(
+            experiment_name=experiment_name,
+            model_name=model_name,
+            params={
+                "model_name": model_name,
+                "test_size": test_size,
+                "random_state": random_state,
+                "imbalance_method": imbalance_method,
+            },
+            metrics=metrics,
+            preprocessor_path=preprocessor_path_obj,
+            model_temp_path=temp_model_path,
+            artifact_dir=artifact_dir,
+        )
+        results.append({"model_name": model_name, "model": model, "metrics": metrics, "run_id": run_id})
+    if not results:
+        raise RuntimeError("No models were successfully trained.")
+    ranked = rank_models(results)
+    best = ranked[0]
+    y_test_proba_best = best["model"].predict_proba(prep["X_test"])[:, 1]
+    threshold_selection = select_best_threshold(
+        prep["y_test"],
+        y_test_proba_best,
+        min_recall=min_recall_target,
+        min_threshold=threshold_min,
+        max_threshold=threshold_max,
+        grid_size=threshold_grid_size,
+    )
+    model_report = {
+        "timestamp_utc": datetime.now(timezone.utc).isoformat(),
+        "best_model_name": best["model_name"],
+        "default_threshold_metrics": best["metrics"],
+        "threshold_selection": threshold_selection,
+        "evaluation_summary": {
+            "test_rows": int(len(prep["y_test"])),
+            "min_recall_target": min_recall_target,
+            "selection_reason": threshold_selection["selection_reason"],
+        },
+    }
+    model_report_path_obj = Path(model_report_path)
+    model_report_path_obj.parent.mkdir(parents=True, exist_ok=True)
+    model_report_path_obj.write_text(json.dumps(model_report, indent=2), encoding="utf-8")
+    final_model_path = save_model(best["model"], model_path)
+    report = {
+        "timestamp_utc": datetime.now(timezone.utc).isoformat(),
+        "experiment_name": experiment_name,
+        "tracking_uri": tracking_uri,
+        "data_path": str(data_path),
+        "preprocessor_path": str(preprocessor_path),
+        "model_path": str(final_model_path),
+        "model_report_path": str(model_report_path_obj),
+        "best_model": {
+            "model_name": best["model_name"],
+            "run_id": best["run_id"],
+            "metrics": best["metrics"],
+            "selected_threshold": threshold_selection["selected_threshold"],
+            "threshold_metrics": threshold_selection["selected_metrics"],
+        },
+        "all_results": [
+            {"model_name": entry["model_name"], "run_id": entry["run_id"], "metrics": entry["metrics"]}
+            for entry in ranked
+        ],
+        "skipped_models": skipped_models,
+    }
+    report_path_obj = Path(report_path)
+    report_path_obj.parent.mkdir(parents=True, exist_ok=True)
+    report_path_obj.write_text(json.dumps(report, indent=2), encoding="utf-8")
+    return report
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Train fraud model and log to MLflow.")
+    parser.add_argument("--config", default=str(DEFAULT_CONFIG_PATH), help="Training config YAML path.")
+    parser.add_argument("--data-path", default=str(DEFAULT_DATA_PATH), help="Dataset CSV path.")
+    parser.add_argument("--model-path", default=str(DEFAULT_MODEL_PATH), help="Output model artifact path.")
+    parser.add_argument(
+        "--preprocessor-path",
+        default=str(DEFAULT_PREPROCESSOR_PATH),
+        help="Output preprocessor artifact path.",
+    )
+    parser.add_argument("--report-path", default=str(DEFAULT_REPORT_PATH), help="Training report JSON path.")
+    parser.add_argument(
+        "--model-report-path",
+        default=str(DEFAULT_MODEL_REPORT_PATH),
+        help="Model evaluation report JSON path.",
+    )
+    return parser
+def main() -> None:
+    args = _build_parser().parse_args()
+    report = run_training_pipeline(
+        config_path=args.config,
+        data_path=args.data_path,
+        model_path=args.model_path,
+        preprocessor_path=args.preprocessor_path,
+        report_path=args.report_path,
+        model_report_path=args.model_report_path,
+    )
+    best = report["best_model"]
+    print("Training completed.")
+    print(f"Best model: {best['model_name']}")
+    print(f"Selected threshold: {best['selected_threshold']:.4f}")
+    print(json.dumps(best["threshold_metrics"], indent=2))
+if __name__ == "__main__":
+    main()

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from __future__ import annotations
+import sys
+from pathlib import Path
+# Ensure repository root is importable in pytest (for `src.*` imports).
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))

tests/test_api.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from __future__ import annotations
+from pathlib import Path
+from fastapi import HTTPException
+from fastapi.testclient import TestClient
+from api.app import app, get_inference_service
+class DummyService:
+    threshold = 0.74
+    model_path = Path("models/model.pkl")
+    preprocessor_path = Path("models/preprocessor.pkl")
+    def predict_records(self, records):
+        outputs = []
+        for record in records:
+            amount = float(record["Amount"])
+            prob = 0.9 if amount > 200 else 0.1
+            outputs.append(
+                {
+                    "is_fraud": prob >= self.threshold,
+                    "fraud_probability": prob,
+                    "risk_level": "high" if prob >= 0.7 else "low",
+                    "threshold": self.threshold,
+                }
+            )
+        return outputs
+def _transaction(amount: float = 10.0) -> dict[str, float]:
+    payload = {"Time": 0.0, "Amount": amount}
+    for i in range(1, 29):
+        payload[f"V{i}"] = 0.0
+    return payload
+def test_health_endpoint() -> None:
+    app.dependency_overrides[get_inference_service] = lambda: DummyService()
+    client = TestClient(app)
+    response = client.get("/health")
+    assert response.status_code == 200
+    body = response.json()
+    assert body["status"] == "ok"
+    assert body["model_loaded"] is True
+    app.dependency_overrides.clear()
+def test_predict_endpoint_valid_payload() -> None:
+    app.dependency_overrides[get_inference_service] = lambda: DummyService()
+    client = TestClient(app)
+    response = client.post("/predict", json=_transaction(amount=350.0))
+    assert response.status_code == 200
+    body = response.json()
+    assert body["is_fraud"] is True
+    assert body["risk_level"] == "high"
+    assert response.headers.get("X-Request-ID")
+    app.dependency_overrides.clear()
+def test_predict_endpoint_invalid_payload() -> None:
+    app.dependency_overrides[get_inference_service] = lambda: DummyService()
+    client = TestClient(app)
+    payload = _transaction()
+    payload.pop("V28")
+    response = client.post("/predict", json=payload)
+    assert response.status_code == 422
+    app.dependency_overrides.clear()
+def test_batch_prediction_endpoint() -> None:
+    app.dependency_overrides[get_inference_service] = lambda: DummyService()
+    client = TestClient(app)
+    response = client.post(
+        "/predict/batch",
+        json={"transactions": [_transaction(20.0), _transaction(300.0)]},
+    )
+    assert response.status_code == 200
+    body = response.json()
+    assert len(body["predictions"]) == 2
+    assert body["predictions"][0]["is_fraud"] is False
+    assert body["predictions"][1]["is_fraud"] is True
+    app.dependency_overrides.clear()
+def test_metrics_endpoint_tracks_predictions_and_requests() -> None:
+    app.dependency_overrides[get_inference_service] = lambda: DummyService()
+    client = TestClient(app)
+    before = client.get("/metrics")
+    assert before.status_code == 200
+    before_body = before.json()
+    predict_response = client.post("/predict", json=_transaction(amount=350.0))
+    assert predict_response.status_code == 200
+    after = client.get("/metrics")
+    assert after.status_code == 200
+    after_body = after.json()
+    assert after_body["total_requests"] >= before_body["total_requests"] + 2
+    assert after_body["total_predictions"] >= before_body["total_predictions"] + 1
+    assert 0.0 <= after_body["error_rate"] <= 1.0
+    assert 0.0 <= after_body["fraud_prediction_rate"] <= 1.0
+    app.dependency_overrides.clear()
+def test_health_returns_503_when_service_unavailable() -> None:
+    def _raise():
+        raise HTTPException(status_code=503, detail="Model artifact not found")
+    app.dependency_overrides[get_inference_service] = _raise
+    client = TestClient(app)
+    response = client.get("/health")
+    assert response.status_code == 503
+    assert "Model artifact not found" in response.json()["detail"]
+    app.dependency_overrides.clear()

tests/test_data_ingestion.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from __future__ import annotations
+import json
+import pandas as pd
+import pytest
+from src.data_ingestion import (
+    EXPECTED_COLUMNS,
+    load_data,
+    run_data_validation,
+    validate_data,
+)
+def _valid_df() -> pd.DataFrame:
+    row = {column: 0.0 for column in EXPECTED_COLUMNS}
+    row["Class"] = 0
+    return pd.DataFrame([row])
+def test_load_data_reads_csv(tmp_path) -> None:
+    df = _valid_df()
+    data_path = tmp_path / "creditcard.csv"
+    df.to_csv(data_path, index=False)
+    loaded = load_data(data_path)
+    assert list(loaded.columns) == EXPECTED_COLUMNS
+    assert loaded.shape == (1, len(EXPECTED_COLUMNS))
+def test_validate_data_invalid_when_required_column_missing() -> None:
+    df = _valid_df().drop(columns=["Amount"])
+    report = validate_data(df)
+    assert report["is_valid"] is False
+    assert any("Missing required columns" in error for error in report["errors"])
+def test_validate_data_invalid_when_class_has_invalid_values() -> None:
+    df = _valid_df()
+    df.loc[0, "Class"] = 3
+    report = validate_data(df)
+    assert report["is_valid"] is False
+    assert any("Class contains invalid values" in error for error in report["errors"])
+def test_run_data_validation_writes_report_and_fails_fast(tmp_path) -> None:
+    invalid_df = _valid_df().drop(columns=["Class"])
+    data_path = tmp_path / "creditcard.csv"
+    report_path = tmp_path / "data_validation.json"
+    invalid_df.to_csv(data_path, index=False)
+    with pytest.raises(ValueError):
+        run_data_validation(data_path, report_path)
+    assert report_path.exists()
+    report = json.loads(report_path.read_text(encoding="utf-8"))
+    assert report["is_valid"] is False
+def test_run_data_validation_passes_for_valid_schema(tmp_path) -> None:
+    valid_df = _valid_df()
+    data_path = tmp_path / "creditcard.csv"
+    report_path = tmp_path / "data_validation.json"
+    valid_df.to_csv(data_path, index=False)
+    report = run_data_validation(data_path, report_path)
+    assert report["is_valid"] is True
+    assert report_path.exists()

tests/test_evaluate.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from __future__ import annotations
+import numpy as np
+from src.evaluate import (
+    calculate_metrics_at_threshold,
+    evaluate_thresholds,
+    select_best_threshold,
+)
+def test_calculate_metrics_at_threshold_contains_threshold() -> None:
+    y_true = np.array([0, 0, 1, 1])
+    y_prob = np.array([0.1, 0.4, 0.6, 0.9])
+    metrics = calculate_metrics_at_threshold(y_true, y_prob, threshold=0.5)
+    assert metrics["threshold"] == 0.5
+    assert 0.0 <= metrics["recall"] <= 1.0
+    assert 0.0 <= metrics["precision"] <= 1.0
+def test_evaluate_thresholds_returns_expected_grid_size() -> None:
+    y_true = np.array([0, 0, 1, 1])
+    y_prob = np.array([0.1, 0.4, 0.6, 0.9])
+    evaluated = evaluate_thresholds(y_true, y_prob, min_threshold=0.1, max_threshold=0.9, grid_size=9)
+    assert len(evaluated) == 9
+    assert evaluated[0]["threshold"] == 0.1
+def test_select_best_threshold_prefers_precision_under_recall_constraint() -> None:
+    y_true = np.array([0, 0, 0, 0, 1, 1, 1, 1])
+    y_prob = np.array([0.02, 0.15, 0.20, 0.30, 0.55, 0.65, 0.80, 0.95])
+    selected = select_best_threshold(
+        y_true,
+        y_prob,
+        min_recall=0.75,
+        min_threshold=0.1,
+        max_threshold=0.9,
+        grid_size=17,
+    )
+    assert selected["selected_metrics"]["recall"] >= 0.75
+    assert 0.1 <= selected["selected_threshold"] <= 0.9
+    assert selected["selection_reason"] in {"meets_min_recall", "fallback_max_recall"}

tests/test_preprocessing.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from __future__ import annotations
+import numpy as np
+import pandas as pd
+import pytest
+from src.preprocessing import (
+    SCALE_COLUMNS,
+    build_preprocessor,
+    handle_imbalance,
+    load_preprocessor,
+    preprocess_for_training,
+    save_preprocessor,
+    scale_features,
+    split_data,
+    transform_features,
+)
+@pytest.fixture
+def sample_df() -> pd.DataFrame:
+    rng = np.random.default_rng(42)
+    rows = 200
+    fraud_count = 20
+    data: dict[str, np.ndarray] = {
+        "Time": rng.normal(loc=5000, scale=1000, size=rows),
+        "Amount": rng.normal(loc=120, scale=50, size=rows),
+    }
+    for i in range(1, 29):
+        data[f"V{i}"] = rng.normal(size=rows)
+    target = np.array([0] * (rows - fraud_count) + [1] * fraud_count)
+    rng.shuffle(target)
+    data["Class"] = target
+    return pd.DataFrame(data)
+def test_split_data_is_stratified(sample_df: pd.DataFrame) -> None:
+    X_train, X_test, y_train, y_test = split_data(sample_df, test_size=0.2, random_state=42)
+    base_ratio = sample_df["Class"].mean()
+    train_ratio = y_train.mean()
+    test_ratio = y_test.mean()
+    assert X_train.shape[0] == 160
+    assert X_test.shape[0] == 40
+    assert abs(train_ratio - base_ratio) < 0.02
+    assert abs(test_ratio - base_ratio) < 0.02
+def test_scale_features_transforms_only_selected_columns(sample_df: pd.DataFrame) -> None:
+    features = sample_df.drop(columns=["Class"])
+    scaled, scaler = scale_features(features)
+    assert scaler is not None
+    for column in SCALE_COLUMNS:
+        assert abs(float(scaled[column].mean())) < 1e-6
+    assert np.allclose(features["V1"].values, scaled["V1"].values)
+def test_handle_imbalance_smote_increases_minority_class(sample_df: pd.DataFrame) -> None:
+    X_train, _, y_train, _ = split_data(sample_df, test_size=0.2, random_state=42)
+    preprocessor = build_preprocessor(X_train.columns.tolist())
+    preprocessor.fit(X_train)
+    X_train_t = transform_features(preprocessor, X_train)
+    base_counts = y_train.value_counts().to_dict()
+    X_balanced, y_balanced, metadata = handle_imbalance(
+        X_train_t, y_train, method="smote", sampling_strategy=0.8
+    )
+    balanced_counts = y_balanced.value_counts().to_dict()
+    assert metadata["method"] == "smote"
+    assert balanced_counts[1] > base_counts[1]
+    assert X_balanced.shape[0] == y_balanced.shape[0]
+def test_preprocessor_save_load_roundtrip(sample_df: pd.DataFrame, tmp_path) -> None:
+    X_train, _, _, _ = split_data(sample_df, test_size=0.2, random_state=42)
+    preprocessor = build_preprocessor(X_train.columns.tolist())
+    preprocessor.fit(X_train)
+    path = tmp_path / "preprocessor.pkl"
+    save_preprocessor(preprocessor, path)
+    loaded = load_preprocessor(path)
+    transformed = transform_features(loaded, X_train.head(5))
+    assert list(transformed.columns) == X_train.columns.tolist()
+    assert transformed.shape == (5, X_train.shape[1])
+def test_preprocess_for_training_creates_artifact(sample_df: pd.DataFrame, tmp_path) -> None:
+    artifact = tmp_path / "preprocessor.pkl"
+    result = preprocess_for_training(
+        sample_df,
+        test_size=0.2,
+        random_state=42,
+        imbalance_method="class_weight",
+        preprocessor_path=artifact,
+    )
+    assert artifact.exists()
+    assert result["X_train"].shape[1] == 30
+    assert result["X_test"].shape[1] == 30
+    assert result["imbalance_metadata"]["method"] == "class_weight"
+    assert result["imbalance_metadata"]["class_weight"] is not None
+def test_handle_imbalance_rejects_unknown_method(sample_df: pd.DataFrame) -> None:
+    X_train, _, y_train, _ = split_data(sample_df)
+    preprocessor = build_preprocessor(X_train.columns.tolist())
+    preprocessor.fit(X_train)
+    X_train_t = transform_features(preprocessor, X_train)
+    with pytest.raises(ValueError):
+        handle_imbalance(X_train_t, y_train, method="unknown")

tests/test_service.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+import joblib
+import numpy as np
+import pandas as pd
+from api.service import InferenceService, load_inference_service, resolve_threshold
+class DummyPreprocessor:
+    feature_names_in_ = np.array(["Time", *[f"V{i}" for i in range(1, 29)], "Amount"])
+    def transform(self, frame: pd.DataFrame) -> pd.DataFrame:
+        return frame
+class DummyModel:
+    def predict_proba(self, frame: pd.DataFrame) -> np.ndarray:
+        probs = []
+        for amount in frame["Amount"].tolist():
+            if amount >= 300:
+                probs.append([0.1, 0.9])
+            elif amount >= 100:
+                probs.append([0.55, 0.45])
+            else:
+                probs.append([0.95, 0.05])
+        return np.array(probs)
+def _record(amount: float) -> dict[str, float]:
+    payload = {"Time": 0.0, "Amount": amount}
+    for i in range(1, 29):
+        payload[f"V{i}"] = 0.0
+    return payload
+def test_inference_service_predict_records_risk_levels() -> None:
+    service = InferenceService(
+        model=DummyModel(),
+        preprocessor=DummyPreprocessor(),
+        threshold=0.5,
+        model_path=Path("models/model.pkl"),
+        preprocessor_path=Path("models/preprocessor.pkl"),
+        feature_columns=["Time", *[f"V{i}" for i in range(1, 29)], "Amount"],
+    )
+    outputs = service.predict_records([_record(20), _record(120), _record(320)])
+    assert outputs[0]["risk_level"] == "low"
+    assert outputs[1]["risk_level"] == "medium"
+    assert outputs[2]["risk_level"] == "high"
+    assert outputs[2]["is_fraud"] is True
+def test_resolve_threshold_precedence(tmp_path) -> None:
+    training_report = tmp_path / "model_training_report.json"
+    model_report = tmp_path / "model_report.json"
+    config_path = tmp_path / "train.yaml"
+    config_path.write_text("threshold:\n  decision_threshold: 0.51\n", encoding="utf-8")
+    model_report.write_text(
+        json.dumps({"threshold_selection": {"selected_threshold": 0.63}}), encoding="utf-8"
+    )
+    training_report.write_text(
+        json.dumps({"best_model": {"selected_threshold": 0.74}}), encoding="utf-8"
+    )
+    threshold = resolve_threshold(
+        training_report_path=training_report,
+        model_report_path=model_report,
+        config_path=config_path,
+    )
+    assert threshold == 0.74
+def test_load_inference_service_reads_artifacts_and_threshold(tmp_path) -> None:
+    load_inference_service.cache_clear()
+    model_path = tmp_path / "model.pkl"
+    preprocessor_path = tmp_path / "preprocessor.pkl"
+    training_report = tmp_path / "model_training_report.json"
+    joblib.dump(DummyModel(), model_path)
+    joblib.dump(DummyPreprocessor(), preprocessor_path)
+    training_report.write_text(
+        json.dumps({"best_model": {"selected_threshold": 0.66}}), encoding="utf-8"
+    )
+    service = load_inference_service(
+        model_path=str(model_path),
+        preprocessor_path=str(preprocessor_path),
+        training_report_path=str(training_report),
+        model_report_path=str(tmp_path / "missing_model_report.json"),
+        config_path=str(tmp_path / "missing_config.yaml"),
+    )
+    assert service.threshold == 0.66
+    outputs = service.predict_records([_record(300.0)])
+    assert outputs[0]["is_fraud"] is True

tests/test_smoke.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ def test_smoke() -> None:
2	+ assert True

tests/test_training.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from __future__ import annotations
+import json
+import numpy as np
+import pandas as pd
+import yaml
+from src.evaluate import rank_models
+from src.train import run_training_pipeline, train_single_model
+def _synthetic_df(rows: int = 160) -> pd.DataFrame:
+    rng = np.random.default_rng(7)
+    data: dict[str, np.ndarray] = {
+        "Time": rng.normal(loc=1000, scale=250, size=rows),
+        "Amount": rng.normal(loc=80, scale=20, size=rows),
+    }
+    for i in range(1, 29):
+        data[f"V{i}"] = rng.normal(size=rows)
+    y = np.zeros(rows, dtype=int)
+    fraud_indices = rng.choice(rows, size=max(8, rows // 20), replace=False)
+    y[fraud_indices] = 1
+    # Inject weak signal for separability.
+    data["Amount"][fraud_indices] += 40
+    data["V3"][fraud_indices] += 1.5
+    data["Class"] = y
+    return pd.DataFrame(data)
+def test_rank_models_orders_by_recall_then_precision() -> None:
+    ranked = rank_models(
+        [
+            {"model_name": "a", "metrics": {"recall": 0.8, "precision": 0.9, "roc_auc": 0.9}},
+            {"model_name": "b", "metrics": {"recall": 0.9, "precision": 0.7, "roc_auc": 0.95}},
+            {"model_name": "c", "metrics": {"recall": 0.9, "precision": 0.8, "roc_auc": 0.85}},
+        ]
+    )
+    assert [entry["model_name"] for entry in ranked] == ["c", "b", "a"]
+def test_train_single_model_returns_expected_metrics() -> None:
+    df = _synthetic_df(200)
+    X = df.drop(columns=["Class"])
+    y = df["Class"]
+    # Simple split for unit test.
+    X_train, X_test = X.iloc[:160], X.iloc[160:]
+    y_train, y_test = y.iloc[:160], y.iloc[160:]
+    _, metrics = train_single_model(
+        model_name="logistic_regression",
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test,
+        y_test=y_test,
+        random_state=42,
+    )
+    assert set(metrics.keys()) == {"precision", "recall", "f1", "roc_auc", "pr_auc", "confusion_matrix"}
+    assert 0.0 <= metrics["recall"] <= 1.0
+def test_run_training_pipeline_creates_report_and_model(tmp_path) -> None:
+    df = _synthetic_df(240)
+    data_path = tmp_path / "creditcard.csv"
+    config_path = tmp_path / "train.yaml"
+    model_path = tmp_path / "best_model.pkl"
+    preprocessor_path = tmp_path / "preprocessor.pkl"
+    report_path = tmp_path / "training_report.json"
+    model_report_path = tmp_path / "model_report.json"
+    validation_report_path = tmp_path / "data_validation.json"
+    df.to_csv(data_path, index=False)
+    config = {
+        "experiment": {"name": "test-experiment"},
+        "training": {
+            "test_size": 0.2,
+            "random_state": 42,
+            "imbalance_method": "class_weight",
+            "models": ["logistic_regression"],
+        },
+        "mlflow": {"tracking_uri": f"file:{tmp_path / 'mlruns'}"},
+    }
+    config_path.write_text(yaml.safe_dump(config), encoding="utf-8")
+    report = run_training_pipeline(
+        config_path=config_path,
+        data_path=data_path,
+        model_path=model_path,
+        preprocessor_path=preprocessor_path,
+        report_path=report_path,
+        model_report_path=model_report_path,
+        validation_report_path=validation_report_path,
+    )
+    assert model_path.exists()
+    assert preprocessor_path.exists()
+    assert report_path.exists()
+    assert model_report_path.exists()
+    assert validation_report_path.exists()
+    assert report["best_model"]["model_name"] == "logistic_regression"
+    assert 0.0 < report["best_model"]["selected_threshold"] < 1.0
+    stored = json.loads(report_path.read_text(encoding="utf-8"))
+    assert stored["best_model"]["run_id"]