github-actions[bot] commited on
Commit
4937cba
·
0 Parent(s):

deploy: sync snapshot from github

Browse files
.dockerignore ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ __pycache__
4
+ *.pyc
5
+ *.pyo
6
+ *.pyd
7
+ .pytest_cache
8
+ .coverage
9
+ .coverage.*
10
+ htmlcov
11
+ .venv
12
+ uv.lock
13
+ pytest.ini
14
+ venv
15
+ env
16
+ .env
17
+ logs
18
+ mlruns
19
+ notebooks
20
+ data/raw
21
+ tests
22
+ .github
.github/workflows/ci.yml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI-CD
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches: [main]
7
+ workflow_dispatch:
8
+
9
+ concurrency:
10
+ group: ci-${{ github.ref }}
11
+ cancel-in-progress: true
12
+
13
+ env:
14
+ PYTHON_VERSION: "3.11"
15
+ IMAGE_NAME: fraud-detection-api
16
+
17
+ jobs:
18
+ test:
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - name: Checkout
22
+ uses: actions/checkout@v4
23
+
24
+ - name: Set up Python
25
+ uses: actions/setup-python@v5
26
+ with:
27
+ python-version: ${{ env.PYTHON_VERSION }}
28
+
29
+ - name: Set up uv
30
+ uses: astral-sh/setup-uv@v5
31
+
32
+ - name: Install dependencies
33
+ run: |
34
+ uv pip install --system -r requirements.txt
35
+
36
+ - name: Run tests
37
+ run: python -m pytest
38
+
39
+ build-image:
40
+ runs-on: ubuntu-latest
41
+ needs: test
42
+ steps:
43
+ - name: Checkout
44
+ uses: actions/checkout@v4
45
+
46
+ - name: Build Docker image
47
+ run: docker build -t $IMAGE_NAME:${{ github.sha }} .
48
+
49
+ - name: Smoke check image metadata
50
+ run: docker image inspect $IMAGE_NAME:${{ github.sha }}
51
+
52
+ deploy:
53
+ runs-on: ubuntu-latest
54
+ needs: build-image
55
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
56
+ steps:
57
+ - name: Trigger deployment webhook (if configured)
58
+ run: |
59
+ if [ -z "$DEPLOY_WEBHOOK_URL" ]; then
60
+ echo "DEPLOY_WEBHOOK_URL secret is not set; skipping deploy trigger."
61
+ exit 0
62
+ fi
63
+ curl -fsS -X POST "$DEPLOY_WEBHOOK_URL"
64
+ env:
65
+ DEPLOY_WEBHOOK_URL: ${{ secrets.DEPLOY_WEBHOOK_URL }}
.github/workflows/deploy-hf-space.yml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy to Hugging Face Space
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ workflow_dispatch:
7
+
8
+ concurrency:
9
+ group: deploy-hf-space-${{ github.ref }}
10
+ cancel-in-progress: true
11
+
12
+ jobs:
13
+ deploy:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - name: Validate required secrets
17
+ env:
18
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
+ HF_SPACE_REPO: ${{ secrets.HF_SPACE_REPO }}
20
+ run: |
21
+ if [ -z "$HF_TOKEN" ] || [ -z "$HF_SPACE_REPO" ]; then
22
+ echo "HF_TOKEN or HF_SPACE_REPO is not set. Configure repository secrets."
23
+ exit 1
24
+ fi
25
+
26
+ - name: Checkout repository
27
+ uses: actions/checkout@v4
28
+ with:
29
+ fetch-depth: 0
30
+
31
+ - name: Configure git
32
+ run: |
33
+ git config user.name "github-actions[bot]"
34
+ git config user.email "github-actions[bot]@users.noreply.github.com"
35
+
36
+ - name: Remove non-serving artifacts for HF push
37
+ run: |
38
+ # Space runtime only needs selected serving artifacts.
39
+ rm -f models/xgboost.pkl
40
+
41
+ - name: Push to Hugging Face Space
42
+ env:
43
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
44
+ HF_SPACE_REPO: ${{ secrets.HF_SPACE_REPO }}
45
+ run: |
46
+ TMP_DIR="$(mktemp -d)"
47
+ rsync -a --delete --exclude=".git" ./ "${TMP_DIR}/"
48
+
49
+ # Exclude artifacts not needed for serving in Space.
50
+ rm -f "${TMP_DIR}/models/xgboost.pkl"
51
+
52
+ cd "${TMP_DIR}"
53
+ git init -b main
54
+ git config user.name "github-actions[bot]"
55
+ git config user.email "github-actions[bot]@users.noreply.github.com"
56
+ git add -A
57
+ git commit -m "deploy: sync snapshot from github"
58
+
59
+ git remote add hf "https://oauth2:${HF_TOKEN}@huggingface.co/spaces/${HF_SPACE_REPO}"
60
+ git push hf main --force
.github/workflows/keepalive-hf-space.yml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Keep HF Space Warm
2
+
3
+ on:
4
+ schedule:
5
+ # Monday, Wednesday, Friday at 09:00 UTC
6
+ - cron: "0 9 * * 1,3,5"
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ ping:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Validate HF Space URL secret
14
+ env:
15
+ HF_SPACE_URL: ${{ secrets.HF_SPACE_URL }}
16
+ run: |
17
+ if [ -z "$HF_SPACE_URL" ]; then
18
+ echo "HF_SPACE_URL secret is not set."
19
+ exit 1
20
+ fi
21
+
22
+ - name: Ping health endpoint
23
+ env:
24
+ HF_SPACE_URL: ${{ secrets.HF_SPACE_URL }}
25
+ run: |
26
+ set -e
27
+ curl -fsS --retry 3 --retry-delay 5 "$HF_SPACE_URL/health"
28
+
29
+ - name: Ping metrics endpoint
30
+ env:
31
+ HF_SPACE_URL: ${{ secrets.HF_SPACE_URL }}
32
+ run: |
33
+ set -e
34
+ curl -fsS --retry 3 --retry-delay 5 "$HF_SPACE_URL/metrics"
.gitignore ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
208
+
209
+ # Project-specific
210
+ data/raw/
211
+ data/processed/
212
+ logs/
213
+ mlruns/
214
+
215
+ IMPLEMENTATION_PLAN.md
216
+ End-to-End MLOps Project Documentation.txt
217
+ uv.lock
218
+
219
+ explaintovithu.md
220
+ interview_explanation.md
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1
6
+
7
+ WORKDIR /app
8
+
9
+ # Install Python dependencies first for better layer caching.
10
+ COPY requirements.txt ./
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy application code and runtime artifacts.
14
+ COPY api ./api
15
+ COPY src ./src
16
+ COPY configs ./configs
17
+ COPY models ./models
18
+ COPY artifacts ./artifacts
19
+
20
+ # Run API as non-root user.
21
+ RUN useradd --create-home --shell /usr/sbin/nologin appuser \
22
+ && chown -R appuser:appuser /app
23
+ USER appuser
24
+
25
+ EXPOSE 8000
26
+
27
+ HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
28
+ CMD python -c "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:8000/health'); sys.exit(0)"
29
+
30
+ CMD ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "8000"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Vimalathas Vithusan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Fraud Detection MLOps API
3
+ emoji: 🚨
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ app_port: 8000
8
+ ---
9
+
10
+ # Fraud Detection MLOps Pipeline
11
+
12
+ Production-style end-to-end fraud detection system with training, experiment tracking, API serving, containerization, CI/CD, and runtime monitoring.
13
+
14
+ ## Highlights
15
+
16
+ - End-to-end ML lifecycle: data validation -> preprocessing -> training -> threshold tuning -> API inference.
17
+ - Imbalanced classification handling with recall-first model ranking.
18
+ - MLflow experiment tracking and artifact logging.
19
+ - FastAPI inference service with single/batch prediction endpoints.
20
+ - Dockerized deployment with health checks and non-root runtime.
21
+ - CI/CD with automated tests, coverage gates, image build, and HF deployment sync.
22
+ - Runtime observability via request IDs, structured logs, and `/metrics`.
23
+
24
+ ## Live Deployment
25
+
26
+ - Hugging Face Space: `https://thasvithu-fraud-detection-mlops-api.hf.space`
27
+ - API Docs: `https://thasvithu-fraud-detection-mlops-api.hf.space/docs`
28
+
29
+ ## Architecture
30
+
31
+ ```mermaid
32
+ flowchart LR
33
+ A[Raw Data<br/>data/raw/creditcard.csv] --> B[Data Validation<br/>src/data_ingestion.py]
34
+ B --> C[Preprocessing<br/>src/preprocessing.py]
35
+ C --> D[Model Training<br/>src/train.py]
36
+ D --> E[Evaluation + Threshold Tuning<br/>src/evaluate.py]
37
+ E --> F[Artifacts<br/>models/*.pkl<br/>artifacts/*.json]
38
+ F --> G[Inference Service<br/>api/service.py]
39
+ G --> H[FastAPI App<br/>api/app.py]
40
+ H --> I["/predict"]
41
+ H --> J["/predict/batch"]
42
+ H --> K["/health"]
43
+ H --> L["/metrics"]
44
+ ```
45
+
46
+ ## ML Training Workflow
47
+
48
+ ```mermaid
49
+ flowchart TD
50
+ T1[Load Config<br/>configs/train.yaml] --> T2[Validate Dataset]
51
+ T2 --> T3[Split + Scale + Imbalance Handling]
52
+ T3 --> T4[Train Candidate Models]
53
+ T4 --> T5[Compute Metrics]
54
+ T5 --> T6[Log Runs to MLflow]
55
+ T6 --> T7[Rank by recall -> precision -> roc_auc]
56
+ T7 --> T8[Select Best Model]
57
+ T8 --> T9[Threshold Sweep + Selection]
58
+ T9 --> T10[Save model + preprocessor + reports]
59
+ ```
60
+
61
+ ## Inference Request Flow
62
+
63
+ ```mermaid
64
+ sequenceDiagram
65
+ autonumber
66
+ participant Client
67
+ participant API as FastAPI
68
+ participant Svc as InferenceService
69
+ participant Art as Artifacts
70
+
71
+ Client->>API: POST /predict (transaction payload)
72
+ API->>Svc: load_inference_service() [cached]
73
+ Svc->>Art: model.pkl + preprocessor.pkl + threshold reports
74
+ Svc-->>API: prediction + probability + risk level
75
+ API-->>Client: JSON response (+ request headers)
76
+ ```
77
+
78
+ ## CI/CD and Deployment Workflows
79
+
80
+ ```mermaid
81
+ flowchart LR
82
+ P[Push / PR] --> C1[ci.yml]
83
+ C1 --> C2[Test + Coverage Gate]
84
+ C2 --> C3[Build Docker Image]
85
+ C3 --> C4[Optional Deploy Webhook]
86
+
87
+ M[Push main] --> H1[deploy-hf-space.yml]
88
+ H1 --> H2[Snapshot Sync to HF Space]
89
+
90
+ S[Schedule Mon/Wed/Fri] --> K1[keepalive-hf-space.yml]
91
+ K1 --> K2[Ping /health and /metrics]
92
+ ```
93
+
94
+ ## Project Structure
95
+
96
+ ```text
97
+ fraud-detection-mlops-pipeline/
98
+ ├── api/
99
+ │ ├── app.py
100
+ │ ├── schemas.py
101
+ │ └── service.py
102
+ ├── src/
103
+ │ ├── data_ingestion.py
104
+ │ ├── preprocessing.py
105
+ │ ├── train.py
106
+ │ ├── evaluate.py
107
+ │ ├── predict.py
108
+ │ └── register_model.py
109
+ ├── configs/
110
+ │ ├── train.yaml
111
+ │ └── logging.yaml
112
+ ├── data/
113
+ │ ├── raw/
114
+ │ └── processed/
115
+ ├── models/
116
+ ├── artifacts/
117
+ ├── tests/
118
+ ├── .github/workflows/
119
+ │ ├── ci.yml
120
+ │ ├── deploy-hf-space.yml
121
+ │ └── keepalive-hf-space.yml
122
+ ├── Dockerfile
123
+ ├── docker-compose.yml
124
+ ├── requirements.txt
125
+ └── pytest.ini
126
+ ```
127
+
128
+ ## Tech Stack
129
+
130
+ - Python 3.11
131
+ - Pandas, NumPy, scikit-learn, imbalanced-learn, XGBoost
132
+ - MLflow
133
+ - FastAPI + Pydantic
134
+ - Docker + Docker Compose
135
+ - GitHub Actions
136
+ - Hugging Face Spaces (Docker SDK)
137
+
138
+ ## API Endpoints
139
+
140
+ - `GET /health`: Service and model readiness
141
+ - `GET /metrics`: Runtime operational counters
142
+ - `POST /predict`: Single transaction prediction
143
+ - `POST /predict/batch`: Batch transaction predictions
144
+ - `GET /docs`: Swagger UI
145
+
146
+ ### Example: Single Prediction
147
+
148
+ ```bash
149
+ BASE="https://thasvithu-fraud-detection-mlops-api.hf.space"
150
+
151
+ curl -X POST "$BASE/predict" \
152
+ -H "Content-Type: application/json" \
153
+ -d '{
154
+ "Time": 0,
155
+ "Amount": 149.62,
156
+ "V1": -1.359807, "V2": -0.072781, "V3": 2.536347, "V4": 1.378155,
157
+ "V5": -0.338321, "V6": 0.462388, "V7": 0.239599, "V8": 0.098698,
158
+ "V9": 0.363787, "V10": 0.090794, "V11": -0.551600, "V12": -0.617801,
159
+ "V13": -0.991390, "V14": -0.311169, "V15": 1.468177, "V16": -0.470401,
160
+ "V17": 0.207971, "V18": 0.025791, "V19": 0.403993, "V20": 0.251412,
161
+ "V21": -0.018307, "V22": 0.277838, "V23": -0.110474, "V24": 0.066928,
162
+ "V25": 0.128539, "V26": -0.189115, "V27": 0.133558, "V28": -0.021053
163
+ }'
164
+ ```
165
+
166
+ ## Local Setup
167
+
168
+ ### Prerequisites
169
+
170
+ - Python 3.11+
171
+ - `uv`
172
+ - Docker (optional, for container run)
173
+
174
+ ### Install
175
+
176
+ ```bash
177
+ uv pip install -r requirements.txt
178
+ ```
179
+
180
+ ### Train
181
+
182
+ ```bash
183
+ uv run python -m src.train
184
+ ```
185
+
186
+ ### Test
187
+
188
+ ```bash
189
+ uv run pytest
190
+ ```
191
+
192
+ ### Run API
193
+
194
+ ```bash
195
+ uv run uvicorn api.app:app --reload --host 0.0.0.0 --port 8000
196
+ ```
197
+
198
+ ## Docker Usage
199
+
200
+ ### Build
201
+
202
+ ```bash
203
+ docker build -t fraud-detection-api:latest .
204
+ ```
205
+
206
+ ### Run
207
+
208
+ ```bash
209
+ docker run --rm -p 8000:8000 fraud-detection-api:latest
210
+ ```
211
+
212
+ ### Compose
213
+
214
+ ```bash
215
+ docker compose up --build
216
+ ```
217
+
218
+ ## Quality Gates
219
+
220
+ - Test coverage enforced via `pytest.ini`
221
+ - Minimum coverage: `>= 80%` across `src` + `api`
222
+ - Current status: passing (see GitHub Actions)
223
+
224
+ ## Monitoring and Operations
225
+
226
+ Runtime metrics exposed by `/metrics`:
227
+ - `total_requests`
228
+ - `error_count`
229
+ - `error_rate`
230
+ - `total_predictions`
231
+ - `fraud_predictions`
232
+ - `fraud_prediction_rate`
233
+ - `avg_latency_ms`
234
+
235
+ Request-level observability:
236
+ - `X-Request-ID`
237
+ - `X-Process-Time-Ms`
238
+ - Structured JSON logs for request and prediction events
239
+
240
+ ## GitHub Actions Workflows
241
+
242
+ - `ci.yml`: test + coverage + image build (+ optional webhook deploy)
243
+ - `deploy-hf-space.yml`: sync `main` to Hugging Face Space
244
+ - `keepalive-hf-space.yml`: scheduled pings to reduce Space inactivity sleep
245
+
246
+ ## Required GitHub Secrets
247
+
248
+ For Hugging Face deploy:
249
+ - `HF_TOKEN`
250
+ - `HF_SPACE_REPO` (format: `username/space-name`)
251
+
252
+ For HF keepalive:
253
+ - `HF_SPACE_URL`
254
+
255
+ Optional webhook deploy:
256
+ - `DEPLOY_WEBHOOK_URL`
257
+
258
+ ## Milestone Status
259
+
260
+ All planned phases (0-9) are complete:
261
+ - Foundation
262
+ - Data validation
263
+ - Preprocessing
264
+ - Training + MLflow tracking
265
+ - Evaluation + threshold tuning
266
+ - FastAPI inference service
267
+ - Testing + quality gates
268
+ - Containerization
269
+ - CI/CD automation
270
+ - Monitoring and operations
271
+
272
+ ## License
273
+
274
+ MIT (see `LICENSE`)
api/__init__.py ADDED
File without changes
api/app.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import time
6
+ from dataclasses import dataclass, field
7
+ from threading import Lock
8
+ from typing import Annotated
9
+ from uuid import uuid4
10
+
11
+ from fastapi import Depends, FastAPI, HTTPException, Request
12
+ from fastapi.responses import JSONResponse
13
+
14
+ from api.schemas import (
15
+ BatchPredictionRequest,
16
+ BatchPredictionResponse,
17
+ HealthResponse,
18
+ MetricsResponse,
19
+ PredictionResponse,
20
+ Transaction,
21
+ )
22
+ from api.service import InferenceService, load_inference_service
23
+
24
+ logger = logging.getLogger("api")
25
+ if not logger.handlers:
26
+ logging.basicConfig(level=logging.INFO)
27
+
28
+
29
+ @dataclass
30
+ class MonitoringState:
31
+ total_requests: int = 0
32
+ error_count: int = 0
33
+ total_predictions: int = 0
34
+ fraud_predictions: int = 0
35
+ total_latency_ms: float = 0.0
36
+ _lock: Lock = field(default_factory=Lock)
37
+
38
+ def record_request(self, *, latency_ms: float, status_code: int) -> None:
39
+ with self._lock:
40
+ self.total_requests += 1
41
+ self.total_latency_ms += latency_ms
42
+ if status_code >= 400:
43
+ self.error_count += 1
44
+
45
+ def record_predictions(self, predictions: list[dict[str, object]]) -> None:
46
+ fraud_count = sum(1 for p in predictions if bool(p.get("is_fraud")))
47
+ with self._lock:
48
+ self.total_predictions += len(predictions)
49
+ self.fraud_predictions += fraud_count
50
+
51
+ def snapshot(self) -> dict[str, float | int]:
52
+ with self._lock:
53
+ avg_latency = self.total_latency_ms / self.total_requests if self.total_requests else 0.0
54
+ error_rate = self.error_count / self.total_requests if self.total_requests else 0.0
55
+ fraud_rate = (
56
+ self.fraud_predictions / self.total_predictions if self.total_predictions else 0.0
57
+ )
58
+ return {
59
+ "total_requests": self.total_requests,
60
+ "error_count": self.error_count,
61
+ "error_rate": float(error_rate),
62
+ "total_predictions": self.total_predictions,
63
+ "fraud_predictions": self.fraud_predictions,
64
+ "fraud_prediction_rate": float(fraud_rate),
65
+ "avg_latency_ms": float(avg_latency),
66
+ }
67
+
68
+
69
+ app = FastAPI(title="Fraud Detection API", version="0.3.0")
70
+ monitoring_state = MonitoringState()
71
+
72
+
73
+ @app.middleware("http")
74
+ async def add_observability(request: Request, call_next):
75
+ request_id = request.headers.get("X-Request-ID", str(uuid4()))
76
+ start = time.perf_counter()
77
+
78
+ status_code = 500
79
+ try:
80
+ response = await call_next(request)
81
+ status_code = response.status_code
82
+ except Exception:
83
+ latency_ms = (time.perf_counter() - start) * 1000
84
+ monitoring_state.record_request(latency_ms=latency_ms, status_code=status_code)
85
+ logger.exception(
86
+ json.dumps(
87
+ {
88
+ "event": "request_error",
89
+ "request_id": request_id,
90
+ "path": request.url.path,
91
+ "method": request.method,
92
+ "latency_ms": round(latency_ms, 2),
93
+ }
94
+ )
95
+ )
96
+ raise
97
+
98
+ latency_ms = (time.perf_counter() - start) * 1000
99
+ monitoring_state.record_request(latency_ms=latency_ms, status_code=status_code)
100
+
101
+ response.headers["X-Process-Time-Ms"] = f"{latency_ms:.2f}"
102
+ response.headers["X-Request-ID"] = request_id
103
+
104
+ logger.info(
105
+ json.dumps(
106
+ {
107
+ "event": "request_complete",
108
+ "request_id": request_id,
109
+ "path": request.url.path,
110
+ "method": request.method,
111
+ "status_code": status_code,
112
+ "latency_ms": round(latency_ms, 2),
113
+ }
114
+ )
115
+ )
116
+ return response
117
+
118
+
119
+ def get_inference_service() -> InferenceService:
120
+ try:
121
+ return load_inference_service()
122
+ except FileNotFoundError as exc:
123
+ raise HTTPException(status_code=503, detail=str(exc)) from exc
124
+
125
+
126
+ ServiceDep = Annotated[InferenceService, Depends(get_inference_service)]
127
+
128
+
129
+ @app.exception_handler(ValueError)
130
+ async def value_error_handler(_: Request, exc: ValueError) -> JSONResponse:
131
+ return JSONResponse(status_code=400, content={"detail": str(exc)})
132
+
133
+
134
+ @app.get("/health", response_model=HealthResponse)
135
+ def health(service: ServiceDep) -> HealthResponse:
136
+ return HealthResponse(
137
+ status="ok",
138
+ model_loaded=True,
139
+ model_path=str(service.model_path),
140
+ preprocessor_path=str(service.preprocessor_path),
141
+ threshold=service.threshold,
142
+ )
143
+
144
+
145
+ @app.get("/metrics", response_model=MetricsResponse)
146
+ def metrics() -> MetricsResponse:
147
+ return MetricsResponse(**monitoring_state.snapshot())
148
+
149
+
150
+ @app.post("/predict", response_model=PredictionResponse)
151
+ def predict(transaction: Transaction, service: ServiceDep) -> PredictionResponse:
152
+ output = service.predict_records([transaction.model_dump()])[0]
153
+ monitoring_state.record_predictions([output])
154
+ logger.info(
155
+ json.dumps(
156
+ {
157
+ "event": "prediction",
158
+ "prediction_count": 1,
159
+ "fraud_predictions": int(output["is_fraud"]),
160
+ "avg_probability": round(float(output["fraud_probability"]), 6),
161
+ "threshold": float(output["threshold"]),
162
+ }
163
+ )
164
+ )
165
+ return PredictionResponse(**output)
166
+
167
+
168
+ @app.post("/predict/batch", response_model=BatchPredictionResponse)
169
+ def predict_batch(request: BatchPredictionRequest, service: ServiceDep) -> BatchPredictionResponse:
170
+ predictions = service.predict_records([record.model_dump() for record in request.transactions])
171
+ monitoring_state.record_predictions(predictions)
172
+
173
+ fraud_count = sum(1 for row in predictions if row["is_fraud"])
174
+ avg_probability = sum(float(row["fraud_probability"]) for row in predictions) / len(predictions)
175
+ logger.info(
176
+ json.dumps(
177
+ {
178
+ "event": "prediction_batch",
179
+ "prediction_count": len(predictions),
180
+ "fraud_predictions": fraud_count,
181
+ "avg_probability": round(avg_probability, 6),
182
+ "threshold": float(predictions[0]["threshold"]),
183
+ }
184
+ )
185
+ )
186
+
187
+ return BatchPredictionResponse(predictions=[PredictionResponse(**row) for row in predictions])
api/schemas.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic request/response schemas for the inference API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field
6
+
7
+
8
+ class Transaction(BaseModel):
9
+ model_config = ConfigDict(extra="forbid")
10
+
11
+ Time: float
12
+ V1: float
13
+ V2: float
14
+ V3: float
15
+ V4: float
16
+ V5: float
17
+ V6: float
18
+ V7: float
19
+ V8: float
20
+ V9: float
21
+ V10: float
22
+ V11: float
23
+ V12: float
24
+ V13: float
25
+ V14: float
26
+ V15: float
27
+ V16: float
28
+ V17: float
29
+ V18: float
30
+ V19: float
31
+ V20: float
32
+ V21: float
33
+ V22: float
34
+ V23: float
35
+ V24: float
36
+ V25: float
37
+ V26: float
38
+ V27: float
39
+ V28: float
40
+ Amount: float = Field(ge=0)
41
+
42
+
43
+ class PredictionResponse(BaseModel):
44
+ is_fraud: bool
45
+ fraud_probability: float
46
+ risk_level: str
47
+ threshold: float
48
+
49
+
50
+ class BatchPredictionRequest(BaseModel):
51
+ model_config = ConfigDict(extra="forbid")
52
+
53
+ transactions: list[Transaction] = Field(min_length=1)
54
+
55
+
56
+ class BatchPredictionResponse(BaseModel):
57
+ predictions: list[PredictionResponse]
58
+
59
+
60
+ class HealthResponse(BaseModel):
61
+ status: str
62
+ model_loaded: bool
63
+ model_path: str
64
+ preprocessor_path: str
65
+ threshold: float
66
+
67
+
68
+ class MetricsResponse(BaseModel):
69
+ total_requests: int
70
+ error_count: int
71
+ error_rate: float
72
+ total_predictions: int
73
+ fraud_predictions: int
74
+ fraud_prediction_rate: float
75
+ avg_latency_ms: float
api/service.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Model loading and prediction service helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass
7
+ from functools import lru_cache
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import joblib
12
+ import pandas as pd
13
+ import yaml
14
+
15
+ from src.data_ingestion import EXPECTED_COLUMNS
16
+
17
+ DEFAULT_MODEL_PATH = Path("models/model.pkl")
18
+ DEFAULT_PREPROCESSOR_PATH = Path("models/preprocessor.pkl")
19
+ DEFAULT_TRAINING_REPORT_PATH = Path("artifacts/model_training_report.json")
20
+ DEFAULT_MODEL_REPORT_PATH = Path("artifacts/model_report.json")
21
+ DEFAULT_CONFIG_PATH = Path("configs/train.yaml")
22
+ FEATURE_COLUMNS = [column for column in EXPECTED_COLUMNS if column != "Class"]
23
+
24
+
25
+ @dataclass
26
+ class InferenceService:
27
+ """Encapsulate model/preprocessor runtime and prediction logic."""
28
+
29
+ model: Any
30
+ preprocessor: Any
31
+ threshold: float
32
+ model_path: Path
33
+ preprocessor_path: Path
34
+ feature_columns: list[str]
35
+
36
+ def predict_records(self, records: list[dict[str, float]]) -> list[dict[str, Any]]:
37
+ """Predict fraud labels/probabilities for input transaction records."""
38
+ frame = pd.DataFrame(records)
39
+ frame = frame[self.feature_columns]
40
+
41
+ transformed = self.preprocessor.transform(frame)
42
+ probabilities = self.model.predict_proba(transformed)[:, 1]
43
+
44
+ outputs: list[dict[str, Any]] = []
45
+ for prob in probabilities:
46
+ probability = float(prob)
47
+ outputs.append(
48
+ {
49
+ "is_fraud": bool(probability >= self.threshold),
50
+ "fraud_probability": probability,
51
+ "risk_level": _risk_level(probability),
52
+ "threshold": float(self.threshold),
53
+ }
54
+ )
55
+ return outputs
56
+
57
+
58
+ def _risk_level(probability: float) -> str:
59
+ if probability >= 0.7:
60
+ return "high"
61
+ if probability >= 0.3:
62
+ return "medium"
63
+ return "low"
64
+
65
+
66
+ def _threshold_from_training_report(training_report_path: Path) -> float | None:
67
+ if not training_report_path.exists():
68
+ return None
69
+ payload = json.loads(training_report_path.read_text(encoding="utf-8"))
70
+ best = payload.get("best_model", {})
71
+ threshold = best.get("selected_threshold")
72
+ return float(threshold) if threshold is not None else None
73
+
74
+
75
+ def _threshold_from_model_report(model_report_path: Path) -> float | None:
76
+ if not model_report_path.exists():
77
+ return None
78
+ payload = json.loads(model_report_path.read_text(encoding="utf-8"))
79
+ selection = payload.get("threshold_selection", {})
80
+ threshold = selection.get("selected_threshold")
81
+ return float(threshold) if threshold is not None else None
82
+
83
+
84
+ def _threshold_from_config(config_path: Path) -> float | None:
85
+ if not config_path.exists():
86
+ return None
87
+ config = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
88
+ threshold_cfg = config.get("threshold", {})
89
+ threshold = threshold_cfg.get("decision_threshold")
90
+ return float(threshold) if threshold is not None else None
91
+
92
+
93
+ def resolve_threshold(
94
+ *,
95
+ training_report_path: Path = DEFAULT_TRAINING_REPORT_PATH,
96
+ model_report_path: Path = DEFAULT_MODEL_REPORT_PATH,
97
+ config_path: Path = DEFAULT_CONFIG_PATH,
98
+ ) -> float:
99
+ """Resolve runtime threshold from artifacts, then fallback config/default."""
100
+ value = _threshold_from_training_report(training_report_path)
101
+ if value is not None:
102
+ return value
103
+ value = _threshold_from_model_report(model_report_path)
104
+ if value is not None:
105
+ return value
106
+ value = _threshold_from_config(config_path)
107
+ if value is not None:
108
+ return value
109
+ return 0.5
110
+
111
+
112
+ @lru_cache(maxsize=1)
113
+ def load_inference_service(
114
+ *,
115
+ model_path: str = str(DEFAULT_MODEL_PATH),
116
+ preprocessor_path: str = str(DEFAULT_PREPROCESSOR_PATH),
117
+ training_report_path: str = str(DEFAULT_TRAINING_REPORT_PATH),
118
+ model_report_path: str = str(DEFAULT_MODEL_REPORT_PATH),
119
+ config_path: str = str(DEFAULT_CONFIG_PATH),
120
+ ) -> InferenceService:
121
+ """Load model + preprocessor + threshold and cache service singleton."""
122
+ model_file = Path(model_path)
123
+ preprocessor_file = Path(preprocessor_path)
124
+
125
+ if not model_file.exists():
126
+ raise FileNotFoundError(f"Model artifact not found: {model_file}")
127
+ if not preprocessor_file.exists():
128
+ raise FileNotFoundError(f"Preprocessor artifact not found: {preprocessor_file}")
129
+
130
+ model = joblib.load(model_file)
131
+ preprocessor = joblib.load(preprocessor_file)
132
+ threshold = resolve_threshold(
133
+ training_report_path=Path(training_report_path),
134
+ model_report_path=Path(model_report_path),
135
+ config_path=Path(config_path),
136
+ )
137
+
138
+ feature_names_in = getattr(preprocessor, "feature_names_in_", FEATURE_COLUMNS)
139
+ feature_columns = list(feature_names_in)
140
+
141
+ return InferenceService(
142
+ model=model,
143
+ preprocessor=preprocessor,
144
+ threshold=threshold,
145
+ model_path=model_file,
146
+ preprocessor_path=preprocessor_file,
147
+ feature_columns=feature_columns,
148
+ )
artifacts/data_validation.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "is_valid": true,
3
+ "errors": [],
4
+ "warnings": [],
5
+ "statistics": {
6
+ "row_count": 284807,
7
+ "column_count": 31,
8
+ "missing_values_total": 0,
9
+ "duplicate_rows": 1081,
10
+ "class_counts": {
11
+ "0": 284315,
12
+ "1": 492
13
+ },
14
+ "fraud_ratio": 0.001727485630620034
15
+ }
16
+ }
artifacts/metrics_logistic_regression.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "precision": 0.06097560975609756,
3
+ "recall": 0.9183673469387755,
4
+ "f1": 0.11435832274459974,
5
+ "roc_auc": 0.9721687370080279,
6
+ "pr_auc": 0.7159122424484009,
7
+ "confusion_matrix": [
8
+ [
9
+ 55478,
10
+ 1386
11
+ ],
12
+ [
13
+ 8,
14
+ 90
15
+ ]
16
+ ]
17
+ }
artifacts/metrics_xgboost.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "precision": 0.9186046511627907,
3
+ "recall": 0.8061224489795918,
4
+ "f1": 0.8586956521739131,
5
+ "roc_auc": 0.9775147361983623,
6
+ "pr_auc": 0.87487299490182,
7
+ "confusion_matrix": [
8
+ [
9
+ 56857,
10
+ 7
11
+ ],
12
+ [
13
+ 19,
14
+ 79
15
+ ]
16
+ ]
17
+ }
artifacts/model_report.json ADDED
@@ -0,0 +1,1834 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp_utc": "2026-02-18T04:09:02.995799+00:00",
3
+ "best_model_name": "logistic_regression",
4
+ "default_threshold_metrics": {
5
+ "precision": 0.06097560975609756,
6
+ "recall": 0.9183673469387755,
7
+ "f1": 0.11435832274459974,
8
+ "roc_auc": 0.9721687370080279,
9
+ "pr_auc": 0.7159122424484009,
10
+ "confusion_matrix": [
11
+ [
12
+ 55478,
13
+ 1386
14
+ ],
15
+ [
16
+ 8,
17
+ 90
18
+ ]
19
+ ]
20
+ },
21
+ "threshold_selection": {
22
+ "selection_reason": "meets_min_recall",
23
+ "min_recall_target": 0.9,
24
+ "selected_threshold": 0.74,
25
+ "selected_metrics": {
26
+ "precision": 0.13650306748466257,
27
+ "recall": 0.9081632653061225,
28
+ "f1": 0.23733333333333334,
29
+ "roc_auc": 0.9721687370080279,
30
+ "pr_auc": 0.7159122424484009,
31
+ "confusion_matrix": [
32
+ [
33
+ 56301,
34
+ 563
35
+ ],
36
+ [
37
+ 9,
38
+ 89
39
+ ]
40
+ ],
41
+ "threshold": 0.74
42
+ },
43
+ "threshold_grid_size": 99,
44
+ "thresholds_evaluated": [
45
+ {
46
+ "precision": 0.0024050381830804323,
47
+ "recall": 0.9897959183673469,
48
+ "f1": 0.004798417017066535,
49
+ "roc_auc": 0.9721687370080279,
50
+ "pr_auc": 0.7159122424484009,
51
+ "confusion_matrix": [
52
+ [
53
+ 16629,
54
+ 40235
55
+ ],
56
+ [
57
+ 1,
58
+ 97
59
+ ]
60
+ ],
61
+ "threshold": 0.01
62
+ },
63
+ {
64
+ "precision": 0.0030859288009416853,
65
+ "recall": 0.9897959183673469,
66
+ "f1": 0.006152675145095303,
67
+ "roc_auc": 0.9721687370080279,
68
+ "pr_auc": 0.7159122424484009,
69
+ "confusion_matrix": [
70
+ [
71
+ 25528,
72
+ 31336
73
+ ],
74
+ [
75
+ 1,
76
+ 97
77
+ ]
78
+ ],
79
+ "threshold": 0.02
80
+ },
81
+ {
82
+ "precision": 0.0037338104313328927,
83
+ "recall": 0.9795918367346939,
84
+ "f1": 0.007439265372544461,
85
+ "roc_auc": 0.9721687370080279,
86
+ "pr_auc": 0.7159122424484009,
87
+ "confusion_matrix": [
88
+ [
89
+ 31249,
90
+ 25615
91
+ ],
92
+ [
93
+ 2,
94
+ 96
95
+ ]
96
+ ],
97
+ "threshold": 0.03
98
+ },
99
+ {
100
+ "precision": 0.004303044174868391,
101
+ "recall": 0.9591836734693877,
102
+ "f1": 0.00856765255434535,
103
+ "roc_auc": 0.9721687370080279,
104
+ "pr_auc": 0.7159122424484009,
105
+ "confusion_matrix": [
106
+ [
107
+ 35113,
108
+ 21751
109
+ ],
110
+ [
111
+ 4,
112
+ 94
113
+ ]
114
+ ],
115
+ "threshold": 0.04
116
+ },
117
+ {
118
+ "precision": 0.004967499867885642,
119
+ "recall": 0.9591836734693877,
120
+ "f1": 0.009883812628147836,
121
+ "roc_auc": 0.9721687370080279,
122
+ "pr_auc": 0.7159122424484009,
123
+ "confusion_matrix": [
124
+ [
125
+ 38035,
126
+ 18829
127
+ ],
128
+ [
129
+ 4,
130
+ 94
131
+ ]
132
+ ],
133
+ "threshold": 0.05
134
+ },
135
+ {
136
+ "precision": 0.005584932564909988,
137
+ "recall": 0.9591836734693877,
138
+ "f1": 0.011105204087660228,
139
+ "roc_auc": 0.9721687370080279,
140
+ "pr_auc": 0.7159122424484009,
141
+ "confusion_matrix": [
142
+ [
143
+ 40127,
144
+ 16737
145
+ ],
146
+ [
147
+ 4,
148
+ 94
149
+ ]
150
+ ],
151
+ "threshold": 0.060000000000000005
152
+ },
153
+ {
154
+ "precision": 0.006171610591280112,
155
+ "recall": 0.9489795918367347,
156
+ "f1": 0.01226346673699479,
157
+ "roc_auc": 0.9721687370080279,
158
+ "pr_auc": 0.7159122424484009,
159
+ "confusion_matrix": [
160
+ [
161
+ 41888,
162
+ 14976
163
+ ],
164
+ [
165
+ 5,
166
+ 93
167
+ ]
168
+ ],
169
+ "threshold": 0.06999999999999999
170
+ },
171
+ {
172
+ "precision": 0.006826189078097475,
173
+ "recall": 0.9489795918367347,
174
+ "f1": 0.01355487538259729,
175
+ "roc_auc": 0.9721687370080279,
176
+ "pr_auc": 0.7159122424484009,
177
+ "confusion_matrix": [
178
+ [
179
+ 43333,
180
+ 13531
181
+ ],
182
+ [
183
+ 5,
184
+ 93
185
+ ]
186
+ ],
187
+ "threshold": 0.08
188
+ },
189
+ {
190
+ "precision": 0.007441190590494479,
191
+ "recall": 0.9489795918367347,
192
+ "f1": 0.014766592569069545,
193
+ "roc_auc": 0.9721687370080279,
194
+ "pr_auc": 0.7159122424484009,
195
+ "confusion_matrix": [
196
+ [
197
+ 44459,
198
+ 12405
199
+ ],
200
+ [
201
+ 5,
202
+ 93
203
+ ]
204
+ ],
205
+ "threshold": 0.09
206
+ },
207
+ {
208
+ "precision": 0.008117308195862791,
209
+ "recall": 0.9489795918367347,
210
+ "f1": 0.01609692773691043,
211
+ "roc_auc": 0.9721687370080279,
212
+ "pr_auc": 0.7159122424484009,
213
+ "confusion_matrix": [
214
+ [
215
+ 45500,
216
+ 11364
217
+ ],
218
+ [
219
+ 5,
220
+ 93
221
+ ]
222
+ ],
223
+ "threshold": 0.09999999999999999
224
+ },
225
+ {
226
+ "precision": 0.008798486281929991,
227
+ "recall": 0.9489795918367347,
228
+ "f1": 0.017435320584926885,
229
+ "roc_auc": 0.9721687370080279,
230
+ "pr_auc": 0.7159122424484009,
231
+ "confusion_matrix": [
232
+ [
233
+ 46387,
234
+ 10477
235
+ ],
236
+ [
237
+ 5,
238
+ 93
239
+ ]
240
+ ],
241
+ "threshold": 0.11
242
+ },
243
+ {
244
+ "precision": 0.009562982005141388,
245
+ "recall": 0.9489795918367347,
246
+ "f1": 0.018935152193830806,
247
+ "roc_auc": 0.9721687370080279,
248
+ "pr_auc": 0.7159122424484009,
249
+ "confusion_matrix": [
250
+ [
251
+ 47232,
252
+ 9632
253
+ ],
254
+ [
255
+ 5,
256
+ 93
257
+ ]
258
+ ],
259
+ "threshold": 0.12
260
+ },
261
+ {
262
+ "precision": 0.01033103754721173,
263
+ "recall": 0.9489795918367347,
264
+ "f1": 0.02043956043956044,
265
+ "roc_auc": 0.9721687370080279,
266
+ "pr_auc": 0.7159122424484009,
267
+ "confusion_matrix": [
268
+ [
269
+ 47955,
270
+ 8909
271
+ ],
272
+ [
273
+ 5,
274
+ 93
275
+ ]
276
+ ],
277
+ "threshold": 0.13
278
+ },
279
+ {
280
+ "precision": 0.011143062544931704,
281
+ "recall": 0.9489795918367347,
282
+ "f1": 0.022027475130270015,
283
+ "roc_auc": 0.9721687370080279,
284
+ "pr_auc": 0.7159122424484009,
285
+ "confusion_matrix": [
286
+ [
287
+ 48611,
288
+ 8253
289
+ ],
290
+ [
291
+ 5,
292
+ 93
293
+ ]
294
+ ],
295
+ "threshold": 0.14
296
+ },
297
+ {
298
+ "precision": 0.011935318275154004,
299
+ "recall": 0.9489795918367347,
300
+ "f1": 0.023574144486692015,
301
+ "roc_auc": 0.9721687370080279,
302
+ "pr_auc": 0.7159122424484009,
303
+ "confusion_matrix": [
304
+ [
305
+ 49165,
306
+ 7699
307
+ ],
308
+ [
309
+ 5,
310
+ 93
311
+ ]
312
+ ],
313
+ "threshold": 0.15000000000000002
314
+ },
315
+ {
316
+ "precision": 0.012781748213304012,
317
+ "recall": 0.9489795918367347,
318
+ "f1": 0.025223759153783564,
319
+ "roc_auc": 0.9721687370080279,
320
+ "pr_auc": 0.7159122424484009,
321
+ "confusion_matrix": [
322
+ [
323
+ 49681,
324
+ 7183
325
+ ],
326
+ [
327
+ 5,
328
+ 93
329
+ ]
330
+ ],
331
+ "threshold": 0.16
332
+ },
333
+ {
334
+ "precision": 0.013650374284456186,
335
+ "recall": 0.9489795918367347,
336
+ "f1": 0.02691361597453335,
337
+ "roc_auc": 0.9721687370080279,
338
+ "pr_auc": 0.7159122424484009,
339
+ "confusion_matrix": [
340
+ [
341
+ 50144,
342
+ 6720
343
+ ],
344
+ [
345
+ 5,
346
+ 93
347
+ ]
348
+ ],
349
+ "threshold": 0.17
350
+ },
351
+ {
352
+ "precision": 0.014563106796116505,
353
+ "recall": 0.9489795918367347,
354
+ "f1": 0.028685996298581123,
355
+ "roc_auc": 0.9721687370080279,
356
+ "pr_auc": 0.7159122424484009,
357
+ "confusion_matrix": [
358
+ [
359
+ 50571,
360
+ 6293
361
+ ],
362
+ [
363
+ 5,
364
+ 93
365
+ ]
366
+ ],
367
+ "threshold": 0.18000000000000002
368
+ },
369
+ {
370
+ "precision": 0.015567458988952126,
371
+ "recall": 0.9489795918367347,
372
+ "f1": 0.030632411067193676,
373
+ "roc_auc": 0.9721687370080279,
374
+ "pr_auc": 0.7159122424484009,
375
+ "confusion_matrix": [
376
+ [
377
+ 50983,
378
+ 5881
379
+ ],
380
+ [
381
+ 5,
382
+ 93
383
+ ]
384
+ ],
385
+ "threshold": 0.19
386
+ },
387
+ {
388
+ "precision": 0.016358463726884778,
389
+ "recall": 0.9387755102040817,
390
+ "f1": 0.03215658860538273,
391
+ "roc_auc": 0.9721687370080279,
392
+ "pr_auc": 0.7159122424484009,
393
+ "confusion_matrix": [
394
+ [
395
+ 51332,
396
+ 5532
397
+ ],
398
+ [
399
+ 6,
400
+ 92
401
+ ]
402
+ ],
403
+ "threshold": 0.2
404
+ },
405
+ {
406
+ "precision": 0.017355215996981702,
407
+ "recall": 0.9387755102040817,
408
+ "f1": 0.03408038525652899,
409
+ "roc_auc": 0.9721687370080279,
410
+ "pr_auc": 0.7159122424484009,
411
+ "confusion_matrix": [
412
+ [
413
+ 51655,
414
+ 5209
415
+ ],
416
+ [
417
+ 6,
418
+ 92
419
+ ]
420
+ ],
421
+ "threshold": 0.21000000000000002
422
+ },
423
+ {
424
+ "precision": 0.018236472945891785,
425
+ "recall": 0.9285714285714286,
426
+ "f1": 0.035770440251572326,
427
+ "roc_auc": 0.9721687370080279,
428
+ "pr_auc": 0.7159122424484009,
429
+ "confusion_matrix": [
430
+ [
431
+ 51965,
432
+ 4899
433
+ ],
434
+ [
435
+ 7,
436
+ 91
437
+ ]
438
+ ],
439
+ "threshold": 0.22
440
+ },
441
+ {
442
+ "precision": 0.01904761904761905,
443
+ "recall": 0.9183673469387755,
444
+ "f1": 0.037321169396641096,
445
+ "roc_auc": 0.9721687370080279,
446
+ "pr_auc": 0.7159122424484009,
447
+ "confusion_matrix": [
448
+ [
449
+ 52229,
450
+ 4635
451
+ ],
452
+ [
453
+ 8,
454
+ 90
455
+ ]
456
+ ],
457
+ "threshold": 0.23
458
+ },
459
+ {
460
+ "precision": 0.020049008687903765,
461
+ "recall": 0.9183673469387755,
462
+ "f1": 0.03924133420536298,
463
+ "roc_auc": 0.9721687370080279,
464
+ "pr_auc": 0.7159122424484009,
465
+ "confusion_matrix": [
466
+ [
467
+ 52465,
468
+ 4399
469
+ ],
470
+ [
471
+ 8,
472
+ 90
473
+ ]
474
+ ],
475
+ "threshold": 0.24000000000000002
476
+ },
477
+ {
478
+ "precision": 0.021216407355021217,
479
+ "recall": 0.9183673469387755,
480
+ "f1": 0.041474654377880185,
481
+ "roc_auc": 0.9721687370080279,
482
+ "pr_auc": 0.7159122424484009,
483
+ "confusion_matrix": [
484
+ [
485
+ 52712,
486
+ 4152
487
+ ],
488
+ [
489
+ 8,
490
+ 90
491
+ ]
492
+ ],
493
+ "threshold": 0.25
494
+ },
495
+ {
496
+ "precision": 0.0224159402241594,
497
+ "recall": 0.9183673469387755,
498
+ "f1": 0.0437636761487965,
499
+ "roc_auc": 0.9721687370080279,
500
+ "pr_auc": 0.7159122424484009,
501
+ "confusion_matrix": [
502
+ [
503
+ 52939,
504
+ 3925
505
+ ],
506
+ [
507
+ 8,
508
+ 90
509
+ ]
510
+ ],
511
+ "threshold": 0.26
512
+ },
513
+ {
514
+ "precision": 0.023578726748755566,
515
+ "recall": 0.9183673469387755,
516
+ "f1": 0.04597701149425287,
517
+ "roc_auc": 0.9721687370080279,
518
+ "pr_auc": 0.7159122424484009,
519
+ "confusion_matrix": [
520
+ [
521
+ 53137,
522
+ 3727
523
+ ],
524
+ [
525
+ 8,
526
+ 90
527
+ ]
528
+ ],
529
+ "threshold": 0.27
530
+ },
531
+ {
532
+ "precision": 0.024725274725274724,
533
+ "recall": 0.9183673469387755,
534
+ "f1": 0.048154093097913325,
535
+ "roc_auc": 0.9721687370080279,
536
+ "pr_auc": 0.7159122424484009,
537
+ "confusion_matrix": [
538
+ [
539
+ 53314,
540
+ 3550
541
+ ],
542
+ [
543
+ 8,
544
+ 90
545
+ ]
546
+ ],
547
+ "threshold": 0.28
548
+ },
549
+ {
550
+ "precision": 0.02601156069364162,
551
+ "recall": 0.9183673469387755,
552
+ "f1": 0.050590219224283306,
553
+ "roc_auc": 0.9721687370080279,
554
+ "pr_auc": 0.7159122424484009,
555
+ "confusion_matrix": [
556
+ [
557
+ 53494,
558
+ 3370
559
+ ],
560
+ [
561
+ 8,
562
+ 90
563
+ ]
564
+ ],
565
+ "threshold": 0.29000000000000004
566
+ },
567
+ {
568
+ "precision": 0.0272975432211101,
569
+ "recall": 0.9183673469387755,
570
+ "f1": 0.053019145802650956,
571
+ "roc_auc": 0.9721687370080279,
572
+ "pr_auc": 0.7159122424484009,
573
+ "confusion_matrix": [
574
+ [
575
+ 53657,
576
+ 3207
577
+ ],
578
+ [
579
+ 8,
580
+ 90
581
+ ]
582
+ ],
583
+ "threshold": 0.3
584
+ },
585
+ {
586
+ "precision": 0.028598665395614873,
587
+ "recall": 0.9183673469387755,
588
+ "f1": 0.05546995377503852,
589
+ "roc_auc": 0.9721687370080279,
590
+ "pr_auc": 0.7159122424484009,
591
+ "confusion_matrix": [
592
+ [
593
+ 53807,
594
+ 3057
595
+ ],
596
+ [
597
+ 8,
598
+ 90
599
+ ]
600
+ ],
601
+ "threshold": 0.31
602
+ },
603
+ {
604
+ "precision": 0.030010003334444816,
605
+ "recall": 0.9183673469387755,
606
+ "f1": 0.05812076202776881,
607
+ "roc_auc": 0.9721687370080279,
608
+ "pr_auc": 0.7159122424484009,
609
+ "confusion_matrix": [
610
+ [
611
+ 53955,
612
+ 2909
613
+ ],
614
+ [
615
+ 8,
616
+ 90
617
+ ]
618
+ ],
619
+ "threshold": 0.32
620
+ },
621
+ {
622
+ "precision": 0.031315240083507306,
623
+ "recall": 0.9183673469387755,
624
+ "f1": 0.06056527590847914,
625
+ "roc_auc": 0.9721687370080279,
626
+ "pr_auc": 0.7159122424484009,
627
+ "confusion_matrix": [
628
+ [
629
+ 54080,
630
+ 2784
631
+ ],
632
+ [
633
+ 8,
634
+ 90
635
+ ]
636
+ ],
637
+ "threshold": 0.33
638
+ },
639
+ {
640
+ "precision": 0.03278688524590164,
641
+ "recall": 0.9183673469387755,
642
+ "f1": 0.06331340133661625,
643
+ "roc_auc": 0.9721687370080279,
644
+ "pr_auc": 0.7159122424484009,
645
+ "confusion_matrix": [
646
+ [
647
+ 54209,
648
+ 2655
649
+ ],
650
+ [
651
+ 8,
652
+ 90
653
+ ]
654
+ ],
655
+ "threshold": 0.34
656
+ },
657
+ {
658
+ "precision": 0.03425961172440046,
659
+ "recall": 0.9183673469387755,
660
+ "f1": 0.06605504587155964,
661
+ "roc_auc": 0.9721687370080279,
662
+ "pr_auc": 0.7159122424484009,
663
+ "confusion_matrix": [
664
+ [
665
+ 54327,
666
+ 2537
667
+ ],
668
+ [
669
+ 8,
670
+ 90
671
+ ]
672
+ ],
673
+ "threshold": 0.35000000000000003
674
+ },
675
+ {
676
+ "precision": 0.03587086488640893,
677
+ "recall": 0.9183673469387755,
678
+ "f1": 0.06904487917146145,
679
+ "roc_auc": 0.9721687370080279,
680
+ "pr_auc": 0.7159122424484009,
681
+ "confusion_matrix": [
682
+ [
683
+ 54445,
684
+ 2419
685
+ ],
686
+ [
687
+ 8,
688
+ 90
689
+ ]
690
+ ],
691
+ "threshold": 0.36000000000000004
692
+ },
693
+ {
694
+ "precision": 0.037282518641259324,
695
+ "recall": 0.9183673469387755,
696
+ "f1": 0.07165605095541401,
697
+ "roc_auc": 0.9721687370080279,
698
+ "pr_auc": 0.7159122424484009,
699
+ "confusion_matrix": [
700
+ [
701
+ 54540,
702
+ 2324
703
+ ],
704
+ [
705
+ 8,
706
+ 90
707
+ ]
708
+ ],
709
+ "threshold": 0.37
710
+ },
711
+ {
712
+ "precision": 0.038860103626943004,
713
+ "recall": 0.9183673469387755,
714
+ "f1": 0.07456503728251865,
715
+ "roc_auc": 0.9721687370080279,
716
+ "pr_auc": 0.7159122424484009,
717
+ "confusion_matrix": [
718
+ [
719
+ 54638,
720
+ 2226
721
+ ],
722
+ [
723
+ 8,
724
+ 90
725
+ ]
726
+ ],
727
+ "threshold": 0.38
728
+ },
729
+ {
730
+ "precision": 0.04025044722719141,
731
+ "recall": 0.9183673469387755,
732
+ "f1": 0.07712082262210797,
733
+ "roc_auc": 0.9721687370080279,
734
+ "pr_auc": 0.7159122424484009,
735
+ "confusion_matrix": [
736
+ [
737
+ 54718,
738
+ 2146
739
+ ],
740
+ [
741
+ 8,
742
+ 90
743
+ ]
744
+ ],
745
+ "threshold": 0.39
746
+ },
747
+ {
748
+ "precision": 0.04205607476635514,
749
+ "recall": 0.9183673469387755,
750
+ "f1": 0.08042895442359249,
751
+ "roc_auc": 0.9721687370080279,
752
+ "pr_auc": 0.7159122424484009,
753
+ "confusion_matrix": [
754
+ [
755
+ 54814,
756
+ 2050
757
+ ],
758
+ [
759
+ 8,
760
+ 90
761
+ ]
762
+ ],
763
+ "threshold": 0.4
764
+ },
765
+ {
766
+ "precision": 0.043923865300146414,
767
+ "recall": 0.9183673469387755,
768
+ "f1": 0.08383791336748952,
769
+ "roc_auc": 0.9721687370080279,
770
+ "pr_auc": 0.7159122424484009,
771
+ "confusion_matrix": [
772
+ [
773
+ 54905,
774
+ 1959
775
+ ],
776
+ [
777
+ 8,
778
+ 90
779
+ ]
780
+ ],
781
+ "threshold": 0.41000000000000003
782
+ },
783
+ {
784
+ "precision": 0.045754956786985254,
785
+ "recall": 0.9183673469387755,
786
+ "f1": 0.08716707021791767,
787
+ "roc_auc": 0.9721687370080279,
788
+ "pr_auc": 0.7159122424484009,
789
+ "confusion_matrix": [
790
+ [
791
+ 54987,
792
+ 1877
793
+ ],
794
+ [
795
+ 8,
796
+ 90
797
+ ]
798
+ ],
799
+ "threshold": 0.42000000000000004
800
+ },
801
+ {
802
+ "precision": 0.04736842105263158,
803
+ "recall": 0.9183673469387755,
804
+ "f1": 0.09009009009009009,
805
+ "roc_auc": 0.9721687370080279,
806
+ "pr_auc": 0.7159122424484009,
807
+ "confusion_matrix": [
808
+ [
809
+ 55054,
810
+ 1810
811
+ ],
812
+ [
813
+ 8,
814
+ 90
815
+ ]
816
+ ],
817
+ "threshold": 0.43
818
+ },
819
+ {
820
+ "precision": 0.049099836333878884,
821
+ "recall": 0.9183673469387755,
822
+ "f1": 0.09321595028482652,
823
+ "roc_auc": 0.9721687370080279,
824
+ "pr_auc": 0.7159122424484009,
825
+ "confusion_matrix": [
826
+ [
827
+ 55121,
828
+ 1743
829
+ ],
830
+ [
831
+ 8,
832
+ 90
833
+ ]
834
+ ],
835
+ "threshold": 0.44
836
+ },
837
+ {
838
+ "precision": 0.050818746470920384,
839
+ "recall": 0.9183673469387755,
840
+ "f1": 0.09630818619582665,
841
+ "roc_auc": 0.9721687370080279,
842
+ "pr_auc": 0.7159122424484009,
843
+ "confusion_matrix": [
844
+ [
845
+ 55183,
846
+ 1681
847
+ ],
848
+ [
849
+ 8,
850
+ 90
851
+ ]
852
+ ],
853
+ "threshold": 0.45
854
+ },
855
+ {
856
+ "precision": 0.052508751458576426,
857
+ "recall": 0.9183673469387755,
858
+ "f1": 0.09933774834437085,
859
+ "roc_auc": 0.9721687370080279,
860
+ "pr_auc": 0.7159122424484009,
861
+ "confusion_matrix": [
862
+ [
863
+ 55240,
864
+ 1624
865
+ ],
866
+ [
867
+ 8,
868
+ 90
869
+ ]
870
+ ],
871
+ "threshold": 0.46
872
+ },
873
+ {
874
+ "precision": 0.054678007290400975,
875
+ "recall": 0.9183673469387755,
876
+ "f1": 0.10321100917431193,
877
+ "roc_auc": 0.9721687370080279,
878
+ "pr_auc": 0.7159122424484009,
879
+ "confusion_matrix": [
880
+ [
881
+ 55308,
882
+ 1556
883
+ ],
884
+ [
885
+ 8,
886
+ 90
887
+ ]
888
+ ],
889
+ "threshold": 0.47000000000000003
890
+ },
891
+ {
892
+ "precision": 0.056568196103079824,
893
+ "recall": 0.9183673469387755,
894
+ "f1": 0.10657193605683836,
895
+ "roc_auc": 0.9721687370080279,
896
+ "pr_auc": 0.7159122424484009,
897
+ "confusion_matrix": [
898
+ [
899
+ 55363,
900
+ 1501
901
+ ],
902
+ [
903
+ 8,
904
+ 90
905
+ ]
906
+ ],
907
+ "threshold": 0.48000000000000004
908
+ },
909
+ {
910
+ "precision": 0.05870841487279843,
911
+ "recall": 0.9183673469387755,
912
+ "f1": 0.11036174126302882,
913
+ "roc_auc": 0.9721687370080279,
914
+ "pr_auc": 0.7159122424484009,
915
+ "confusion_matrix": [
916
+ [
917
+ 55421,
918
+ 1443
919
+ ],
920
+ [
921
+ 8,
922
+ 90
923
+ ]
924
+ ],
925
+ "threshold": 0.49
926
+ },
927
+ {
928
+ "precision": 0.06097560975609756,
929
+ "recall": 0.9183673469387755,
930
+ "f1": 0.11435832274459974,
931
+ "roc_auc": 0.9721687370080279,
932
+ "pr_auc": 0.7159122424484009,
933
+ "confusion_matrix": [
934
+ [
935
+ 55478,
936
+ 1386
937
+ ],
938
+ [
939
+ 8,
940
+ 90
941
+ ]
942
+ ],
943
+ "threshold": 0.5
944
+ },
945
+ {
946
+ "precision": 0.06382978723404255,
947
+ "recall": 0.9183673469387755,
948
+ "f1": 0.11936339522546419,
949
+ "roc_auc": 0.9721687370080279,
950
+ "pr_auc": 0.7159122424484009,
951
+ "confusion_matrix": [
952
+ [
953
+ 55544,
954
+ 1320
955
+ ],
956
+ [
957
+ 8,
958
+ 90
959
+ ]
960
+ ],
961
+ "threshold": 0.51
962
+ },
963
+ {
964
+ "precision": 0.06642066420664207,
965
+ "recall": 0.9183673469387755,
966
+ "f1": 0.12388162422573985,
967
+ "roc_auc": 0.9721687370080279,
968
+ "pr_auc": 0.7159122424484009,
969
+ "confusion_matrix": [
970
+ [
971
+ 55599,
972
+ 1265
973
+ ],
974
+ [
975
+ 8,
976
+ 90
977
+ ]
978
+ ],
979
+ "threshold": 0.52
980
+ },
981
+ {
982
+ "precision": 0.06813020439061317,
983
+ "recall": 0.9183673469387755,
984
+ "f1": 0.12684989429175475,
985
+ "roc_auc": 0.9721687370080279,
986
+ "pr_auc": 0.7159122424484009,
987
+ "confusion_matrix": [
988
+ [
989
+ 55633,
990
+ 1231
991
+ ],
992
+ [
993
+ 8,
994
+ 90
995
+ ]
996
+ ],
997
+ "threshold": 0.53
998
+ },
999
+ {
1000
+ "precision": 0.0706436420722135,
1001
+ "recall": 0.9183673469387755,
1002
+ "f1": 0.13119533527696792,
1003
+ "roc_auc": 0.9721687370080279,
1004
+ "pr_auc": 0.7159122424484009,
1005
+ "confusion_matrix": [
1006
+ [
1007
+ 55680,
1008
+ 1184
1009
+ ],
1010
+ [
1011
+ 8,
1012
+ 90
1013
+ ]
1014
+ ],
1015
+ "threshold": 0.54
1016
+ },
1017
+ {
1018
+ "precision": 0.07317073170731707,
1019
+ "recall": 0.9183673469387755,
1020
+ "f1": 0.1355421686746988,
1021
+ "roc_auc": 0.9721687370080279,
1022
+ "pr_auc": 0.7159122424484009,
1023
+ "confusion_matrix": [
1024
+ [
1025
+ 55724,
1026
+ 1140
1027
+ ],
1028
+ [
1029
+ 8,
1030
+ 90
1031
+ ]
1032
+ ],
1033
+ "threshold": 0.55
1034
+ },
1035
+ {
1036
+ "precision": 0.0760777683854607,
1037
+ "recall": 0.9183673469387755,
1038
+ "f1": 0.1405152224824356,
1039
+ "roc_auc": 0.9721687370080279,
1040
+ "pr_auc": 0.7159122424484009,
1041
+ "confusion_matrix": [
1042
+ [
1043
+ 55771,
1044
+ 1093
1045
+ ],
1046
+ [
1047
+ 8,
1048
+ 90
1049
+ ]
1050
+ ],
1051
+ "threshold": 0.56
1052
+ },
1053
+ {
1054
+ "precision": 0.07853403141361257,
1055
+ "recall": 0.9183673469387755,
1056
+ "f1": 0.14469453376205788,
1057
+ "roc_auc": 0.9721687370080279,
1058
+ "pr_auc": 0.7159122424484009,
1059
+ "confusion_matrix": [
1060
+ [
1061
+ 55808,
1062
+ 1056
1063
+ ],
1064
+ [
1065
+ 8,
1066
+ 90
1067
+ ]
1068
+ ],
1069
+ "threshold": 0.5700000000000001
1070
+ },
1071
+ {
1072
+ "precision": 0.0820419325432999,
1073
+ "recall": 0.9183673469387755,
1074
+ "f1": 0.1506276150627615,
1075
+ "roc_auc": 0.9721687370080279,
1076
+ "pr_auc": 0.7159122424484009,
1077
+ "confusion_matrix": [
1078
+ [
1079
+ 55857,
1080
+ 1007
1081
+ ],
1082
+ [
1083
+ 8,
1084
+ 90
1085
+ ]
1086
+ ],
1087
+ "threshold": 0.5800000000000001
1088
+ },
1089
+ {
1090
+ "precision": 0.08458646616541353,
1091
+ "recall": 0.9183673469387755,
1092
+ "f1": 0.1549053356282272,
1093
+ "roc_auc": 0.9721687370080279,
1094
+ "pr_auc": 0.7159122424484009,
1095
+ "confusion_matrix": [
1096
+ [
1097
+ 55890,
1098
+ 974
1099
+ ],
1100
+ [
1101
+ 8,
1102
+ 90
1103
+ ]
1104
+ ],
1105
+ "threshold": 0.59
1106
+ },
1107
+ {
1108
+ "precision": 0.0866601752677702,
1109
+ "recall": 0.9081632653061225,
1110
+ "f1": 0.1582222222222222,
1111
+ "roc_auc": 0.9721687370080279,
1112
+ "pr_auc": 0.7159122424484009,
1113
+ "confusion_matrix": [
1114
+ [
1115
+ 55926,
1116
+ 938
1117
+ ],
1118
+ [
1119
+ 9,
1120
+ 89
1121
+ ]
1122
+ ],
1123
+ "threshold": 0.6
1124
+ },
1125
+ {
1126
+ "precision": 0.09035532994923857,
1127
+ "recall": 0.9081632653061225,
1128
+ "f1": 0.16435826408125578,
1129
+ "roc_auc": 0.9721687370080279,
1130
+ "pr_auc": 0.7159122424484009,
1131
+ "confusion_matrix": [
1132
+ [
1133
+ 55968,
1134
+ 896
1135
+ ],
1136
+ [
1137
+ 9,
1138
+ 89
1139
+ ]
1140
+ ],
1141
+ "threshold": 0.61
1142
+ },
1143
+ {
1144
+ "precision": 0.09290187891440502,
1145
+ "recall": 0.9081632653061225,
1146
+ "f1": 0.16856060606060605,
1147
+ "roc_auc": 0.9721687370080279,
1148
+ "pr_auc": 0.7159122424484009,
1149
+ "confusion_matrix": [
1150
+ [
1151
+ 55995,
1152
+ 869
1153
+ ],
1154
+ [
1155
+ 9,
1156
+ 89
1157
+ ]
1158
+ ],
1159
+ "threshold": 0.62
1160
+ },
1161
+ {
1162
+ "precision": 0.09611231101511879,
1163
+ "recall": 0.9081632653061225,
1164
+ "f1": 0.173828125,
1165
+ "roc_auc": 0.9721687370080279,
1166
+ "pr_auc": 0.7159122424484009,
1167
+ "confusion_matrix": [
1168
+ [
1169
+ 56027,
1170
+ 837
1171
+ ],
1172
+ [
1173
+ 9,
1174
+ 89
1175
+ ]
1176
+ ],
1177
+ "threshold": 0.63
1178
+ },
1179
+ {
1180
+ "precision": 0.09866962305986696,
1181
+ "recall": 0.9081632653061225,
1182
+ "f1": 0.178,
1183
+ "roc_auc": 0.9721687370080279,
1184
+ "pr_auc": 0.7159122424484009,
1185
+ "confusion_matrix": [
1186
+ [
1187
+ 56051,
1188
+ 813
1189
+ ],
1190
+ [
1191
+ 9,
1192
+ 89
1193
+ ]
1194
+ ],
1195
+ "threshold": 0.64
1196
+ },
1197
+ {
1198
+ "precision": 0.10194730813287514,
1199
+ "recall": 0.9081632653061225,
1200
+ "f1": 0.18331616889804325,
1201
+ "roc_auc": 0.9721687370080279,
1202
+ "pr_auc": 0.7159122424484009,
1203
+ "confusion_matrix": [
1204
+ [
1205
+ 56080,
1206
+ 784
1207
+ ],
1208
+ [
1209
+ 9,
1210
+ 89
1211
+ ]
1212
+ ],
1213
+ "threshold": 0.65
1214
+ },
1215
+ {
1216
+ "precision": 0.10620525059665871,
1217
+ "recall": 0.9081632653061225,
1218
+ "f1": 0.19017094017094016,
1219
+ "roc_auc": 0.9721687370080279,
1220
+ "pr_auc": 0.7159122424484009,
1221
+ "confusion_matrix": [
1222
+ [
1223
+ 56115,
1224
+ 749
1225
+ ],
1226
+ [
1227
+ 9,
1228
+ 89
1229
+ ]
1230
+ ],
1231
+ "threshold": 0.66
1232
+ },
1233
+ {
1234
+ "precision": 0.11014851485148515,
1235
+ "recall": 0.9081632653061225,
1236
+ "f1": 0.19646799116997793,
1237
+ "roc_auc": 0.9721687370080279,
1238
+ "pr_auc": 0.7159122424484009,
1239
+ "confusion_matrix": [
1240
+ [
1241
+ 56145,
1242
+ 719
1243
+ ],
1244
+ [
1245
+ 9,
1246
+ 89
1247
+ ]
1248
+ ],
1249
+ "threshold": 0.67
1250
+ },
1251
+ {
1252
+ "precision": 0.11424903722721438,
1253
+ "recall": 0.9081632653061225,
1254
+ "f1": 0.20296465222348917,
1255
+ "roc_auc": 0.9721687370080279,
1256
+ "pr_auc": 0.7159122424484009,
1257
+ "confusion_matrix": [
1258
+ [
1259
+ 56174,
1260
+ 690
1261
+ ],
1262
+ [
1263
+ 9,
1264
+ 89
1265
+ ]
1266
+ ],
1267
+ "threshold": 0.68
1268
+ },
1269
+ {
1270
+ "precision": 0.11772486772486772,
1271
+ "recall": 0.9081632653061225,
1272
+ "f1": 0.20843091334894615,
1273
+ "roc_auc": 0.9721687370080279,
1274
+ "pr_auc": 0.7159122424484009,
1275
+ "confusion_matrix": [
1276
+ [
1277
+ 56197,
1278
+ 667
1279
+ ],
1280
+ [
1281
+ 9,
1282
+ 89
1283
+ ]
1284
+ ],
1285
+ "threshold": 0.6900000000000001
1286
+ },
1287
+ {
1288
+ "precision": 0.12141882673942701,
1289
+ "recall": 0.9081632653061225,
1290
+ "f1": 0.21419975932611313,
1291
+ "roc_auc": 0.9721687370080279,
1292
+ "pr_auc": 0.7159122424484009,
1293
+ "confusion_matrix": [
1294
+ [
1295
+ 56220,
1296
+ 644
1297
+ ],
1298
+ [
1299
+ 9,
1300
+ 89
1301
+ ]
1302
+ ],
1303
+ "threshold": 0.7000000000000001
1304
+ },
1305
+ {
1306
+ "precision": 0.12588401697312587,
1307
+ "recall": 0.9081632653061225,
1308
+ "f1": 0.22111801242236026,
1309
+ "roc_auc": 0.9721687370080279,
1310
+ "pr_auc": 0.7159122424484009,
1311
+ "confusion_matrix": [
1312
+ [
1313
+ 56246,
1314
+ 618
1315
+ ],
1316
+ [
1317
+ 9,
1318
+ 89
1319
+ ]
1320
+ ],
1321
+ "threshold": 0.7100000000000001
1322
+ },
1323
+ {
1324
+ "precision": 0.12936046511627908,
1325
+ "recall": 0.9081632653061225,
1326
+ "f1": 0.22646310432569974,
1327
+ "roc_auc": 0.9721687370080279,
1328
+ "pr_auc": 0.7159122424484009,
1329
+ "confusion_matrix": [
1330
+ [
1331
+ 56265,
1332
+ 599
1333
+ ],
1334
+ [
1335
+ 9,
1336
+ 89
1337
+ ]
1338
+ ],
1339
+ "threshold": 0.72
1340
+ },
1341
+ {
1342
+ "precision": 0.13343328335832083,
1343
+ "recall": 0.9081632653061225,
1344
+ "f1": 0.2326797385620915,
1345
+ "roc_auc": 0.9721687370080279,
1346
+ "pr_auc": 0.7159122424484009,
1347
+ "confusion_matrix": [
1348
+ [
1349
+ 56286,
1350
+ 578
1351
+ ],
1352
+ [
1353
+ 9,
1354
+ 89
1355
+ ]
1356
+ ],
1357
+ "threshold": 0.73
1358
+ },
1359
+ {
1360
+ "precision": 0.13650306748466257,
1361
+ "recall": 0.9081632653061225,
1362
+ "f1": 0.23733333333333334,
1363
+ "roc_auc": 0.9721687370080279,
1364
+ "pr_auc": 0.7159122424484009,
1365
+ "confusion_matrix": [
1366
+ [
1367
+ 56301,
1368
+ 563
1369
+ ],
1370
+ [
1371
+ 9,
1372
+ 89
1373
+ ]
1374
+ ],
1375
+ "threshold": 0.74
1376
+ },
1377
+ {
1378
+ "precision": 0.14012738853503184,
1379
+ "recall": 0.8979591836734694,
1380
+ "f1": 0.24242424242424243,
1381
+ "roc_auc": 0.9721687370080279,
1382
+ "pr_auc": 0.7159122424484009,
1383
+ "confusion_matrix": [
1384
+ [
1385
+ 56324,
1386
+ 540
1387
+ ],
1388
+ [
1389
+ 10,
1390
+ 88
1391
+ ]
1392
+ ],
1393
+ "threshold": 0.75
1394
+ },
1395
+ {
1396
+ "precision": 0.14402618657937807,
1397
+ "recall": 0.8979591836734694,
1398
+ "f1": 0.24823695345557123,
1399
+ "roc_auc": 0.9721687370080279,
1400
+ "pr_auc": 0.7159122424484009,
1401
+ "confusion_matrix": [
1402
+ [
1403
+ 56341,
1404
+ 523
1405
+ ],
1406
+ [
1407
+ 10,
1408
+ 88
1409
+ ]
1410
+ ],
1411
+ "threshold": 0.76
1412
+ },
1413
+ {
1414
+ "precision": 0.14864864864864866,
1415
+ "recall": 0.8979591836734694,
1416
+ "f1": 0.25507246376811593,
1417
+ "roc_auc": 0.9721687370080279,
1418
+ "pr_auc": 0.7159122424484009,
1419
+ "confusion_matrix": [
1420
+ [
1421
+ 56360,
1422
+ 504
1423
+ ],
1424
+ [
1425
+ 10,
1426
+ 88
1427
+ ]
1428
+ ],
1429
+ "threshold": 0.77
1430
+ },
1431
+ {
1432
+ "precision": 0.15198618307426598,
1433
+ "recall": 0.8979591836734694,
1434
+ "f1": 0.25997045790251105,
1435
+ "roc_auc": 0.9721687370080279,
1436
+ "pr_auc": 0.7159122424484009,
1437
+ "confusion_matrix": [
1438
+ [
1439
+ 56373,
1440
+ 491
1441
+ ],
1442
+ [
1443
+ 10,
1444
+ 88
1445
+ ]
1446
+ ],
1447
+ "threshold": 0.78
1448
+ },
1449
+ {
1450
+ "precision": 0.15630550621669628,
1451
+ "recall": 0.8979591836734694,
1452
+ "f1": 0.26626323751891073,
1453
+ "roc_auc": 0.9721687370080279,
1454
+ "pr_auc": 0.7159122424484009,
1455
+ "confusion_matrix": [
1456
+ [
1457
+ 56389,
1458
+ 475
1459
+ ],
1460
+ [
1461
+ 10,
1462
+ 88
1463
+ ]
1464
+ ],
1465
+ "threshold": 0.79
1466
+ },
1467
+ {
1468
+ "precision": 0.16087751371115175,
1469
+ "recall": 0.8979591836734694,
1470
+ "f1": 0.27286821705426356,
1471
+ "roc_auc": 0.9721687370080279,
1472
+ "pr_auc": 0.7159122424484009,
1473
+ "confusion_matrix": [
1474
+ [
1475
+ 56405,
1476
+ 459
1477
+ ],
1478
+ [
1479
+ 10,
1480
+ 88
1481
+ ]
1482
+ ],
1483
+ "threshold": 0.8
1484
+ },
1485
+ {
1486
+ "precision": 0.1638418079096045,
1487
+ "recall": 0.8877551020408163,
1488
+ "f1": 0.2766295707472178,
1489
+ "roc_auc": 0.9721687370080279,
1490
+ "pr_auc": 0.7159122424484009,
1491
+ "confusion_matrix": [
1492
+ [
1493
+ 56420,
1494
+ 444
1495
+ ],
1496
+ [
1497
+ 11,
1498
+ 87
1499
+ ]
1500
+ ],
1501
+ "threshold": 0.81
1502
+ },
1503
+ {
1504
+ "precision": 0.17058823529411765,
1505
+ "recall": 0.8877551020408163,
1506
+ "f1": 0.28618421052631576,
1507
+ "roc_auc": 0.9721687370080279,
1508
+ "pr_auc": 0.7159122424484009,
1509
+ "confusion_matrix": [
1510
+ [
1511
+ 56441,
1512
+ 423
1513
+ ],
1514
+ [
1515
+ 11,
1516
+ 87
1517
+ ]
1518
+ ],
1519
+ "threshold": 0.8200000000000001
1520
+ },
1521
+ {
1522
+ "precision": 0.174,
1523
+ "recall": 0.8877551020408163,
1524
+ "f1": 0.2909698996655518,
1525
+ "roc_auc": 0.9721687370080279,
1526
+ "pr_auc": 0.7159122424484009,
1527
+ "confusion_matrix": [
1528
+ [
1529
+ 56451,
1530
+ 413
1531
+ ],
1532
+ [
1533
+ 11,
1534
+ 87
1535
+ ]
1536
+ ],
1537
+ "threshold": 0.8300000000000001
1538
+ },
1539
+ {
1540
+ "precision": 0.1797520661157025,
1541
+ "recall": 0.8877551020408163,
1542
+ "f1": 0.29896907216494845,
1543
+ "roc_auc": 0.9721687370080279,
1544
+ "pr_auc": 0.7159122424484009,
1545
+ "confusion_matrix": [
1546
+ [
1547
+ 56467,
1548
+ 397
1549
+ ],
1550
+ [
1551
+ 11,
1552
+ 87
1553
+ ]
1554
+ ],
1555
+ "threshold": 0.8400000000000001
1556
+ },
1557
+ {
1558
+ "precision": 0.18471337579617833,
1559
+ "recall": 0.8877551020408163,
1560
+ "f1": 0.30579964850615116,
1561
+ "roc_auc": 0.9721687370080279,
1562
+ "pr_auc": 0.7159122424484009,
1563
+ "confusion_matrix": [
1564
+ [
1565
+ 56480,
1566
+ 384
1567
+ ],
1568
+ [
1569
+ 11,
1570
+ 87
1571
+ ]
1572
+ ],
1573
+ "threshold": 0.85
1574
+ },
1575
+ {
1576
+ "precision": 0.19506726457399104,
1577
+ "recall": 0.8877551020408163,
1578
+ "f1": 0.31985294117647056,
1579
+ "roc_auc": 0.9721687370080279,
1580
+ "pr_auc": 0.7159122424484009,
1581
+ "confusion_matrix": [
1582
+ [
1583
+ 56505,
1584
+ 359
1585
+ ],
1586
+ [
1587
+ 11,
1588
+ 87
1589
+ ]
1590
+ ],
1591
+ "threshold": 0.86
1592
+ },
1593
+ {
1594
+ "precision": 0.20374707259953162,
1595
+ "recall": 0.8877551020408163,
1596
+ "f1": 0.3314285714285714,
1597
+ "roc_auc": 0.9721687370080279,
1598
+ "pr_auc": 0.7159122424484009,
1599
+ "confusion_matrix": [
1600
+ [
1601
+ 56524,
1602
+ 340
1603
+ ],
1604
+ [
1605
+ 11,
1606
+ 87
1607
+ ]
1608
+ ],
1609
+ "threshold": 0.87
1610
+ },
1611
+ {
1612
+ "precision": 0.21375921375921375,
1613
+ "recall": 0.8877551020408163,
1614
+ "f1": 0.3445544554455445,
1615
+ "roc_auc": 0.9721687370080279,
1616
+ "pr_auc": 0.7159122424484009,
1617
+ "confusion_matrix": [
1618
+ [
1619
+ 56544,
1620
+ 320
1621
+ ],
1622
+ [
1623
+ 11,
1624
+ 87
1625
+ ]
1626
+ ],
1627
+ "threshold": 0.88
1628
+ },
1629
+ {
1630
+ "precision": 0.2265625,
1631
+ "recall": 0.8877551020408163,
1632
+ "f1": 0.36099585062240663,
1633
+ "roc_auc": 0.9721687370080279,
1634
+ "pr_auc": 0.7159122424484009,
1635
+ "confusion_matrix": [
1636
+ [
1637
+ 56567,
1638
+ 297
1639
+ ],
1640
+ [
1641
+ 11,
1642
+ 87
1643
+ ]
1644
+ ],
1645
+ "threshold": 0.89
1646
+ },
1647
+ {
1648
+ "precision": 0.24507042253521127,
1649
+ "recall": 0.8877551020408163,
1650
+ "f1": 0.3841059602649007,
1651
+ "roc_auc": 0.9721687370080279,
1652
+ "pr_auc": 0.7159122424484009,
1653
+ "confusion_matrix": [
1654
+ [
1655
+ 56596,
1656
+ 268
1657
+ ],
1658
+ [
1659
+ 11,
1660
+ 87
1661
+ ]
1662
+ ],
1663
+ "threshold": 0.9
1664
+ },
1665
+ {
1666
+ "precision": 0.2636363636363636,
1667
+ "recall": 0.8877551020408163,
1668
+ "f1": 0.40654205607476634,
1669
+ "roc_auc": 0.9721687370080279,
1670
+ "pr_auc": 0.7159122424484009,
1671
+ "confusion_matrix": [
1672
+ [
1673
+ 56621,
1674
+ 243
1675
+ ],
1676
+ [
1677
+ 11,
1678
+ 87
1679
+ ]
1680
+ ],
1681
+ "threshold": 0.91
1682
+ },
1683
+ {
1684
+ "precision": 0.28618421052631576,
1685
+ "recall": 0.8877551020408163,
1686
+ "f1": 0.43283582089552236,
1687
+ "roc_auc": 0.9721687370080279,
1688
+ "pr_auc": 0.7159122424484009,
1689
+ "confusion_matrix": [
1690
+ [
1691
+ 56647,
1692
+ 217
1693
+ ],
1694
+ [
1695
+ 11,
1696
+ 87
1697
+ ]
1698
+ ],
1699
+ "threshold": 0.92
1700
+ },
1701
+ {
1702
+ "precision": 0.3246268656716418,
1703
+ "recall": 0.8877551020408163,
1704
+ "f1": 0.47540983606557374,
1705
+ "roc_auc": 0.9721687370080279,
1706
+ "pr_auc": 0.7159122424484009,
1707
+ "confusion_matrix": [
1708
+ [
1709
+ 56683,
1710
+ 181
1711
+ ],
1712
+ [
1713
+ 11,
1714
+ 87
1715
+ ]
1716
+ ],
1717
+ "threshold": 0.93
1718
+ },
1719
+ {
1720
+ "precision": 0.35080645161290325,
1721
+ "recall": 0.8877551020408163,
1722
+ "f1": 0.5028901734104047,
1723
+ "roc_auc": 0.9721687370080279,
1724
+ "pr_auc": 0.7159122424484009,
1725
+ "confusion_matrix": [
1726
+ [
1727
+ 56703,
1728
+ 161
1729
+ ],
1730
+ [
1731
+ 11,
1732
+ 87
1733
+ ]
1734
+ ],
1735
+ "threshold": 0.9400000000000001
1736
+ },
1737
+ {
1738
+ "precision": 0.3918918918918919,
1739
+ "recall": 0.8877551020408163,
1740
+ "f1": 0.54375,
1741
+ "roc_auc": 0.9721687370080279,
1742
+ "pr_auc": 0.7159122424484009,
1743
+ "confusion_matrix": [
1744
+ [
1745
+ 56729,
1746
+ 135
1747
+ ],
1748
+ [
1749
+ 11,
1750
+ 87
1751
+ ]
1752
+ ],
1753
+ "threshold": 0.9500000000000001
1754
+ },
1755
+ {
1756
+ "precision": 0.44387755102040816,
1757
+ "recall": 0.8877551020408163,
1758
+ "f1": 0.5918367346938775,
1759
+ "roc_auc": 0.9721687370080279,
1760
+ "pr_auc": 0.7159122424484009,
1761
+ "confusion_matrix": [
1762
+ [
1763
+ 56755,
1764
+ 109
1765
+ ],
1766
+ [
1767
+ 11,
1768
+ 87
1769
+ ]
1770
+ ],
1771
+ "threshold": 0.9600000000000001
1772
+ },
1773
+ {
1774
+ "precision": 0.47802197802197804,
1775
+ "recall": 0.8877551020408163,
1776
+ "f1": 0.6214285714285714,
1777
+ "roc_auc": 0.9721687370080279,
1778
+ "pr_auc": 0.7159122424484009,
1779
+ "confusion_matrix": [
1780
+ [
1781
+ 56769,
1782
+ 95
1783
+ ],
1784
+ [
1785
+ 11,
1786
+ 87
1787
+ ]
1788
+ ],
1789
+ "threshold": 0.97
1790
+ },
1791
+ {
1792
+ "precision": 0.5151515151515151,
1793
+ "recall": 0.8673469387755102,
1794
+ "f1": 0.6463878326996197,
1795
+ "roc_auc": 0.9721687370080279,
1796
+ "pr_auc": 0.7159122424484009,
1797
+ "confusion_matrix": [
1798
+ [
1799
+ 56784,
1800
+ 80
1801
+ ],
1802
+ [
1803
+ 13,
1804
+ 85
1805
+ ]
1806
+ ],
1807
+ "threshold": 0.98
1808
+ },
1809
+ {
1810
+ "precision": 0.5763888888888888,
1811
+ "recall": 0.8469387755102041,
1812
+ "f1": 0.6859504132231405,
1813
+ "roc_auc": 0.9721687370080279,
1814
+ "pr_auc": 0.7159122424484009,
1815
+ "confusion_matrix": [
1816
+ [
1817
+ 56803,
1818
+ 61
1819
+ ],
1820
+ [
1821
+ 15,
1822
+ 83
1823
+ ]
1824
+ ],
1825
+ "threshold": 0.99
1826
+ }
1827
+ ]
1828
+ },
1829
+ "evaluation_summary": {
1830
+ "test_rows": 56962,
1831
+ "min_recall_target": 0.9,
1832
+ "selection_reason": "meets_min_recall"
1833
+ }
1834
+ }
artifacts/model_training_report.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp_utc": "2026-02-18T04:09:02.997602+00:00",
3
+ "experiment_name": "fraud-detection-baseline",
4
+ "tracking_uri": "file:./mlruns",
5
+ "data_path": "data/raw/creditcard.csv",
6
+ "preprocessor_path": "models/preprocessor.pkl",
7
+ "model_path": "models/model.pkl",
8
+ "model_report_path": "artifacts/model_report.json",
9
+ "best_model": {
10
+ "model_name": "logistic_regression",
11
+ "run_id": "f953d6a1c2d944338f8fc210408267a9",
12
+ "metrics": {
13
+ "precision": 0.06097560975609756,
14
+ "recall": 0.9183673469387755,
15
+ "f1": 0.11435832274459974,
16
+ "roc_auc": 0.9721687370080279,
17
+ "pr_auc": 0.7159122424484009,
18
+ "confusion_matrix": [
19
+ [
20
+ 55478,
21
+ 1386
22
+ ],
23
+ [
24
+ 8,
25
+ 90
26
+ ]
27
+ ]
28
+ },
29
+ "selected_threshold": 0.74,
30
+ "threshold_metrics": {
31
+ "precision": 0.13650306748466257,
32
+ "recall": 0.9081632653061225,
33
+ "f1": 0.23733333333333334,
34
+ "roc_auc": 0.9721687370080279,
35
+ "pr_auc": 0.7159122424484009,
36
+ "confusion_matrix": [
37
+ [
38
+ 56301,
39
+ 563
40
+ ],
41
+ [
42
+ 9,
43
+ 89
44
+ ]
45
+ ],
46
+ "threshold": 0.74
47
+ }
48
+ },
49
+ "all_results": [
50
+ {
51
+ "model_name": "logistic_regression",
52
+ "run_id": "f953d6a1c2d944338f8fc210408267a9",
53
+ "metrics": {
54
+ "precision": 0.06097560975609756,
55
+ "recall": 0.9183673469387755,
56
+ "f1": 0.11435832274459974,
57
+ "roc_auc": 0.9721687370080279,
58
+ "pr_auc": 0.7159122424484009,
59
+ "confusion_matrix": [
60
+ [
61
+ 55478,
62
+ 1386
63
+ ],
64
+ [
65
+ 8,
66
+ 90
67
+ ]
68
+ ]
69
+ }
70
+ },
71
+ {
72
+ "model_name": "xgboost",
73
+ "run_id": "0ad9425817db4958a142b29f816108f4",
74
+ "metrics": {
75
+ "precision": 0.9186046511627907,
76
+ "recall": 0.8061224489795918,
77
+ "f1": 0.8586956521739131,
78
+ "roc_auc": 0.9775147361983623,
79
+ "pr_auc": 0.87487299490182,
80
+ "confusion_matrix": [
81
+ [
82
+ 56857,
83
+ 7
84
+ ],
85
+ [
86
+ 19,
87
+ 79
88
+ ]
89
+ ]
90
+ }
91
+ }
92
+ ],
93
+ "skipped_models": []
94
+ }
configs/logging.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1
2
+ formatters:
3
+ standard:
4
+ format: '%(asctime)s | %(levelname)s | %(name)s | %(message)s'
5
+ handlers:
6
+ console:
7
+ class: logging.StreamHandler
8
+ formatter: standard
9
+ level: INFO
10
+ root:
11
+ handlers: [console]
12
+ level: INFO
configs/train.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment:
2
+ name: fraud-detection-baseline
3
+
4
+ training:
5
+ test_size: 0.2
6
+ random_state: 42
7
+ imbalance_method: class_weight
8
+ models:
9
+ - logistic_regression
10
+ - xgboost
11
+
12
+ mlflow:
13
+ tracking_uri: file:./mlruns
14
+
15
+ threshold:
16
+ decision_threshold: 0.5
17
+ min_recall_target: 0.9
18
+ min_threshold: 0.01
19
+ max_threshold: 0.99
20
+ grid_size: 99
docker-compose.yml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ api:
3
+ build:
4
+ context: .
5
+ dockerfile: Dockerfile
6
+ image: fraud-detection-api:latest
7
+ container_name: fraud-detection-api
8
+ restart: unless-stopped
9
+ ports:
10
+ - "8000:8000"
11
+ environment:
12
+ - PYTHONUNBUFFERED=1
13
+ healthcheck:
14
+ test: ["CMD", "python", "-c", "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:8000/health'); sys.exit(0)"]
15
+ interval: 30s
16
+ timeout: 5s
17
+ retries: 3
18
+ start_period: 20s
models/logistic_regression.pkl ADDED
Binary file (1.54 kB). View file
 
models/model.pkl ADDED
Binary file (1.54 kB). View file
 
models/preprocessor.pkl ADDED
Binary file (2.68 kB). View file
 
pyproject.toml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "fraud-detection-mlops-pipeline"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "fastapi>=0.110,<0.116",
9
+ "httpx>=0.27,<0.29",
10
+ "imbalanced-learn>=0.12,<0.15",
11
+ "mlflow>=2.11,<3.0",
12
+ "numpy>=1.26,<3.0",
13
+ "pandas>=2.1,<2.4",
14
+ "pydantic>=2.6,<3.0",
15
+ "pytest>=8.0,<9.0",
16
+ "pytest-cov>=5.0,<7.0",
17
+ "python-dotenv>=1.0,<2.0",
18
+ "pyyaml>=6.0,<7.0",
19
+ "scikit-learn>=1.4,<1.8",
20
+ "uvicorn[standard]>=0.29,<0.36",
21
+ "xgboost>=2.0,<3.0",
22
+ ]
pytest.ini ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [pytest]
2
+ addopts = -q --cov=src --cov=api --cov-report=term-missing --cov-fail-under=80
3
+ testpaths = tests
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy>=1.26,<3.0
2
+ pandas>=2.1,<2.4
3
+ scikit-learn>=1.4,<1.8
4
+ imbalanced-learn>=0.12,<0.15
5
+ xgboost>=2.0,<3.0
6
+ mlflow>=2.11,<3.0
7
+ fastapi>=0.110,<0.116
8
+ uvicorn[standard]>=0.29,<0.36
9
+ pydantic>=2.6,<3.0
10
+ python-dotenv>=1.0,<2.0
11
+ pyyaml>=6.0,<7.0
12
+ pytest>=8.0,<9.0
13
+ pytest-cov>=5.0,<7.0
14
+ httpx>=0.27,<0.29
src/__init__.py ADDED
File without changes
src/data_ingestion.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data ingestion and validation utilities for the fraud dataset."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import pandas as pd
11
+
12
+ EXPECTED_ROW_COUNT = 284_807
13
+ EXPECTED_COLUMNS = ["Time", *[f"V{i}" for i in range(1, 29)], "Amount", "Class"]
14
+ EXPECTED_CLASS_VALUES = {0, 1}
15
+
16
+
17
+ def load_data(file_path: str | Path) -> pd.DataFrame:
18
+ """Load CSV data from disk."""
19
+ path = Path(file_path)
20
+ if not path.exists():
21
+ raise FileNotFoundError(f"Dataset not found: {path}")
22
+ if path.suffix.lower() != ".csv":
23
+ raise ValueError(f"Expected a CSV file, got: {path.suffix}")
24
+ return pd.read_csv(path)
25
+
26
+
27
+ def get_data_statistics(df: pd.DataFrame) -> dict[str, Any]:
28
+ """Return key dataset statistics used for validation and monitoring."""
29
+ class_counts: dict[str, int] = {}
30
+ fraud_ratio: float | None = None
31
+
32
+ if "Class" in df.columns:
33
+ raw_counts = df["Class"].value_counts(dropna=False).to_dict()
34
+ class_counts = {str(k): int(v) for k, v in raw_counts.items()}
35
+ if len(df) > 0:
36
+ fraud_ratio = float((df["Class"] == 1).sum() / len(df))
37
+
38
+ return {
39
+ "row_count": int(df.shape[0]),
40
+ "column_count": int(df.shape[1]),
41
+ "missing_values_total": int(df.isna().sum().sum()),
42
+ "duplicate_rows": int(df.duplicated().sum()),
43
+ "class_counts": class_counts,
44
+ "fraud_ratio": fraud_ratio,
45
+ }
46
+
47
+
48
+ def validate_data(df: pd.DataFrame, expected_rows: int = EXPECTED_ROW_COUNT) -> dict[str, Any]:
49
+ """Validate schema and data quality; return a structured report."""
50
+ errors: list[str] = []
51
+ warnings: list[str] = []
52
+
53
+ actual_columns = list(df.columns)
54
+ missing_columns = [col for col in EXPECTED_COLUMNS if col not in actual_columns]
55
+ unexpected_columns = [col for col in actual_columns if col not in EXPECTED_COLUMNS]
56
+
57
+ if missing_columns:
58
+ errors.append(f"Missing required columns: {missing_columns}")
59
+ if unexpected_columns:
60
+ warnings.append(f"Unexpected columns present: {unexpected_columns}")
61
+
62
+ stats = get_data_statistics(df)
63
+
64
+ if expected_rows and stats["row_count"] != expected_rows:
65
+ warnings.append(
66
+ f"Row count differs from expected {expected_rows}: got {stats['row_count']}"
67
+ )
68
+
69
+ if stats["missing_values_total"] > 0:
70
+ warnings.append(f"Dataset contains {stats['missing_values_total']} missing values")
71
+
72
+ if "Class" in df.columns:
73
+ class_values = set(df["Class"].dropna().unique().tolist())
74
+ invalid_class_values = sorted(class_values - EXPECTED_CLASS_VALUES)
75
+ if invalid_class_values:
76
+ errors.append(f"Class contains invalid values: {invalid_class_values}")
77
+ if len(class_values) == 1:
78
+ warnings.append("Class column has only one class present")
79
+ else:
80
+ errors.append("Class column not found")
81
+
82
+ is_valid = len(errors) == 0
83
+ return {"is_valid": is_valid, "errors": errors, "warnings": warnings, "statistics": stats}
84
+
85
+
86
+ def save_validation_report(report: dict[str, Any], output_path: str | Path) -> Path:
87
+ """Write validation report to JSON."""
88
+ output = Path(output_path)
89
+ output.parent.mkdir(parents=True, exist_ok=True)
90
+ output.write_text(json.dumps(report, indent=2), encoding="utf-8")
91
+ return output
92
+
93
+
94
+ def run_data_validation(
95
+ file_path: str | Path = "data/raw/creditcard.csv",
96
+ report_path: str | Path = "artifacts/data_validation.json",
97
+ ) -> dict[str, Any]:
98
+ """Load dataset, validate, persist report, and fail fast on schema errors."""
99
+ df = load_data(file_path)
100
+ report = validate_data(df)
101
+ save_validation_report(report, report_path)
102
+ if not report["is_valid"]:
103
+ raise ValueError(f"Data validation failed: {report['errors']}")
104
+ return report
105
+
106
+
107
+ def _build_parser() -> argparse.ArgumentParser:
108
+ parser = argparse.ArgumentParser(description="Validate fraud dataset schema and quality.")
109
+ parser.add_argument(
110
+ "--data-path",
111
+ default="data/raw/creditcard.csv",
112
+ help="Path to the raw CSV dataset.",
113
+ )
114
+ parser.add_argument(
115
+ "--report-path",
116
+ default="artifacts/data_validation.json",
117
+ help="Path to write the validation report JSON.",
118
+ )
119
+ return parser
120
+
121
+
122
+ def main() -> None:
123
+ args = _build_parser().parse_args()
124
+ report = run_data_validation(args.data_path, args.report_path)
125
+ print("Data validation passed.")
126
+ print(json.dumps(report["statistics"], indent=2))
127
+
128
+
129
+ if __name__ == "__main__":
130
+ main()
src/evaluate.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Model evaluation utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+ from sklearn.metrics import (
9
+ average_precision_score,
10
+ confusion_matrix,
11
+ f1_score,
12
+ precision_score,
13
+ recall_score,
14
+ roc_auc_score,
15
+ )
16
+
17
+
18
+ def _safe_roc_auc(y_true, y_pred_proba) -> float:
19
+ try:
20
+ return float(roc_auc_score(y_true, y_pred_proba))
21
+ except ValueError:
22
+ return float("nan")
23
+
24
+
25
+ def _safe_pr_auc(y_true, y_pred_proba) -> float:
26
+ try:
27
+ return float(average_precision_score(y_true, y_pred_proba))
28
+ except ValueError:
29
+ return float("nan")
30
+
31
+
32
+ def calculate_metrics(y_true, y_pred, y_pred_proba) -> dict[str, Any]:
33
+ """Calculate classification metrics used for model comparison."""
34
+ cm = confusion_matrix(y_true, y_pred)
35
+ return {
36
+ "precision": float(precision_score(y_true, y_pred, zero_division=0)),
37
+ "recall": float(recall_score(y_true, y_pred, zero_division=0)),
38
+ "f1": float(f1_score(y_true, y_pred, zero_division=0)),
39
+ "roc_auc": _safe_roc_auc(y_true, y_pred_proba),
40
+ "pr_auc": _safe_pr_auc(y_true, y_pred_proba),
41
+ "confusion_matrix": cm.tolist(),
42
+ }
43
+
44
+
45
+ def rank_models(results: list[dict[str, Any]]) -> list[dict[str, Any]]:
46
+ """Sort candidate model results by recall, then precision, then roc_auc."""
47
+ return sorted(
48
+ results,
49
+ key=lambda r: (r["metrics"]["recall"], r["metrics"]["precision"], r["metrics"]["roc_auc"]),
50
+ reverse=True,
51
+ )
52
+
53
+
54
+ def calculate_metrics_at_threshold(
55
+ y_true,
56
+ y_pred_proba,
57
+ *,
58
+ threshold: float,
59
+ ) -> dict[str, Any]:
60
+ """Compute metrics using a probability threshold."""
61
+ y_pred = (np.asarray(y_pred_proba) >= threshold).astype(int)
62
+ metrics = calculate_metrics(y_true, y_pred, y_pred_proba)
63
+ metrics["threshold"] = float(threshold)
64
+ return metrics
65
+
66
+
67
+ def evaluate_thresholds(
68
+ y_true,
69
+ y_pred_proba,
70
+ *,
71
+ thresholds: list[float] | None = None,
72
+ min_threshold: float = 0.01,
73
+ max_threshold: float = 0.99,
74
+ grid_size: int = 99,
75
+ ) -> list[dict[str, Any]]:
76
+ """Evaluate model metrics across threshold grid."""
77
+ if thresholds is None:
78
+ thresholds = np.linspace(min_threshold, max_threshold, grid_size).tolist()
79
+ return [
80
+ calculate_metrics_at_threshold(y_true, y_pred_proba, threshold=t)
81
+ for t in thresholds
82
+ ]
83
+
84
+
85
+ def select_best_threshold(
86
+ y_true,
87
+ y_pred_proba,
88
+ *,
89
+ min_recall: float = 0.90,
90
+ min_threshold: float = 0.01,
91
+ max_threshold: float = 0.99,
92
+ grid_size: int = 99,
93
+ ) -> dict[str, Any]:
94
+ """Select threshold by maximizing precision while meeting recall target."""
95
+ evaluations = evaluate_thresholds(
96
+ y_true,
97
+ y_pred_proba,
98
+ min_threshold=min_threshold,
99
+ max_threshold=max_threshold,
100
+ grid_size=grid_size,
101
+ )
102
+
103
+ feasible = [m for m in evaluations if m["recall"] >= min_recall]
104
+ search_space = feasible if feasible else evaluations
105
+ selection_reason = "meets_min_recall" if feasible else "fallback_max_recall"
106
+
107
+ best = sorted(
108
+ search_space,
109
+ key=lambda m: (m["precision"], m["f1"], m["recall"]),
110
+ reverse=True,
111
+ )[0]
112
+
113
+ return {
114
+ "selection_reason": selection_reason,
115
+ "min_recall_target": float(min_recall),
116
+ "selected_threshold": float(best["threshold"]),
117
+ "selected_metrics": best,
118
+ "threshold_grid_size": int(grid_size),
119
+ "thresholds_evaluated": evaluations,
120
+ }
src/predict.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Batch/single prediction helper functions."""
src/preprocessing.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Training/inference preprocessing pipeline utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import joblib
9
+ import numpy as np
10
+ import pandas as pd
11
+ from imblearn.over_sampling import SMOTE
12
+ from sklearn.compose import ColumnTransformer
13
+ from sklearn.model_selection import train_test_split
14
+ from sklearn.preprocessing import StandardScaler
15
+ from sklearn.utils.class_weight import compute_class_weight
16
+
17
+ TARGET_COLUMN = "Class"
18
+ SCALE_COLUMNS = ["Time", "Amount"]
19
+
20
+
21
+ def split_data(
22
+ df: pd.DataFrame,
23
+ *,
24
+ target_column: str = TARGET_COLUMN,
25
+ test_size: float = 0.2,
26
+ random_state: int = 42,
27
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
28
+ """Split dataframe into train/test with class stratification."""
29
+ if target_column not in df.columns:
30
+ raise ValueError(f"Missing target column: {target_column}")
31
+ if not 0 < test_size < 1:
32
+ raise ValueError("test_size must be between 0 and 1")
33
+
34
+ X = df.drop(columns=[target_column])
35
+ y = df[target_column]
36
+
37
+ return train_test_split(
38
+ X,
39
+ y,
40
+ test_size=test_size,
41
+ random_state=random_state,
42
+ stratify=y,
43
+ )
44
+
45
+
46
+ def scale_features(
47
+ df: pd.DataFrame,
48
+ *,
49
+ columns: list[str] | None = None,
50
+ scaler: StandardScaler | None = None,
51
+ ) -> tuple[pd.DataFrame, StandardScaler]:
52
+ """Scale selected columns and return transformed dataframe and scaler."""
53
+ scale_columns = columns or SCALE_COLUMNS
54
+ missing = [column for column in scale_columns if column not in df.columns]
55
+ if missing:
56
+ raise ValueError(f"Columns not found for scaling: {missing}")
57
+
58
+ local_scaler = scaler or StandardScaler()
59
+ result = df.copy()
60
+ result[scale_columns] = local_scaler.fit_transform(df[scale_columns])
61
+ return result, local_scaler
62
+
63
+
64
+ def build_preprocessor(
65
+ feature_columns: list[str],
66
+ *,
67
+ scale_columns: list[str] | None = None,
68
+ ) -> ColumnTransformer:
69
+ """Build a column transformer for consistent training/inference transforms."""
70
+ chosen_scale_columns = scale_columns or SCALE_COLUMNS
71
+ missing = [column for column in chosen_scale_columns if column not in feature_columns]
72
+ if missing:
73
+ raise ValueError(f"Scale columns missing from features: {missing}")
74
+
75
+ preprocessor = ColumnTransformer(
76
+ transformers=[("scale", StandardScaler(), chosen_scale_columns)],
77
+ remainder="passthrough",
78
+ verbose_feature_names_out=False,
79
+ )
80
+ preprocessor.set_output(transform="pandas")
81
+ return preprocessor
82
+
83
+
84
+ def transform_features(
85
+ preprocessor: ColumnTransformer,
86
+ X: pd.DataFrame,
87
+ ) -> pd.DataFrame:
88
+ """Transform feature dataframe using a fitted preprocessor."""
89
+ transformed = preprocessor.transform(X)
90
+ if not isinstance(transformed, pd.DataFrame):
91
+ transformed = pd.DataFrame(transformed, columns=preprocessor.get_feature_names_out())
92
+ return transformed
93
+
94
+
95
+ def handle_imbalance(
96
+ X_train: pd.DataFrame,
97
+ y_train: pd.Series,
98
+ *,
99
+ method: str = "class_weight",
100
+ random_state: int = 42,
101
+ sampling_strategy: float = 0.5,
102
+ ) -> tuple[pd.DataFrame, pd.Series, dict[str, Any]]:
103
+ """Handle class imbalance using strategy selected by method."""
104
+ selected = method.lower()
105
+ if selected not in {"none", "class_weight", "smote"}:
106
+ raise ValueError("method must be one of: none, class_weight, smote")
107
+
108
+ if selected == "none":
109
+ return X_train, y_train, {"method": "none", "class_weight": None}
110
+
111
+ if selected == "class_weight":
112
+ classes = np.array(sorted(y_train.unique().tolist()))
113
+ weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
114
+ class_weight = {int(label): float(weight) for label, weight in zip(classes, weights)}
115
+ return X_train, y_train, {"method": "class_weight", "class_weight": class_weight}
116
+
117
+ smote = SMOTE(random_state=random_state, sampling_strategy=sampling_strategy)
118
+ X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
119
+ X_balanced = pd.DataFrame(X_resampled, columns=X_train.columns)
120
+ y_balanced = pd.Series(y_resampled, name=y_train.name)
121
+ return X_balanced, y_balanced, {"method": "smote", "class_weight": None}
122
+
123
+
124
+ def save_preprocessor(preprocessor: ColumnTransformer, output_path: str | Path) -> Path:
125
+ """Persist fitted preprocessor to disk."""
126
+ path = Path(output_path)
127
+ path.parent.mkdir(parents=True, exist_ok=True)
128
+ joblib.dump(preprocessor, path)
129
+ return path
130
+
131
+
132
+ def load_preprocessor(preprocessor_path: str | Path) -> ColumnTransformer:
133
+ """Load persisted preprocessor from disk."""
134
+ return joblib.load(Path(preprocessor_path))
135
+
136
+
137
+ def preprocess_for_training(
138
+ df: pd.DataFrame,
139
+ *,
140
+ target_column: str = TARGET_COLUMN,
141
+ test_size: float = 0.2,
142
+ random_state: int = 42,
143
+ imbalance_method: str = "class_weight",
144
+ preprocessor_path: str | Path = "models/preprocessor.pkl",
145
+ ) -> dict[str, Any]:
146
+ """Run train/test split, fit/transform preprocessor, and handle imbalance."""
147
+ X_train_raw, X_test_raw, y_train, y_test = split_data(
148
+ df,
149
+ target_column=target_column,
150
+ test_size=test_size,
151
+ random_state=random_state,
152
+ )
153
+
154
+ preprocessor = build_preprocessor(feature_columns=X_train_raw.columns.tolist())
155
+ preprocessor.fit(X_train_raw)
156
+
157
+ X_train = transform_features(preprocessor, X_train_raw)
158
+ X_test = transform_features(preprocessor, X_test_raw)
159
+
160
+ X_train_final, y_train_final, imbalance_metadata = handle_imbalance(
161
+ X_train,
162
+ y_train,
163
+ method=imbalance_method,
164
+ random_state=random_state,
165
+ )
166
+
167
+ save_preprocessor(preprocessor, preprocessor_path)
168
+
169
+ return {
170
+ "X_train": X_train_final,
171
+ "X_test": X_test,
172
+ "y_train": y_train_final,
173
+ "y_test": y_test,
174
+ "preprocessor": preprocessor,
175
+ "imbalance_metadata": imbalance_metadata,
176
+ }
src/register_model.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Model registration helpers (local/MLflow registry)."""
src/train.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Training entrypoint for fraud detection models with MLflow tracking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import joblib
12
+ import mlflow
13
+ import pandas as pd
14
+ import yaml
15
+ from sklearn.linear_model import LogisticRegression
16
+
17
+ from src.data_ingestion import load_data, run_data_validation
18
+ from src.evaluate import calculate_metrics, rank_models, select_best_threshold
19
+ from src.preprocessing import preprocess_for_training
20
+
21
+ try:
22
+ from xgboost import XGBClassifier
23
+ except Exception: # pragma: no cover - handled at runtime
24
+ XGBClassifier = None
25
+
26
+
27
+ DEFAULT_CONFIG_PATH = Path("configs/train.yaml")
28
+ DEFAULT_DATA_PATH = Path("data/raw/creditcard.csv")
29
+ DEFAULT_MODEL_PATH = Path("models/model.pkl")
30
+ DEFAULT_PREPROCESSOR_PATH = Path("models/preprocessor.pkl")
31
+ DEFAULT_REPORT_PATH = Path("artifacts/model_training_report.json")
32
+ DEFAULT_MODEL_REPORT_PATH = Path("artifacts/model_report.json")
33
+ DEFAULT_VALIDATION_REPORT_PATH = Path("artifacts/data_validation.json")
34
+
35
+
36
+ def load_training_config(config_path: str | Path = DEFAULT_CONFIG_PATH) -> dict[str, Any]:
37
+ """Load YAML training configuration."""
38
+ config = yaml.safe_load(Path(config_path).read_text(encoding="utf-8")) or {}
39
+ config.setdefault("experiment", {})
40
+ config.setdefault("training", {})
41
+ config.setdefault("mlflow", {})
42
+ return config
43
+
44
+
45
+ def create_model(model_name: str, random_state: int) -> Any:
46
+ """Create model instance from configured model name."""
47
+ if model_name == "logistic_regression":
48
+ return LogisticRegression(
49
+ max_iter=500,
50
+ solver="lbfgs",
51
+ class_weight="balanced",
52
+ random_state=random_state,
53
+ )
54
+
55
+ if model_name == "xgboost":
56
+ if XGBClassifier is None:
57
+ raise RuntimeError("xgboost is not available in the environment")
58
+ return XGBClassifier(
59
+ n_estimators=300,
60
+ max_depth=5,
61
+ learning_rate=0.05,
62
+ subsample=0.9,
63
+ colsample_bytree=0.9,
64
+ eval_metric="logloss",
65
+ random_state=random_state,
66
+ n_jobs=2,
67
+ )
68
+
69
+ raise ValueError(f"Unsupported model: {model_name}")
70
+
71
+
72
+ def train_single_model(
73
+ model_name: str,
74
+ X_train: pd.DataFrame,
75
+ y_train: pd.Series,
76
+ X_test: pd.DataFrame,
77
+ y_test: pd.Series,
78
+ *,
79
+ random_state: int,
80
+ ) -> tuple[Any, dict[str, Any]]:
81
+ """Train one model and return model + metrics."""
82
+ model = create_model(model_name, random_state=random_state)
83
+ model.fit(X_train, y_train)
84
+
85
+ y_pred = model.predict(X_test)
86
+ y_pred_proba = model.predict_proba(X_test)[:, 1]
87
+ metrics = calculate_metrics(y_test, y_pred, y_pred_proba)
88
+ return model, metrics
89
+
90
+
91
+ def log_run_to_mlflow(
92
+ *,
93
+ experiment_name: str,
94
+ model_name: str,
95
+ params: dict[str, Any],
96
+ metrics: dict[str, Any],
97
+ preprocessor_path: Path,
98
+ model_temp_path: Path,
99
+ artifact_dir: Path,
100
+ ) -> str:
101
+ """Log one training run to MLflow and return run id."""
102
+ mlflow.set_experiment(experiment_name)
103
+ with mlflow.start_run(run_name=model_name) as run:
104
+ mlflow.log_params(params)
105
+ metric_values = {k: v for k, v in metrics.items() if isinstance(v, float)}
106
+ mlflow.log_metrics(metric_values)
107
+
108
+ # Structured artifacts for debugging and reproducibility.
109
+ metrics_path = artifact_dir / f"metrics_{model_name}.json"
110
+ metrics_path.parent.mkdir(parents=True, exist_ok=True)
111
+ metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8")
112
+
113
+ mlflow.log_artifact(str(preprocessor_path), artifact_path="preprocessor")
114
+ mlflow.log_artifact(str(model_temp_path), artifact_path="model")
115
+ mlflow.log_artifact(str(metrics_path), artifact_path="metrics")
116
+
117
+ return run.info.run_id
118
+
119
+
120
+ def save_model(model: Any, output_path: str | Path = DEFAULT_MODEL_PATH) -> Path:
121
+ """Save model artifact to disk."""
122
+ path = Path(output_path)
123
+ path.parent.mkdir(parents=True, exist_ok=True)
124
+ joblib.dump(model, path)
125
+ return path
126
+
127
+
128
+ def run_training_pipeline(
129
+ *,
130
+ config_path: str | Path = DEFAULT_CONFIG_PATH,
131
+ data_path: str | Path = DEFAULT_DATA_PATH,
132
+ model_path: str | Path = DEFAULT_MODEL_PATH,
133
+ preprocessor_path: str | Path = DEFAULT_PREPROCESSOR_PATH,
134
+ report_path: str | Path = DEFAULT_REPORT_PATH,
135
+ model_report_path: str | Path = DEFAULT_MODEL_REPORT_PATH,
136
+ validation_report_path: str | Path = DEFAULT_VALIDATION_REPORT_PATH,
137
+ ) -> dict[str, Any]:
138
+ """Execute end-to-end training and experiment tracking pipeline."""
139
+ config = load_training_config(config_path)
140
+
141
+ experiment_name = config["experiment"].get("name", "fraud-detection-baseline")
142
+ tracking_uri = config["mlflow"].get("tracking_uri", "file:./mlruns")
143
+ mlflow.set_tracking_uri(tracking_uri)
144
+
145
+ training_cfg = config["training"]
146
+ random_state = int(training_cfg.get("random_state", 42))
147
+ test_size = float(training_cfg.get("test_size", 0.2))
148
+ imbalance_method = str(training_cfg.get("imbalance_method", "class_weight"))
149
+ models = training_cfg.get("models") or [training_cfg.get("model", "logistic_regression")]
150
+ threshold_cfg = config.get("threshold", {})
151
+ min_recall_target = float(threshold_cfg.get("min_recall_target", 0.90))
152
+ threshold_grid_size = int(threshold_cfg.get("grid_size", 99))
153
+ threshold_min = float(threshold_cfg.get("min_threshold", 0.01))
154
+ threshold_max = float(threshold_cfg.get("max_threshold", 0.99))
155
+
156
+ run_data_validation(file_path=data_path, report_path=validation_report_path)
157
+ raw_df = load_data(data_path)
158
+ prep = preprocess_for_training(
159
+ raw_df,
160
+ test_size=test_size,
161
+ random_state=random_state,
162
+ imbalance_method=imbalance_method,
163
+ preprocessor_path=preprocessor_path,
164
+ )
165
+
166
+ results: list[dict[str, Any]] = []
167
+ skipped_models: list[dict[str, str]] = []
168
+ artifact_dir = Path(report_path).parent
169
+ artifact_dir.mkdir(parents=True, exist_ok=True)
170
+ preprocessor_path_obj = Path(preprocessor_path)
171
+ for model_name in models:
172
+ try:
173
+ model, metrics = train_single_model(
174
+ model_name=model_name,
175
+ X_train=prep["X_train"],
176
+ y_train=prep["y_train"],
177
+ X_test=prep["X_test"],
178
+ y_test=prep["y_test"],
179
+ random_state=random_state,
180
+ )
181
+ except RuntimeError as exc:
182
+ skipped_models.append({"model_name": model_name, "reason": str(exc)})
183
+ continue
184
+
185
+ temp_model_path = Path(model_path).parent / f"{model_name}.pkl"
186
+ save_model(model, temp_model_path)
187
+
188
+ run_id = log_run_to_mlflow(
189
+ experiment_name=experiment_name,
190
+ model_name=model_name,
191
+ params={
192
+ "model_name": model_name,
193
+ "test_size": test_size,
194
+ "random_state": random_state,
195
+ "imbalance_method": imbalance_method,
196
+ },
197
+ metrics=metrics,
198
+ preprocessor_path=preprocessor_path_obj,
199
+ model_temp_path=temp_model_path,
200
+ artifact_dir=artifact_dir,
201
+ )
202
+
203
+ results.append({"model_name": model_name, "model": model, "metrics": metrics, "run_id": run_id})
204
+
205
+ if not results:
206
+ raise RuntimeError("No models were successfully trained.")
207
+
208
+ ranked = rank_models(results)
209
+ best = ranked[0]
210
+ y_test_proba_best = best["model"].predict_proba(prep["X_test"])[:, 1]
211
+ threshold_selection = select_best_threshold(
212
+ prep["y_test"],
213
+ y_test_proba_best,
214
+ min_recall=min_recall_target,
215
+ min_threshold=threshold_min,
216
+ max_threshold=threshold_max,
217
+ grid_size=threshold_grid_size,
218
+ )
219
+
220
+ model_report = {
221
+ "timestamp_utc": datetime.now(timezone.utc).isoformat(),
222
+ "best_model_name": best["model_name"],
223
+ "default_threshold_metrics": best["metrics"],
224
+ "threshold_selection": threshold_selection,
225
+ "evaluation_summary": {
226
+ "test_rows": int(len(prep["y_test"])),
227
+ "min_recall_target": min_recall_target,
228
+ "selection_reason": threshold_selection["selection_reason"],
229
+ },
230
+ }
231
+ model_report_path_obj = Path(model_report_path)
232
+ model_report_path_obj.parent.mkdir(parents=True, exist_ok=True)
233
+ model_report_path_obj.write_text(json.dumps(model_report, indent=2), encoding="utf-8")
234
+
235
+ final_model_path = save_model(best["model"], model_path)
236
+
237
+ report = {
238
+ "timestamp_utc": datetime.now(timezone.utc).isoformat(),
239
+ "experiment_name": experiment_name,
240
+ "tracking_uri": tracking_uri,
241
+ "data_path": str(data_path),
242
+ "preprocessor_path": str(preprocessor_path),
243
+ "model_path": str(final_model_path),
244
+ "model_report_path": str(model_report_path_obj),
245
+ "best_model": {
246
+ "model_name": best["model_name"],
247
+ "run_id": best["run_id"],
248
+ "metrics": best["metrics"],
249
+ "selected_threshold": threshold_selection["selected_threshold"],
250
+ "threshold_metrics": threshold_selection["selected_metrics"],
251
+ },
252
+ "all_results": [
253
+ {"model_name": entry["model_name"], "run_id": entry["run_id"], "metrics": entry["metrics"]}
254
+ for entry in ranked
255
+ ],
256
+ "skipped_models": skipped_models,
257
+ }
258
+
259
+ report_path_obj = Path(report_path)
260
+ report_path_obj.parent.mkdir(parents=True, exist_ok=True)
261
+ report_path_obj.write_text(json.dumps(report, indent=2), encoding="utf-8")
262
+
263
+ return report
264
+
265
+
266
+ def _build_parser() -> argparse.ArgumentParser:
267
+ parser = argparse.ArgumentParser(description="Train fraud model and log to MLflow.")
268
+ parser.add_argument("--config", default=str(DEFAULT_CONFIG_PATH), help="Training config YAML path.")
269
+ parser.add_argument("--data-path", default=str(DEFAULT_DATA_PATH), help="Dataset CSV path.")
270
+ parser.add_argument("--model-path", default=str(DEFAULT_MODEL_PATH), help="Output model artifact path.")
271
+ parser.add_argument(
272
+ "--preprocessor-path",
273
+ default=str(DEFAULT_PREPROCESSOR_PATH),
274
+ help="Output preprocessor artifact path.",
275
+ )
276
+ parser.add_argument("--report-path", default=str(DEFAULT_REPORT_PATH), help="Training report JSON path.")
277
+ parser.add_argument(
278
+ "--model-report-path",
279
+ default=str(DEFAULT_MODEL_REPORT_PATH),
280
+ help="Model evaluation report JSON path.",
281
+ )
282
+ return parser
283
+
284
+
285
+ def main() -> None:
286
+ args = _build_parser().parse_args()
287
+ report = run_training_pipeline(
288
+ config_path=args.config,
289
+ data_path=args.data_path,
290
+ model_path=args.model_path,
291
+ preprocessor_path=args.preprocessor_path,
292
+ report_path=args.report_path,
293
+ model_report_path=args.model_report_path,
294
+ )
295
+
296
+ best = report["best_model"]
297
+ print("Training completed.")
298
+ print(f"Best model: {best['model_name']}")
299
+ print(f"Selected threshold: {best['selected_threshold']:.4f}")
300
+ print(json.dumps(best["threshold_metrics"], indent=2))
301
+
302
+
303
+ if __name__ == "__main__":
304
+ main()
tests/conftest.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ # Ensure repository root is importable in pytest (for `src.*` imports).
7
+ ROOT = Path(__file__).resolve().parents[1]
8
+ if str(ROOT) not in sys.path:
9
+ sys.path.insert(0, str(ROOT))
tests/test_api.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from fastapi import HTTPException
6
+ from fastapi.testclient import TestClient
7
+
8
+ from api.app import app, get_inference_service
9
+
10
+
11
+ class DummyService:
12
+ threshold = 0.74
13
+ model_path = Path("models/model.pkl")
14
+ preprocessor_path = Path("models/preprocessor.pkl")
15
+
16
+ def predict_records(self, records):
17
+ outputs = []
18
+ for record in records:
19
+ amount = float(record["Amount"])
20
+ prob = 0.9 if amount > 200 else 0.1
21
+ outputs.append(
22
+ {
23
+ "is_fraud": prob >= self.threshold,
24
+ "fraud_probability": prob,
25
+ "risk_level": "high" if prob >= 0.7 else "low",
26
+ "threshold": self.threshold,
27
+ }
28
+ )
29
+ return outputs
30
+
31
+
32
+ def _transaction(amount: float = 10.0) -> dict[str, float]:
33
+ payload = {"Time": 0.0, "Amount": amount}
34
+ for i in range(1, 29):
35
+ payload[f"V{i}"] = 0.0
36
+ return payload
37
+
38
+
39
+ def test_health_endpoint() -> None:
40
+ app.dependency_overrides[get_inference_service] = lambda: DummyService()
41
+ client = TestClient(app)
42
+
43
+ response = client.get("/health")
44
+
45
+ assert response.status_code == 200
46
+ body = response.json()
47
+ assert body["status"] == "ok"
48
+ assert body["model_loaded"] is True
49
+ app.dependency_overrides.clear()
50
+
51
+
52
+ def test_predict_endpoint_valid_payload() -> None:
53
+ app.dependency_overrides[get_inference_service] = lambda: DummyService()
54
+ client = TestClient(app)
55
+
56
+ response = client.post("/predict", json=_transaction(amount=350.0))
57
+
58
+ assert response.status_code == 200
59
+ body = response.json()
60
+ assert body["is_fraud"] is True
61
+ assert body["risk_level"] == "high"
62
+ assert response.headers.get("X-Request-ID")
63
+ app.dependency_overrides.clear()
64
+
65
+
66
+ def test_predict_endpoint_invalid_payload() -> None:
67
+ app.dependency_overrides[get_inference_service] = lambda: DummyService()
68
+ client = TestClient(app)
69
+
70
+ payload = _transaction()
71
+ payload.pop("V28")
72
+ response = client.post("/predict", json=payload)
73
+
74
+ assert response.status_code == 422
75
+ app.dependency_overrides.clear()
76
+
77
+
78
+ def test_batch_prediction_endpoint() -> None:
79
+ app.dependency_overrides[get_inference_service] = lambda: DummyService()
80
+ client = TestClient(app)
81
+
82
+ response = client.post(
83
+ "/predict/batch",
84
+ json={"transactions": [_transaction(20.0), _transaction(300.0)]},
85
+ )
86
+
87
+ assert response.status_code == 200
88
+ body = response.json()
89
+ assert len(body["predictions"]) == 2
90
+ assert body["predictions"][0]["is_fraud"] is False
91
+ assert body["predictions"][1]["is_fraud"] is True
92
+ app.dependency_overrides.clear()
93
+
94
+
95
+ def test_metrics_endpoint_tracks_predictions_and_requests() -> None:
96
+ app.dependency_overrides[get_inference_service] = lambda: DummyService()
97
+ client = TestClient(app)
98
+
99
+ before = client.get("/metrics")
100
+ assert before.status_code == 200
101
+ before_body = before.json()
102
+
103
+ predict_response = client.post("/predict", json=_transaction(amount=350.0))
104
+ assert predict_response.status_code == 200
105
+
106
+ after = client.get("/metrics")
107
+ assert after.status_code == 200
108
+ after_body = after.json()
109
+
110
+ assert after_body["total_requests"] >= before_body["total_requests"] + 2
111
+ assert after_body["total_predictions"] >= before_body["total_predictions"] + 1
112
+ assert 0.0 <= after_body["error_rate"] <= 1.0
113
+ assert 0.0 <= after_body["fraud_prediction_rate"] <= 1.0
114
+ app.dependency_overrides.clear()
115
+
116
+
117
+ def test_health_returns_503_when_service_unavailable() -> None:
118
+ def _raise():
119
+ raise HTTPException(status_code=503, detail="Model artifact not found")
120
+
121
+ app.dependency_overrides[get_inference_service] = _raise
122
+ client = TestClient(app)
123
+
124
+ response = client.get("/health")
125
+
126
+ assert response.status_code == 503
127
+ assert "Model artifact not found" in response.json()["detail"]
128
+ app.dependency_overrides.clear()
tests/test_data_ingestion.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ import pandas as pd
6
+ import pytest
7
+
8
+ from src.data_ingestion import (
9
+ EXPECTED_COLUMNS,
10
+ load_data,
11
+ run_data_validation,
12
+ validate_data,
13
+ )
14
+
15
+
16
+ def _valid_df() -> pd.DataFrame:
17
+ row = {column: 0.0 for column in EXPECTED_COLUMNS}
18
+ row["Class"] = 0
19
+ return pd.DataFrame([row])
20
+
21
+
22
+ def test_load_data_reads_csv(tmp_path) -> None:
23
+ df = _valid_df()
24
+ data_path = tmp_path / "creditcard.csv"
25
+ df.to_csv(data_path, index=False)
26
+
27
+ loaded = load_data(data_path)
28
+
29
+ assert list(loaded.columns) == EXPECTED_COLUMNS
30
+ assert loaded.shape == (1, len(EXPECTED_COLUMNS))
31
+
32
+
33
+ def test_validate_data_invalid_when_required_column_missing() -> None:
34
+ df = _valid_df().drop(columns=["Amount"])
35
+
36
+ report = validate_data(df)
37
+
38
+ assert report["is_valid"] is False
39
+ assert any("Missing required columns" in error for error in report["errors"])
40
+
41
+
42
+ def test_validate_data_invalid_when_class_has_invalid_values() -> None:
43
+ df = _valid_df()
44
+ df.loc[0, "Class"] = 3
45
+
46
+ report = validate_data(df)
47
+
48
+ assert report["is_valid"] is False
49
+ assert any("Class contains invalid values" in error for error in report["errors"])
50
+
51
+
52
+ def test_run_data_validation_writes_report_and_fails_fast(tmp_path) -> None:
53
+ invalid_df = _valid_df().drop(columns=["Class"])
54
+ data_path = tmp_path / "creditcard.csv"
55
+ report_path = tmp_path / "data_validation.json"
56
+ invalid_df.to_csv(data_path, index=False)
57
+
58
+ with pytest.raises(ValueError):
59
+ run_data_validation(data_path, report_path)
60
+
61
+ assert report_path.exists()
62
+ report = json.loads(report_path.read_text(encoding="utf-8"))
63
+ assert report["is_valid"] is False
64
+
65
+
66
+ def test_run_data_validation_passes_for_valid_schema(tmp_path) -> None:
67
+ valid_df = _valid_df()
68
+ data_path = tmp_path / "creditcard.csv"
69
+ report_path = tmp_path / "data_validation.json"
70
+ valid_df.to_csv(data_path, index=False)
71
+
72
+ report = run_data_validation(data_path, report_path)
73
+
74
+ assert report["is_valid"] is True
75
+ assert report_path.exists()
tests/test_evaluate.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import numpy as np
4
+
5
+ from src.evaluate import (
6
+ calculate_metrics_at_threshold,
7
+ evaluate_thresholds,
8
+ select_best_threshold,
9
+ )
10
+
11
+
12
+ def test_calculate_metrics_at_threshold_contains_threshold() -> None:
13
+ y_true = np.array([0, 0, 1, 1])
14
+ y_prob = np.array([0.1, 0.4, 0.6, 0.9])
15
+
16
+ metrics = calculate_metrics_at_threshold(y_true, y_prob, threshold=0.5)
17
+
18
+ assert metrics["threshold"] == 0.5
19
+ assert 0.0 <= metrics["recall"] <= 1.0
20
+ assert 0.0 <= metrics["precision"] <= 1.0
21
+
22
+
23
+ def test_evaluate_thresholds_returns_expected_grid_size() -> None:
24
+ y_true = np.array([0, 0, 1, 1])
25
+ y_prob = np.array([0.1, 0.4, 0.6, 0.9])
26
+
27
+ evaluated = evaluate_thresholds(y_true, y_prob, min_threshold=0.1, max_threshold=0.9, grid_size=9)
28
+
29
+ assert len(evaluated) == 9
30
+ assert evaluated[0]["threshold"] == 0.1
31
+
32
+
33
+ def test_select_best_threshold_prefers_precision_under_recall_constraint() -> None:
34
+ y_true = np.array([0, 0, 0, 0, 1, 1, 1, 1])
35
+ y_prob = np.array([0.02, 0.15, 0.20, 0.30, 0.55, 0.65, 0.80, 0.95])
36
+
37
+ selected = select_best_threshold(
38
+ y_true,
39
+ y_prob,
40
+ min_recall=0.75,
41
+ min_threshold=0.1,
42
+ max_threshold=0.9,
43
+ grid_size=17,
44
+ )
45
+
46
+ assert selected["selected_metrics"]["recall"] >= 0.75
47
+ assert 0.1 <= selected["selected_threshold"] <= 0.9
48
+ assert selected["selection_reason"] in {"meets_min_recall", "fallback_max_recall"}
tests/test_preprocessing.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import pytest
6
+
7
+ from src.preprocessing import (
8
+ SCALE_COLUMNS,
9
+ build_preprocessor,
10
+ handle_imbalance,
11
+ load_preprocessor,
12
+ preprocess_for_training,
13
+ save_preprocessor,
14
+ scale_features,
15
+ split_data,
16
+ transform_features,
17
+ )
18
+
19
+
20
+ @pytest.fixture
21
+ def sample_df() -> pd.DataFrame:
22
+ rng = np.random.default_rng(42)
23
+ rows = 200
24
+ fraud_count = 20
25
+
26
+ data: dict[str, np.ndarray] = {
27
+ "Time": rng.normal(loc=5000, scale=1000, size=rows),
28
+ "Amount": rng.normal(loc=120, scale=50, size=rows),
29
+ }
30
+ for i in range(1, 29):
31
+ data[f"V{i}"] = rng.normal(size=rows)
32
+
33
+ target = np.array([0] * (rows - fraud_count) + [1] * fraud_count)
34
+ rng.shuffle(target)
35
+ data["Class"] = target
36
+
37
+ return pd.DataFrame(data)
38
+
39
+
40
+ def test_split_data_is_stratified(sample_df: pd.DataFrame) -> None:
41
+ X_train, X_test, y_train, y_test = split_data(sample_df, test_size=0.2, random_state=42)
42
+
43
+ base_ratio = sample_df["Class"].mean()
44
+ train_ratio = y_train.mean()
45
+ test_ratio = y_test.mean()
46
+
47
+ assert X_train.shape[0] == 160
48
+ assert X_test.shape[0] == 40
49
+ assert abs(train_ratio - base_ratio) < 0.02
50
+ assert abs(test_ratio - base_ratio) < 0.02
51
+
52
+
53
+ def test_scale_features_transforms_only_selected_columns(sample_df: pd.DataFrame) -> None:
54
+ features = sample_df.drop(columns=["Class"])
55
+ scaled, scaler = scale_features(features)
56
+
57
+ assert scaler is not None
58
+ for column in SCALE_COLUMNS:
59
+ assert abs(float(scaled[column].mean())) < 1e-6
60
+
61
+ assert np.allclose(features["V1"].values, scaled["V1"].values)
62
+
63
+
64
+ def test_handle_imbalance_smote_increases_minority_class(sample_df: pd.DataFrame) -> None:
65
+ X_train, _, y_train, _ = split_data(sample_df, test_size=0.2, random_state=42)
66
+ preprocessor = build_preprocessor(X_train.columns.tolist())
67
+ preprocessor.fit(X_train)
68
+ X_train_t = transform_features(preprocessor, X_train)
69
+
70
+ base_counts = y_train.value_counts().to_dict()
71
+ X_balanced, y_balanced, metadata = handle_imbalance(
72
+ X_train_t, y_train, method="smote", sampling_strategy=0.8
73
+ )
74
+ balanced_counts = y_balanced.value_counts().to_dict()
75
+
76
+ assert metadata["method"] == "smote"
77
+ assert balanced_counts[1] > base_counts[1]
78
+ assert X_balanced.shape[0] == y_balanced.shape[0]
79
+
80
+
81
+ def test_preprocessor_save_load_roundtrip(sample_df: pd.DataFrame, tmp_path) -> None:
82
+ X_train, _, _, _ = split_data(sample_df, test_size=0.2, random_state=42)
83
+ preprocessor = build_preprocessor(X_train.columns.tolist())
84
+ preprocessor.fit(X_train)
85
+
86
+ path = tmp_path / "preprocessor.pkl"
87
+ save_preprocessor(preprocessor, path)
88
+ loaded = load_preprocessor(path)
89
+
90
+ transformed = transform_features(loaded, X_train.head(5))
91
+ assert list(transformed.columns) == X_train.columns.tolist()
92
+ assert transformed.shape == (5, X_train.shape[1])
93
+
94
+
95
+ def test_preprocess_for_training_creates_artifact(sample_df: pd.DataFrame, tmp_path) -> None:
96
+ artifact = tmp_path / "preprocessor.pkl"
97
+
98
+ result = preprocess_for_training(
99
+ sample_df,
100
+ test_size=0.2,
101
+ random_state=42,
102
+ imbalance_method="class_weight",
103
+ preprocessor_path=artifact,
104
+ )
105
+
106
+ assert artifact.exists()
107
+ assert result["X_train"].shape[1] == 30
108
+ assert result["X_test"].shape[1] == 30
109
+ assert result["imbalance_metadata"]["method"] == "class_weight"
110
+ assert result["imbalance_metadata"]["class_weight"] is not None
111
+
112
+
113
+ def test_handle_imbalance_rejects_unknown_method(sample_df: pd.DataFrame) -> None:
114
+ X_train, _, y_train, _ = split_data(sample_df)
115
+ preprocessor = build_preprocessor(X_train.columns.tolist())
116
+ preprocessor.fit(X_train)
117
+ X_train_t = transform_features(preprocessor, X_train)
118
+
119
+ with pytest.raises(ValueError):
120
+ handle_imbalance(X_train_t, y_train, method="unknown")
tests/test_service.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ import joblib
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from api.service import InferenceService, load_inference_service, resolve_threshold
11
+
12
+
13
+ class DummyPreprocessor:
14
+ feature_names_in_ = np.array(["Time", *[f"V{i}" for i in range(1, 29)], "Amount"])
15
+
16
+ def transform(self, frame: pd.DataFrame) -> pd.DataFrame:
17
+ return frame
18
+
19
+
20
+ class DummyModel:
21
+ def predict_proba(self, frame: pd.DataFrame) -> np.ndarray:
22
+ probs = []
23
+ for amount in frame["Amount"].tolist():
24
+ if amount >= 300:
25
+ probs.append([0.1, 0.9])
26
+ elif amount >= 100:
27
+ probs.append([0.55, 0.45])
28
+ else:
29
+ probs.append([0.95, 0.05])
30
+ return np.array(probs)
31
+
32
+
33
+ def _record(amount: float) -> dict[str, float]:
34
+ payload = {"Time": 0.0, "Amount": amount}
35
+ for i in range(1, 29):
36
+ payload[f"V{i}"] = 0.0
37
+ return payload
38
+
39
+
40
+ def test_inference_service_predict_records_risk_levels() -> None:
41
+ service = InferenceService(
42
+ model=DummyModel(),
43
+ preprocessor=DummyPreprocessor(),
44
+ threshold=0.5,
45
+ model_path=Path("models/model.pkl"),
46
+ preprocessor_path=Path("models/preprocessor.pkl"),
47
+ feature_columns=["Time", *[f"V{i}" for i in range(1, 29)], "Amount"],
48
+ )
49
+
50
+ outputs = service.predict_records([_record(20), _record(120), _record(320)])
51
+
52
+ assert outputs[0]["risk_level"] == "low"
53
+ assert outputs[1]["risk_level"] == "medium"
54
+ assert outputs[2]["risk_level"] == "high"
55
+ assert outputs[2]["is_fraud"] is True
56
+
57
+
58
+ def test_resolve_threshold_precedence(tmp_path) -> None:
59
+ training_report = tmp_path / "model_training_report.json"
60
+ model_report = tmp_path / "model_report.json"
61
+ config_path = tmp_path / "train.yaml"
62
+
63
+ config_path.write_text("threshold:\n decision_threshold: 0.51\n", encoding="utf-8")
64
+ model_report.write_text(
65
+ json.dumps({"threshold_selection": {"selected_threshold": 0.63}}), encoding="utf-8"
66
+ )
67
+ training_report.write_text(
68
+ json.dumps({"best_model": {"selected_threshold": 0.74}}), encoding="utf-8"
69
+ )
70
+
71
+ threshold = resolve_threshold(
72
+ training_report_path=training_report,
73
+ model_report_path=model_report,
74
+ config_path=config_path,
75
+ )
76
+
77
+ assert threshold == 0.74
78
+
79
+
80
+ def test_load_inference_service_reads_artifacts_and_threshold(tmp_path) -> None:
81
+ load_inference_service.cache_clear()
82
+
83
+ model_path = tmp_path / "model.pkl"
84
+ preprocessor_path = tmp_path / "preprocessor.pkl"
85
+ training_report = tmp_path / "model_training_report.json"
86
+
87
+ joblib.dump(DummyModel(), model_path)
88
+ joblib.dump(DummyPreprocessor(), preprocessor_path)
89
+ training_report.write_text(
90
+ json.dumps({"best_model": {"selected_threshold": 0.66}}), encoding="utf-8"
91
+ )
92
+
93
+ service = load_inference_service(
94
+ model_path=str(model_path),
95
+ preprocessor_path=str(preprocessor_path),
96
+ training_report_path=str(training_report),
97
+ model_report_path=str(tmp_path / "missing_model_report.json"),
98
+ config_path=str(tmp_path / "missing_config.yaml"),
99
+ )
100
+
101
+ assert service.threshold == 0.66
102
+ outputs = service.predict_records([_record(300.0)])
103
+ assert outputs[0]["is_fraud"] is True
tests/test_smoke.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ def test_smoke() -> None:
2
+ assert True
tests/test_training.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import yaml
8
+
9
+ from src.evaluate import rank_models
10
+ from src.train import run_training_pipeline, train_single_model
11
+
12
+
13
+ def _synthetic_df(rows: int = 160) -> pd.DataFrame:
14
+ rng = np.random.default_rng(7)
15
+ data: dict[str, np.ndarray] = {
16
+ "Time": rng.normal(loc=1000, scale=250, size=rows),
17
+ "Amount": rng.normal(loc=80, scale=20, size=rows),
18
+ }
19
+ for i in range(1, 29):
20
+ data[f"V{i}"] = rng.normal(size=rows)
21
+
22
+ y = np.zeros(rows, dtype=int)
23
+ fraud_indices = rng.choice(rows, size=max(8, rows // 20), replace=False)
24
+ y[fraud_indices] = 1
25
+
26
+ # Inject weak signal for separability.
27
+ data["Amount"][fraud_indices] += 40
28
+ data["V3"][fraud_indices] += 1.5
29
+ data["Class"] = y
30
+ return pd.DataFrame(data)
31
+
32
+
33
+ def test_rank_models_orders_by_recall_then_precision() -> None:
34
+ ranked = rank_models(
35
+ [
36
+ {"model_name": "a", "metrics": {"recall": 0.8, "precision": 0.9, "roc_auc": 0.9}},
37
+ {"model_name": "b", "metrics": {"recall": 0.9, "precision": 0.7, "roc_auc": 0.95}},
38
+ {"model_name": "c", "metrics": {"recall": 0.9, "precision": 0.8, "roc_auc": 0.85}},
39
+ ]
40
+ )
41
+ assert [entry["model_name"] for entry in ranked] == ["c", "b", "a"]
42
+
43
+
44
+ def test_train_single_model_returns_expected_metrics() -> None:
45
+ df = _synthetic_df(200)
46
+ X = df.drop(columns=["Class"])
47
+ y = df["Class"]
48
+
49
+ # Simple split for unit test.
50
+ X_train, X_test = X.iloc[:160], X.iloc[160:]
51
+ y_train, y_test = y.iloc[:160], y.iloc[160:]
52
+
53
+ _, metrics = train_single_model(
54
+ model_name="logistic_regression",
55
+ X_train=X_train,
56
+ y_train=y_train,
57
+ X_test=X_test,
58
+ y_test=y_test,
59
+ random_state=42,
60
+ )
61
+
62
+ assert set(metrics.keys()) == {"precision", "recall", "f1", "roc_auc", "pr_auc", "confusion_matrix"}
63
+ assert 0.0 <= metrics["recall"] <= 1.0
64
+
65
+
66
+ def test_run_training_pipeline_creates_report_and_model(tmp_path) -> None:
67
+ df = _synthetic_df(240)
68
+ data_path = tmp_path / "creditcard.csv"
69
+ config_path = tmp_path / "train.yaml"
70
+ model_path = tmp_path / "best_model.pkl"
71
+ preprocessor_path = tmp_path / "preprocessor.pkl"
72
+ report_path = tmp_path / "training_report.json"
73
+ model_report_path = tmp_path / "model_report.json"
74
+ validation_report_path = tmp_path / "data_validation.json"
75
+
76
+ df.to_csv(data_path, index=False)
77
+
78
+ config = {
79
+ "experiment": {"name": "test-experiment"},
80
+ "training": {
81
+ "test_size": 0.2,
82
+ "random_state": 42,
83
+ "imbalance_method": "class_weight",
84
+ "models": ["logistic_regression"],
85
+ },
86
+ "mlflow": {"tracking_uri": f"file:{tmp_path / 'mlruns'}"},
87
+ }
88
+ config_path.write_text(yaml.safe_dump(config), encoding="utf-8")
89
+
90
+ report = run_training_pipeline(
91
+ config_path=config_path,
92
+ data_path=data_path,
93
+ model_path=model_path,
94
+ preprocessor_path=preprocessor_path,
95
+ report_path=report_path,
96
+ model_report_path=model_report_path,
97
+ validation_report_path=validation_report_path,
98
+ )
99
+
100
+ assert model_path.exists()
101
+ assert preprocessor_path.exists()
102
+ assert report_path.exists()
103
+ assert model_report_path.exists()
104
+ assert validation_report_path.exists()
105
+ assert report["best_model"]["model_name"] == "logistic_regression"
106
+ assert 0.0 < report["best_model"]["selected_threshold"] < 1.0
107
+
108
+ stored = json.loads(report_path.read_text(encoding="utf-8"))
109
+ assert stored["best_model"]["run_id"]