Spaces:
Sleeping
Sleeping
github-actions[bot] commited on
Commit ·
4937cba
0
Parent(s):
deploy: sync snapshot from github
Browse files- .dockerignore +22 -0
- .github/workflows/ci.yml +65 -0
- .github/workflows/deploy-hf-space.yml +60 -0
- .github/workflows/keepalive-hf-space.yml +34 -0
- .gitignore +220 -0
- .python-version +1 -0
- Dockerfile +30 -0
- LICENSE +21 -0
- README.md +274 -0
- api/__init__.py +0 -0
- api/app.py +187 -0
- api/schemas.py +75 -0
- api/service.py +148 -0
- artifacts/data_validation.json +16 -0
- artifacts/metrics_logistic_regression.json +17 -0
- artifacts/metrics_xgboost.json +17 -0
- artifacts/model_report.json +1834 -0
- artifacts/model_training_report.json +94 -0
- configs/logging.yaml +12 -0
- configs/train.yaml +20 -0
- docker-compose.yml +18 -0
- models/logistic_regression.pkl +0 -0
- models/model.pkl +0 -0
- models/preprocessor.pkl +0 -0
- pyproject.toml +22 -0
- pytest.ini +3 -0
- requirements.txt +14 -0
- src/__init__.py +0 -0
- src/data_ingestion.py +130 -0
- src/evaluate.py +120 -0
- src/predict.py +1 -0
- src/preprocessing.py +176 -0
- src/register_model.py +1 -0
- src/train.py +304 -0
- tests/conftest.py +9 -0
- tests/test_api.py +128 -0
- tests/test_data_ingestion.py +75 -0
- tests/test_evaluate.py +48 -0
- tests/test_preprocessing.py +120 -0
- tests/test_service.py +103 -0
- tests/test_smoke.py +2 -0
- tests/test_training.py +109 -0
.dockerignore
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.gitignore
|
| 3 |
+
__pycache__
|
| 4 |
+
*.pyc
|
| 5 |
+
*.pyo
|
| 6 |
+
*.pyd
|
| 7 |
+
.pytest_cache
|
| 8 |
+
.coverage
|
| 9 |
+
.coverage.*
|
| 10 |
+
htmlcov
|
| 11 |
+
.venv
|
| 12 |
+
uv.lock
|
| 13 |
+
pytest.ini
|
| 14 |
+
venv
|
| 15 |
+
env
|
| 16 |
+
.env
|
| 17 |
+
logs
|
| 18 |
+
mlruns
|
| 19 |
+
notebooks
|
| 20 |
+
data/raw
|
| 21 |
+
tests
|
| 22 |
+
.github
|
.github/workflows/ci.yml
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CI-CD
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
pull_request:
|
| 5 |
+
push:
|
| 6 |
+
branches: [main]
|
| 7 |
+
workflow_dispatch:
|
| 8 |
+
|
| 9 |
+
concurrency:
|
| 10 |
+
group: ci-${{ github.ref }}
|
| 11 |
+
cancel-in-progress: true
|
| 12 |
+
|
| 13 |
+
env:
|
| 14 |
+
PYTHON_VERSION: "3.11"
|
| 15 |
+
IMAGE_NAME: fraud-detection-api
|
| 16 |
+
|
| 17 |
+
jobs:
|
| 18 |
+
test:
|
| 19 |
+
runs-on: ubuntu-latest
|
| 20 |
+
steps:
|
| 21 |
+
- name: Checkout
|
| 22 |
+
uses: actions/checkout@v4
|
| 23 |
+
|
| 24 |
+
- name: Set up Python
|
| 25 |
+
uses: actions/setup-python@v5
|
| 26 |
+
with:
|
| 27 |
+
python-version: ${{ env.PYTHON_VERSION }}
|
| 28 |
+
|
| 29 |
+
- name: Set up uv
|
| 30 |
+
uses: astral-sh/setup-uv@v5
|
| 31 |
+
|
| 32 |
+
- name: Install dependencies
|
| 33 |
+
run: |
|
| 34 |
+
uv pip install --system -r requirements.txt
|
| 35 |
+
|
| 36 |
+
- name: Run tests
|
| 37 |
+
run: python -m pytest
|
| 38 |
+
|
| 39 |
+
build-image:
|
| 40 |
+
runs-on: ubuntu-latest
|
| 41 |
+
needs: test
|
| 42 |
+
steps:
|
| 43 |
+
- name: Checkout
|
| 44 |
+
uses: actions/checkout@v4
|
| 45 |
+
|
| 46 |
+
- name: Build Docker image
|
| 47 |
+
run: docker build -t $IMAGE_NAME:${{ github.sha }} .
|
| 48 |
+
|
| 49 |
+
- name: Smoke check image metadata
|
| 50 |
+
run: docker image inspect $IMAGE_NAME:${{ github.sha }}
|
| 51 |
+
|
| 52 |
+
deploy:
|
| 53 |
+
runs-on: ubuntu-latest
|
| 54 |
+
needs: build-image
|
| 55 |
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
| 56 |
+
steps:
|
| 57 |
+
- name: Trigger deployment webhook (if configured)
|
| 58 |
+
run: |
|
| 59 |
+
if [ -z "$DEPLOY_WEBHOOK_URL" ]; then
|
| 60 |
+
echo "DEPLOY_WEBHOOK_URL secret is not set; skipping deploy trigger."
|
| 61 |
+
exit 0
|
| 62 |
+
fi
|
| 63 |
+
curl -fsS -X POST "$DEPLOY_WEBHOOK_URL"
|
| 64 |
+
env:
|
| 65 |
+
DEPLOY_WEBHOOK_URL: ${{ secrets.DEPLOY_WEBHOOK_URL }}
|
.github/workflows/deploy-hf-space.yml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Deploy to Hugging Face Space
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [main]
|
| 6 |
+
workflow_dispatch:
|
| 7 |
+
|
| 8 |
+
concurrency:
|
| 9 |
+
group: deploy-hf-space-${{ github.ref }}
|
| 10 |
+
cancel-in-progress: true
|
| 11 |
+
|
| 12 |
+
jobs:
|
| 13 |
+
deploy:
|
| 14 |
+
runs-on: ubuntu-latest
|
| 15 |
+
steps:
|
| 16 |
+
- name: Validate required secrets
|
| 17 |
+
env:
|
| 18 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
+
HF_SPACE_REPO: ${{ secrets.HF_SPACE_REPO }}
|
| 20 |
+
run: |
|
| 21 |
+
if [ -z "$HF_TOKEN" ] || [ -z "$HF_SPACE_REPO" ]; then
|
| 22 |
+
echo "HF_TOKEN or HF_SPACE_REPO is not set. Configure repository secrets."
|
| 23 |
+
exit 1
|
| 24 |
+
fi
|
| 25 |
+
|
| 26 |
+
- name: Checkout repository
|
| 27 |
+
uses: actions/checkout@v4
|
| 28 |
+
with:
|
| 29 |
+
fetch-depth: 0
|
| 30 |
+
|
| 31 |
+
- name: Configure git
|
| 32 |
+
run: |
|
| 33 |
+
git config user.name "github-actions[bot]"
|
| 34 |
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
| 35 |
+
|
| 36 |
+
- name: Remove non-serving artifacts for HF push
|
| 37 |
+
run: |
|
| 38 |
+
# Space runtime only needs selected serving artifacts.
|
| 39 |
+
rm -f models/xgboost.pkl
|
| 40 |
+
|
| 41 |
+
- name: Push to Hugging Face Space
|
| 42 |
+
env:
|
| 43 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 44 |
+
HF_SPACE_REPO: ${{ secrets.HF_SPACE_REPO }}
|
| 45 |
+
run: |
|
| 46 |
+
TMP_DIR="$(mktemp -d)"
|
| 47 |
+
rsync -a --delete --exclude=".git" ./ "${TMP_DIR}/"
|
| 48 |
+
|
| 49 |
+
# Exclude artifacts not needed for serving in Space.
|
| 50 |
+
rm -f "${TMP_DIR}/models/xgboost.pkl"
|
| 51 |
+
|
| 52 |
+
cd "${TMP_DIR}"
|
| 53 |
+
git init -b main
|
| 54 |
+
git config user.name "github-actions[bot]"
|
| 55 |
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
| 56 |
+
git add -A
|
| 57 |
+
git commit -m "deploy: sync snapshot from github"
|
| 58 |
+
|
| 59 |
+
git remote add hf "https://oauth2:${HF_TOKEN}@huggingface.co/spaces/${HF_SPACE_REPO}"
|
| 60 |
+
git push hf main --force
|
.github/workflows/keepalive-hf-space.yml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Keep HF Space Warm
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
schedule:
|
| 5 |
+
# Monday, Wednesday, Friday at 09:00 UTC
|
| 6 |
+
- cron: "0 9 * * 1,3,5"
|
| 7 |
+
workflow_dispatch:
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
ping:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
steps:
|
| 13 |
+
- name: Validate HF Space URL secret
|
| 14 |
+
env:
|
| 15 |
+
HF_SPACE_URL: ${{ secrets.HF_SPACE_URL }}
|
| 16 |
+
run: |
|
| 17 |
+
if [ -z "$HF_SPACE_URL" ]; then
|
| 18 |
+
echo "HF_SPACE_URL secret is not set."
|
| 19 |
+
exit 1
|
| 20 |
+
fi
|
| 21 |
+
|
| 22 |
+
- name: Ping health endpoint
|
| 23 |
+
env:
|
| 24 |
+
HF_SPACE_URL: ${{ secrets.HF_SPACE_URL }}
|
| 25 |
+
run: |
|
| 26 |
+
set -e
|
| 27 |
+
curl -fsS --retry 3 --retry-delay 5 "$HF_SPACE_URL/health"
|
| 28 |
+
|
| 29 |
+
- name: Ping metrics endpoint
|
| 30 |
+
env:
|
| 31 |
+
HF_SPACE_URL: ${{ secrets.HF_SPACE_URL }}
|
| 32 |
+
run: |
|
| 33 |
+
set -e
|
| 34 |
+
curl -fsS --retry 3 --retry-delay 5 "$HF_SPACE_URL/metrics"
|
.gitignore
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py.cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
#poetry.toml
|
| 110 |
+
|
| 111 |
+
# pdm
|
| 112 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 113 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 114 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 115 |
+
#pdm.lock
|
| 116 |
+
#pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# pixi
|
| 121 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 122 |
+
#pixi.lock
|
| 123 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 124 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 125 |
+
.pixi
|
| 126 |
+
|
| 127 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 128 |
+
__pypackages__/
|
| 129 |
+
|
| 130 |
+
# Celery stuff
|
| 131 |
+
celerybeat-schedule
|
| 132 |
+
celerybeat.pid
|
| 133 |
+
|
| 134 |
+
# SageMath parsed files
|
| 135 |
+
*.sage.py
|
| 136 |
+
|
| 137 |
+
# Environments
|
| 138 |
+
.env
|
| 139 |
+
.envrc
|
| 140 |
+
.venv
|
| 141 |
+
env/
|
| 142 |
+
venv/
|
| 143 |
+
ENV/
|
| 144 |
+
env.bak/
|
| 145 |
+
venv.bak/
|
| 146 |
+
|
| 147 |
+
# Spyder project settings
|
| 148 |
+
.spyderproject
|
| 149 |
+
.spyproject
|
| 150 |
+
|
| 151 |
+
# Rope project settings
|
| 152 |
+
.ropeproject
|
| 153 |
+
|
| 154 |
+
# mkdocs documentation
|
| 155 |
+
/site
|
| 156 |
+
|
| 157 |
+
# mypy
|
| 158 |
+
.mypy_cache/
|
| 159 |
+
.dmypy.json
|
| 160 |
+
dmypy.json
|
| 161 |
+
|
| 162 |
+
# Pyre type checker
|
| 163 |
+
.pyre/
|
| 164 |
+
|
| 165 |
+
# pytype static type analyzer
|
| 166 |
+
.pytype/
|
| 167 |
+
|
| 168 |
+
# Cython debug symbols
|
| 169 |
+
cython_debug/
|
| 170 |
+
|
| 171 |
+
# PyCharm
|
| 172 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 173 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 174 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 175 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 176 |
+
#.idea/
|
| 177 |
+
|
| 178 |
+
# Abstra
|
| 179 |
+
# Abstra is an AI-powered process automation framework.
|
| 180 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 181 |
+
# Learn more at https://abstra.io/docs
|
| 182 |
+
.abstra/
|
| 183 |
+
|
| 184 |
+
# Visual Studio Code
|
| 185 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 186 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 187 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 188 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 189 |
+
# .vscode/
|
| 190 |
+
|
| 191 |
+
# Ruff stuff:
|
| 192 |
+
.ruff_cache/
|
| 193 |
+
|
| 194 |
+
# PyPI configuration file
|
| 195 |
+
.pypirc
|
| 196 |
+
|
| 197 |
+
# Cursor
|
| 198 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 199 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 200 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 201 |
+
.cursorignore
|
| 202 |
+
.cursorindexingignore
|
| 203 |
+
|
| 204 |
+
# Marimo
|
| 205 |
+
marimo/_static/
|
| 206 |
+
marimo/_lsp/
|
| 207 |
+
__marimo__/
|
| 208 |
+
|
| 209 |
+
# Project-specific
|
| 210 |
+
data/raw/
|
| 211 |
+
data/processed/
|
| 212 |
+
logs/
|
| 213 |
+
mlruns/
|
| 214 |
+
|
| 215 |
+
IMPLEMENTATION_PLAN.md
|
| 216 |
+
End-to-End MLOps Project Documentation.txt
|
| 217 |
+
uv.lock
|
| 218 |
+
|
| 219 |
+
explaintovithu.md
|
| 220 |
+
interview_explanation.md
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.11
|
Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
PIP_NO_CACHE_DIR=1
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
# Install Python dependencies first for better layer caching.
|
| 10 |
+
COPY requirements.txt ./
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# Copy application code and runtime artifacts.
|
| 14 |
+
COPY api ./api
|
| 15 |
+
COPY src ./src
|
| 16 |
+
COPY configs ./configs
|
| 17 |
+
COPY models ./models
|
| 18 |
+
COPY artifacts ./artifacts
|
| 19 |
+
|
| 20 |
+
# Run API as non-root user.
|
| 21 |
+
RUN useradd --create-home --shell /usr/sbin/nologin appuser \
|
| 22 |
+
&& chown -R appuser:appuser /app
|
| 23 |
+
USER appuser
|
| 24 |
+
|
| 25 |
+
EXPOSE 8000
|
| 26 |
+
|
| 27 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
|
| 28 |
+
CMD python -c "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:8000/health'); sys.exit(0)"
|
| 29 |
+
|
| 30 |
+
CMD ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 Vimalathas Vithusan
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Fraud Detection MLOps API
|
| 3 |
+
emoji: 🚨
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Fraud Detection MLOps Pipeline
|
| 11 |
+
|
| 12 |
+
Production-style end-to-end fraud detection system with training, experiment tracking, API serving, containerization, CI/CD, and runtime monitoring.
|
| 13 |
+
|
| 14 |
+
## Highlights
|
| 15 |
+
|
| 16 |
+
- End-to-end ML lifecycle: data validation -> preprocessing -> training -> threshold tuning -> API inference.
|
| 17 |
+
- Imbalanced classification handling with recall-first model ranking.
|
| 18 |
+
- MLflow experiment tracking and artifact logging.
|
| 19 |
+
- FastAPI inference service with single/batch prediction endpoints.
|
| 20 |
+
- Dockerized deployment with health checks and non-root runtime.
|
| 21 |
+
- CI/CD with automated tests, coverage gates, image build, and HF deployment sync.
|
| 22 |
+
- Runtime observability via request IDs, structured logs, and `/metrics`.
|
| 23 |
+
|
| 24 |
+
## Live Deployment
|
| 25 |
+
|
| 26 |
+
- Hugging Face Space: `https://thasvithu-fraud-detection-mlops-api.hf.space`
|
| 27 |
+
- API Docs: `https://thasvithu-fraud-detection-mlops-api.hf.space/docs`
|
| 28 |
+
|
| 29 |
+
## Architecture
|
| 30 |
+
|
| 31 |
+
```mermaid
|
| 32 |
+
flowchart LR
|
| 33 |
+
A[Raw Data<br/>data/raw/creditcard.csv] --> B[Data Validation<br/>src/data_ingestion.py]
|
| 34 |
+
B --> C[Preprocessing<br/>src/preprocessing.py]
|
| 35 |
+
C --> D[Model Training<br/>src/train.py]
|
| 36 |
+
D --> E[Evaluation + Threshold Tuning<br/>src/evaluate.py]
|
| 37 |
+
E --> F[Artifacts<br/>models/*.pkl<br/>artifacts/*.json]
|
| 38 |
+
F --> G[Inference Service<br/>api/service.py]
|
| 39 |
+
G --> H[FastAPI App<br/>api/app.py]
|
| 40 |
+
H --> I["/predict"]
|
| 41 |
+
H --> J["/predict/batch"]
|
| 42 |
+
H --> K["/health"]
|
| 43 |
+
H --> L["/metrics"]
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
## ML Training Workflow
|
| 47 |
+
|
| 48 |
+
```mermaid
|
| 49 |
+
flowchart TD
|
| 50 |
+
T1[Load Config<br/>configs/train.yaml] --> T2[Validate Dataset]
|
| 51 |
+
T2 --> T3[Split + Scale + Imbalance Handling]
|
| 52 |
+
T3 --> T4[Train Candidate Models]
|
| 53 |
+
T4 --> T5[Compute Metrics]
|
| 54 |
+
T5 --> T6[Log Runs to MLflow]
|
| 55 |
+
T6 --> T7[Rank by recall -> precision -> roc_auc]
|
| 56 |
+
T7 --> T8[Select Best Model]
|
| 57 |
+
T8 --> T9[Threshold Sweep + Selection]
|
| 58 |
+
T9 --> T10[Save model + preprocessor + reports]
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
## Inference Request Flow
|
| 62 |
+
|
| 63 |
+
```mermaid
|
| 64 |
+
sequenceDiagram
|
| 65 |
+
autonumber
|
| 66 |
+
participant Client
|
| 67 |
+
participant API as FastAPI
|
| 68 |
+
participant Svc as InferenceService
|
| 69 |
+
participant Art as Artifacts
|
| 70 |
+
|
| 71 |
+
Client->>API: POST /predict (transaction payload)
|
| 72 |
+
API->>Svc: load_inference_service() [cached]
|
| 73 |
+
Svc->>Art: model.pkl + preprocessor.pkl + threshold reports
|
| 74 |
+
Svc-->>API: prediction + probability + risk level
|
| 75 |
+
API-->>Client: JSON response (+ request headers)
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
## CI/CD and Deployment Workflows
|
| 79 |
+
|
| 80 |
+
```mermaid
|
| 81 |
+
flowchart LR
|
| 82 |
+
P[Push / PR] --> C1[ci.yml]
|
| 83 |
+
C1 --> C2[Test + Coverage Gate]
|
| 84 |
+
C2 --> C3[Build Docker Image]
|
| 85 |
+
C3 --> C4[Optional Deploy Webhook]
|
| 86 |
+
|
| 87 |
+
M[Push main] --> H1[deploy-hf-space.yml]
|
| 88 |
+
H1 --> H2[Snapshot Sync to HF Space]
|
| 89 |
+
|
| 90 |
+
S[Schedule Mon/Wed/Fri] --> K1[keepalive-hf-space.yml]
|
| 91 |
+
K1 --> K2[Ping /health and /metrics]
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
## Project Structure
|
| 95 |
+
|
| 96 |
+
```text
|
| 97 |
+
fraud-detection-mlops-pipeline/
|
| 98 |
+
├── api/
|
| 99 |
+
│ ├── app.py
|
| 100 |
+
│ ├── schemas.py
|
| 101 |
+
│ └── service.py
|
| 102 |
+
├── src/
|
| 103 |
+
│ ├── data_ingestion.py
|
| 104 |
+
│ ├── preprocessing.py
|
| 105 |
+
│ ├── train.py
|
| 106 |
+
│ ├── evaluate.py
|
| 107 |
+
│ ├── predict.py
|
| 108 |
+
│ └── register_model.py
|
| 109 |
+
├── configs/
|
| 110 |
+
│ ├── train.yaml
|
| 111 |
+
│ └── logging.yaml
|
| 112 |
+
├── data/
|
| 113 |
+
│ ├── raw/
|
| 114 |
+
│ └── processed/
|
| 115 |
+
├── models/
|
| 116 |
+
├── artifacts/
|
| 117 |
+
├── tests/
|
| 118 |
+
├── .github/workflows/
|
| 119 |
+
│ ├── ci.yml
|
| 120 |
+
│ ├── deploy-hf-space.yml
|
| 121 |
+
│ └── keepalive-hf-space.yml
|
| 122 |
+
├── Dockerfile
|
| 123 |
+
├── docker-compose.yml
|
| 124 |
+
├── requirements.txt
|
| 125 |
+
└── pytest.ini
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
## Tech Stack
|
| 129 |
+
|
| 130 |
+
- Python 3.11
|
| 131 |
+
- Pandas, NumPy, scikit-learn, imbalanced-learn, XGBoost
|
| 132 |
+
- MLflow
|
| 133 |
+
- FastAPI + Pydantic
|
| 134 |
+
- Docker + Docker Compose
|
| 135 |
+
- GitHub Actions
|
| 136 |
+
- Hugging Face Spaces (Docker SDK)
|
| 137 |
+
|
| 138 |
+
## API Endpoints
|
| 139 |
+
|
| 140 |
+
- `GET /health`: Service and model readiness
|
| 141 |
+
- `GET /metrics`: Runtime operational counters
|
| 142 |
+
- `POST /predict`: Single transaction prediction
|
| 143 |
+
- `POST /predict/batch`: Batch transaction predictions
|
| 144 |
+
- `GET /docs`: Swagger UI
|
| 145 |
+
|
| 146 |
+
### Example: Single Prediction
|
| 147 |
+
|
| 148 |
+
```bash
|
| 149 |
+
BASE="https://thasvithu-fraud-detection-mlops-api.hf.space"
|
| 150 |
+
|
| 151 |
+
curl -X POST "$BASE/predict" \
|
| 152 |
+
-H "Content-Type: application/json" \
|
| 153 |
+
-d '{
|
| 154 |
+
"Time": 0,
|
| 155 |
+
"Amount": 149.62,
|
| 156 |
+
"V1": -1.359807, "V2": -0.072781, "V3": 2.536347, "V4": 1.378155,
|
| 157 |
+
"V5": -0.338321, "V6": 0.462388, "V7": 0.239599, "V8": 0.098698,
|
| 158 |
+
"V9": 0.363787, "V10": 0.090794, "V11": -0.551600, "V12": -0.617801,
|
| 159 |
+
"V13": -0.991390, "V14": -0.311169, "V15": 1.468177, "V16": -0.470401,
|
| 160 |
+
"V17": 0.207971, "V18": 0.025791, "V19": 0.403993, "V20": 0.251412,
|
| 161 |
+
"V21": -0.018307, "V22": 0.277838, "V23": -0.110474, "V24": 0.066928,
|
| 162 |
+
"V25": 0.128539, "V26": -0.189115, "V27": 0.133558, "V28": -0.021053
|
| 163 |
+
}'
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
## Local Setup
|
| 167 |
+
|
| 168 |
+
### Prerequisites
|
| 169 |
+
|
| 170 |
+
- Python 3.11+
|
| 171 |
+
- `uv`
|
| 172 |
+
- Docker (optional, for container run)
|
| 173 |
+
|
| 174 |
+
### Install
|
| 175 |
+
|
| 176 |
+
```bash
|
| 177 |
+
uv pip install -r requirements.txt
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
### Train
|
| 181 |
+
|
| 182 |
+
```bash
|
| 183 |
+
uv run python -m src.train
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
### Test
|
| 187 |
+
|
| 188 |
+
```bash
|
| 189 |
+
uv run pytest
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
### Run API
|
| 193 |
+
|
| 194 |
+
```bash
|
| 195 |
+
uv run uvicorn api.app:app --reload --host 0.0.0.0 --port 8000
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
## Docker Usage
|
| 199 |
+
|
| 200 |
+
### Build
|
| 201 |
+
|
| 202 |
+
```bash
|
| 203 |
+
docker build -t fraud-detection-api:latest .
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
### Run
|
| 207 |
+
|
| 208 |
+
```bash
|
| 209 |
+
docker run --rm -p 8000:8000 fraud-detection-api:latest
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
### Compose
|
| 213 |
+
|
| 214 |
+
```bash
|
| 215 |
+
docker compose up --build
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
## Quality Gates
|
| 219 |
+
|
| 220 |
+
- Test coverage enforced via `pytest.ini`
|
| 221 |
+
- Minimum coverage: `>= 80%` across `src` + `api`
|
| 222 |
+
- Current status: passing (see GitHub Actions)
|
| 223 |
+
|
| 224 |
+
## Monitoring and Operations
|
| 225 |
+
|
| 226 |
+
Runtime metrics exposed by `/metrics`:
|
| 227 |
+
- `total_requests`
|
| 228 |
+
- `error_count`
|
| 229 |
+
- `error_rate`
|
| 230 |
+
- `total_predictions`
|
| 231 |
+
- `fraud_predictions`
|
| 232 |
+
- `fraud_prediction_rate`
|
| 233 |
+
- `avg_latency_ms`
|
| 234 |
+
|
| 235 |
+
Request-level observability:
|
| 236 |
+
- `X-Request-ID`
|
| 237 |
+
- `X-Process-Time-Ms`
|
| 238 |
+
- Structured JSON logs for request and prediction events
|
| 239 |
+
|
| 240 |
+
## GitHub Actions Workflows
|
| 241 |
+
|
| 242 |
+
- `ci.yml`: test + coverage + image build (+ optional webhook deploy)
|
| 243 |
+
- `deploy-hf-space.yml`: sync `main` to Hugging Face Space
|
| 244 |
+
- `keepalive-hf-space.yml`: scheduled pings to reduce Space inactivity sleep
|
| 245 |
+
|
| 246 |
+
## Required GitHub Secrets
|
| 247 |
+
|
| 248 |
+
For Hugging Face deploy:
|
| 249 |
+
- `HF_TOKEN`
|
| 250 |
+
- `HF_SPACE_REPO` (format: `username/space-name`)
|
| 251 |
+
|
| 252 |
+
For HF keepalive:
|
| 253 |
+
- `HF_SPACE_URL`
|
| 254 |
+
|
| 255 |
+
Optional webhook deploy:
|
| 256 |
+
- `DEPLOY_WEBHOOK_URL`
|
| 257 |
+
|
| 258 |
+
## Milestone Status
|
| 259 |
+
|
| 260 |
+
All planned phases (0-9) are complete:
|
| 261 |
+
- Foundation
|
| 262 |
+
- Data validation
|
| 263 |
+
- Preprocessing
|
| 264 |
+
- Training + MLflow tracking
|
| 265 |
+
- Evaluation + threshold tuning
|
| 266 |
+
- FastAPI inference service
|
| 267 |
+
- Testing + quality gates
|
| 268 |
+
- Containerization
|
| 269 |
+
- CI/CD automation
|
| 270 |
+
- Monitoring and operations
|
| 271 |
+
|
| 272 |
+
## License
|
| 273 |
+
|
| 274 |
+
MIT (see `LICENSE`)
|
api/__init__.py
ADDED
|
File without changes
|
api/app.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
import time
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
from threading import Lock
|
| 8 |
+
from typing import Annotated
|
| 9 |
+
from uuid import uuid4
|
| 10 |
+
|
| 11 |
+
from fastapi import Depends, FastAPI, HTTPException, Request
|
| 12 |
+
from fastapi.responses import JSONResponse
|
| 13 |
+
|
| 14 |
+
from api.schemas import (
|
| 15 |
+
BatchPredictionRequest,
|
| 16 |
+
BatchPredictionResponse,
|
| 17 |
+
HealthResponse,
|
| 18 |
+
MetricsResponse,
|
| 19 |
+
PredictionResponse,
|
| 20 |
+
Transaction,
|
| 21 |
+
)
|
| 22 |
+
from api.service import InferenceService, load_inference_service
|
| 23 |
+
|
| 24 |
+
logger = logging.getLogger("api")
|
| 25 |
+
if not logger.handlers:
|
| 26 |
+
logging.basicConfig(level=logging.INFO)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class MonitoringState:
|
| 31 |
+
total_requests: int = 0
|
| 32 |
+
error_count: int = 0
|
| 33 |
+
total_predictions: int = 0
|
| 34 |
+
fraud_predictions: int = 0
|
| 35 |
+
total_latency_ms: float = 0.0
|
| 36 |
+
_lock: Lock = field(default_factory=Lock)
|
| 37 |
+
|
| 38 |
+
def record_request(self, *, latency_ms: float, status_code: int) -> None:
|
| 39 |
+
with self._lock:
|
| 40 |
+
self.total_requests += 1
|
| 41 |
+
self.total_latency_ms += latency_ms
|
| 42 |
+
if status_code >= 400:
|
| 43 |
+
self.error_count += 1
|
| 44 |
+
|
| 45 |
+
def record_predictions(self, predictions: list[dict[str, object]]) -> None:
|
| 46 |
+
fraud_count = sum(1 for p in predictions if bool(p.get("is_fraud")))
|
| 47 |
+
with self._lock:
|
| 48 |
+
self.total_predictions += len(predictions)
|
| 49 |
+
self.fraud_predictions += fraud_count
|
| 50 |
+
|
| 51 |
+
def snapshot(self) -> dict[str, float | int]:
|
| 52 |
+
with self._lock:
|
| 53 |
+
avg_latency = self.total_latency_ms / self.total_requests if self.total_requests else 0.0
|
| 54 |
+
error_rate = self.error_count / self.total_requests if self.total_requests else 0.0
|
| 55 |
+
fraud_rate = (
|
| 56 |
+
self.fraud_predictions / self.total_predictions if self.total_predictions else 0.0
|
| 57 |
+
)
|
| 58 |
+
return {
|
| 59 |
+
"total_requests": self.total_requests,
|
| 60 |
+
"error_count": self.error_count,
|
| 61 |
+
"error_rate": float(error_rate),
|
| 62 |
+
"total_predictions": self.total_predictions,
|
| 63 |
+
"fraud_predictions": self.fraud_predictions,
|
| 64 |
+
"fraud_prediction_rate": float(fraud_rate),
|
| 65 |
+
"avg_latency_ms": float(avg_latency),
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
app = FastAPI(title="Fraud Detection API", version="0.3.0")
|
| 70 |
+
monitoring_state = MonitoringState()
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@app.middleware("http")
|
| 74 |
+
async def add_observability(request: Request, call_next):
|
| 75 |
+
request_id = request.headers.get("X-Request-ID", str(uuid4()))
|
| 76 |
+
start = time.perf_counter()
|
| 77 |
+
|
| 78 |
+
status_code = 500
|
| 79 |
+
try:
|
| 80 |
+
response = await call_next(request)
|
| 81 |
+
status_code = response.status_code
|
| 82 |
+
except Exception:
|
| 83 |
+
latency_ms = (time.perf_counter() - start) * 1000
|
| 84 |
+
monitoring_state.record_request(latency_ms=latency_ms, status_code=status_code)
|
| 85 |
+
logger.exception(
|
| 86 |
+
json.dumps(
|
| 87 |
+
{
|
| 88 |
+
"event": "request_error",
|
| 89 |
+
"request_id": request_id,
|
| 90 |
+
"path": request.url.path,
|
| 91 |
+
"method": request.method,
|
| 92 |
+
"latency_ms": round(latency_ms, 2),
|
| 93 |
+
}
|
| 94 |
+
)
|
| 95 |
+
)
|
| 96 |
+
raise
|
| 97 |
+
|
| 98 |
+
latency_ms = (time.perf_counter() - start) * 1000
|
| 99 |
+
monitoring_state.record_request(latency_ms=latency_ms, status_code=status_code)
|
| 100 |
+
|
| 101 |
+
response.headers["X-Process-Time-Ms"] = f"{latency_ms:.2f}"
|
| 102 |
+
response.headers["X-Request-ID"] = request_id
|
| 103 |
+
|
| 104 |
+
logger.info(
|
| 105 |
+
json.dumps(
|
| 106 |
+
{
|
| 107 |
+
"event": "request_complete",
|
| 108 |
+
"request_id": request_id,
|
| 109 |
+
"path": request.url.path,
|
| 110 |
+
"method": request.method,
|
| 111 |
+
"status_code": status_code,
|
| 112 |
+
"latency_ms": round(latency_ms, 2),
|
| 113 |
+
}
|
| 114 |
+
)
|
| 115 |
+
)
|
| 116 |
+
return response
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def get_inference_service() -> InferenceService:
|
| 120 |
+
try:
|
| 121 |
+
return load_inference_service()
|
| 122 |
+
except FileNotFoundError as exc:
|
| 123 |
+
raise HTTPException(status_code=503, detail=str(exc)) from exc
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
ServiceDep = Annotated[InferenceService, Depends(get_inference_service)]
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
@app.exception_handler(ValueError)
|
| 130 |
+
async def value_error_handler(_: Request, exc: ValueError) -> JSONResponse:
|
| 131 |
+
return JSONResponse(status_code=400, content={"detail": str(exc)})
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
@app.get("/health", response_model=HealthResponse)
|
| 135 |
+
def health(service: ServiceDep) -> HealthResponse:
|
| 136 |
+
return HealthResponse(
|
| 137 |
+
status="ok",
|
| 138 |
+
model_loaded=True,
|
| 139 |
+
model_path=str(service.model_path),
|
| 140 |
+
preprocessor_path=str(service.preprocessor_path),
|
| 141 |
+
threshold=service.threshold,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
@app.get("/metrics", response_model=MetricsResponse)
|
| 146 |
+
def metrics() -> MetricsResponse:
|
| 147 |
+
return MetricsResponse(**monitoring_state.snapshot())
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
@app.post("/predict", response_model=PredictionResponse)
|
| 151 |
+
def predict(transaction: Transaction, service: ServiceDep) -> PredictionResponse:
|
| 152 |
+
output = service.predict_records([transaction.model_dump()])[0]
|
| 153 |
+
monitoring_state.record_predictions([output])
|
| 154 |
+
logger.info(
|
| 155 |
+
json.dumps(
|
| 156 |
+
{
|
| 157 |
+
"event": "prediction",
|
| 158 |
+
"prediction_count": 1,
|
| 159 |
+
"fraud_predictions": int(output["is_fraud"]),
|
| 160 |
+
"avg_probability": round(float(output["fraud_probability"]), 6),
|
| 161 |
+
"threshold": float(output["threshold"]),
|
| 162 |
+
}
|
| 163 |
+
)
|
| 164 |
+
)
|
| 165 |
+
return PredictionResponse(**output)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
@app.post("/predict/batch", response_model=BatchPredictionResponse)
|
| 169 |
+
def predict_batch(request: BatchPredictionRequest, service: ServiceDep) -> BatchPredictionResponse:
|
| 170 |
+
predictions = service.predict_records([record.model_dump() for record in request.transactions])
|
| 171 |
+
monitoring_state.record_predictions(predictions)
|
| 172 |
+
|
| 173 |
+
fraud_count = sum(1 for row in predictions if row["is_fraud"])
|
| 174 |
+
avg_probability = sum(float(row["fraud_probability"]) for row in predictions) / len(predictions)
|
| 175 |
+
logger.info(
|
| 176 |
+
json.dumps(
|
| 177 |
+
{
|
| 178 |
+
"event": "prediction_batch",
|
| 179 |
+
"prediction_count": len(predictions),
|
| 180 |
+
"fraud_predictions": fraud_count,
|
| 181 |
+
"avg_probability": round(avg_probability, 6),
|
| 182 |
+
"threshold": float(predictions[0]["threshold"]),
|
| 183 |
+
}
|
| 184 |
+
)
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
return BatchPredictionResponse(predictions=[PredictionResponse(**row) for row in predictions])
|
api/schemas.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic request/response schemas for the inference API."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Transaction(BaseModel):
|
| 9 |
+
model_config = ConfigDict(extra="forbid")
|
| 10 |
+
|
| 11 |
+
Time: float
|
| 12 |
+
V1: float
|
| 13 |
+
V2: float
|
| 14 |
+
V3: float
|
| 15 |
+
V4: float
|
| 16 |
+
V5: float
|
| 17 |
+
V6: float
|
| 18 |
+
V7: float
|
| 19 |
+
V8: float
|
| 20 |
+
V9: float
|
| 21 |
+
V10: float
|
| 22 |
+
V11: float
|
| 23 |
+
V12: float
|
| 24 |
+
V13: float
|
| 25 |
+
V14: float
|
| 26 |
+
V15: float
|
| 27 |
+
V16: float
|
| 28 |
+
V17: float
|
| 29 |
+
V18: float
|
| 30 |
+
V19: float
|
| 31 |
+
V20: float
|
| 32 |
+
V21: float
|
| 33 |
+
V22: float
|
| 34 |
+
V23: float
|
| 35 |
+
V24: float
|
| 36 |
+
V25: float
|
| 37 |
+
V26: float
|
| 38 |
+
V27: float
|
| 39 |
+
V28: float
|
| 40 |
+
Amount: float = Field(ge=0)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class PredictionResponse(BaseModel):
|
| 44 |
+
is_fraud: bool
|
| 45 |
+
fraud_probability: float
|
| 46 |
+
risk_level: str
|
| 47 |
+
threshold: float
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class BatchPredictionRequest(BaseModel):
|
| 51 |
+
model_config = ConfigDict(extra="forbid")
|
| 52 |
+
|
| 53 |
+
transactions: list[Transaction] = Field(min_length=1)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class BatchPredictionResponse(BaseModel):
|
| 57 |
+
predictions: list[PredictionResponse]
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class HealthResponse(BaseModel):
|
| 61 |
+
status: str
|
| 62 |
+
model_loaded: bool
|
| 63 |
+
model_path: str
|
| 64 |
+
preprocessor_path: str
|
| 65 |
+
threshold: float
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class MetricsResponse(BaseModel):
|
| 69 |
+
total_requests: int
|
| 70 |
+
error_count: int
|
| 71 |
+
error_rate: float
|
| 72 |
+
total_predictions: int
|
| 73 |
+
fraud_predictions: int
|
| 74 |
+
fraud_prediction_rate: float
|
| 75 |
+
avg_latency_ms: float
|
api/service.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Model loading and prediction service helpers."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from functools import lru_cache
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Any
|
| 10 |
+
|
| 11 |
+
import joblib
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import yaml
|
| 14 |
+
|
| 15 |
+
from src.data_ingestion import EXPECTED_COLUMNS
|
| 16 |
+
|
| 17 |
+
DEFAULT_MODEL_PATH = Path("models/model.pkl")
|
| 18 |
+
DEFAULT_PREPROCESSOR_PATH = Path("models/preprocessor.pkl")
|
| 19 |
+
DEFAULT_TRAINING_REPORT_PATH = Path("artifacts/model_training_report.json")
|
| 20 |
+
DEFAULT_MODEL_REPORT_PATH = Path("artifacts/model_report.json")
|
| 21 |
+
DEFAULT_CONFIG_PATH = Path("configs/train.yaml")
|
| 22 |
+
FEATURE_COLUMNS = [column for column in EXPECTED_COLUMNS if column != "Class"]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class InferenceService:
|
| 27 |
+
"""Encapsulate model/preprocessor runtime and prediction logic."""
|
| 28 |
+
|
| 29 |
+
model: Any
|
| 30 |
+
preprocessor: Any
|
| 31 |
+
threshold: float
|
| 32 |
+
model_path: Path
|
| 33 |
+
preprocessor_path: Path
|
| 34 |
+
feature_columns: list[str]
|
| 35 |
+
|
| 36 |
+
def predict_records(self, records: list[dict[str, float]]) -> list[dict[str, Any]]:
|
| 37 |
+
"""Predict fraud labels/probabilities for input transaction records."""
|
| 38 |
+
frame = pd.DataFrame(records)
|
| 39 |
+
frame = frame[self.feature_columns]
|
| 40 |
+
|
| 41 |
+
transformed = self.preprocessor.transform(frame)
|
| 42 |
+
probabilities = self.model.predict_proba(transformed)[:, 1]
|
| 43 |
+
|
| 44 |
+
outputs: list[dict[str, Any]] = []
|
| 45 |
+
for prob in probabilities:
|
| 46 |
+
probability = float(prob)
|
| 47 |
+
outputs.append(
|
| 48 |
+
{
|
| 49 |
+
"is_fraud": bool(probability >= self.threshold),
|
| 50 |
+
"fraud_probability": probability,
|
| 51 |
+
"risk_level": _risk_level(probability),
|
| 52 |
+
"threshold": float(self.threshold),
|
| 53 |
+
}
|
| 54 |
+
)
|
| 55 |
+
return outputs
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def _risk_level(probability: float) -> str:
|
| 59 |
+
if probability >= 0.7:
|
| 60 |
+
return "high"
|
| 61 |
+
if probability >= 0.3:
|
| 62 |
+
return "medium"
|
| 63 |
+
return "low"
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _threshold_from_training_report(training_report_path: Path) -> float | None:
|
| 67 |
+
if not training_report_path.exists():
|
| 68 |
+
return None
|
| 69 |
+
payload = json.loads(training_report_path.read_text(encoding="utf-8"))
|
| 70 |
+
best = payload.get("best_model", {})
|
| 71 |
+
threshold = best.get("selected_threshold")
|
| 72 |
+
return float(threshold) if threshold is not None else None
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _threshold_from_model_report(model_report_path: Path) -> float | None:
|
| 76 |
+
if not model_report_path.exists():
|
| 77 |
+
return None
|
| 78 |
+
payload = json.loads(model_report_path.read_text(encoding="utf-8"))
|
| 79 |
+
selection = payload.get("threshold_selection", {})
|
| 80 |
+
threshold = selection.get("selected_threshold")
|
| 81 |
+
return float(threshold) if threshold is not None else None
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _threshold_from_config(config_path: Path) -> float | None:
|
| 85 |
+
if not config_path.exists():
|
| 86 |
+
return None
|
| 87 |
+
config = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
| 88 |
+
threshold_cfg = config.get("threshold", {})
|
| 89 |
+
threshold = threshold_cfg.get("decision_threshold")
|
| 90 |
+
return float(threshold) if threshold is not None else None
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def resolve_threshold(
|
| 94 |
+
*,
|
| 95 |
+
training_report_path: Path = DEFAULT_TRAINING_REPORT_PATH,
|
| 96 |
+
model_report_path: Path = DEFAULT_MODEL_REPORT_PATH,
|
| 97 |
+
config_path: Path = DEFAULT_CONFIG_PATH,
|
| 98 |
+
) -> float:
|
| 99 |
+
"""Resolve runtime threshold from artifacts, then fallback config/default."""
|
| 100 |
+
value = _threshold_from_training_report(training_report_path)
|
| 101 |
+
if value is not None:
|
| 102 |
+
return value
|
| 103 |
+
value = _threshold_from_model_report(model_report_path)
|
| 104 |
+
if value is not None:
|
| 105 |
+
return value
|
| 106 |
+
value = _threshold_from_config(config_path)
|
| 107 |
+
if value is not None:
|
| 108 |
+
return value
|
| 109 |
+
return 0.5
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
@lru_cache(maxsize=1)
|
| 113 |
+
def load_inference_service(
|
| 114 |
+
*,
|
| 115 |
+
model_path: str = str(DEFAULT_MODEL_PATH),
|
| 116 |
+
preprocessor_path: str = str(DEFAULT_PREPROCESSOR_PATH),
|
| 117 |
+
training_report_path: str = str(DEFAULT_TRAINING_REPORT_PATH),
|
| 118 |
+
model_report_path: str = str(DEFAULT_MODEL_REPORT_PATH),
|
| 119 |
+
config_path: str = str(DEFAULT_CONFIG_PATH),
|
| 120 |
+
) -> InferenceService:
|
| 121 |
+
"""Load model + preprocessor + threshold and cache service singleton."""
|
| 122 |
+
model_file = Path(model_path)
|
| 123 |
+
preprocessor_file = Path(preprocessor_path)
|
| 124 |
+
|
| 125 |
+
if not model_file.exists():
|
| 126 |
+
raise FileNotFoundError(f"Model artifact not found: {model_file}")
|
| 127 |
+
if not preprocessor_file.exists():
|
| 128 |
+
raise FileNotFoundError(f"Preprocessor artifact not found: {preprocessor_file}")
|
| 129 |
+
|
| 130 |
+
model = joblib.load(model_file)
|
| 131 |
+
preprocessor = joblib.load(preprocessor_file)
|
| 132 |
+
threshold = resolve_threshold(
|
| 133 |
+
training_report_path=Path(training_report_path),
|
| 134 |
+
model_report_path=Path(model_report_path),
|
| 135 |
+
config_path=Path(config_path),
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
feature_names_in = getattr(preprocessor, "feature_names_in_", FEATURE_COLUMNS)
|
| 139 |
+
feature_columns = list(feature_names_in)
|
| 140 |
+
|
| 141 |
+
return InferenceService(
|
| 142 |
+
model=model,
|
| 143 |
+
preprocessor=preprocessor,
|
| 144 |
+
threshold=threshold,
|
| 145 |
+
model_path=model_file,
|
| 146 |
+
preprocessor_path=preprocessor_file,
|
| 147 |
+
feature_columns=feature_columns,
|
| 148 |
+
)
|
artifacts/data_validation.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"is_valid": true,
|
| 3 |
+
"errors": [],
|
| 4 |
+
"warnings": [],
|
| 5 |
+
"statistics": {
|
| 6 |
+
"row_count": 284807,
|
| 7 |
+
"column_count": 31,
|
| 8 |
+
"missing_values_total": 0,
|
| 9 |
+
"duplicate_rows": 1081,
|
| 10 |
+
"class_counts": {
|
| 11 |
+
"0": 284315,
|
| 12 |
+
"1": 492
|
| 13 |
+
},
|
| 14 |
+
"fraud_ratio": 0.001727485630620034
|
| 15 |
+
}
|
| 16 |
+
}
|
artifacts/metrics_logistic_regression.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"precision": 0.06097560975609756,
|
| 3 |
+
"recall": 0.9183673469387755,
|
| 4 |
+
"f1": 0.11435832274459974,
|
| 5 |
+
"roc_auc": 0.9721687370080279,
|
| 6 |
+
"pr_auc": 0.7159122424484009,
|
| 7 |
+
"confusion_matrix": [
|
| 8 |
+
[
|
| 9 |
+
55478,
|
| 10 |
+
1386
|
| 11 |
+
],
|
| 12 |
+
[
|
| 13 |
+
8,
|
| 14 |
+
90
|
| 15 |
+
]
|
| 16 |
+
]
|
| 17 |
+
}
|
artifacts/metrics_xgboost.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"precision": 0.9186046511627907,
|
| 3 |
+
"recall": 0.8061224489795918,
|
| 4 |
+
"f1": 0.8586956521739131,
|
| 5 |
+
"roc_auc": 0.9775147361983623,
|
| 6 |
+
"pr_auc": 0.87487299490182,
|
| 7 |
+
"confusion_matrix": [
|
| 8 |
+
[
|
| 9 |
+
56857,
|
| 10 |
+
7
|
| 11 |
+
],
|
| 12 |
+
[
|
| 13 |
+
19,
|
| 14 |
+
79
|
| 15 |
+
]
|
| 16 |
+
]
|
| 17 |
+
}
|
artifacts/model_report.json
ADDED
|
@@ -0,0 +1,1834 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp_utc": "2026-02-18T04:09:02.995799+00:00",
|
| 3 |
+
"best_model_name": "logistic_regression",
|
| 4 |
+
"default_threshold_metrics": {
|
| 5 |
+
"precision": 0.06097560975609756,
|
| 6 |
+
"recall": 0.9183673469387755,
|
| 7 |
+
"f1": 0.11435832274459974,
|
| 8 |
+
"roc_auc": 0.9721687370080279,
|
| 9 |
+
"pr_auc": 0.7159122424484009,
|
| 10 |
+
"confusion_matrix": [
|
| 11 |
+
[
|
| 12 |
+
55478,
|
| 13 |
+
1386
|
| 14 |
+
],
|
| 15 |
+
[
|
| 16 |
+
8,
|
| 17 |
+
90
|
| 18 |
+
]
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
"threshold_selection": {
|
| 22 |
+
"selection_reason": "meets_min_recall",
|
| 23 |
+
"min_recall_target": 0.9,
|
| 24 |
+
"selected_threshold": 0.74,
|
| 25 |
+
"selected_metrics": {
|
| 26 |
+
"precision": 0.13650306748466257,
|
| 27 |
+
"recall": 0.9081632653061225,
|
| 28 |
+
"f1": 0.23733333333333334,
|
| 29 |
+
"roc_auc": 0.9721687370080279,
|
| 30 |
+
"pr_auc": 0.7159122424484009,
|
| 31 |
+
"confusion_matrix": [
|
| 32 |
+
[
|
| 33 |
+
56301,
|
| 34 |
+
563
|
| 35 |
+
],
|
| 36 |
+
[
|
| 37 |
+
9,
|
| 38 |
+
89
|
| 39 |
+
]
|
| 40 |
+
],
|
| 41 |
+
"threshold": 0.74
|
| 42 |
+
},
|
| 43 |
+
"threshold_grid_size": 99,
|
| 44 |
+
"thresholds_evaluated": [
|
| 45 |
+
{
|
| 46 |
+
"precision": 0.0024050381830804323,
|
| 47 |
+
"recall": 0.9897959183673469,
|
| 48 |
+
"f1": 0.004798417017066535,
|
| 49 |
+
"roc_auc": 0.9721687370080279,
|
| 50 |
+
"pr_auc": 0.7159122424484009,
|
| 51 |
+
"confusion_matrix": [
|
| 52 |
+
[
|
| 53 |
+
16629,
|
| 54 |
+
40235
|
| 55 |
+
],
|
| 56 |
+
[
|
| 57 |
+
1,
|
| 58 |
+
97
|
| 59 |
+
]
|
| 60 |
+
],
|
| 61 |
+
"threshold": 0.01
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"precision": 0.0030859288009416853,
|
| 65 |
+
"recall": 0.9897959183673469,
|
| 66 |
+
"f1": 0.006152675145095303,
|
| 67 |
+
"roc_auc": 0.9721687370080279,
|
| 68 |
+
"pr_auc": 0.7159122424484009,
|
| 69 |
+
"confusion_matrix": [
|
| 70 |
+
[
|
| 71 |
+
25528,
|
| 72 |
+
31336
|
| 73 |
+
],
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
97
|
| 77 |
+
]
|
| 78 |
+
],
|
| 79 |
+
"threshold": 0.02
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"precision": 0.0037338104313328927,
|
| 83 |
+
"recall": 0.9795918367346939,
|
| 84 |
+
"f1": 0.007439265372544461,
|
| 85 |
+
"roc_auc": 0.9721687370080279,
|
| 86 |
+
"pr_auc": 0.7159122424484009,
|
| 87 |
+
"confusion_matrix": [
|
| 88 |
+
[
|
| 89 |
+
31249,
|
| 90 |
+
25615
|
| 91 |
+
],
|
| 92 |
+
[
|
| 93 |
+
2,
|
| 94 |
+
96
|
| 95 |
+
]
|
| 96 |
+
],
|
| 97 |
+
"threshold": 0.03
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"precision": 0.004303044174868391,
|
| 101 |
+
"recall": 0.9591836734693877,
|
| 102 |
+
"f1": 0.00856765255434535,
|
| 103 |
+
"roc_auc": 0.9721687370080279,
|
| 104 |
+
"pr_auc": 0.7159122424484009,
|
| 105 |
+
"confusion_matrix": [
|
| 106 |
+
[
|
| 107 |
+
35113,
|
| 108 |
+
21751
|
| 109 |
+
],
|
| 110 |
+
[
|
| 111 |
+
4,
|
| 112 |
+
94
|
| 113 |
+
]
|
| 114 |
+
],
|
| 115 |
+
"threshold": 0.04
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"precision": 0.004967499867885642,
|
| 119 |
+
"recall": 0.9591836734693877,
|
| 120 |
+
"f1": 0.009883812628147836,
|
| 121 |
+
"roc_auc": 0.9721687370080279,
|
| 122 |
+
"pr_auc": 0.7159122424484009,
|
| 123 |
+
"confusion_matrix": [
|
| 124 |
+
[
|
| 125 |
+
38035,
|
| 126 |
+
18829
|
| 127 |
+
],
|
| 128 |
+
[
|
| 129 |
+
4,
|
| 130 |
+
94
|
| 131 |
+
]
|
| 132 |
+
],
|
| 133 |
+
"threshold": 0.05
|
| 134 |
+
},
|
| 135 |
+
{
|
| 136 |
+
"precision": 0.005584932564909988,
|
| 137 |
+
"recall": 0.9591836734693877,
|
| 138 |
+
"f1": 0.011105204087660228,
|
| 139 |
+
"roc_auc": 0.9721687370080279,
|
| 140 |
+
"pr_auc": 0.7159122424484009,
|
| 141 |
+
"confusion_matrix": [
|
| 142 |
+
[
|
| 143 |
+
40127,
|
| 144 |
+
16737
|
| 145 |
+
],
|
| 146 |
+
[
|
| 147 |
+
4,
|
| 148 |
+
94
|
| 149 |
+
]
|
| 150 |
+
],
|
| 151 |
+
"threshold": 0.060000000000000005
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"precision": 0.006171610591280112,
|
| 155 |
+
"recall": 0.9489795918367347,
|
| 156 |
+
"f1": 0.01226346673699479,
|
| 157 |
+
"roc_auc": 0.9721687370080279,
|
| 158 |
+
"pr_auc": 0.7159122424484009,
|
| 159 |
+
"confusion_matrix": [
|
| 160 |
+
[
|
| 161 |
+
41888,
|
| 162 |
+
14976
|
| 163 |
+
],
|
| 164 |
+
[
|
| 165 |
+
5,
|
| 166 |
+
93
|
| 167 |
+
]
|
| 168 |
+
],
|
| 169 |
+
"threshold": 0.06999999999999999
|
| 170 |
+
},
|
| 171 |
+
{
|
| 172 |
+
"precision": 0.006826189078097475,
|
| 173 |
+
"recall": 0.9489795918367347,
|
| 174 |
+
"f1": 0.01355487538259729,
|
| 175 |
+
"roc_auc": 0.9721687370080279,
|
| 176 |
+
"pr_auc": 0.7159122424484009,
|
| 177 |
+
"confusion_matrix": [
|
| 178 |
+
[
|
| 179 |
+
43333,
|
| 180 |
+
13531
|
| 181 |
+
],
|
| 182 |
+
[
|
| 183 |
+
5,
|
| 184 |
+
93
|
| 185 |
+
]
|
| 186 |
+
],
|
| 187 |
+
"threshold": 0.08
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"precision": 0.007441190590494479,
|
| 191 |
+
"recall": 0.9489795918367347,
|
| 192 |
+
"f1": 0.014766592569069545,
|
| 193 |
+
"roc_auc": 0.9721687370080279,
|
| 194 |
+
"pr_auc": 0.7159122424484009,
|
| 195 |
+
"confusion_matrix": [
|
| 196 |
+
[
|
| 197 |
+
44459,
|
| 198 |
+
12405
|
| 199 |
+
],
|
| 200 |
+
[
|
| 201 |
+
5,
|
| 202 |
+
93
|
| 203 |
+
]
|
| 204 |
+
],
|
| 205 |
+
"threshold": 0.09
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"precision": 0.008117308195862791,
|
| 209 |
+
"recall": 0.9489795918367347,
|
| 210 |
+
"f1": 0.01609692773691043,
|
| 211 |
+
"roc_auc": 0.9721687370080279,
|
| 212 |
+
"pr_auc": 0.7159122424484009,
|
| 213 |
+
"confusion_matrix": [
|
| 214 |
+
[
|
| 215 |
+
45500,
|
| 216 |
+
11364
|
| 217 |
+
],
|
| 218 |
+
[
|
| 219 |
+
5,
|
| 220 |
+
93
|
| 221 |
+
]
|
| 222 |
+
],
|
| 223 |
+
"threshold": 0.09999999999999999
|
| 224 |
+
},
|
| 225 |
+
{
|
| 226 |
+
"precision": 0.008798486281929991,
|
| 227 |
+
"recall": 0.9489795918367347,
|
| 228 |
+
"f1": 0.017435320584926885,
|
| 229 |
+
"roc_auc": 0.9721687370080279,
|
| 230 |
+
"pr_auc": 0.7159122424484009,
|
| 231 |
+
"confusion_matrix": [
|
| 232 |
+
[
|
| 233 |
+
46387,
|
| 234 |
+
10477
|
| 235 |
+
],
|
| 236 |
+
[
|
| 237 |
+
5,
|
| 238 |
+
93
|
| 239 |
+
]
|
| 240 |
+
],
|
| 241 |
+
"threshold": 0.11
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"precision": 0.009562982005141388,
|
| 245 |
+
"recall": 0.9489795918367347,
|
| 246 |
+
"f1": 0.018935152193830806,
|
| 247 |
+
"roc_auc": 0.9721687370080279,
|
| 248 |
+
"pr_auc": 0.7159122424484009,
|
| 249 |
+
"confusion_matrix": [
|
| 250 |
+
[
|
| 251 |
+
47232,
|
| 252 |
+
9632
|
| 253 |
+
],
|
| 254 |
+
[
|
| 255 |
+
5,
|
| 256 |
+
93
|
| 257 |
+
]
|
| 258 |
+
],
|
| 259 |
+
"threshold": 0.12
|
| 260 |
+
},
|
| 261 |
+
{
|
| 262 |
+
"precision": 0.01033103754721173,
|
| 263 |
+
"recall": 0.9489795918367347,
|
| 264 |
+
"f1": 0.02043956043956044,
|
| 265 |
+
"roc_auc": 0.9721687370080279,
|
| 266 |
+
"pr_auc": 0.7159122424484009,
|
| 267 |
+
"confusion_matrix": [
|
| 268 |
+
[
|
| 269 |
+
47955,
|
| 270 |
+
8909
|
| 271 |
+
],
|
| 272 |
+
[
|
| 273 |
+
5,
|
| 274 |
+
93
|
| 275 |
+
]
|
| 276 |
+
],
|
| 277 |
+
"threshold": 0.13
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"precision": 0.011143062544931704,
|
| 281 |
+
"recall": 0.9489795918367347,
|
| 282 |
+
"f1": 0.022027475130270015,
|
| 283 |
+
"roc_auc": 0.9721687370080279,
|
| 284 |
+
"pr_auc": 0.7159122424484009,
|
| 285 |
+
"confusion_matrix": [
|
| 286 |
+
[
|
| 287 |
+
48611,
|
| 288 |
+
8253
|
| 289 |
+
],
|
| 290 |
+
[
|
| 291 |
+
5,
|
| 292 |
+
93
|
| 293 |
+
]
|
| 294 |
+
],
|
| 295 |
+
"threshold": 0.14
|
| 296 |
+
},
|
| 297 |
+
{
|
| 298 |
+
"precision": 0.011935318275154004,
|
| 299 |
+
"recall": 0.9489795918367347,
|
| 300 |
+
"f1": 0.023574144486692015,
|
| 301 |
+
"roc_auc": 0.9721687370080279,
|
| 302 |
+
"pr_auc": 0.7159122424484009,
|
| 303 |
+
"confusion_matrix": [
|
| 304 |
+
[
|
| 305 |
+
49165,
|
| 306 |
+
7699
|
| 307 |
+
],
|
| 308 |
+
[
|
| 309 |
+
5,
|
| 310 |
+
93
|
| 311 |
+
]
|
| 312 |
+
],
|
| 313 |
+
"threshold": 0.15000000000000002
|
| 314 |
+
},
|
| 315 |
+
{
|
| 316 |
+
"precision": 0.012781748213304012,
|
| 317 |
+
"recall": 0.9489795918367347,
|
| 318 |
+
"f1": 0.025223759153783564,
|
| 319 |
+
"roc_auc": 0.9721687370080279,
|
| 320 |
+
"pr_auc": 0.7159122424484009,
|
| 321 |
+
"confusion_matrix": [
|
| 322 |
+
[
|
| 323 |
+
49681,
|
| 324 |
+
7183
|
| 325 |
+
],
|
| 326 |
+
[
|
| 327 |
+
5,
|
| 328 |
+
93
|
| 329 |
+
]
|
| 330 |
+
],
|
| 331 |
+
"threshold": 0.16
|
| 332 |
+
},
|
| 333 |
+
{
|
| 334 |
+
"precision": 0.013650374284456186,
|
| 335 |
+
"recall": 0.9489795918367347,
|
| 336 |
+
"f1": 0.02691361597453335,
|
| 337 |
+
"roc_auc": 0.9721687370080279,
|
| 338 |
+
"pr_auc": 0.7159122424484009,
|
| 339 |
+
"confusion_matrix": [
|
| 340 |
+
[
|
| 341 |
+
50144,
|
| 342 |
+
6720
|
| 343 |
+
],
|
| 344 |
+
[
|
| 345 |
+
5,
|
| 346 |
+
93
|
| 347 |
+
]
|
| 348 |
+
],
|
| 349 |
+
"threshold": 0.17
|
| 350 |
+
},
|
| 351 |
+
{
|
| 352 |
+
"precision": 0.014563106796116505,
|
| 353 |
+
"recall": 0.9489795918367347,
|
| 354 |
+
"f1": 0.028685996298581123,
|
| 355 |
+
"roc_auc": 0.9721687370080279,
|
| 356 |
+
"pr_auc": 0.7159122424484009,
|
| 357 |
+
"confusion_matrix": [
|
| 358 |
+
[
|
| 359 |
+
50571,
|
| 360 |
+
6293
|
| 361 |
+
],
|
| 362 |
+
[
|
| 363 |
+
5,
|
| 364 |
+
93
|
| 365 |
+
]
|
| 366 |
+
],
|
| 367 |
+
"threshold": 0.18000000000000002
|
| 368 |
+
},
|
| 369 |
+
{
|
| 370 |
+
"precision": 0.015567458988952126,
|
| 371 |
+
"recall": 0.9489795918367347,
|
| 372 |
+
"f1": 0.030632411067193676,
|
| 373 |
+
"roc_auc": 0.9721687370080279,
|
| 374 |
+
"pr_auc": 0.7159122424484009,
|
| 375 |
+
"confusion_matrix": [
|
| 376 |
+
[
|
| 377 |
+
50983,
|
| 378 |
+
5881
|
| 379 |
+
],
|
| 380 |
+
[
|
| 381 |
+
5,
|
| 382 |
+
93
|
| 383 |
+
]
|
| 384 |
+
],
|
| 385 |
+
"threshold": 0.19
|
| 386 |
+
},
|
| 387 |
+
{
|
| 388 |
+
"precision": 0.016358463726884778,
|
| 389 |
+
"recall": 0.9387755102040817,
|
| 390 |
+
"f1": 0.03215658860538273,
|
| 391 |
+
"roc_auc": 0.9721687370080279,
|
| 392 |
+
"pr_auc": 0.7159122424484009,
|
| 393 |
+
"confusion_matrix": [
|
| 394 |
+
[
|
| 395 |
+
51332,
|
| 396 |
+
5532
|
| 397 |
+
],
|
| 398 |
+
[
|
| 399 |
+
6,
|
| 400 |
+
92
|
| 401 |
+
]
|
| 402 |
+
],
|
| 403 |
+
"threshold": 0.2
|
| 404 |
+
},
|
| 405 |
+
{
|
| 406 |
+
"precision": 0.017355215996981702,
|
| 407 |
+
"recall": 0.9387755102040817,
|
| 408 |
+
"f1": 0.03408038525652899,
|
| 409 |
+
"roc_auc": 0.9721687370080279,
|
| 410 |
+
"pr_auc": 0.7159122424484009,
|
| 411 |
+
"confusion_matrix": [
|
| 412 |
+
[
|
| 413 |
+
51655,
|
| 414 |
+
5209
|
| 415 |
+
],
|
| 416 |
+
[
|
| 417 |
+
6,
|
| 418 |
+
92
|
| 419 |
+
]
|
| 420 |
+
],
|
| 421 |
+
"threshold": 0.21000000000000002
|
| 422 |
+
},
|
| 423 |
+
{
|
| 424 |
+
"precision": 0.018236472945891785,
|
| 425 |
+
"recall": 0.9285714285714286,
|
| 426 |
+
"f1": 0.035770440251572326,
|
| 427 |
+
"roc_auc": 0.9721687370080279,
|
| 428 |
+
"pr_auc": 0.7159122424484009,
|
| 429 |
+
"confusion_matrix": [
|
| 430 |
+
[
|
| 431 |
+
51965,
|
| 432 |
+
4899
|
| 433 |
+
],
|
| 434 |
+
[
|
| 435 |
+
7,
|
| 436 |
+
91
|
| 437 |
+
]
|
| 438 |
+
],
|
| 439 |
+
"threshold": 0.22
|
| 440 |
+
},
|
| 441 |
+
{
|
| 442 |
+
"precision": 0.01904761904761905,
|
| 443 |
+
"recall": 0.9183673469387755,
|
| 444 |
+
"f1": 0.037321169396641096,
|
| 445 |
+
"roc_auc": 0.9721687370080279,
|
| 446 |
+
"pr_auc": 0.7159122424484009,
|
| 447 |
+
"confusion_matrix": [
|
| 448 |
+
[
|
| 449 |
+
52229,
|
| 450 |
+
4635
|
| 451 |
+
],
|
| 452 |
+
[
|
| 453 |
+
8,
|
| 454 |
+
90
|
| 455 |
+
]
|
| 456 |
+
],
|
| 457 |
+
"threshold": 0.23
|
| 458 |
+
},
|
| 459 |
+
{
|
| 460 |
+
"precision": 0.020049008687903765,
|
| 461 |
+
"recall": 0.9183673469387755,
|
| 462 |
+
"f1": 0.03924133420536298,
|
| 463 |
+
"roc_auc": 0.9721687370080279,
|
| 464 |
+
"pr_auc": 0.7159122424484009,
|
| 465 |
+
"confusion_matrix": [
|
| 466 |
+
[
|
| 467 |
+
52465,
|
| 468 |
+
4399
|
| 469 |
+
],
|
| 470 |
+
[
|
| 471 |
+
8,
|
| 472 |
+
90
|
| 473 |
+
]
|
| 474 |
+
],
|
| 475 |
+
"threshold": 0.24000000000000002
|
| 476 |
+
},
|
| 477 |
+
{
|
| 478 |
+
"precision": 0.021216407355021217,
|
| 479 |
+
"recall": 0.9183673469387755,
|
| 480 |
+
"f1": 0.041474654377880185,
|
| 481 |
+
"roc_auc": 0.9721687370080279,
|
| 482 |
+
"pr_auc": 0.7159122424484009,
|
| 483 |
+
"confusion_matrix": [
|
| 484 |
+
[
|
| 485 |
+
52712,
|
| 486 |
+
4152
|
| 487 |
+
],
|
| 488 |
+
[
|
| 489 |
+
8,
|
| 490 |
+
90
|
| 491 |
+
]
|
| 492 |
+
],
|
| 493 |
+
"threshold": 0.25
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"precision": 0.0224159402241594,
|
| 497 |
+
"recall": 0.9183673469387755,
|
| 498 |
+
"f1": 0.0437636761487965,
|
| 499 |
+
"roc_auc": 0.9721687370080279,
|
| 500 |
+
"pr_auc": 0.7159122424484009,
|
| 501 |
+
"confusion_matrix": [
|
| 502 |
+
[
|
| 503 |
+
52939,
|
| 504 |
+
3925
|
| 505 |
+
],
|
| 506 |
+
[
|
| 507 |
+
8,
|
| 508 |
+
90
|
| 509 |
+
]
|
| 510 |
+
],
|
| 511 |
+
"threshold": 0.26
|
| 512 |
+
},
|
| 513 |
+
{
|
| 514 |
+
"precision": 0.023578726748755566,
|
| 515 |
+
"recall": 0.9183673469387755,
|
| 516 |
+
"f1": 0.04597701149425287,
|
| 517 |
+
"roc_auc": 0.9721687370080279,
|
| 518 |
+
"pr_auc": 0.7159122424484009,
|
| 519 |
+
"confusion_matrix": [
|
| 520 |
+
[
|
| 521 |
+
53137,
|
| 522 |
+
3727
|
| 523 |
+
],
|
| 524 |
+
[
|
| 525 |
+
8,
|
| 526 |
+
90
|
| 527 |
+
]
|
| 528 |
+
],
|
| 529 |
+
"threshold": 0.27
|
| 530 |
+
},
|
| 531 |
+
{
|
| 532 |
+
"precision": 0.024725274725274724,
|
| 533 |
+
"recall": 0.9183673469387755,
|
| 534 |
+
"f1": 0.048154093097913325,
|
| 535 |
+
"roc_auc": 0.9721687370080279,
|
| 536 |
+
"pr_auc": 0.7159122424484009,
|
| 537 |
+
"confusion_matrix": [
|
| 538 |
+
[
|
| 539 |
+
53314,
|
| 540 |
+
3550
|
| 541 |
+
],
|
| 542 |
+
[
|
| 543 |
+
8,
|
| 544 |
+
90
|
| 545 |
+
]
|
| 546 |
+
],
|
| 547 |
+
"threshold": 0.28
|
| 548 |
+
},
|
| 549 |
+
{
|
| 550 |
+
"precision": 0.02601156069364162,
|
| 551 |
+
"recall": 0.9183673469387755,
|
| 552 |
+
"f1": 0.050590219224283306,
|
| 553 |
+
"roc_auc": 0.9721687370080279,
|
| 554 |
+
"pr_auc": 0.7159122424484009,
|
| 555 |
+
"confusion_matrix": [
|
| 556 |
+
[
|
| 557 |
+
53494,
|
| 558 |
+
3370
|
| 559 |
+
],
|
| 560 |
+
[
|
| 561 |
+
8,
|
| 562 |
+
90
|
| 563 |
+
]
|
| 564 |
+
],
|
| 565 |
+
"threshold": 0.29000000000000004
|
| 566 |
+
},
|
| 567 |
+
{
|
| 568 |
+
"precision": 0.0272975432211101,
|
| 569 |
+
"recall": 0.9183673469387755,
|
| 570 |
+
"f1": 0.053019145802650956,
|
| 571 |
+
"roc_auc": 0.9721687370080279,
|
| 572 |
+
"pr_auc": 0.7159122424484009,
|
| 573 |
+
"confusion_matrix": [
|
| 574 |
+
[
|
| 575 |
+
53657,
|
| 576 |
+
3207
|
| 577 |
+
],
|
| 578 |
+
[
|
| 579 |
+
8,
|
| 580 |
+
90
|
| 581 |
+
]
|
| 582 |
+
],
|
| 583 |
+
"threshold": 0.3
|
| 584 |
+
},
|
| 585 |
+
{
|
| 586 |
+
"precision": 0.028598665395614873,
|
| 587 |
+
"recall": 0.9183673469387755,
|
| 588 |
+
"f1": 0.05546995377503852,
|
| 589 |
+
"roc_auc": 0.9721687370080279,
|
| 590 |
+
"pr_auc": 0.7159122424484009,
|
| 591 |
+
"confusion_matrix": [
|
| 592 |
+
[
|
| 593 |
+
53807,
|
| 594 |
+
3057
|
| 595 |
+
],
|
| 596 |
+
[
|
| 597 |
+
8,
|
| 598 |
+
90
|
| 599 |
+
]
|
| 600 |
+
],
|
| 601 |
+
"threshold": 0.31
|
| 602 |
+
},
|
| 603 |
+
{
|
| 604 |
+
"precision": 0.030010003334444816,
|
| 605 |
+
"recall": 0.9183673469387755,
|
| 606 |
+
"f1": 0.05812076202776881,
|
| 607 |
+
"roc_auc": 0.9721687370080279,
|
| 608 |
+
"pr_auc": 0.7159122424484009,
|
| 609 |
+
"confusion_matrix": [
|
| 610 |
+
[
|
| 611 |
+
53955,
|
| 612 |
+
2909
|
| 613 |
+
],
|
| 614 |
+
[
|
| 615 |
+
8,
|
| 616 |
+
90
|
| 617 |
+
]
|
| 618 |
+
],
|
| 619 |
+
"threshold": 0.32
|
| 620 |
+
},
|
| 621 |
+
{
|
| 622 |
+
"precision": 0.031315240083507306,
|
| 623 |
+
"recall": 0.9183673469387755,
|
| 624 |
+
"f1": 0.06056527590847914,
|
| 625 |
+
"roc_auc": 0.9721687370080279,
|
| 626 |
+
"pr_auc": 0.7159122424484009,
|
| 627 |
+
"confusion_matrix": [
|
| 628 |
+
[
|
| 629 |
+
54080,
|
| 630 |
+
2784
|
| 631 |
+
],
|
| 632 |
+
[
|
| 633 |
+
8,
|
| 634 |
+
90
|
| 635 |
+
]
|
| 636 |
+
],
|
| 637 |
+
"threshold": 0.33
|
| 638 |
+
},
|
| 639 |
+
{
|
| 640 |
+
"precision": 0.03278688524590164,
|
| 641 |
+
"recall": 0.9183673469387755,
|
| 642 |
+
"f1": 0.06331340133661625,
|
| 643 |
+
"roc_auc": 0.9721687370080279,
|
| 644 |
+
"pr_auc": 0.7159122424484009,
|
| 645 |
+
"confusion_matrix": [
|
| 646 |
+
[
|
| 647 |
+
54209,
|
| 648 |
+
2655
|
| 649 |
+
],
|
| 650 |
+
[
|
| 651 |
+
8,
|
| 652 |
+
90
|
| 653 |
+
]
|
| 654 |
+
],
|
| 655 |
+
"threshold": 0.34
|
| 656 |
+
},
|
| 657 |
+
{
|
| 658 |
+
"precision": 0.03425961172440046,
|
| 659 |
+
"recall": 0.9183673469387755,
|
| 660 |
+
"f1": 0.06605504587155964,
|
| 661 |
+
"roc_auc": 0.9721687370080279,
|
| 662 |
+
"pr_auc": 0.7159122424484009,
|
| 663 |
+
"confusion_matrix": [
|
| 664 |
+
[
|
| 665 |
+
54327,
|
| 666 |
+
2537
|
| 667 |
+
],
|
| 668 |
+
[
|
| 669 |
+
8,
|
| 670 |
+
90
|
| 671 |
+
]
|
| 672 |
+
],
|
| 673 |
+
"threshold": 0.35000000000000003
|
| 674 |
+
},
|
| 675 |
+
{
|
| 676 |
+
"precision": 0.03587086488640893,
|
| 677 |
+
"recall": 0.9183673469387755,
|
| 678 |
+
"f1": 0.06904487917146145,
|
| 679 |
+
"roc_auc": 0.9721687370080279,
|
| 680 |
+
"pr_auc": 0.7159122424484009,
|
| 681 |
+
"confusion_matrix": [
|
| 682 |
+
[
|
| 683 |
+
54445,
|
| 684 |
+
2419
|
| 685 |
+
],
|
| 686 |
+
[
|
| 687 |
+
8,
|
| 688 |
+
90
|
| 689 |
+
]
|
| 690 |
+
],
|
| 691 |
+
"threshold": 0.36000000000000004
|
| 692 |
+
},
|
| 693 |
+
{
|
| 694 |
+
"precision": 0.037282518641259324,
|
| 695 |
+
"recall": 0.9183673469387755,
|
| 696 |
+
"f1": 0.07165605095541401,
|
| 697 |
+
"roc_auc": 0.9721687370080279,
|
| 698 |
+
"pr_auc": 0.7159122424484009,
|
| 699 |
+
"confusion_matrix": [
|
| 700 |
+
[
|
| 701 |
+
54540,
|
| 702 |
+
2324
|
| 703 |
+
],
|
| 704 |
+
[
|
| 705 |
+
8,
|
| 706 |
+
90
|
| 707 |
+
]
|
| 708 |
+
],
|
| 709 |
+
"threshold": 0.37
|
| 710 |
+
},
|
| 711 |
+
{
|
| 712 |
+
"precision": 0.038860103626943004,
|
| 713 |
+
"recall": 0.9183673469387755,
|
| 714 |
+
"f1": 0.07456503728251865,
|
| 715 |
+
"roc_auc": 0.9721687370080279,
|
| 716 |
+
"pr_auc": 0.7159122424484009,
|
| 717 |
+
"confusion_matrix": [
|
| 718 |
+
[
|
| 719 |
+
54638,
|
| 720 |
+
2226
|
| 721 |
+
],
|
| 722 |
+
[
|
| 723 |
+
8,
|
| 724 |
+
90
|
| 725 |
+
]
|
| 726 |
+
],
|
| 727 |
+
"threshold": 0.38
|
| 728 |
+
},
|
| 729 |
+
{
|
| 730 |
+
"precision": 0.04025044722719141,
|
| 731 |
+
"recall": 0.9183673469387755,
|
| 732 |
+
"f1": 0.07712082262210797,
|
| 733 |
+
"roc_auc": 0.9721687370080279,
|
| 734 |
+
"pr_auc": 0.7159122424484009,
|
| 735 |
+
"confusion_matrix": [
|
| 736 |
+
[
|
| 737 |
+
54718,
|
| 738 |
+
2146
|
| 739 |
+
],
|
| 740 |
+
[
|
| 741 |
+
8,
|
| 742 |
+
90
|
| 743 |
+
]
|
| 744 |
+
],
|
| 745 |
+
"threshold": 0.39
|
| 746 |
+
},
|
| 747 |
+
{
|
| 748 |
+
"precision": 0.04205607476635514,
|
| 749 |
+
"recall": 0.9183673469387755,
|
| 750 |
+
"f1": 0.08042895442359249,
|
| 751 |
+
"roc_auc": 0.9721687370080279,
|
| 752 |
+
"pr_auc": 0.7159122424484009,
|
| 753 |
+
"confusion_matrix": [
|
| 754 |
+
[
|
| 755 |
+
54814,
|
| 756 |
+
2050
|
| 757 |
+
],
|
| 758 |
+
[
|
| 759 |
+
8,
|
| 760 |
+
90
|
| 761 |
+
]
|
| 762 |
+
],
|
| 763 |
+
"threshold": 0.4
|
| 764 |
+
},
|
| 765 |
+
{
|
| 766 |
+
"precision": 0.043923865300146414,
|
| 767 |
+
"recall": 0.9183673469387755,
|
| 768 |
+
"f1": 0.08383791336748952,
|
| 769 |
+
"roc_auc": 0.9721687370080279,
|
| 770 |
+
"pr_auc": 0.7159122424484009,
|
| 771 |
+
"confusion_matrix": [
|
| 772 |
+
[
|
| 773 |
+
54905,
|
| 774 |
+
1959
|
| 775 |
+
],
|
| 776 |
+
[
|
| 777 |
+
8,
|
| 778 |
+
90
|
| 779 |
+
]
|
| 780 |
+
],
|
| 781 |
+
"threshold": 0.41000000000000003
|
| 782 |
+
},
|
| 783 |
+
{
|
| 784 |
+
"precision": 0.045754956786985254,
|
| 785 |
+
"recall": 0.9183673469387755,
|
| 786 |
+
"f1": 0.08716707021791767,
|
| 787 |
+
"roc_auc": 0.9721687370080279,
|
| 788 |
+
"pr_auc": 0.7159122424484009,
|
| 789 |
+
"confusion_matrix": [
|
| 790 |
+
[
|
| 791 |
+
54987,
|
| 792 |
+
1877
|
| 793 |
+
],
|
| 794 |
+
[
|
| 795 |
+
8,
|
| 796 |
+
90
|
| 797 |
+
]
|
| 798 |
+
],
|
| 799 |
+
"threshold": 0.42000000000000004
|
| 800 |
+
},
|
| 801 |
+
{
|
| 802 |
+
"precision": 0.04736842105263158,
|
| 803 |
+
"recall": 0.9183673469387755,
|
| 804 |
+
"f1": 0.09009009009009009,
|
| 805 |
+
"roc_auc": 0.9721687370080279,
|
| 806 |
+
"pr_auc": 0.7159122424484009,
|
| 807 |
+
"confusion_matrix": [
|
| 808 |
+
[
|
| 809 |
+
55054,
|
| 810 |
+
1810
|
| 811 |
+
],
|
| 812 |
+
[
|
| 813 |
+
8,
|
| 814 |
+
90
|
| 815 |
+
]
|
| 816 |
+
],
|
| 817 |
+
"threshold": 0.43
|
| 818 |
+
},
|
| 819 |
+
{
|
| 820 |
+
"precision": 0.049099836333878884,
|
| 821 |
+
"recall": 0.9183673469387755,
|
| 822 |
+
"f1": 0.09321595028482652,
|
| 823 |
+
"roc_auc": 0.9721687370080279,
|
| 824 |
+
"pr_auc": 0.7159122424484009,
|
| 825 |
+
"confusion_matrix": [
|
| 826 |
+
[
|
| 827 |
+
55121,
|
| 828 |
+
1743
|
| 829 |
+
],
|
| 830 |
+
[
|
| 831 |
+
8,
|
| 832 |
+
90
|
| 833 |
+
]
|
| 834 |
+
],
|
| 835 |
+
"threshold": 0.44
|
| 836 |
+
},
|
| 837 |
+
{
|
| 838 |
+
"precision": 0.050818746470920384,
|
| 839 |
+
"recall": 0.9183673469387755,
|
| 840 |
+
"f1": 0.09630818619582665,
|
| 841 |
+
"roc_auc": 0.9721687370080279,
|
| 842 |
+
"pr_auc": 0.7159122424484009,
|
| 843 |
+
"confusion_matrix": [
|
| 844 |
+
[
|
| 845 |
+
55183,
|
| 846 |
+
1681
|
| 847 |
+
],
|
| 848 |
+
[
|
| 849 |
+
8,
|
| 850 |
+
90
|
| 851 |
+
]
|
| 852 |
+
],
|
| 853 |
+
"threshold": 0.45
|
| 854 |
+
},
|
| 855 |
+
{
|
| 856 |
+
"precision": 0.052508751458576426,
|
| 857 |
+
"recall": 0.9183673469387755,
|
| 858 |
+
"f1": 0.09933774834437085,
|
| 859 |
+
"roc_auc": 0.9721687370080279,
|
| 860 |
+
"pr_auc": 0.7159122424484009,
|
| 861 |
+
"confusion_matrix": [
|
| 862 |
+
[
|
| 863 |
+
55240,
|
| 864 |
+
1624
|
| 865 |
+
],
|
| 866 |
+
[
|
| 867 |
+
8,
|
| 868 |
+
90
|
| 869 |
+
]
|
| 870 |
+
],
|
| 871 |
+
"threshold": 0.46
|
| 872 |
+
},
|
| 873 |
+
{
|
| 874 |
+
"precision": 0.054678007290400975,
|
| 875 |
+
"recall": 0.9183673469387755,
|
| 876 |
+
"f1": 0.10321100917431193,
|
| 877 |
+
"roc_auc": 0.9721687370080279,
|
| 878 |
+
"pr_auc": 0.7159122424484009,
|
| 879 |
+
"confusion_matrix": [
|
| 880 |
+
[
|
| 881 |
+
55308,
|
| 882 |
+
1556
|
| 883 |
+
],
|
| 884 |
+
[
|
| 885 |
+
8,
|
| 886 |
+
90
|
| 887 |
+
]
|
| 888 |
+
],
|
| 889 |
+
"threshold": 0.47000000000000003
|
| 890 |
+
},
|
| 891 |
+
{
|
| 892 |
+
"precision": 0.056568196103079824,
|
| 893 |
+
"recall": 0.9183673469387755,
|
| 894 |
+
"f1": 0.10657193605683836,
|
| 895 |
+
"roc_auc": 0.9721687370080279,
|
| 896 |
+
"pr_auc": 0.7159122424484009,
|
| 897 |
+
"confusion_matrix": [
|
| 898 |
+
[
|
| 899 |
+
55363,
|
| 900 |
+
1501
|
| 901 |
+
],
|
| 902 |
+
[
|
| 903 |
+
8,
|
| 904 |
+
90
|
| 905 |
+
]
|
| 906 |
+
],
|
| 907 |
+
"threshold": 0.48000000000000004
|
| 908 |
+
},
|
| 909 |
+
{
|
| 910 |
+
"precision": 0.05870841487279843,
|
| 911 |
+
"recall": 0.9183673469387755,
|
| 912 |
+
"f1": 0.11036174126302882,
|
| 913 |
+
"roc_auc": 0.9721687370080279,
|
| 914 |
+
"pr_auc": 0.7159122424484009,
|
| 915 |
+
"confusion_matrix": [
|
| 916 |
+
[
|
| 917 |
+
55421,
|
| 918 |
+
1443
|
| 919 |
+
],
|
| 920 |
+
[
|
| 921 |
+
8,
|
| 922 |
+
90
|
| 923 |
+
]
|
| 924 |
+
],
|
| 925 |
+
"threshold": 0.49
|
| 926 |
+
},
|
| 927 |
+
{
|
| 928 |
+
"precision": 0.06097560975609756,
|
| 929 |
+
"recall": 0.9183673469387755,
|
| 930 |
+
"f1": 0.11435832274459974,
|
| 931 |
+
"roc_auc": 0.9721687370080279,
|
| 932 |
+
"pr_auc": 0.7159122424484009,
|
| 933 |
+
"confusion_matrix": [
|
| 934 |
+
[
|
| 935 |
+
55478,
|
| 936 |
+
1386
|
| 937 |
+
],
|
| 938 |
+
[
|
| 939 |
+
8,
|
| 940 |
+
90
|
| 941 |
+
]
|
| 942 |
+
],
|
| 943 |
+
"threshold": 0.5
|
| 944 |
+
},
|
| 945 |
+
{
|
| 946 |
+
"precision": 0.06382978723404255,
|
| 947 |
+
"recall": 0.9183673469387755,
|
| 948 |
+
"f1": 0.11936339522546419,
|
| 949 |
+
"roc_auc": 0.9721687370080279,
|
| 950 |
+
"pr_auc": 0.7159122424484009,
|
| 951 |
+
"confusion_matrix": [
|
| 952 |
+
[
|
| 953 |
+
55544,
|
| 954 |
+
1320
|
| 955 |
+
],
|
| 956 |
+
[
|
| 957 |
+
8,
|
| 958 |
+
90
|
| 959 |
+
]
|
| 960 |
+
],
|
| 961 |
+
"threshold": 0.51
|
| 962 |
+
},
|
| 963 |
+
{
|
| 964 |
+
"precision": 0.06642066420664207,
|
| 965 |
+
"recall": 0.9183673469387755,
|
| 966 |
+
"f1": 0.12388162422573985,
|
| 967 |
+
"roc_auc": 0.9721687370080279,
|
| 968 |
+
"pr_auc": 0.7159122424484009,
|
| 969 |
+
"confusion_matrix": [
|
| 970 |
+
[
|
| 971 |
+
55599,
|
| 972 |
+
1265
|
| 973 |
+
],
|
| 974 |
+
[
|
| 975 |
+
8,
|
| 976 |
+
90
|
| 977 |
+
]
|
| 978 |
+
],
|
| 979 |
+
"threshold": 0.52
|
| 980 |
+
},
|
| 981 |
+
{
|
| 982 |
+
"precision": 0.06813020439061317,
|
| 983 |
+
"recall": 0.9183673469387755,
|
| 984 |
+
"f1": 0.12684989429175475,
|
| 985 |
+
"roc_auc": 0.9721687370080279,
|
| 986 |
+
"pr_auc": 0.7159122424484009,
|
| 987 |
+
"confusion_matrix": [
|
| 988 |
+
[
|
| 989 |
+
55633,
|
| 990 |
+
1231
|
| 991 |
+
],
|
| 992 |
+
[
|
| 993 |
+
8,
|
| 994 |
+
90
|
| 995 |
+
]
|
| 996 |
+
],
|
| 997 |
+
"threshold": 0.53
|
| 998 |
+
},
|
| 999 |
+
{
|
| 1000 |
+
"precision": 0.0706436420722135,
|
| 1001 |
+
"recall": 0.9183673469387755,
|
| 1002 |
+
"f1": 0.13119533527696792,
|
| 1003 |
+
"roc_auc": 0.9721687370080279,
|
| 1004 |
+
"pr_auc": 0.7159122424484009,
|
| 1005 |
+
"confusion_matrix": [
|
| 1006 |
+
[
|
| 1007 |
+
55680,
|
| 1008 |
+
1184
|
| 1009 |
+
],
|
| 1010 |
+
[
|
| 1011 |
+
8,
|
| 1012 |
+
90
|
| 1013 |
+
]
|
| 1014 |
+
],
|
| 1015 |
+
"threshold": 0.54
|
| 1016 |
+
},
|
| 1017 |
+
{
|
| 1018 |
+
"precision": 0.07317073170731707,
|
| 1019 |
+
"recall": 0.9183673469387755,
|
| 1020 |
+
"f1": 0.1355421686746988,
|
| 1021 |
+
"roc_auc": 0.9721687370080279,
|
| 1022 |
+
"pr_auc": 0.7159122424484009,
|
| 1023 |
+
"confusion_matrix": [
|
| 1024 |
+
[
|
| 1025 |
+
55724,
|
| 1026 |
+
1140
|
| 1027 |
+
],
|
| 1028 |
+
[
|
| 1029 |
+
8,
|
| 1030 |
+
90
|
| 1031 |
+
]
|
| 1032 |
+
],
|
| 1033 |
+
"threshold": 0.55
|
| 1034 |
+
},
|
| 1035 |
+
{
|
| 1036 |
+
"precision": 0.0760777683854607,
|
| 1037 |
+
"recall": 0.9183673469387755,
|
| 1038 |
+
"f1": 0.1405152224824356,
|
| 1039 |
+
"roc_auc": 0.9721687370080279,
|
| 1040 |
+
"pr_auc": 0.7159122424484009,
|
| 1041 |
+
"confusion_matrix": [
|
| 1042 |
+
[
|
| 1043 |
+
55771,
|
| 1044 |
+
1093
|
| 1045 |
+
],
|
| 1046 |
+
[
|
| 1047 |
+
8,
|
| 1048 |
+
90
|
| 1049 |
+
]
|
| 1050 |
+
],
|
| 1051 |
+
"threshold": 0.56
|
| 1052 |
+
},
|
| 1053 |
+
{
|
| 1054 |
+
"precision": 0.07853403141361257,
|
| 1055 |
+
"recall": 0.9183673469387755,
|
| 1056 |
+
"f1": 0.14469453376205788,
|
| 1057 |
+
"roc_auc": 0.9721687370080279,
|
| 1058 |
+
"pr_auc": 0.7159122424484009,
|
| 1059 |
+
"confusion_matrix": [
|
| 1060 |
+
[
|
| 1061 |
+
55808,
|
| 1062 |
+
1056
|
| 1063 |
+
],
|
| 1064 |
+
[
|
| 1065 |
+
8,
|
| 1066 |
+
90
|
| 1067 |
+
]
|
| 1068 |
+
],
|
| 1069 |
+
"threshold": 0.5700000000000001
|
| 1070 |
+
},
|
| 1071 |
+
{
|
| 1072 |
+
"precision": 0.0820419325432999,
|
| 1073 |
+
"recall": 0.9183673469387755,
|
| 1074 |
+
"f1": 0.1506276150627615,
|
| 1075 |
+
"roc_auc": 0.9721687370080279,
|
| 1076 |
+
"pr_auc": 0.7159122424484009,
|
| 1077 |
+
"confusion_matrix": [
|
| 1078 |
+
[
|
| 1079 |
+
55857,
|
| 1080 |
+
1007
|
| 1081 |
+
],
|
| 1082 |
+
[
|
| 1083 |
+
8,
|
| 1084 |
+
90
|
| 1085 |
+
]
|
| 1086 |
+
],
|
| 1087 |
+
"threshold": 0.5800000000000001
|
| 1088 |
+
},
|
| 1089 |
+
{
|
| 1090 |
+
"precision": 0.08458646616541353,
|
| 1091 |
+
"recall": 0.9183673469387755,
|
| 1092 |
+
"f1": 0.1549053356282272,
|
| 1093 |
+
"roc_auc": 0.9721687370080279,
|
| 1094 |
+
"pr_auc": 0.7159122424484009,
|
| 1095 |
+
"confusion_matrix": [
|
| 1096 |
+
[
|
| 1097 |
+
55890,
|
| 1098 |
+
974
|
| 1099 |
+
],
|
| 1100 |
+
[
|
| 1101 |
+
8,
|
| 1102 |
+
90
|
| 1103 |
+
]
|
| 1104 |
+
],
|
| 1105 |
+
"threshold": 0.59
|
| 1106 |
+
},
|
| 1107 |
+
{
|
| 1108 |
+
"precision": 0.0866601752677702,
|
| 1109 |
+
"recall": 0.9081632653061225,
|
| 1110 |
+
"f1": 0.1582222222222222,
|
| 1111 |
+
"roc_auc": 0.9721687370080279,
|
| 1112 |
+
"pr_auc": 0.7159122424484009,
|
| 1113 |
+
"confusion_matrix": [
|
| 1114 |
+
[
|
| 1115 |
+
55926,
|
| 1116 |
+
938
|
| 1117 |
+
],
|
| 1118 |
+
[
|
| 1119 |
+
9,
|
| 1120 |
+
89
|
| 1121 |
+
]
|
| 1122 |
+
],
|
| 1123 |
+
"threshold": 0.6
|
| 1124 |
+
},
|
| 1125 |
+
{
|
| 1126 |
+
"precision": 0.09035532994923857,
|
| 1127 |
+
"recall": 0.9081632653061225,
|
| 1128 |
+
"f1": 0.16435826408125578,
|
| 1129 |
+
"roc_auc": 0.9721687370080279,
|
| 1130 |
+
"pr_auc": 0.7159122424484009,
|
| 1131 |
+
"confusion_matrix": [
|
| 1132 |
+
[
|
| 1133 |
+
55968,
|
| 1134 |
+
896
|
| 1135 |
+
],
|
| 1136 |
+
[
|
| 1137 |
+
9,
|
| 1138 |
+
89
|
| 1139 |
+
]
|
| 1140 |
+
],
|
| 1141 |
+
"threshold": 0.61
|
| 1142 |
+
},
|
| 1143 |
+
{
|
| 1144 |
+
"precision": 0.09290187891440502,
|
| 1145 |
+
"recall": 0.9081632653061225,
|
| 1146 |
+
"f1": 0.16856060606060605,
|
| 1147 |
+
"roc_auc": 0.9721687370080279,
|
| 1148 |
+
"pr_auc": 0.7159122424484009,
|
| 1149 |
+
"confusion_matrix": [
|
| 1150 |
+
[
|
| 1151 |
+
55995,
|
| 1152 |
+
869
|
| 1153 |
+
],
|
| 1154 |
+
[
|
| 1155 |
+
9,
|
| 1156 |
+
89
|
| 1157 |
+
]
|
| 1158 |
+
],
|
| 1159 |
+
"threshold": 0.62
|
| 1160 |
+
},
|
| 1161 |
+
{
|
| 1162 |
+
"precision": 0.09611231101511879,
|
| 1163 |
+
"recall": 0.9081632653061225,
|
| 1164 |
+
"f1": 0.173828125,
|
| 1165 |
+
"roc_auc": 0.9721687370080279,
|
| 1166 |
+
"pr_auc": 0.7159122424484009,
|
| 1167 |
+
"confusion_matrix": [
|
| 1168 |
+
[
|
| 1169 |
+
56027,
|
| 1170 |
+
837
|
| 1171 |
+
],
|
| 1172 |
+
[
|
| 1173 |
+
9,
|
| 1174 |
+
89
|
| 1175 |
+
]
|
| 1176 |
+
],
|
| 1177 |
+
"threshold": 0.63
|
| 1178 |
+
},
|
| 1179 |
+
{
|
| 1180 |
+
"precision": 0.09866962305986696,
|
| 1181 |
+
"recall": 0.9081632653061225,
|
| 1182 |
+
"f1": 0.178,
|
| 1183 |
+
"roc_auc": 0.9721687370080279,
|
| 1184 |
+
"pr_auc": 0.7159122424484009,
|
| 1185 |
+
"confusion_matrix": [
|
| 1186 |
+
[
|
| 1187 |
+
56051,
|
| 1188 |
+
813
|
| 1189 |
+
],
|
| 1190 |
+
[
|
| 1191 |
+
9,
|
| 1192 |
+
89
|
| 1193 |
+
]
|
| 1194 |
+
],
|
| 1195 |
+
"threshold": 0.64
|
| 1196 |
+
},
|
| 1197 |
+
{
|
| 1198 |
+
"precision": 0.10194730813287514,
|
| 1199 |
+
"recall": 0.9081632653061225,
|
| 1200 |
+
"f1": 0.18331616889804325,
|
| 1201 |
+
"roc_auc": 0.9721687370080279,
|
| 1202 |
+
"pr_auc": 0.7159122424484009,
|
| 1203 |
+
"confusion_matrix": [
|
| 1204 |
+
[
|
| 1205 |
+
56080,
|
| 1206 |
+
784
|
| 1207 |
+
],
|
| 1208 |
+
[
|
| 1209 |
+
9,
|
| 1210 |
+
89
|
| 1211 |
+
]
|
| 1212 |
+
],
|
| 1213 |
+
"threshold": 0.65
|
| 1214 |
+
},
|
| 1215 |
+
{
|
| 1216 |
+
"precision": 0.10620525059665871,
|
| 1217 |
+
"recall": 0.9081632653061225,
|
| 1218 |
+
"f1": 0.19017094017094016,
|
| 1219 |
+
"roc_auc": 0.9721687370080279,
|
| 1220 |
+
"pr_auc": 0.7159122424484009,
|
| 1221 |
+
"confusion_matrix": [
|
| 1222 |
+
[
|
| 1223 |
+
56115,
|
| 1224 |
+
749
|
| 1225 |
+
],
|
| 1226 |
+
[
|
| 1227 |
+
9,
|
| 1228 |
+
89
|
| 1229 |
+
]
|
| 1230 |
+
],
|
| 1231 |
+
"threshold": 0.66
|
| 1232 |
+
},
|
| 1233 |
+
{
|
| 1234 |
+
"precision": 0.11014851485148515,
|
| 1235 |
+
"recall": 0.9081632653061225,
|
| 1236 |
+
"f1": 0.19646799116997793,
|
| 1237 |
+
"roc_auc": 0.9721687370080279,
|
| 1238 |
+
"pr_auc": 0.7159122424484009,
|
| 1239 |
+
"confusion_matrix": [
|
| 1240 |
+
[
|
| 1241 |
+
56145,
|
| 1242 |
+
719
|
| 1243 |
+
],
|
| 1244 |
+
[
|
| 1245 |
+
9,
|
| 1246 |
+
89
|
| 1247 |
+
]
|
| 1248 |
+
],
|
| 1249 |
+
"threshold": 0.67
|
| 1250 |
+
},
|
| 1251 |
+
{
|
| 1252 |
+
"precision": 0.11424903722721438,
|
| 1253 |
+
"recall": 0.9081632653061225,
|
| 1254 |
+
"f1": 0.20296465222348917,
|
| 1255 |
+
"roc_auc": 0.9721687370080279,
|
| 1256 |
+
"pr_auc": 0.7159122424484009,
|
| 1257 |
+
"confusion_matrix": [
|
| 1258 |
+
[
|
| 1259 |
+
56174,
|
| 1260 |
+
690
|
| 1261 |
+
],
|
| 1262 |
+
[
|
| 1263 |
+
9,
|
| 1264 |
+
89
|
| 1265 |
+
]
|
| 1266 |
+
],
|
| 1267 |
+
"threshold": 0.68
|
| 1268 |
+
},
|
| 1269 |
+
{
|
| 1270 |
+
"precision": 0.11772486772486772,
|
| 1271 |
+
"recall": 0.9081632653061225,
|
| 1272 |
+
"f1": 0.20843091334894615,
|
| 1273 |
+
"roc_auc": 0.9721687370080279,
|
| 1274 |
+
"pr_auc": 0.7159122424484009,
|
| 1275 |
+
"confusion_matrix": [
|
| 1276 |
+
[
|
| 1277 |
+
56197,
|
| 1278 |
+
667
|
| 1279 |
+
],
|
| 1280 |
+
[
|
| 1281 |
+
9,
|
| 1282 |
+
89
|
| 1283 |
+
]
|
| 1284 |
+
],
|
| 1285 |
+
"threshold": 0.6900000000000001
|
| 1286 |
+
},
|
| 1287 |
+
{
|
| 1288 |
+
"precision": 0.12141882673942701,
|
| 1289 |
+
"recall": 0.9081632653061225,
|
| 1290 |
+
"f1": 0.21419975932611313,
|
| 1291 |
+
"roc_auc": 0.9721687370080279,
|
| 1292 |
+
"pr_auc": 0.7159122424484009,
|
| 1293 |
+
"confusion_matrix": [
|
| 1294 |
+
[
|
| 1295 |
+
56220,
|
| 1296 |
+
644
|
| 1297 |
+
],
|
| 1298 |
+
[
|
| 1299 |
+
9,
|
| 1300 |
+
89
|
| 1301 |
+
]
|
| 1302 |
+
],
|
| 1303 |
+
"threshold": 0.7000000000000001
|
| 1304 |
+
},
|
| 1305 |
+
{
|
| 1306 |
+
"precision": 0.12588401697312587,
|
| 1307 |
+
"recall": 0.9081632653061225,
|
| 1308 |
+
"f1": 0.22111801242236026,
|
| 1309 |
+
"roc_auc": 0.9721687370080279,
|
| 1310 |
+
"pr_auc": 0.7159122424484009,
|
| 1311 |
+
"confusion_matrix": [
|
| 1312 |
+
[
|
| 1313 |
+
56246,
|
| 1314 |
+
618
|
| 1315 |
+
],
|
| 1316 |
+
[
|
| 1317 |
+
9,
|
| 1318 |
+
89
|
| 1319 |
+
]
|
| 1320 |
+
],
|
| 1321 |
+
"threshold": 0.7100000000000001
|
| 1322 |
+
},
|
| 1323 |
+
{
|
| 1324 |
+
"precision": 0.12936046511627908,
|
| 1325 |
+
"recall": 0.9081632653061225,
|
| 1326 |
+
"f1": 0.22646310432569974,
|
| 1327 |
+
"roc_auc": 0.9721687370080279,
|
| 1328 |
+
"pr_auc": 0.7159122424484009,
|
| 1329 |
+
"confusion_matrix": [
|
| 1330 |
+
[
|
| 1331 |
+
56265,
|
| 1332 |
+
599
|
| 1333 |
+
],
|
| 1334 |
+
[
|
| 1335 |
+
9,
|
| 1336 |
+
89
|
| 1337 |
+
]
|
| 1338 |
+
],
|
| 1339 |
+
"threshold": 0.72
|
| 1340 |
+
},
|
| 1341 |
+
{
|
| 1342 |
+
"precision": 0.13343328335832083,
|
| 1343 |
+
"recall": 0.9081632653061225,
|
| 1344 |
+
"f1": 0.2326797385620915,
|
| 1345 |
+
"roc_auc": 0.9721687370080279,
|
| 1346 |
+
"pr_auc": 0.7159122424484009,
|
| 1347 |
+
"confusion_matrix": [
|
| 1348 |
+
[
|
| 1349 |
+
56286,
|
| 1350 |
+
578
|
| 1351 |
+
],
|
| 1352 |
+
[
|
| 1353 |
+
9,
|
| 1354 |
+
89
|
| 1355 |
+
]
|
| 1356 |
+
],
|
| 1357 |
+
"threshold": 0.73
|
| 1358 |
+
},
|
| 1359 |
+
{
|
| 1360 |
+
"precision": 0.13650306748466257,
|
| 1361 |
+
"recall": 0.9081632653061225,
|
| 1362 |
+
"f1": 0.23733333333333334,
|
| 1363 |
+
"roc_auc": 0.9721687370080279,
|
| 1364 |
+
"pr_auc": 0.7159122424484009,
|
| 1365 |
+
"confusion_matrix": [
|
| 1366 |
+
[
|
| 1367 |
+
56301,
|
| 1368 |
+
563
|
| 1369 |
+
],
|
| 1370 |
+
[
|
| 1371 |
+
9,
|
| 1372 |
+
89
|
| 1373 |
+
]
|
| 1374 |
+
],
|
| 1375 |
+
"threshold": 0.74
|
| 1376 |
+
},
|
| 1377 |
+
{
|
| 1378 |
+
"precision": 0.14012738853503184,
|
| 1379 |
+
"recall": 0.8979591836734694,
|
| 1380 |
+
"f1": 0.24242424242424243,
|
| 1381 |
+
"roc_auc": 0.9721687370080279,
|
| 1382 |
+
"pr_auc": 0.7159122424484009,
|
| 1383 |
+
"confusion_matrix": [
|
| 1384 |
+
[
|
| 1385 |
+
56324,
|
| 1386 |
+
540
|
| 1387 |
+
],
|
| 1388 |
+
[
|
| 1389 |
+
10,
|
| 1390 |
+
88
|
| 1391 |
+
]
|
| 1392 |
+
],
|
| 1393 |
+
"threshold": 0.75
|
| 1394 |
+
},
|
| 1395 |
+
{
|
| 1396 |
+
"precision": 0.14402618657937807,
|
| 1397 |
+
"recall": 0.8979591836734694,
|
| 1398 |
+
"f1": 0.24823695345557123,
|
| 1399 |
+
"roc_auc": 0.9721687370080279,
|
| 1400 |
+
"pr_auc": 0.7159122424484009,
|
| 1401 |
+
"confusion_matrix": [
|
| 1402 |
+
[
|
| 1403 |
+
56341,
|
| 1404 |
+
523
|
| 1405 |
+
],
|
| 1406 |
+
[
|
| 1407 |
+
10,
|
| 1408 |
+
88
|
| 1409 |
+
]
|
| 1410 |
+
],
|
| 1411 |
+
"threshold": 0.76
|
| 1412 |
+
},
|
| 1413 |
+
{
|
| 1414 |
+
"precision": 0.14864864864864866,
|
| 1415 |
+
"recall": 0.8979591836734694,
|
| 1416 |
+
"f1": 0.25507246376811593,
|
| 1417 |
+
"roc_auc": 0.9721687370080279,
|
| 1418 |
+
"pr_auc": 0.7159122424484009,
|
| 1419 |
+
"confusion_matrix": [
|
| 1420 |
+
[
|
| 1421 |
+
56360,
|
| 1422 |
+
504
|
| 1423 |
+
],
|
| 1424 |
+
[
|
| 1425 |
+
10,
|
| 1426 |
+
88
|
| 1427 |
+
]
|
| 1428 |
+
],
|
| 1429 |
+
"threshold": 0.77
|
| 1430 |
+
},
|
| 1431 |
+
{
|
| 1432 |
+
"precision": 0.15198618307426598,
|
| 1433 |
+
"recall": 0.8979591836734694,
|
| 1434 |
+
"f1": 0.25997045790251105,
|
| 1435 |
+
"roc_auc": 0.9721687370080279,
|
| 1436 |
+
"pr_auc": 0.7159122424484009,
|
| 1437 |
+
"confusion_matrix": [
|
| 1438 |
+
[
|
| 1439 |
+
56373,
|
| 1440 |
+
491
|
| 1441 |
+
],
|
| 1442 |
+
[
|
| 1443 |
+
10,
|
| 1444 |
+
88
|
| 1445 |
+
]
|
| 1446 |
+
],
|
| 1447 |
+
"threshold": 0.78
|
| 1448 |
+
},
|
| 1449 |
+
{
|
| 1450 |
+
"precision": 0.15630550621669628,
|
| 1451 |
+
"recall": 0.8979591836734694,
|
| 1452 |
+
"f1": 0.26626323751891073,
|
| 1453 |
+
"roc_auc": 0.9721687370080279,
|
| 1454 |
+
"pr_auc": 0.7159122424484009,
|
| 1455 |
+
"confusion_matrix": [
|
| 1456 |
+
[
|
| 1457 |
+
56389,
|
| 1458 |
+
475
|
| 1459 |
+
],
|
| 1460 |
+
[
|
| 1461 |
+
10,
|
| 1462 |
+
88
|
| 1463 |
+
]
|
| 1464 |
+
],
|
| 1465 |
+
"threshold": 0.79
|
| 1466 |
+
},
|
| 1467 |
+
{
|
| 1468 |
+
"precision": 0.16087751371115175,
|
| 1469 |
+
"recall": 0.8979591836734694,
|
| 1470 |
+
"f1": 0.27286821705426356,
|
| 1471 |
+
"roc_auc": 0.9721687370080279,
|
| 1472 |
+
"pr_auc": 0.7159122424484009,
|
| 1473 |
+
"confusion_matrix": [
|
| 1474 |
+
[
|
| 1475 |
+
56405,
|
| 1476 |
+
459
|
| 1477 |
+
],
|
| 1478 |
+
[
|
| 1479 |
+
10,
|
| 1480 |
+
88
|
| 1481 |
+
]
|
| 1482 |
+
],
|
| 1483 |
+
"threshold": 0.8
|
| 1484 |
+
},
|
| 1485 |
+
{
|
| 1486 |
+
"precision": 0.1638418079096045,
|
| 1487 |
+
"recall": 0.8877551020408163,
|
| 1488 |
+
"f1": 0.2766295707472178,
|
| 1489 |
+
"roc_auc": 0.9721687370080279,
|
| 1490 |
+
"pr_auc": 0.7159122424484009,
|
| 1491 |
+
"confusion_matrix": [
|
| 1492 |
+
[
|
| 1493 |
+
56420,
|
| 1494 |
+
444
|
| 1495 |
+
],
|
| 1496 |
+
[
|
| 1497 |
+
11,
|
| 1498 |
+
87
|
| 1499 |
+
]
|
| 1500 |
+
],
|
| 1501 |
+
"threshold": 0.81
|
| 1502 |
+
},
|
| 1503 |
+
{
|
| 1504 |
+
"precision": 0.17058823529411765,
|
| 1505 |
+
"recall": 0.8877551020408163,
|
| 1506 |
+
"f1": 0.28618421052631576,
|
| 1507 |
+
"roc_auc": 0.9721687370080279,
|
| 1508 |
+
"pr_auc": 0.7159122424484009,
|
| 1509 |
+
"confusion_matrix": [
|
| 1510 |
+
[
|
| 1511 |
+
56441,
|
| 1512 |
+
423
|
| 1513 |
+
],
|
| 1514 |
+
[
|
| 1515 |
+
11,
|
| 1516 |
+
87
|
| 1517 |
+
]
|
| 1518 |
+
],
|
| 1519 |
+
"threshold": 0.8200000000000001
|
| 1520 |
+
},
|
| 1521 |
+
{
|
| 1522 |
+
"precision": 0.174,
|
| 1523 |
+
"recall": 0.8877551020408163,
|
| 1524 |
+
"f1": 0.2909698996655518,
|
| 1525 |
+
"roc_auc": 0.9721687370080279,
|
| 1526 |
+
"pr_auc": 0.7159122424484009,
|
| 1527 |
+
"confusion_matrix": [
|
| 1528 |
+
[
|
| 1529 |
+
56451,
|
| 1530 |
+
413
|
| 1531 |
+
],
|
| 1532 |
+
[
|
| 1533 |
+
11,
|
| 1534 |
+
87
|
| 1535 |
+
]
|
| 1536 |
+
],
|
| 1537 |
+
"threshold": 0.8300000000000001
|
| 1538 |
+
},
|
| 1539 |
+
{
|
| 1540 |
+
"precision": 0.1797520661157025,
|
| 1541 |
+
"recall": 0.8877551020408163,
|
| 1542 |
+
"f1": 0.29896907216494845,
|
| 1543 |
+
"roc_auc": 0.9721687370080279,
|
| 1544 |
+
"pr_auc": 0.7159122424484009,
|
| 1545 |
+
"confusion_matrix": [
|
| 1546 |
+
[
|
| 1547 |
+
56467,
|
| 1548 |
+
397
|
| 1549 |
+
],
|
| 1550 |
+
[
|
| 1551 |
+
11,
|
| 1552 |
+
87
|
| 1553 |
+
]
|
| 1554 |
+
],
|
| 1555 |
+
"threshold": 0.8400000000000001
|
| 1556 |
+
},
|
| 1557 |
+
{
|
| 1558 |
+
"precision": 0.18471337579617833,
|
| 1559 |
+
"recall": 0.8877551020408163,
|
| 1560 |
+
"f1": 0.30579964850615116,
|
| 1561 |
+
"roc_auc": 0.9721687370080279,
|
| 1562 |
+
"pr_auc": 0.7159122424484009,
|
| 1563 |
+
"confusion_matrix": [
|
| 1564 |
+
[
|
| 1565 |
+
56480,
|
| 1566 |
+
384
|
| 1567 |
+
],
|
| 1568 |
+
[
|
| 1569 |
+
11,
|
| 1570 |
+
87
|
| 1571 |
+
]
|
| 1572 |
+
],
|
| 1573 |
+
"threshold": 0.85
|
| 1574 |
+
},
|
| 1575 |
+
{
|
| 1576 |
+
"precision": 0.19506726457399104,
|
| 1577 |
+
"recall": 0.8877551020408163,
|
| 1578 |
+
"f1": 0.31985294117647056,
|
| 1579 |
+
"roc_auc": 0.9721687370080279,
|
| 1580 |
+
"pr_auc": 0.7159122424484009,
|
| 1581 |
+
"confusion_matrix": [
|
| 1582 |
+
[
|
| 1583 |
+
56505,
|
| 1584 |
+
359
|
| 1585 |
+
],
|
| 1586 |
+
[
|
| 1587 |
+
11,
|
| 1588 |
+
87
|
| 1589 |
+
]
|
| 1590 |
+
],
|
| 1591 |
+
"threshold": 0.86
|
| 1592 |
+
},
|
| 1593 |
+
{
|
| 1594 |
+
"precision": 0.20374707259953162,
|
| 1595 |
+
"recall": 0.8877551020408163,
|
| 1596 |
+
"f1": 0.3314285714285714,
|
| 1597 |
+
"roc_auc": 0.9721687370080279,
|
| 1598 |
+
"pr_auc": 0.7159122424484009,
|
| 1599 |
+
"confusion_matrix": [
|
| 1600 |
+
[
|
| 1601 |
+
56524,
|
| 1602 |
+
340
|
| 1603 |
+
],
|
| 1604 |
+
[
|
| 1605 |
+
11,
|
| 1606 |
+
87
|
| 1607 |
+
]
|
| 1608 |
+
],
|
| 1609 |
+
"threshold": 0.87
|
| 1610 |
+
},
|
| 1611 |
+
{
|
| 1612 |
+
"precision": 0.21375921375921375,
|
| 1613 |
+
"recall": 0.8877551020408163,
|
| 1614 |
+
"f1": 0.3445544554455445,
|
| 1615 |
+
"roc_auc": 0.9721687370080279,
|
| 1616 |
+
"pr_auc": 0.7159122424484009,
|
| 1617 |
+
"confusion_matrix": [
|
| 1618 |
+
[
|
| 1619 |
+
56544,
|
| 1620 |
+
320
|
| 1621 |
+
],
|
| 1622 |
+
[
|
| 1623 |
+
11,
|
| 1624 |
+
87
|
| 1625 |
+
]
|
| 1626 |
+
],
|
| 1627 |
+
"threshold": 0.88
|
| 1628 |
+
},
|
| 1629 |
+
{
|
| 1630 |
+
"precision": 0.2265625,
|
| 1631 |
+
"recall": 0.8877551020408163,
|
| 1632 |
+
"f1": 0.36099585062240663,
|
| 1633 |
+
"roc_auc": 0.9721687370080279,
|
| 1634 |
+
"pr_auc": 0.7159122424484009,
|
| 1635 |
+
"confusion_matrix": [
|
| 1636 |
+
[
|
| 1637 |
+
56567,
|
| 1638 |
+
297
|
| 1639 |
+
],
|
| 1640 |
+
[
|
| 1641 |
+
11,
|
| 1642 |
+
87
|
| 1643 |
+
]
|
| 1644 |
+
],
|
| 1645 |
+
"threshold": 0.89
|
| 1646 |
+
},
|
| 1647 |
+
{
|
| 1648 |
+
"precision": 0.24507042253521127,
|
| 1649 |
+
"recall": 0.8877551020408163,
|
| 1650 |
+
"f1": 0.3841059602649007,
|
| 1651 |
+
"roc_auc": 0.9721687370080279,
|
| 1652 |
+
"pr_auc": 0.7159122424484009,
|
| 1653 |
+
"confusion_matrix": [
|
| 1654 |
+
[
|
| 1655 |
+
56596,
|
| 1656 |
+
268
|
| 1657 |
+
],
|
| 1658 |
+
[
|
| 1659 |
+
11,
|
| 1660 |
+
87
|
| 1661 |
+
]
|
| 1662 |
+
],
|
| 1663 |
+
"threshold": 0.9
|
| 1664 |
+
},
|
| 1665 |
+
{
|
| 1666 |
+
"precision": 0.2636363636363636,
|
| 1667 |
+
"recall": 0.8877551020408163,
|
| 1668 |
+
"f1": 0.40654205607476634,
|
| 1669 |
+
"roc_auc": 0.9721687370080279,
|
| 1670 |
+
"pr_auc": 0.7159122424484009,
|
| 1671 |
+
"confusion_matrix": [
|
| 1672 |
+
[
|
| 1673 |
+
56621,
|
| 1674 |
+
243
|
| 1675 |
+
],
|
| 1676 |
+
[
|
| 1677 |
+
11,
|
| 1678 |
+
87
|
| 1679 |
+
]
|
| 1680 |
+
],
|
| 1681 |
+
"threshold": 0.91
|
| 1682 |
+
},
|
| 1683 |
+
{
|
| 1684 |
+
"precision": 0.28618421052631576,
|
| 1685 |
+
"recall": 0.8877551020408163,
|
| 1686 |
+
"f1": 0.43283582089552236,
|
| 1687 |
+
"roc_auc": 0.9721687370080279,
|
| 1688 |
+
"pr_auc": 0.7159122424484009,
|
| 1689 |
+
"confusion_matrix": [
|
| 1690 |
+
[
|
| 1691 |
+
56647,
|
| 1692 |
+
217
|
| 1693 |
+
],
|
| 1694 |
+
[
|
| 1695 |
+
11,
|
| 1696 |
+
87
|
| 1697 |
+
]
|
| 1698 |
+
],
|
| 1699 |
+
"threshold": 0.92
|
| 1700 |
+
},
|
| 1701 |
+
{
|
| 1702 |
+
"precision": 0.3246268656716418,
|
| 1703 |
+
"recall": 0.8877551020408163,
|
| 1704 |
+
"f1": 0.47540983606557374,
|
| 1705 |
+
"roc_auc": 0.9721687370080279,
|
| 1706 |
+
"pr_auc": 0.7159122424484009,
|
| 1707 |
+
"confusion_matrix": [
|
| 1708 |
+
[
|
| 1709 |
+
56683,
|
| 1710 |
+
181
|
| 1711 |
+
],
|
| 1712 |
+
[
|
| 1713 |
+
11,
|
| 1714 |
+
87
|
| 1715 |
+
]
|
| 1716 |
+
],
|
| 1717 |
+
"threshold": 0.93
|
| 1718 |
+
},
|
| 1719 |
+
{
|
| 1720 |
+
"precision": 0.35080645161290325,
|
| 1721 |
+
"recall": 0.8877551020408163,
|
| 1722 |
+
"f1": 0.5028901734104047,
|
| 1723 |
+
"roc_auc": 0.9721687370080279,
|
| 1724 |
+
"pr_auc": 0.7159122424484009,
|
| 1725 |
+
"confusion_matrix": [
|
| 1726 |
+
[
|
| 1727 |
+
56703,
|
| 1728 |
+
161
|
| 1729 |
+
],
|
| 1730 |
+
[
|
| 1731 |
+
11,
|
| 1732 |
+
87
|
| 1733 |
+
]
|
| 1734 |
+
],
|
| 1735 |
+
"threshold": 0.9400000000000001
|
| 1736 |
+
},
|
| 1737 |
+
{
|
| 1738 |
+
"precision": 0.3918918918918919,
|
| 1739 |
+
"recall": 0.8877551020408163,
|
| 1740 |
+
"f1": 0.54375,
|
| 1741 |
+
"roc_auc": 0.9721687370080279,
|
| 1742 |
+
"pr_auc": 0.7159122424484009,
|
| 1743 |
+
"confusion_matrix": [
|
| 1744 |
+
[
|
| 1745 |
+
56729,
|
| 1746 |
+
135
|
| 1747 |
+
],
|
| 1748 |
+
[
|
| 1749 |
+
11,
|
| 1750 |
+
87
|
| 1751 |
+
]
|
| 1752 |
+
],
|
| 1753 |
+
"threshold": 0.9500000000000001
|
| 1754 |
+
},
|
| 1755 |
+
{
|
| 1756 |
+
"precision": 0.44387755102040816,
|
| 1757 |
+
"recall": 0.8877551020408163,
|
| 1758 |
+
"f1": 0.5918367346938775,
|
| 1759 |
+
"roc_auc": 0.9721687370080279,
|
| 1760 |
+
"pr_auc": 0.7159122424484009,
|
| 1761 |
+
"confusion_matrix": [
|
| 1762 |
+
[
|
| 1763 |
+
56755,
|
| 1764 |
+
109
|
| 1765 |
+
],
|
| 1766 |
+
[
|
| 1767 |
+
11,
|
| 1768 |
+
87
|
| 1769 |
+
]
|
| 1770 |
+
],
|
| 1771 |
+
"threshold": 0.9600000000000001
|
| 1772 |
+
},
|
| 1773 |
+
{
|
| 1774 |
+
"precision": 0.47802197802197804,
|
| 1775 |
+
"recall": 0.8877551020408163,
|
| 1776 |
+
"f1": 0.6214285714285714,
|
| 1777 |
+
"roc_auc": 0.9721687370080279,
|
| 1778 |
+
"pr_auc": 0.7159122424484009,
|
| 1779 |
+
"confusion_matrix": [
|
| 1780 |
+
[
|
| 1781 |
+
56769,
|
| 1782 |
+
95
|
| 1783 |
+
],
|
| 1784 |
+
[
|
| 1785 |
+
11,
|
| 1786 |
+
87
|
| 1787 |
+
]
|
| 1788 |
+
],
|
| 1789 |
+
"threshold": 0.97
|
| 1790 |
+
},
|
| 1791 |
+
{
|
| 1792 |
+
"precision": 0.5151515151515151,
|
| 1793 |
+
"recall": 0.8673469387755102,
|
| 1794 |
+
"f1": 0.6463878326996197,
|
| 1795 |
+
"roc_auc": 0.9721687370080279,
|
| 1796 |
+
"pr_auc": 0.7159122424484009,
|
| 1797 |
+
"confusion_matrix": [
|
| 1798 |
+
[
|
| 1799 |
+
56784,
|
| 1800 |
+
80
|
| 1801 |
+
],
|
| 1802 |
+
[
|
| 1803 |
+
13,
|
| 1804 |
+
85
|
| 1805 |
+
]
|
| 1806 |
+
],
|
| 1807 |
+
"threshold": 0.98
|
| 1808 |
+
},
|
| 1809 |
+
{
|
| 1810 |
+
"precision": 0.5763888888888888,
|
| 1811 |
+
"recall": 0.8469387755102041,
|
| 1812 |
+
"f1": 0.6859504132231405,
|
| 1813 |
+
"roc_auc": 0.9721687370080279,
|
| 1814 |
+
"pr_auc": 0.7159122424484009,
|
| 1815 |
+
"confusion_matrix": [
|
| 1816 |
+
[
|
| 1817 |
+
56803,
|
| 1818 |
+
61
|
| 1819 |
+
],
|
| 1820 |
+
[
|
| 1821 |
+
15,
|
| 1822 |
+
83
|
| 1823 |
+
]
|
| 1824 |
+
],
|
| 1825 |
+
"threshold": 0.99
|
| 1826 |
+
}
|
| 1827 |
+
]
|
| 1828 |
+
},
|
| 1829 |
+
"evaluation_summary": {
|
| 1830 |
+
"test_rows": 56962,
|
| 1831 |
+
"min_recall_target": 0.9,
|
| 1832 |
+
"selection_reason": "meets_min_recall"
|
| 1833 |
+
}
|
| 1834 |
+
}
|
artifacts/model_training_report.json
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp_utc": "2026-02-18T04:09:02.997602+00:00",
|
| 3 |
+
"experiment_name": "fraud-detection-baseline",
|
| 4 |
+
"tracking_uri": "file:./mlruns",
|
| 5 |
+
"data_path": "data/raw/creditcard.csv",
|
| 6 |
+
"preprocessor_path": "models/preprocessor.pkl",
|
| 7 |
+
"model_path": "models/model.pkl",
|
| 8 |
+
"model_report_path": "artifacts/model_report.json",
|
| 9 |
+
"best_model": {
|
| 10 |
+
"model_name": "logistic_regression",
|
| 11 |
+
"run_id": "f953d6a1c2d944338f8fc210408267a9",
|
| 12 |
+
"metrics": {
|
| 13 |
+
"precision": 0.06097560975609756,
|
| 14 |
+
"recall": 0.9183673469387755,
|
| 15 |
+
"f1": 0.11435832274459974,
|
| 16 |
+
"roc_auc": 0.9721687370080279,
|
| 17 |
+
"pr_auc": 0.7159122424484009,
|
| 18 |
+
"confusion_matrix": [
|
| 19 |
+
[
|
| 20 |
+
55478,
|
| 21 |
+
1386
|
| 22 |
+
],
|
| 23 |
+
[
|
| 24 |
+
8,
|
| 25 |
+
90
|
| 26 |
+
]
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"selected_threshold": 0.74,
|
| 30 |
+
"threshold_metrics": {
|
| 31 |
+
"precision": 0.13650306748466257,
|
| 32 |
+
"recall": 0.9081632653061225,
|
| 33 |
+
"f1": 0.23733333333333334,
|
| 34 |
+
"roc_auc": 0.9721687370080279,
|
| 35 |
+
"pr_auc": 0.7159122424484009,
|
| 36 |
+
"confusion_matrix": [
|
| 37 |
+
[
|
| 38 |
+
56301,
|
| 39 |
+
563
|
| 40 |
+
],
|
| 41 |
+
[
|
| 42 |
+
9,
|
| 43 |
+
89
|
| 44 |
+
]
|
| 45 |
+
],
|
| 46 |
+
"threshold": 0.74
|
| 47 |
+
}
|
| 48 |
+
},
|
| 49 |
+
"all_results": [
|
| 50 |
+
{
|
| 51 |
+
"model_name": "logistic_regression",
|
| 52 |
+
"run_id": "f953d6a1c2d944338f8fc210408267a9",
|
| 53 |
+
"metrics": {
|
| 54 |
+
"precision": 0.06097560975609756,
|
| 55 |
+
"recall": 0.9183673469387755,
|
| 56 |
+
"f1": 0.11435832274459974,
|
| 57 |
+
"roc_auc": 0.9721687370080279,
|
| 58 |
+
"pr_auc": 0.7159122424484009,
|
| 59 |
+
"confusion_matrix": [
|
| 60 |
+
[
|
| 61 |
+
55478,
|
| 62 |
+
1386
|
| 63 |
+
],
|
| 64 |
+
[
|
| 65 |
+
8,
|
| 66 |
+
90
|
| 67 |
+
]
|
| 68 |
+
]
|
| 69 |
+
}
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"model_name": "xgboost",
|
| 73 |
+
"run_id": "0ad9425817db4958a142b29f816108f4",
|
| 74 |
+
"metrics": {
|
| 75 |
+
"precision": 0.9186046511627907,
|
| 76 |
+
"recall": 0.8061224489795918,
|
| 77 |
+
"f1": 0.8586956521739131,
|
| 78 |
+
"roc_auc": 0.9775147361983623,
|
| 79 |
+
"pr_auc": 0.87487299490182,
|
| 80 |
+
"confusion_matrix": [
|
| 81 |
+
[
|
| 82 |
+
56857,
|
| 83 |
+
7
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
19,
|
| 87 |
+
79
|
| 88 |
+
]
|
| 89 |
+
]
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
],
|
| 93 |
+
"skipped_models": []
|
| 94 |
+
}
|
configs/logging.yaml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: 1
|
| 2 |
+
formatters:
|
| 3 |
+
standard:
|
| 4 |
+
format: '%(asctime)s | %(levelname)s | %(name)s | %(message)s'
|
| 5 |
+
handlers:
|
| 6 |
+
console:
|
| 7 |
+
class: logging.StreamHandler
|
| 8 |
+
formatter: standard
|
| 9 |
+
level: INFO
|
| 10 |
+
root:
|
| 11 |
+
handlers: [console]
|
| 12 |
+
level: INFO
|
configs/train.yaml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experiment:
|
| 2 |
+
name: fraud-detection-baseline
|
| 3 |
+
|
| 4 |
+
training:
|
| 5 |
+
test_size: 0.2
|
| 6 |
+
random_state: 42
|
| 7 |
+
imbalance_method: class_weight
|
| 8 |
+
models:
|
| 9 |
+
- logistic_regression
|
| 10 |
+
- xgboost
|
| 11 |
+
|
| 12 |
+
mlflow:
|
| 13 |
+
tracking_uri: file:./mlruns
|
| 14 |
+
|
| 15 |
+
threshold:
|
| 16 |
+
decision_threshold: 0.5
|
| 17 |
+
min_recall_target: 0.9
|
| 18 |
+
min_threshold: 0.01
|
| 19 |
+
max_threshold: 0.99
|
| 20 |
+
grid_size: 99
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
api:
|
| 3 |
+
build:
|
| 4 |
+
context: .
|
| 5 |
+
dockerfile: Dockerfile
|
| 6 |
+
image: fraud-detection-api:latest
|
| 7 |
+
container_name: fraud-detection-api
|
| 8 |
+
restart: unless-stopped
|
| 9 |
+
ports:
|
| 10 |
+
- "8000:8000"
|
| 11 |
+
environment:
|
| 12 |
+
- PYTHONUNBUFFERED=1
|
| 13 |
+
healthcheck:
|
| 14 |
+
test: ["CMD", "python", "-c", "import urllib.request,sys; urllib.request.urlopen('http://127.0.0.1:8000/health'); sys.exit(0)"]
|
| 15 |
+
interval: 30s
|
| 16 |
+
timeout: 5s
|
| 17 |
+
retries: 3
|
| 18 |
+
start_period: 20s
|
models/logistic_regression.pkl
ADDED
|
Binary file (1.54 kB). View file
|
|
|
models/model.pkl
ADDED
|
Binary file (1.54 kB). View file
|
|
|
models/preprocessor.pkl
ADDED
|
Binary file (2.68 kB). View file
|
|
|
pyproject.toml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "fraud-detection-mlops-pipeline"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.11"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"fastapi>=0.110,<0.116",
|
| 9 |
+
"httpx>=0.27,<0.29",
|
| 10 |
+
"imbalanced-learn>=0.12,<0.15",
|
| 11 |
+
"mlflow>=2.11,<3.0",
|
| 12 |
+
"numpy>=1.26,<3.0",
|
| 13 |
+
"pandas>=2.1,<2.4",
|
| 14 |
+
"pydantic>=2.6,<3.0",
|
| 15 |
+
"pytest>=8.0,<9.0",
|
| 16 |
+
"pytest-cov>=5.0,<7.0",
|
| 17 |
+
"python-dotenv>=1.0,<2.0",
|
| 18 |
+
"pyyaml>=6.0,<7.0",
|
| 19 |
+
"scikit-learn>=1.4,<1.8",
|
| 20 |
+
"uvicorn[standard]>=0.29,<0.36",
|
| 21 |
+
"xgboost>=2.0,<3.0",
|
| 22 |
+
]
|
pytest.ini
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[pytest]
|
| 2 |
+
addopts = -q --cov=src --cov=api --cov-report=term-missing --cov-fail-under=80
|
| 3 |
+
testpaths = tests
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy>=1.26,<3.0
|
| 2 |
+
pandas>=2.1,<2.4
|
| 3 |
+
scikit-learn>=1.4,<1.8
|
| 4 |
+
imbalanced-learn>=0.12,<0.15
|
| 5 |
+
xgboost>=2.0,<3.0
|
| 6 |
+
mlflow>=2.11,<3.0
|
| 7 |
+
fastapi>=0.110,<0.116
|
| 8 |
+
uvicorn[standard]>=0.29,<0.36
|
| 9 |
+
pydantic>=2.6,<3.0
|
| 10 |
+
python-dotenv>=1.0,<2.0
|
| 11 |
+
pyyaml>=6.0,<7.0
|
| 12 |
+
pytest>=8.0,<9.0
|
| 13 |
+
pytest-cov>=5.0,<7.0
|
| 14 |
+
httpx>=0.27,<0.29
|
src/__init__.py
ADDED
|
File without changes
|
src/data_ingestion.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data ingestion and validation utilities for the fraud dataset."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
|
| 12 |
+
EXPECTED_ROW_COUNT = 284_807
|
| 13 |
+
EXPECTED_COLUMNS = ["Time", *[f"V{i}" for i in range(1, 29)], "Amount", "Class"]
|
| 14 |
+
EXPECTED_CLASS_VALUES = {0, 1}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def load_data(file_path: str | Path) -> pd.DataFrame:
|
| 18 |
+
"""Load CSV data from disk."""
|
| 19 |
+
path = Path(file_path)
|
| 20 |
+
if not path.exists():
|
| 21 |
+
raise FileNotFoundError(f"Dataset not found: {path}")
|
| 22 |
+
if path.suffix.lower() != ".csv":
|
| 23 |
+
raise ValueError(f"Expected a CSV file, got: {path.suffix}")
|
| 24 |
+
return pd.read_csv(path)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_data_statistics(df: pd.DataFrame) -> dict[str, Any]:
|
| 28 |
+
"""Return key dataset statistics used for validation and monitoring."""
|
| 29 |
+
class_counts: dict[str, int] = {}
|
| 30 |
+
fraud_ratio: float | None = None
|
| 31 |
+
|
| 32 |
+
if "Class" in df.columns:
|
| 33 |
+
raw_counts = df["Class"].value_counts(dropna=False).to_dict()
|
| 34 |
+
class_counts = {str(k): int(v) for k, v in raw_counts.items()}
|
| 35 |
+
if len(df) > 0:
|
| 36 |
+
fraud_ratio = float((df["Class"] == 1).sum() / len(df))
|
| 37 |
+
|
| 38 |
+
return {
|
| 39 |
+
"row_count": int(df.shape[0]),
|
| 40 |
+
"column_count": int(df.shape[1]),
|
| 41 |
+
"missing_values_total": int(df.isna().sum().sum()),
|
| 42 |
+
"duplicate_rows": int(df.duplicated().sum()),
|
| 43 |
+
"class_counts": class_counts,
|
| 44 |
+
"fraud_ratio": fraud_ratio,
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def validate_data(df: pd.DataFrame, expected_rows: int = EXPECTED_ROW_COUNT) -> dict[str, Any]:
|
| 49 |
+
"""Validate schema and data quality; return a structured report."""
|
| 50 |
+
errors: list[str] = []
|
| 51 |
+
warnings: list[str] = []
|
| 52 |
+
|
| 53 |
+
actual_columns = list(df.columns)
|
| 54 |
+
missing_columns = [col for col in EXPECTED_COLUMNS if col not in actual_columns]
|
| 55 |
+
unexpected_columns = [col for col in actual_columns if col not in EXPECTED_COLUMNS]
|
| 56 |
+
|
| 57 |
+
if missing_columns:
|
| 58 |
+
errors.append(f"Missing required columns: {missing_columns}")
|
| 59 |
+
if unexpected_columns:
|
| 60 |
+
warnings.append(f"Unexpected columns present: {unexpected_columns}")
|
| 61 |
+
|
| 62 |
+
stats = get_data_statistics(df)
|
| 63 |
+
|
| 64 |
+
if expected_rows and stats["row_count"] != expected_rows:
|
| 65 |
+
warnings.append(
|
| 66 |
+
f"Row count differs from expected {expected_rows}: got {stats['row_count']}"
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
if stats["missing_values_total"] > 0:
|
| 70 |
+
warnings.append(f"Dataset contains {stats['missing_values_total']} missing values")
|
| 71 |
+
|
| 72 |
+
if "Class" in df.columns:
|
| 73 |
+
class_values = set(df["Class"].dropna().unique().tolist())
|
| 74 |
+
invalid_class_values = sorted(class_values - EXPECTED_CLASS_VALUES)
|
| 75 |
+
if invalid_class_values:
|
| 76 |
+
errors.append(f"Class contains invalid values: {invalid_class_values}")
|
| 77 |
+
if len(class_values) == 1:
|
| 78 |
+
warnings.append("Class column has only one class present")
|
| 79 |
+
else:
|
| 80 |
+
errors.append("Class column not found")
|
| 81 |
+
|
| 82 |
+
is_valid = len(errors) == 0
|
| 83 |
+
return {"is_valid": is_valid, "errors": errors, "warnings": warnings, "statistics": stats}
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def save_validation_report(report: dict[str, Any], output_path: str | Path) -> Path:
|
| 87 |
+
"""Write validation report to JSON."""
|
| 88 |
+
output = Path(output_path)
|
| 89 |
+
output.parent.mkdir(parents=True, exist_ok=True)
|
| 90 |
+
output.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
| 91 |
+
return output
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def run_data_validation(
|
| 95 |
+
file_path: str | Path = "data/raw/creditcard.csv",
|
| 96 |
+
report_path: str | Path = "artifacts/data_validation.json",
|
| 97 |
+
) -> dict[str, Any]:
|
| 98 |
+
"""Load dataset, validate, persist report, and fail fast on schema errors."""
|
| 99 |
+
df = load_data(file_path)
|
| 100 |
+
report = validate_data(df)
|
| 101 |
+
save_validation_report(report, report_path)
|
| 102 |
+
if not report["is_valid"]:
|
| 103 |
+
raise ValueError(f"Data validation failed: {report['errors']}")
|
| 104 |
+
return report
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _build_parser() -> argparse.ArgumentParser:
|
| 108 |
+
parser = argparse.ArgumentParser(description="Validate fraud dataset schema and quality.")
|
| 109 |
+
parser.add_argument(
|
| 110 |
+
"--data-path",
|
| 111 |
+
default="data/raw/creditcard.csv",
|
| 112 |
+
help="Path to the raw CSV dataset.",
|
| 113 |
+
)
|
| 114 |
+
parser.add_argument(
|
| 115 |
+
"--report-path",
|
| 116 |
+
default="artifacts/data_validation.json",
|
| 117 |
+
help="Path to write the validation report JSON.",
|
| 118 |
+
)
|
| 119 |
+
return parser
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def main() -> None:
|
| 123 |
+
args = _build_parser().parse_args()
|
| 124 |
+
report = run_data_validation(args.data_path, args.report_path)
|
| 125 |
+
print("Data validation passed.")
|
| 126 |
+
print(json.dumps(report["statistics"], indent=2))
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
if __name__ == "__main__":
|
| 130 |
+
main()
|
src/evaluate.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Model evaluation utilities."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
from sklearn.metrics import (
|
| 9 |
+
average_precision_score,
|
| 10 |
+
confusion_matrix,
|
| 11 |
+
f1_score,
|
| 12 |
+
precision_score,
|
| 13 |
+
recall_score,
|
| 14 |
+
roc_auc_score,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _safe_roc_auc(y_true, y_pred_proba) -> float:
|
| 19 |
+
try:
|
| 20 |
+
return float(roc_auc_score(y_true, y_pred_proba))
|
| 21 |
+
except ValueError:
|
| 22 |
+
return float("nan")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _safe_pr_auc(y_true, y_pred_proba) -> float:
|
| 26 |
+
try:
|
| 27 |
+
return float(average_precision_score(y_true, y_pred_proba))
|
| 28 |
+
except ValueError:
|
| 29 |
+
return float("nan")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def calculate_metrics(y_true, y_pred, y_pred_proba) -> dict[str, Any]:
|
| 33 |
+
"""Calculate classification metrics used for model comparison."""
|
| 34 |
+
cm = confusion_matrix(y_true, y_pred)
|
| 35 |
+
return {
|
| 36 |
+
"precision": float(precision_score(y_true, y_pred, zero_division=0)),
|
| 37 |
+
"recall": float(recall_score(y_true, y_pred, zero_division=0)),
|
| 38 |
+
"f1": float(f1_score(y_true, y_pred, zero_division=0)),
|
| 39 |
+
"roc_auc": _safe_roc_auc(y_true, y_pred_proba),
|
| 40 |
+
"pr_auc": _safe_pr_auc(y_true, y_pred_proba),
|
| 41 |
+
"confusion_matrix": cm.tolist(),
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def rank_models(results: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
| 46 |
+
"""Sort candidate model results by recall, then precision, then roc_auc."""
|
| 47 |
+
return sorted(
|
| 48 |
+
results,
|
| 49 |
+
key=lambda r: (r["metrics"]["recall"], r["metrics"]["precision"], r["metrics"]["roc_auc"]),
|
| 50 |
+
reverse=True,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def calculate_metrics_at_threshold(
|
| 55 |
+
y_true,
|
| 56 |
+
y_pred_proba,
|
| 57 |
+
*,
|
| 58 |
+
threshold: float,
|
| 59 |
+
) -> dict[str, Any]:
|
| 60 |
+
"""Compute metrics using a probability threshold."""
|
| 61 |
+
y_pred = (np.asarray(y_pred_proba) >= threshold).astype(int)
|
| 62 |
+
metrics = calculate_metrics(y_true, y_pred, y_pred_proba)
|
| 63 |
+
metrics["threshold"] = float(threshold)
|
| 64 |
+
return metrics
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def evaluate_thresholds(
|
| 68 |
+
y_true,
|
| 69 |
+
y_pred_proba,
|
| 70 |
+
*,
|
| 71 |
+
thresholds: list[float] | None = None,
|
| 72 |
+
min_threshold: float = 0.01,
|
| 73 |
+
max_threshold: float = 0.99,
|
| 74 |
+
grid_size: int = 99,
|
| 75 |
+
) -> list[dict[str, Any]]:
|
| 76 |
+
"""Evaluate model metrics across threshold grid."""
|
| 77 |
+
if thresholds is None:
|
| 78 |
+
thresholds = np.linspace(min_threshold, max_threshold, grid_size).tolist()
|
| 79 |
+
return [
|
| 80 |
+
calculate_metrics_at_threshold(y_true, y_pred_proba, threshold=t)
|
| 81 |
+
for t in thresholds
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def select_best_threshold(
|
| 86 |
+
y_true,
|
| 87 |
+
y_pred_proba,
|
| 88 |
+
*,
|
| 89 |
+
min_recall: float = 0.90,
|
| 90 |
+
min_threshold: float = 0.01,
|
| 91 |
+
max_threshold: float = 0.99,
|
| 92 |
+
grid_size: int = 99,
|
| 93 |
+
) -> dict[str, Any]:
|
| 94 |
+
"""Select threshold by maximizing precision while meeting recall target."""
|
| 95 |
+
evaluations = evaluate_thresholds(
|
| 96 |
+
y_true,
|
| 97 |
+
y_pred_proba,
|
| 98 |
+
min_threshold=min_threshold,
|
| 99 |
+
max_threshold=max_threshold,
|
| 100 |
+
grid_size=grid_size,
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
feasible = [m for m in evaluations if m["recall"] >= min_recall]
|
| 104 |
+
search_space = feasible if feasible else evaluations
|
| 105 |
+
selection_reason = "meets_min_recall" if feasible else "fallback_max_recall"
|
| 106 |
+
|
| 107 |
+
best = sorted(
|
| 108 |
+
search_space,
|
| 109 |
+
key=lambda m: (m["precision"], m["f1"], m["recall"]),
|
| 110 |
+
reverse=True,
|
| 111 |
+
)[0]
|
| 112 |
+
|
| 113 |
+
return {
|
| 114 |
+
"selection_reason": selection_reason,
|
| 115 |
+
"min_recall_target": float(min_recall),
|
| 116 |
+
"selected_threshold": float(best["threshold"]),
|
| 117 |
+
"selected_metrics": best,
|
| 118 |
+
"threshold_grid_size": int(grid_size),
|
| 119 |
+
"thresholds_evaluated": evaluations,
|
| 120 |
+
}
|
src/predict.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Batch/single prediction helper functions."""
|
src/preprocessing.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Training/inference preprocessing pipeline utilities."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
import joblib
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from imblearn.over_sampling import SMOTE
|
| 12 |
+
from sklearn.compose import ColumnTransformer
|
| 13 |
+
from sklearn.model_selection import train_test_split
|
| 14 |
+
from sklearn.preprocessing import StandardScaler
|
| 15 |
+
from sklearn.utils.class_weight import compute_class_weight
|
| 16 |
+
|
| 17 |
+
TARGET_COLUMN = "Class"
|
| 18 |
+
SCALE_COLUMNS = ["Time", "Amount"]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def split_data(
|
| 22 |
+
df: pd.DataFrame,
|
| 23 |
+
*,
|
| 24 |
+
target_column: str = TARGET_COLUMN,
|
| 25 |
+
test_size: float = 0.2,
|
| 26 |
+
random_state: int = 42,
|
| 27 |
+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
|
| 28 |
+
"""Split dataframe into train/test with class stratification."""
|
| 29 |
+
if target_column not in df.columns:
|
| 30 |
+
raise ValueError(f"Missing target column: {target_column}")
|
| 31 |
+
if not 0 < test_size < 1:
|
| 32 |
+
raise ValueError("test_size must be between 0 and 1")
|
| 33 |
+
|
| 34 |
+
X = df.drop(columns=[target_column])
|
| 35 |
+
y = df[target_column]
|
| 36 |
+
|
| 37 |
+
return train_test_split(
|
| 38 |
+
X,
|
| 39 |
+
y,
|
| 40 |
+
test_size=test_size,
|
| 41 |
+
random_state=random_state,
|
| 42 |
+
stratify=y,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def scale_features(
|
| 47 |
+
df: pd.DataFrame,
|
| 48 |
+
*,
|
| 49 |
+
columns: list[str] | None = None,
|
| 50 |
+
scaler: StandardScaler | None = None,
|
| 51 |
+
) -> tuple[pd.DataFrame, StandardScaler]:
|
| 52 |
+
"""Scale selected columns and return transformed dataframe and scaler."""
|
| 53 |
+
scale_columns = columns or SCALE_COLUMNS
|
| 54 |
+
missing = [column for column in scale_columns if column not in df.columns]
|
| 55 |
+
if missing:
|
| 56 |
+
raise ValueError(f"Columns not found for scaling: {missing}")
|
| 57 |
+
|
| 58 |
+
local_scaler = scaler or StandardScaler()
|
| 59 |
+
result = df.copy()
|
| 60 |
+
result[scale_columns] = local_scaler.fit_transform(df[scale_columns])
|
| 61 |
+
return result, local_scaler
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def build_preprocessor(
|
| 65 |
+
feature_columns: list[str],
|
| 66 |
+
*,
|
| 67 |
+
scale_columns: list[str] | None = None,
|
| 68 |
+
) -> ColumnTransformer:
|
| 69 |
+
"""Build a column transformer for consistent training/inference transforms."""
|
| 70 |
+
chosen_scale_columns = scale_columns or SCALE_COLUMNS
|
| 71 |
+
missing = [column for column in chosen_scale_columns if column not in feature_columns]
|
| 72 |
+
if missing:
|
| 73 |
+
raise ValueError(f"Scale columns missing from features: {missing}")
|
| 74 |
+
|
| 75 |
+
preprocessor = ColumnTransformer(
|
| 76 |
+
transformers=[("scale", StandardScaler(), chosen_scale_columns)],
|
| 77 |
+
remainder="passthrough",
|
| 78 |
+
verbose_feature_names_out=False,
|
| 79 |
+
)
|
| 80 |
+
preprocessor.set_output(transform="pandas")
|
| 81 |
+
return preprocessor
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def transform_features(
|
| 85 |
+
preprocessor: ColumnTransformer,
|
| 86 |
+
X: pd.DataFrame,
|
| 87 |
+
) -> pd.DataFrame:
|
| 88 |
+
"""Transform feature dataframe using a fitted preprocessor."""
|
| 89 |
+
transformed = preprocessor.transform(X)
|
| 90 |
+
if not isinstance(transformed, pd.DataFrame):
|
| 91 |
+
transformed = pd.DataFrame(transformed, columns=preprocessor.get_feature_names_out())
|
| 92 |
+
return transformed
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def handle_imbalance(
|
| 96 |
+
X_train: pd.DataFrame,
|
| 97 |
+
y_train: pd.Series,
|
| 98 |
+
*,
|
| 99 |
+
method: str = "class_weight",
|
| 100 |
+
random_state: int = 42,
|
| 101 |
+
sampling_strategy: float = 0.5,
|
| 102 |
+
) -> tuple[pd.DataFrame, pd.Series, dict[str, Any]]:
|
| 103 |
+
"""Handle class imbalance using strategy selected by method."""
|
| 104 |
+
selected = method.lower()
|
| 105 |
+
if selected not in {"none", "class_weight", "smote"}:
|
| 106 |
+
raise ValueError("method must be one of: none, class_weight, smote")
|
| 107 |
+
|
| 108 |
+
if selected == "none":
|
| 109 |
+
return X_train, y_train, {"method": "none", "class_weight": None}
|
| 110 |
+
|
| 111 |
+
if selected == "class_weight":
|
| 112 |
+
classes = np.array(sorted(y_train.unique().tolist()))
|
| 113 |
+
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
|
| 114 |
+
class_weight = {int(label): float(weight) for label, weight in zip(classes, weights)}
|
| 115 |
+
return X_train, y_train, {"method": "class_weight", "class_weight": class_weight}
|
| 116 |
+
|
| 117 |
+
smote = SMOTE(random_state=random_state, sampling_strategy=sampling_strategy)
|
| 118 |
+
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
|
| 119 |
+
X_balanced = pd.DataFrame(X_resampled, columns=X_train.columns)
|
| 120 |
+
y_balanced = pd.Series(y_resampled, name=y_train.name)
|
| 121 |
+
return X_balanced, y_balanced, {"method": "smote", "class_weight": None}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def save_preprocessor(preprocessor: ColumnTransformer, output_path: str | Path) -> Path:
|
| 125 |
+
"""Persist fitted preprocessor to disk."""
|
| 126 |
+
path = Path(output_path)
|
| 127 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 128 |
+
joblib.dump(preprocessor, path)
|
| 129 |
+
return path
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def load_preprocessor(preprocessor_path: str | Path) -> ColumnTransformer:
|
| 133 |
+
"""Load persisted preprocessor from disk."""
|
| 134 |
+
return joblib.load(Path(preprocessor_path))
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def preprocess_for_training(
|
| 138 |
+
df: pd.DataFrame,
|
| 139 |
+
*,
|
| 140 |
+
target_column: str = TARGET_COLUMN,
|
| 141 |
+
test_size: float = 0.2,
|
| 142 |
+
random_state: int = 42,
|
| 143 |
+
imbalance_method: str = "class_weight",
|
| 144 |
+
preprocessor_path: str | Path = "models/preprocessor.pkl",
|
| 145 |
+
) -> dict[str, Any]:
|
| 146 |
+
"""Run train/test split, fit/transform preprocessor, and handle imbalance."""
|
| 147 |
+
X_train_raw, X_test_raw, y_train, y_test = split_data(
|
| 148 |
+
df,
|
| 149 |
+
target_column=target_column,
|
| 150 |
+
test_size=test_size,
|
| 151 |
+
random_state=random_state,
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
preprocessor = build_preprocessor(feature_columns=X_train_raw.columns.tolist())
|
| 155 |
+
preprocessor.fit(X_train_raw)
|
| 156 |
+
|
| 157 |
+
X_train = transform_features(preprocessor, X_train_raw)
|
| 158 |
+
X_test = transform_features(preprocessor, X_test_raw)
|
| 159 |
+
|
| 160 |
+
X_train_final, y_train_final, imbalance_metadata = handle_imbalance(
|
| 161 |
+
X_train,
|
| 162 |
+
y_train,
|
| 163 |
+
method=imbalance_method,
|
| 164 |
+
random_state=random_state,
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
save_preprocessor(preprocessor, preprocessor_path)
|
| 168 |
+
|
| 169 |
+
return {
|
| 170 |
+
"X_train": X_train_final,
|
| 171 |
+
"X_test": X_test,
|
| 172 |
+
"y_train": y_train_final,
|
| 173 |
+
"y_test": y_test,
|
| 174 |
+
"preprocessor": preprocessor,
|
| 175 |
+
"imbalance_metadata": imbalance_metadata,
|
| 176 |
+
}
|
src/register_model.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Model registration helpers (local/MLflow registry)."""
|
src/train.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Training entrypoint for fraud detection models with MLflow tracking."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
from datetime import datetime, timezone
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Any
|
| 10 |
+
|
| 11 |
+
import joblib
|
| 12 |
+
import mlflow
|
| 13 |
+
import pandas as pd
|
| 14 |
+
import yaml
|
| 15 |
+
from sklearn.linear_model import LogisticRegression
|
| 16 |
+
|
| 17 |
+
from src.data_ingestion import load_data, run_data_validation
|
| 18 |
+
from src.evaluate import calculate_metrics, rank_models, select_best_threshold
|
| 19 |
+
from src.preprocessing import preprocess_for_training
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
from xgboost import XGBClassifier
|
| 23 |
+
except Exception: # pragma: no cover - handled at runtime
|
| 24 |
+
XGBClassifier = None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
DEFAULT_CONFIG_PATH = Path("configs/train.yaml")
|
| 28 |
+
DEFAULT_DATA_PATH = Path("data/raw/creditcard.csv")
|
| 29 |
+
DEFAULT_MODEL_PATH = Path("models/model.pkl")
|
| 30 |
+
DEFAULT_PREPROCESSOR_PATH = Path("models/preprocessor.pkl")
|
| 31 |
+
DEFAULT_REPORT_PATH = Path("artifacts/model_training_report.json")
|
| 32 |
+
DEFAULT_MODEL_REPORT_PATH = Path("artifacts/model_report.json")
|
| 33 |
+
DEFAULT_VALIDATION_REPORT_PATH = Path("artifacts/data_validation.json")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def load_training_config(config_path: str | Path = DEFAULT_CONFIG_PATH) -> dict[str, Any]:
|
| 37 |
+
"""Load YAML training configuration."""
|
| 38 |
+
config = yaml.safe_load(Path(config_path).read_text(encoding="utf-8")) or {}
|
| 39 |
+
config.setdefault("experiment", {})
|
| 40 |
+
config.setdefault("training", {})
|
| 41 |
+
config.setdefault("mlflow", {})
|
| 42 |
+
return config
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def create_model(model_name: str, random_state: int) -> Any:
|
| 46 |
+
"""Create model instance from configured model name."""
|
| 47 |
+
if model_name == "logistic_regression":
|
| 48 |
+
return LogisticRegression(
|
| 49 |
+
max_iter=500,
|
| 50 |
+
solver="lbfgs",
|
| 51 |
+
class_weight="balanced",
|
| 52 |
+
random_state=random_state,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
if model_name == "xgboost":
|
| 56 |
+
if XGBClassifier is None:
|
| 57 |
+
raise RuntimeError("xgboost is not available in the environment")
|
| 58 |
+
return XGBClassifier(
|
| 59 |
+
n_estimators=300,
|
| 60 |
+
max_depth=5,
|
| 61 |
+
learning_rate=0.05,
|
| 62 |
+
subsample=0.9,
|
| 63 |
+
colsample_bytree=0.9,
|
| 64 |
+
eval_metric="logloss",
|
| 65 |
+
random_state=random_state,
|
| 66 |
+
n_jobs=2,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
raise ValueError(f"Unsupported model: {model_name}")
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def train_single_model(
|
| 73 |
+
model_name: str,
|
| 74 |
+
X_train: pd.DataFrame,
|
| 75 |
+
y_train: pd.Series,
|
| 76 |
+
X_test: pd.DataFrame,
|
| 77 |
+
y_test: pd.Series,
|
| 78 |
+
*,
|
| 79 |
+
random_state: int,
|
| 80 |
+
) -> tuple[Any, dict[str, Any]]:
|
| 81 |
+
"""Train one model and return model + metrics."""
|
| 82 |
+
model = create_model(model_name, random_state=random_state)
|
| 83 |
+
model.fit(X_train, y_train)
|
| 84 |
+
|
| 85 |
+
y_pred = model.predict(X_test)
|
| 86 |
+
y_pred_proba = model.predict_proba(X_test)[:, 1]
|
| 87 |
+
metrics = calculate_metrics(y_test, y_pred, y_pred_proba)
|
| 88 |
+
return model, metrics
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def log_run_to_mlflow(
|
| 92 |
+
*,
|
| 93 |
+
experiment_name: str,
|
| 94 |
+
model_name: str,
|
| 95 |
+
params: dict[str, Any],
|
| 96 |
+
metrics: dict[str, Any],
|
| 97 |
+
preprocessor_path: Path,
|
| 98 |
+
model_temp_path: Path,
|
| 99 |
+
artifact_dir: Path,
|
| 100 |
+
) -> str:
|
| 101 |
+
"""Log one training run to MLflow and return run id."""
|
| 102 |
+
mlflow.set_experiment(experiment_name)
|
| 103 |
+
with mlflow.start_run(run_name=model_name) as run:
|
| 104 |
+
mlflow.log_params(params)
|
| 105 |
+
metric_values = {k: v for k, v in metrics.items() if isinstance(v, float)}
|
| 106 |
+
mlflow.log_metrics(metric_values)
|
| 107 |
+
|
| 108 |
+
# Structured artifacts for debugging and reproducibility.
|
| 109 |
+
metrics_path = artifact_dir / f"metrics_{model_name}.json"
|
| 110 |
+
metrics_path.parent.mkdir(parents=True, exist_ok=True)
|
| 111 |
+
metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8")
|
| 112 |
+
|
| 113 |
+
mlflow.log_artifact(str(preprocessor_path), artifact_path="preprocessor")
|
| 114 |
+
mlflow.log_artifact(str(model_temp_path), artifact_path="model")
|
| 115 |
+
mlflow.log_artifact(str(metrics_path), artifact_path="metrics")
|
| 116 |
+
|
| 117 |
+
return run.info.run_id
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def save_model(model: Any, output_path: str | Path = DEFAULT_MODEL_PATH) -> Path:
|
| 121 |
+
"""Save model artifact to disk."""
|
| 122 |
+
path = Path(output_path)
|
| 123 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 124 |
+
joblib.dump(model, path)
|
| 125 |
+
return path
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def run_training_pipeline(
|
| 129 |
+
*,
|
| 130 |
+
config_path: str | Path = DEFAULT_CONFIG_PATH,
|
| 131 |
+
data_path: str | Path = DEFAULT_DATA_PATH,
|
| 132 |
+
model_path: str | Path = DEFAULT_MODEL_PATH,
|
| 133 |
+
preprocessor_path: str | Path = DEFAULT_PREPROCESSOR_PATH,
|
| 134 |
+
report_path: str | Path = DEFAULT_REPORT_PATH,
|
| 135 |
+
model_report_path: str | Path = DEFAULT_MODEL_REPORT_PATH,
|
| 136 |
+
validation_report_path: str | Path = DEFAULT_VALIDATION_REPORT_PATH,
|
| 137 |
+
) -> dict[str, Any]:
|
| 138 |
+
"""Execute end-to-end training and experiment tracking pipeline."""
|
| 139 |
+
config = load_training_config(config_path)
|
| 140 |
+
|
| 141 |
+
experiment_name = config["experiment"].get("name", "fraud-detection-baseline")
|
| 142 |
+
tracking_uri = config["mlflow"].get("tracking_uri", "file:./mlruns")
|
| 143 |
+
mlflow.set_tracking_uri(tracking_uri)
|
| 144 |
+
|
| 145 |
+
training_cfg = config["training"]
|
| 146 |
+
random_state = int(training_cfg.get("random_state", 42))
|
| 147 |
+
test_size = float(training_cfg.get("test_size", 0.2))
|
| 148 |
+
imbalance_method = str(training_cfg.get("imbalance_method", "class_weight"))
|
| 149 |
+
models = training_cfg.get("models") or [training_cfg.get("model", "logistic_regression")]
|
| 150 |
+
threshold_cfg = config.get("threshold", {})
|
| 151 |
+
min_recall_target = float(threshold_cfg.get("min_recall_target", 0.90))
|
| 152 |
+
threshold_grid_size = int(threshold_cfg.get("grid_size", 99))
|
| 153 |
+
threshold_min = float(threshold_cfg.get("min_threshold", 0.01))
|
| 154 |
+
threshold_max = float(threshold_cfg.get("max_threshold", 0.99))
|
| 155 |
+
|
| 156 |
+
run_data_validation(file_path=data_path, report_path=validation_report_path)
|
| 157 |
+
raw_df = load_data(data_path)
|
| 158 |
+
prep = preprocess_for_training(
|
| 159 |
+
raw_df,
|
| 160 |
+
test_size=test_size,
|
| 161 |
+
random_state=random_state,
|
| 162 |
+
imbalance_method=imbalance_method,
|
| 163 |
+
preprocessor_path=preprocessor_path,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
results: list[dict[str, Any]] = []
|
| 167 |
+
skipped_models: list[dict[str, str]] = []
|
| 168 |
+
artifact_dir = Path(report_path).parent
|
| 169 |
+
artifact_dir.mkdir(parents=True, exist_ok=True)
|
| 170 |
+
preprocessor_path_obj = Path(preprocessor_path)
|
| 171 |
+
for model_name in models:
|
| 172 |
+
try:
|
| 173 |
+
model, metrics = train_single_model(
|
| 174 |
+
model_name=model_name,
|
| 175 |
+
X_train=prep["X_train"],
|
| 176 |
+
y_train=prep["y_train"],
|
| 177 |
+
X_test=prep["X_test"],
|
| 178 |
+
y_test=prep["y_test"],
|
| 179 |
+
random_state=random_state,
|
| 180 |
+
)
|
| 181 |
+
except RuntimeError as exc:
|
| 182 |
+
skipped_models.append({"model_name": model_name, "reason": str(exc)})
|
| 183 |
+
continue
|
| 184 |
+
|
| 185 |
+
temp_model_path = Path(model_path).parent / f"{model_name}.pkl"
|
| 186 |
+
save_model(model, temp_model_path)
|
| 187 |
+
|
| 188 |
+
run_id = log_run_to_mlflow(
|
| 189 |
+
experiment_name=experiment_name,
|
| 190 |
+
model_name=model_name,
|
| 191 |
+
params={
|
| 192 |
+
"model_name": model_name,
|
| 193 |
+
"test_size": test_size,
|
| 194 |
+
"random_state": random_state,
|
| 195 |
+
"imbalance_method": imbalance_method,
|
| 196 |
+
},
|
| 197 |
+
metrics=metrics,
|
| 198 |
+
preprocessor_path=preprocessor_path_obj,
|
| 199 |
+
model_temp_path=temp_model_path,
|
| 200 |
+
artifact_dir=artifact_dir,
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
results.append({"model_name": model_name, "model": model, "metrics": metrics, "run_id": run_id})
|
| 204 |
+
|
| 205 |
+
if not results:
|
| 206 |
+
raise RuntimeError("No models were successfully trained.")
|
| 207 |
+
|
| 208 |
+
ranked = rank_models(results)
|
| 209 |
+
best = ranked[0]
|
| 210 |
+
y_test_proba_best = best["model"].predict_proba(prep["X_test"])[:, 1]
|
| 211 |
+
threshold_selection = select_best_threshold(
|
| 212 |
+
prep["y_test"],
|
| 213 |
+
y_test_proba_best,
|
| 214 |
+
min_recall=min_recall_target,
|
| 215 |
+
min_threshold=threshold_min,
|
| 216 |
+
max_threshold=threshold_max,
|
| 217 |
+
grid_size=threshold_grid_size,
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
model_report = {
|
| 221 |
+
"timestamp_utc": datetime.now(timezone.utc).isoformat(),
|
| 222 |
+
"best_model_name": best["model_name"],
|
| 223 |
+
"default_threshold_metrics": best["metrics"],
|
| 224 |
+
"threshold_selection": threshold_selection,
|
| 225 |
+
"evaluation_summary": {
|
| 226 |
+
"test_rows": int(len(prep["y_test"])),
|
| 227 |
+
"min_recall_target": min_recall_target,
|
| 228 |
+
"selection_reason": threshold_selection["selection_reason"],
|
| 229 |
+
},
|
| 230 |
+
}
|
| 231 |
+
model_report_path_obj = Path(model_report_path)
|
| 232 |
+
model_report_path_obj.parent.mkdir(parents=True, exist_ok=True)
|
| 233 |
+
model_report_path_obj.write_text(json.dumps(model_report, indent=2), encoding="utf-8")
|
| 234 |
+
|
| 235 |
+
final_model_path = save_model(best["model"], model_path)
|
| 236 |
+
|
| 237 |
+
report = {
|
| 238 |
+
"timestamp_utc": datetime.now(timezone.utc).isoformat(),
|
| 239 |
+
"experiment_name": experiment_name,
|
| 240 |
+
"tracking_uri": tracking_uri,
|
| 241 |
+
"data_path": str(data_path),
|
| 242 |
+
"preprocessor_path": str(preprocessor_path),
|
| 243 |
+
"model_path": str(final_model_path),
|
| 244 |
+
"model_report_path": str(model_report_path_obj),
|
| 245 |
+
"best_model": {
|
| 246 |
+
"model_name": best["model_name"],
|
| 247 |
+
"run_id": best["run_id"],
|
| 248 |
+
"metrics": best["metrics"],
|
| 249 |
+
"selected_threshold": threshold_selection["selected_threshold"],
|
| 250 |
+
"threshold_metrics": threshold_selection["selected_metrics"],
|
| 251 |
+
},
|
| 252 |
+
"all_results": [
|
| 253 |
+
{"model_name": entry["model_name"], "run_id": entry["run_id"], "metrics": entry["metrics"]}
|
| 254 |
+
for entry in ranked
|
| 255 |
+
],
|
| 256 |
+
"skipped_models": skipped_models,
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
report_path_obj = Path(report_path)
|
| 260 |
+
report_path_obj.parent.mkdir(parents=True, exist_ok=True)
|
| 261 |
+
report_path_obj.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
| 262 |
+
|
| 263 |
+
return report
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def _build_parser() -> argparse.ArgumentParser:
|
| 267 |
+
parser = argparse.ArgumentParser(description="Train fraud model and log to MLflow.")
|
| 268 |
+
parser.add_argument("--config", default=str(DEFAULT_CONFIG_PATH), help="Training config YAML path.")
|
| 269 |
+
parser.add_argument("--data-path", default=str(DEFAULT_DATA_PATH), help="Dataset CSV path.")
|
| 270 |
+
parser.add_argument("--model-path", default=str(DEFAULT_MODEL_PATH), help="Output model artifact path.")
|
| 271 |
+
parser.add_argument(
|
| 272 |
+
"--preprocessor-path",
|
| 273 |
+
default=str(DEFAULT_PREPROCESSOR_PATH),
|
| 274 |
+
help="Output preprocessor artifact path.",
|
| 275 |
+
)
|
| 276 |
+
parser.add_argument("--report-path", default=str(DEFAULT_REPORT_PATH), help="Training report JSON path.")
|
| 277 |
+
parser.add_argument(
|
| 278 |
+
"--model-report-path",
|
| 279 |
+
default=str(DEFAULT_MODEL_REPORT_PATH),
|
| 280 |
+
help="Model evaluation report JSON path.",
|
| 281 |
+
)
|
| 282 |
+
return parser
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def main() -> None:
|
| 286 |
+
args = _build_parser().parse_args()
|
| 287 |
+
report = run_training_pipeline(
|
| 288 |
+
config_path=args.config,
|
| 289 |
+
data_path=args.data_path,
|
| 290 |
+
model_path=args.model_path,
|
| 291 |
+
preprocessor_path=args.preprocessor_path,
|
| 292 |
+
report_path=args.report_path,
|
| 293 |
+
model_report_path=args.model_report_path,
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
best = report["best_model"]
|
| 297 |
+
print("Training completed.")
|
| 298 |
+
print(f"Best model: {best['model_name']}")
|
| 299 |
+
print(f"Selected threshold: {best['selected_threshold']:.4f}")
|
| 300 |
+
print(json.dumps(best["threshold_metrics"], indent=2))
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
if __name__ == "__main__":
|
| 304 |
+
main()
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
# Ensure repository root is importable in pytest (for `src.*` imports).
|
| 7 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 8 |
+
if str(ROOT) not in sys.path:
|
| 9 |
+
sys.path.insert(0, str(ROOT))
|
tests/test_api.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from fastapi import HTTPException
|
| 6 |
+
from fastapi.testclient import TestClient
|
| 7 |
+
|
| 8 |
+
from api.app import app, get_inference_service
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class DummyService:
|
| 12 |
+
threshold = 0.74
|
| 13 |
+
model_path = Path("models/model.pkl")
|
| 14 |
+
preprocessor_path = Path("models/preprocessor.pkl")
|
| 15 |
+
|
| 16 |
+
def predict_records(self, records):
|
| 17 |
+
outputs = []
|
| 18 |
+
for record in records:
|
| 19 |
+
amount = float(record["Amount"])
|
| 20 |
+
prob = 0.9 if amount > 200 else 0.1
|
| 21 |
+
outputs.append(
|
| 22 |
+
{
|
| 23 |
+
"is_fraud": prob >= self.threshold,
|
| 24 |
+
"fraud_probability": prob,
|
| 25 |
+
"risk_level": "high" if prob >= 0.7 else "low",
|
| 26 |
+
"threshold": self.threshold,
|
| 27 |
+
}
|
| 28 |
+
)
|
| 29 |
+
return outputs
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _transaction(amount: float = 10.0) -> dict[str, float]:
|
| 33 |
+
payload = {"Time": 0.0, "Amount": amount}
|
| 34 |
+
for i in range(1, 29):
|
| 35 |
+
payload[f"V{i}"] = 0.0
|
| 36 |
+
return payload
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def test_health_endpoint() -> None:
|
| 40 |
+
app.dependency_overrides[get_inference_service] = lambda: DummyService()
|
| 41 |
+
client = TestClient(app)
|
| 42 |
+
|
| 43 |
+
response = client.get("/health")
|
| 44 |
+
|
| 45 |
+
assert response.status_code == 200
|
| 46 |
+
body = response.json()
|
| 47 |
+
assert body["status"] == "ok"
|
| 48 |
+
assert body["model_loaded"] is True
|
| 49 |
+
app.dependency_overrides.clear()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def test_predict_endpoint_valid_payload() -> None:
|
| 53 |
+
app.dependency_overrides[get_inference_service] = lambda: DummyService()
|
| 54 |
+
client = TestClient(app)
|
| 55 |
+
|
| 56 |
+
response = client.post("/predict", json=_transaction(amount=350.0))
|
| 57 |
+
|
| 58 |
+
assert response.status_code == 200
|
| 59 |
+
body = response.json()
|
| 60 |
+
assert body["is_fraud"] is True
|
| 61 |
+
assert body["risk_level"] == "high"
|
| 62 |
+
assert response.headers.get("X-Request-ID")
|
| 63 |
+
app.dependency_overrides.clear()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def test_predict_endpoint_invalid_payload() -> None:
|
| 67 |
+
app.dependency_overrides[get_inference_service] = lambda: DummyService()
|
| 68 |
+
client = TestClient(app)
|
| 69 |
+
|
| 70 |
+
payload = _transaction()
|
| 71 |
+
payload.pop("V28")
|
| 72 |
+
response = client.post("/predict", json=payload)
|
| 73 |
+
|
| 74 |
+
assert response.status_code == 422
|
| 75 |
+
app.dependency_overrides.clear()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def test_batch_prediction_endpoint() -> None:
|
| 79 |
+
app.dependency_overrides[get_inference_service] = lambda: DummyService()
|
| 80 |
+
client = TestClient(app)
|
| 81 |
+
|
| 82 |
+
response = client.post(
|
| 83 |
+
"/predict/batch",
|
| 84 |
+
json={"transactions": [_transaction(20.0), _transaction(300.0)]},
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
assert response.status_code == 200
|
| 88 |
+
body = response.json()
|
| 89 |
+
assert len(body["predictions"]) == 2
|
| 90 |
+
assert body["predictions"][0]["is_fraud"] is False
|
| 91 |
+
assert body["predictions"][1]["is_fraud"] is True
|
| 92 |
+
app.dependency_overrides.clear()
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def test_metrics_endpoint_tracks_predictions_and_requests() -> None:
|
| 96 |
+
app.dependency_overrides[get_inference_service] = lambda: DummyService()
|
| 97 |
+
client = TestClient(app)
|
| 98 |
+
|
| 99 |
+
before = client.get("/metrics")
|
| 100 |
+
assert before.status_code == 200
|
| 101 |
+
before_body = before.json()
|
| 102 |
+
|
| 103 |
+
predict_response = client.post("/predict", json=_transaction(amount=350.0))
|
| 104 |
+
assert predict_response.status_code == 200
|
| 105 |
+
|
| 106 |
+
after = client.get("/metrics")
|
| 107 |
+
assert after.status_code == 200
|
| 108 |
+
after_body = after.json()
|
| 109 |
+
|
| 110 |
+
assert after_body["total_requests"] >= before_body["total_requests"] + 2
|
| 111 |
+
assert after_body["total_predictions"] >= before_body["total_predictions"] + 1
|
| 112 |
+
assert 0.0 <= after_body["error_rate"] <= 1.0
|
| 113 |
+
assert 0.0 <= after_body["fraud_prediction_rate"] <= 1.0
|
| 114 |
+
app.dependency_overrides.clear()
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def test_health_returns_503_when_service_unavailable() -> None:
|
| 118 |
+
def _raise():
|
| 119 |
+
raise HTTPException(status_code=503, detail="Model artifact not found")
|
| 120 |
+
|
| 121 |
+
app.dependency_overrides[get_inference_service] = _raise
|
| 122 |
+
client = TestClient(app)
|
| 123 |
+
|
| 124 |
+
response = client.get("/health")
|
| 125 |
+
|
| 126 |
+
assert response.status_code == 503
|
| 127 |
+
assert "Model artifact not found" in response.json()["detail"]
|
| 128 |
+
app.dependency_overrides.clear()
|
tests/test_data_ingestion.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import pytest
|
| 7 |
+
|
| 8 |
+
from src.data_ingestion import (
|
| 9 |
+
EXPECTED_COLUMNS,
|
| 10 |
+
load_data,
|
| 11 |
+
run_data_validation,
|
| 12 |
+
validate_data,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _valid_df() -> pd.DataFrame:
|
| 17 |
+
row = {column: 0.0 for column in EXPECTED_COLUMNS}
|
| 18 |
+
row["Class"] = 0
|
| 19 |
+
return pd.DataFrame([row])
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_load_data_reads_csv(tmp_path) -> None:
|
| 23 |
+
df = _valid_df()
|
| 24 |
+
data_path = tmp_path / "creditcard.csv"
|
| 25 |
+
df.to_csv(data_path, index=False)
|
| 26 |
+
|
| 27 |
+
loaded = load_data(data_path)
|
| 28 |
+
|
| 29 |
+
assert list(loaded.columns) == EXPECTED_COLUMNS
|
| 30 |
+
assert loaded.shape == (1, len(EXPECTED_COLUMNS))
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_validate_data_invalid_when_required_column_missing() -> None:
|
| 34 |
+
df = _valid_df().drop(columns=["Amount"])
|
| 35 |
+
|
| 36 |
+
report = validate_data(df)
|
| 37 |
+
|
| 38 |
+
assert report["is_valid"] is False
|
| 39 |
+
assert any("Missing required columns" in error for error in report["errors"])
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_validate_data_invalid_when_class_has_invalid_values() -> None:
|
| 43 |
+
df = _valid_df()
|
| 44 |
+
df.loc[0, "Class"] = 3
|
| 45 |
+
|
| 46 |
+
report = validate_data(df)
|
| 47 |
+
|
| 48 |
+
assert report["is_valid"] is False
|
| 49 |
+
assert any("Class contains invalid values" in error for error in report["errors"])
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def test_run_data_validation_writes_report_and_fails_fast(tmp_path) -> None:
|
| 53 |
+
invalid_df = _valid_df().drop(columns=["Class"])
|
| 54 |
+
data_path = tmp_path / "creditcard.csv"
|
| 55 |
+
report_path = tmp_path / "data_validation.json"
|
| 56 |
+
invalid_df.to_csv(data_path, index=False)
|
| 57 |
+
|
| 58 |
+
with pytest.raises(ValueError):
|
| 59 |
+
run_data_validation(data_path, report_path)
|
| 60 |
+
|
| 61 |
+
assert report_path.exists()
|
| 62 |
+
report = json.loads(report_path.read_text(encoding="utf-8"))
|
| 63 |
+
assert report["is_valid"] is False
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def test_run_data_validation_passes_for_valid_schema(tmp_path) -> None:
|
| 67 |
+
valid_df = _valid_df()
|
| 68 |
+
data_path = tmp_path / "creditcard.csv"
|
| 69 |
+
report_path = tmp_path / "data_validation.json"
|
| 70 |
+
valid_df.to_csv(data_path, index=False)
|
| 71 |
+
|
| 72 |
+
report = run_data_validation(data_path, report_path)
|
| 73 |
+
|
| 74 |
+
assert report["is_valid"] is True
|
| 75 |
+
assert report_path.exists()
|
tests/test_evaluate.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
from src.evaluate import (
|
| 6 |
+
calculate_metrics_at_threshold,
|
| 7 |
+
evaluate_thresholds,
|
| 8 |
+
select_best_threshold,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_calculate_metrics_at_threshold_contains_threshold() -> None:
|
| 13 |
+
y_true = np.array([0, 0, 1, 1])
|
| 14 |
+
y_prob = np.array([0.1, 0.4, 0.6, 0.9])
|
| 15 |
+
|
| 16 |
+
metrics = calculate_metrics_at_threshold(y_true, y_prob, threshold=0.5)
|
| 17 |
+
|
| 18 |
+
assert metrics["threshold"] == 0.5
|
| 19 |
+
assert 0.0 <= metrics["recall"] <= 1.0
|
| 20 |
+
assert 0.0 <= metrics["precision"] <= 1.0
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_evaluate_thresholds_returns_expected_grid_size() -> None:
|
| 24 |
+
y_true = np.array([0, 0, 1, 1])
|
| 25 |
+
y_prob = np.array([0.1, 0.4, 0.6, 0.9])
|
| 26 |
+
|
| 27 |
+
evaluated = evaluate_thresholds(y_true, y_prob, min_threshold=0.1, max_threshold=0.9, grid_size=9)
|
| 28 |
+
|
| 29 |
+
assert len(evaluated) == 9
|
| 30 |
+
assert evaluated[0]["threshold"] == 0.1
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_select_best_threshold_prefers_precision_under_recall_constraint() -> None:
|
| 34 |
+
y_true = np.array([0, 0, 0, 0, 1, 1, 1, 1])
|
| 35 |
+
y_prob = np.array([0.02, 0.15, 0.20, 0.30, 0.55, 0.65, 0.80, 0.95])
|
| 36 |
+
|
| 37 |
+
selected = select_best_threshold(
|
| 38 |
+
y_true,
|
| 39 |
+
y_prob,
|
| 40 |
+
min_recall=0.75,
|
| 41 |
+
min_threshold=0.1,
|
| 42 |
+
max_threshold=0.9,
|
| 43 |
+
grid_size=17,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
assert selected["selected_metrics"]["recall"] >= 0.75
|
| 47 |
+
assert 0.1 <= selected["selected_threshold"] <= 0.9
|
| 48 |
+
assert selected["selection_reason"] in {"meets_min_recall", "fallback_max_recall"}
|
tests/test_preprocessing.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
from src.preprocessing import (
|
| 8 |
+
SCALE_COLUMNS,
|
| 9 |
+
build_preprocessor,
|
| 10 |
+
handle_imbalance,
|
| 11 |
+
load_preprocessor,
|
| 12 |
+
preprocess_for_training,
|
| 13 |
+
save_preprocessor,
|
| 14 |
+
scale_features,
|
| 15 |
+
split_data,
|
| 16 |
+
transform_features,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@pytest.fixture
|
| 21 |
+
def sample_df() -> pd.DataFrame:
|
| 22 |
+
rng = np.random.default_rng(42)
|
| 23 |
+
rows = 200
|
| 24 |
+
fraud_count = 20
|
| 25 |
+
|
| 26 |
+
data: dict[str, np.ndarray] = {
|
| 27 |
+
"Time": rng.normal(loc=5000, scale=1000, size=rows),
|
| 28 |
+
"Amount": rng.normal(loc=120, scale=50, size=rows),
|
| 29 |
+
}
|
| 30 |
+
for i in range(1, 29):
|
| 31 |
+
data[f"V{i}"] = rng.normal(size=rows)
|
| 32 |
+
|
| 33 |
+
target = np.array([0] * (rows - fraud_count) + [1] * fraud_count)
|
| 34 |
+
rng.shuffle(target)
|
| 35 |
+
data["Class"] = target
|
| 36 |
+
|
| 37 |
+
return pd.DataFrame(data)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_split_data_is_stratified(sample_df: pd.DataFrame) -> None:
|
| 41 |
+
X_train, X_test, y_train, y_test = split_data(sample_df, test_size=0.2, random_state=42)
|
| 42 |
+
|
| 43 |
+
base_ratio = sample_df["Class"].mean()
|
| 44 |
+
train_ratio = y_train.mean()
|
| 45 |
+
test_ratio = y_test.mean()
|
| 46 |
+
|
| 47 |
+
assert X_train.shape[0] == 160
|
| 48 |
+
assert X_test.shape[0] == 40
|
| 49 |
+
assert abs(train_ratio - base_ratio) < 0.02
|
| 50 |
+
assert abs(test_ratio - base_ratio) < 0.02
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_scale_features_transforms_only_selected_columns(sample_df: pd.DataFrame) -> None:
|
| 54 |
+
features = sample_df.drop(columns=["Class"])
|
| 55 |
+
scaled, scaler = scale_features(features)
|
| 56 |
+
|
| 57 |
+
assert scaler is not None
|
| 58 |
+
for column in SCALE_COLUMNS:
|
| 59 |
+
assert abs(float(scaled[column].mean())) < 1e-6
|
| 60 |
+
|
| 61 |
+
assert np.allclose(features["V1"].values, scaled["V1"].values)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def test_handle_imbalance_smote_increases_minority_class(sample_df: pd.DataFrame) -> None:
|
| 65 |
+
X_train, _, y_train, _ = split_data(sample_df, test_size=0.2, random_state=42)
|
| 66 |
+
preprocessor = build_preprocessor(X_train.columns.tolist())
|
| 67 |
+
preprocessor.fit(X_train)
|
| 68 |
+
X_train_t = transform_features(preprocessor, X_train)
|
| 69 |
+
|
| 70 |
+
base_counts = y_train.value_counts().to_dict()
|
| 71 |
+
X_balanced, y_balanced, metadata = handle_imbalance(
|
| 72 |
+
X_train_t, y_train, method="smote", sampling_strategy=0.8
|
| 73 |
+
)
|
| 74 |
+
balanced_counts = y_balanced.value_counts().to_dict()
|
| 75 |
+
|
| 76 |
+
assert metadata["method"] == "smote"
|
| 77 |
+
assert balanced_counts[1] > base_counts[1]
|
| 78 |
+
assert X_balanced.shape[0] == y_balanced.shape[0]
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def test_preprocessor_save_load_roundtrip(sample_df: pd.DataFrame, tmp_path) -> None:
|
| 82 |
+
X_train, _, _, _ = split_data(sample_df, test_size=0.2, random_state=42)
|
| 83 |
+
preprocessor = build_preprocessor(X_train.columns.tolist())
|
| 84 |
+
preprocessor.fit(X_train)
|
| 85 |
+
|
| 86 |
+
path = tmp_path / "preprocessor.pkl"
|
| 87 |
+
save_preprocessor(preprocessor, path)
|
| 88 |
+
loaded = load_preprocessor(path)
|
| 89 |
+
|
| 90 |
+
transformed = transform_features(loaded, X_train.head(5))
|
| 91 |
+
assert list(transformed.columns) == X_train.columns.tolist()
|
| 92 |
+
assert transformed.shape == (5, X_train.shape[1])
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def test_preprocess_for_training_creates_artifact(sample_df: pd.DataFrame, tmp_path) -> None:
|
| 96 |
+
artifact = tmp_path / "preprocessor.pkl"
|
| 97 |
+
|
| 98 |
+
result = preprocess_for_training(
|
| 99 |
+
sample_df,
|
| 100 |
+
test_size=0.2,
|
| 101 |
+
random_state=42,
|
| 102 |
+
imbalance_method="class_weight",
|
| 103 |
+
preprocessor_path=artifact,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
assert artifact.exists()
|
| 107 |
+
assert result["X_train"].shape[1] == 30
|
| 108 |
+
assert result["X_test"].shape[1] == 30
|
| 109 |
+
assert result["imbalance_metadata"]["method"] == "class_weight"
|
| 110 |
+
assert result["imbalance_metadata"]["class_weight"] is not None
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def test_handle_imbalance_rejects_unknown_method(sample_df: pd.DataFrame) -> None:
|
| 114 |
+
X_train, _, y_train, _ = split_data(sample_df)
|
| 115 |
+
preprocessor = build_preprocessor(X_train.columns.tolist())
|
| 116 |
+
preprocessor.fit(X_train)
|
| 117 |
+
X_train_t = transform_features(preprocessor, X_train)
|
| 118 |
+
|
| 119 |
+
with pytest.raises(ValueError):
|
| 120 |
+
handle_imbalance(X_train_t, y_train, method="unknown")
|
tests/test_service.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import joblib
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
from api.service import InferenceService, load_inference_service, resolve_threshold
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class DummyPreprocessor:
|
| 14 |
+
feature_names_in_ = np.array(["Time", *[f"V{i}" for i in range(1, 29)], "Amount"])
|
| 15 |
+
|
| 16 |
+
def transform(self, frame: pd.DataFrame) -> pd.DataFrame:
|
| 17 |
+
return frame
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class DummyModel:
|
| 21 |
+
def predict_proba(self, frame: pd.DataFrame) -> np.ndarray:
|
| 22 |
+
probs = []
|
| 23 |
+
for amount in frame["Amount"].tolist():
|
| 24 |
+
if amount >= 300:
|
| 25 |
+
probs.append([0.1, 0.9])
|
| 26 |
+
elif amount >= 100:
|
| 27 |
+
probs.append([0.55, 0.45])
|
| 28 |
+
else:
|
| 29 |
+
probs.append([0.95, 0.05])
|
| 30 |
+
return np.array(probs)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _record(amount: float) -> dict[str, float]:
|
| 34 |
+
payload = {"Time": 0.0, "Amount": amount}
|
| 35 |
+
for i in range(1, 29):
|
| 36 |
+
payload[f"V{i}"] = 0.0
|
| 37 |
+
return payload
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_inference_service_predict_records_risk_levels() -> None:
|
| 41 |
+
service = InferenceService(
|
| 42 |
+
model=DummyModel(),
|
| 43 |
+
preprocessor=DummyPreprocessor(),
|
| 44 |
+
threshold=0.5,
|
| 45 |
+
model_path=Path("models/model.pkl"),
|
| 46 |
+
preprocessor_path=Path("models/preprocessor.pkl"),
|
| 47 |
+
feature_columns=["Time", *[f"V{i}" for i in range(1, 29)], "Amount"],
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
outputs = service.predict_records([_record(20), _record(120), _record(320)])
|
| 51 |
+
|
| 52 |
+
assert outputs[0]["risk_level"] == "low"
|
| 53 |
+
assert outputs[1]["risk_level"] == "medium"
|
| 54 |
+
assert outputs[2]["risk_level"] == "high"
|
| 55 |
+
assert outputs[2]["is_fraud"] is True
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def test_resolve_threshold_precedence(tmp_path) -> None:
|
| 59 |
+
training_report = tmp_path / "model_training_report.json"
|
| 60 |
+
model_report = tmp_path / "model_report.json"
|
| 61 |
+
config_path = tmp_path / "train.yaml"
|
| 62 |
+
|
| 63 |
+
config_path.write_text("threshold:\n decision_threshold: 0.51\n", encoding="utf-8")
|
| 64 |
+
model_report.write_text(
|
| 65 |
+
json.dumps({"threshold_selection": {"selected_threshold": 0.63}}), encoding="utf-8"
|
| 66 |
+
)
|
| 67 |
+
training_report.write_text(
|
| 68 |
+
json.dumps({"best_model": {"selected_threshold": 0.74}}), encoding="utf-8"
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
threshold = resolve_threshold(
|
| 72 |
+
training_report_path=training_report,
|
| 73 |
+
model_report_path=model_report,
|
| 74 |
+
config_path=config_path,
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
assert threshold == 0.74
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def test_load_inference_service_reads_artifacts_and_threshold(tmp_path) -> None:
|
| 81 |
+
load_inference_service.cache_clear()
|
| 82 |
+
|
| 83 |
+
model_path = tmp_path / "model.pkl"
|
| 84 |
+
preprocessor_path = tmp_path / "preprocessor.pkl"
|
| 85 |
+
training_report = tmp_path / "model_training_report.json"
|
| 86 |
+
|
| 87 |
+
joblib.dump(DummyModel(), model_path)
|
| 88 |
+
joblib.dump(DummyPreprocessor(), preprocessor_path)
|
| 89 |
+
training_report.write_text(
|
| 90 |
+
json.dumps({"best_model": {"selected_threshold": 0.66}}), encoding="utf-8"
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
service = load_inference_service(
|
| 94 |
+
model_path=str(model_path),
|
| 95 |
+
preprocessor_path=str(preprocessor_path),
|
| 96 |
+
training_report_path=str(training_report),
|
| 97 |
+
model_report_path=str(tmp_path / "missing_model_report.json"),
|
| 98 |
+
config_path=str(tmp_path / "missing_config.yaml"),
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
assert service.threshold == 0.66
|
| 102 |
+
outputs = service.predict_records([_record(300.0)])
|
| 103 |
+
assert outputs[0]["is_fraud"] is True
|
tests/test_smoke.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def test_smoke() -> None:
|
| 2 |
+
assert True
|
tests/test_training.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import yaml
|
| 8 |
+
|
| 9 |
+
from src.evaluate import rank_models
|
| 10 |
+
from src.train import run_training_pipeline, train_single_model
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _synthetic_df(rows: int = 160) -> pd.DataFrame:
|
| 14 |
+
rng = np.random.default_rng(7)
|
| 15 |
+
data: dict[str, np.ndarray] = {
|
| 16 |
+
"Time": rng.normal(loc=1000, scale=250, size=rows),
|
| 17 |
+
"Amount": rng.normal(loc=80, scale=20, size=rows),
|
| 18 |
+
}
|
| 19 |
+
for i in range(1, 29):
|
| 20 |
+
data[f"V{i}"] = rng.normal(size=rows)
|
| 21 |
+
|
| 22 |
+
y = np.zeros(rows, dtype=int)
|
| 23 |
+
fraud_indices = rng.choice(rows, size=max(8, rows // 20), replace=False)
|
| 24 |
+
y[fraud_indices] = 1
|
| 25 |
+
|
| 26 |
+
# Inject weak signal for separability.
|
| 27 |
+
data["Amount"][fraud_indices] += 40
|
| 28 |
+
data["V3"][fraud_indices] += 1.5
|
| 29 |
+
data["Class"] = y
|
| 30 |
+
return pd.DataFrame(data)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_rank_models_orders_by_recall_then_precision() -> None:
|
| 34 |
+
ranked = rank_models(
|
| 35 |
+
[
|
| 36 |
+
{"model_name": "a", "metrics": {"recall": 0.8, "precision": 0.9, "roc_auc": 0.9}},
|
| 37 |
+
{"model_name": "b", "metrics": {"recall": 0.9, "precision": 0.7, "roc_auc": 0.95}},
|
| 38 |
+
{"model_name": "c", "metrics": {"recall": 0.9, "precision": 0.8, "roc_auc": 0.85}},
|
| 39 |
+
]
|
| 40 |
+
)
|
| 41 |
+
assert [entry["model_name"] for entry in ranked] == ["c", "b", "a"]
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_train_single_model_returns_expected_metrics() -> None:
|
| 45 |
+
df = _synthetic_df(200)
|
| 46 |
+
X = df.drop(columns=["Class"])
|
| 47 |
+
y = df["Class"]
|
| 48 |
+
|
| 49 |
+
# Simple split for unit test.
|
| 50 |
+
X_train, X_test = X.iloc[:160], X.iloc[160:]
|
| 51 |
+
y_train, y_test = y.iloc[:160], y.iloc[160:]
|
| 52 |
+
|
| 53 |
+
_, metrics = train_single_model(
|
| 54 |
+
model_name="logistic_regression",
|
| 55 |
+
X_train=X_train,
|
| 56 |
+
y_train=y_train,
|
| 57 |
+
X_test=X_test,
|
| 58 |
+
y_test=y_test,
|
| 59 |
+
random_state=42,
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
assert set(metrics.keys()) == {"precision", "recall", "f1", "roc_auc", "pr_auc", "confusion_matrix"}
|
| 63 |
+
assert 0.0 <= metrics["recall"] <= 1.0
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def test_run_training_pipeline_creates_report_and_model(tmp_path) -> None:
|
| 67 |
+
df = _synthetic_df(240)
|
| 68 |
+
data_path = tmp_path / "creditcard.csv"
|
| 69 |
+
config_path = tmp_path / "train.yaml"
|
| 70 |
+
model_path = tmp_path / "best_model.pkl"
|
| 71 |
+
preprocessor_path = tmp_path / "preprocessor.pkl"
|
| 72 |
+
report_path = tmp_path / "training_report.json"
|
| 73 |
+
model_report_path = tmp_path / "model_report.json"
|
| 74 |
+
validation_report_path = tmp_path / "data_validation.json"
|
| 75 |
+
|
| 76 |
+
df.to_csv(data_path, index=False)
|
| 77 |
+
|
| 78 |
+
config = {
|
| 79 |
+
"experiment": {"name": "test-experiment"},
|
| 80 |
+
"training": {
|
| 81 |
+
"test_size": 0.2,
|
| 82 |
+
"random_state": 42,
|
| 83 |
+
"imbalance_method": "class_weight",
|
| 84 |
+
"models": ["logistic_regression"],
|
| 85 |
+
},
|
| 86 |
+
"mlflow": {"tracking_uri": f"file:{tmp_path / 'mlruns'}"},
|
| 87 |
+
}
|
| 88 |
+
config_path.write_text(yaml.safe_dump(config), encoding="utf-8")
|
| 89 |
+
|
| 90 |
+
report = run_training_pipeline(
|
| 91 |
+
config_path=config_path,
|
| 92 |
+
data_path=data_path,
|
| 93 |
+
model_path=model_path,
|
| 94 |
+
preprocessor_path=preprocessor_path,
|
| 95 |
+
report_path=report_path,
|
| 96 |
+
model_report_path=model_report_path,
|
| 97 |
+
validation_report_path=validation_report_path,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
assert model_path.exists()
|
| 101 |
+
assert preprocessor_path.exists()
|
| 102 |
+
assert report_path.exists()
|
| 103 |
+
assert model_report_path.exists()
|
| 104 |
+
assert validation_report_path.exists()
|
| 105 |
+
assert report["best_model"]["model_name"] == "logistic_regression"
|
| 106 |
+
assert 0.0 < report["best_model"]["selected_threshold"] < 1.0
|
| 107 |
+
|
| 108 |
+
stored = json.loads(report_path.read_text(encoding="utf-8"))
|
| 109 |
+
assert stored["best_model"]["run_id"]
|