Martinacap02 commited on
Commit
f7d11f7
·
0 Parent(s):

Init deploy branch for HF Space

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +2 -0
  2. .dvc/.gitignore +3 -0
  3. .dvc/config +6 -0
  4. .dvcignore +3 -0
  5. .github/workflows/pynblint.yml +67 -0
  6. .github/workflows/pytestAndGX.yml +60 -0
  7. .github/workflows/ruff-linter.yml +108 -0
  8. .gitignore +191 -0
  9. Dockerfile +41 -0
  10. Makefile +87 -0
  11. README.md +239 -0
  12. data/.gitignore +3 -0
  13. data/README.md +176 -0
  14. docs/.gitkeep +0 -0
  15. docs/CardioTrack_ML_Canvas.md +98 -0
  16. docs/Risk_Classification.md +101 -0
  17. dvc.lock +754 -0
  18. dvc.yaml +86 -0
  19. metrics/test/all/.gitignore +3 -0
  20. metrics/test/female/.gitignore +3 -0
  21. metrics/test/male/.gitignore +3 -0
  22. metrics/test/nosex/.gitignore +3 -0
  23. models/.gitignore +3 -0
  24. models/README.md +110 -0
  25. models/all/.gitignore +3 -0
  26. models/female/.gitignore +3 -0
  27. models/male/.gitignore +3 -0
  28. models/nosex/.gitignore +3 -0
  29. notebooks/.gitkeep +0 -0
  30. notebooks/1.0-mc-initial-data-exploration.ipynb +0 -0
  31. predicting_outcomes_in_heart_failure/__init__.py +1 -0
  32. predicting_outcomes_in_heart_failure/app/__init__.py +0 -0
  33. predicting_outcomes_in_heart_failure/app/main.py +183 -0
  34. predicting_outcomes_in_heart_failure/app/routers/cards.py +52 -0
  35. predicting_outcomes_in_heart_failure/app/routers/general.py +19 -0
  36. predicting_outcomes_in_heart_failure/app/routers/model_info.py +95 -0
  37. predicting_outcomes_in_heart_failure/app/routers/prediction.py +135 -0
  38. predicting_outcomes_in_heart_failure/app/schema.py +28 -0
  39. predicting_outcomes_in_heart_failure/app/utils.py +59 -0
  40. predicting_outcomes_in_heart_failure/app/wrapper.py +210 -0
  41. predicting_outcomes_in_heart_failure/config.py +129 -0
  42. predicting_outcomes_in_heart_failure/data/dataset.py +22 -0
  43. predicting_outcomes_in_heart_failure/data/preprocess.py +116 -0
  44. predicting_outcomes_in_heart_failure/data/split_data.py +114 -0
  45. predicting_outcomes_in_heart_failure/modeling/__init__.py +0 -0
  46. predicting_outcomes_in_heart_failure/modeling/evaluate.py +182 -0
  47. predicting_outcomes_in_heart_failure/modeling/explainability.py +202 -0
  48. predicting_outcomes_in_heart_failure/modeling/predict.py +135 -0
  49. predicting_outcomes_in_heart_failure/modeling/train.py +261 -0
  50. pyproject.toml +80 -0
.dockerignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ venv/
.dvc/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /config.local
2
+ /tmp
3
+ /cache
.dvc/config ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [core]
2
+ remote = origin
3
+
4
+ [remote "origin"]
5
+ url = s3://dvc
6
+ endpointurl = https://dagshub.com/se4ai2526-uniba/CardioTrack.s3
.dvcignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Add patterns of files dvc should ignore, which could improve
2
+ # the performance. Learn more at
3
+ # https://dvc.org/doc/user-guide/dvcignore
.github/workflows/pynblint.yml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Lint Notebooks (Pynblint)
2
+
3
+ on:
4
+ pull_request:
5
+ paths:
6
+ - 'notebooks/**/*.ipynb'
7
+
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+
13
+ jobs:
14
+ pynblint:
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - name: Checkout repository
18
+ uses: actions/checkout@v4
19
+ with:
20
+ fetch-depth: 0
21
+
22
+ - name: Install uv (for uvx)
23
+ uses: astral-sh/setup-uv@v3
24
+
25
+ - name: Cache uv
26
+ uses: actions/cache@v4
27
+ with:
28
+ path: ~/.cache/uv
29
+ key: uv-${{ runner.os }}-${{ hashFiles('pyproject.toml','uv.lock') }}
30
+ restore-keys: |
31
+ uv-${{ runner.os }}-
32
+
33
+ - name: Run pynblint on notebooks directory
34
+ id: pynblint
35
+ run: |
36
+ set -e
37
+ echo "Running pynblint via uvx (isolated & pinned)..."
38
+ uvx --from pynblint \
39
+ --with "click<8.2" \
40
+ --with "typer<0.12" \
41
+ --with "lxml[html_clean]" \
42
+ pynblint notebooks/ > pynblint_report.txt 2>&1 || true
43
+ cat pynblint_report.txt
44
+
45
+ - name: Check for violations
46
+ run: |
47
+ if grep -qiE "Traceback|ImportError|ModuleNotFoundError" pynblint_report.txt; then
48
+ echo "❌ pynblint error."
49
+ cat pynblint_report.txt
50
+ exit 1
51
+ elif grep -q "LINTING RESULTS" pynblint_report.txt; then
52
+ echo "⚠️ Pynblint found violations in notebooks"
53
+ echo "violations=true" >> $GITHUB_ENV
54
+ cat pynblint_report.txt
55
+ exit 1
56
+ else
57
+ echo "✅ No violations found"
58
+ echo "violations=false" >> $GITHUB_ENV
59
+ fi
60
+
61
+ - name: Upload pynblint report
62
+ if: always()
63
+ uses: actions/upload-artifact@v4
64
+ with:
65
+ name: pynblint-report
66
+ path: pynblint_report.txt
67
+ retention-days: 30
.github/workflows/pytestAndGX.yml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Pytest and GX Validation
2
+ on:
3
+ pull_request:
4
+ branches-ignore:
5
+ - main
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ jobs:
11
+ test:
12
+ runs-on: ubuntu-latest
13
+
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ with:
17
+ fetch-depth: 0
18
+
19
+ # Install uv and activate cache
20
+ - uses: astral-sh/setup-uv@v3
21
+
22
+ - name: Cache uv
23
+ uses: actions/cache@v4
24
+ with:
25
+ path: ~/.cache/uv
26
+ key: uv-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'uv.lock') }}
27
+
28
+ # Install all dependencies
29
+ - name: Sync dependencies
30
+ run: uv sync
31
+
32
+ #Install dvc
33
+ - name: Install DVC
34
+ run: |
35
+ uv pip install "dvc-s3" "boto3>=1.36.0" "botocore>=1.36.0"
36
+
37
+
38
+
39
+ - name: Configure DVC credentials
40
+ run: |
41
+ uv run dvc remote modify origin --local access_key_id ${{ secrets.DAGSHUB_TOKEN }}
42
+ uv run dvc remote modify origin --local secret_access_key ${{ secrets.DAGSHUB_TOKEN }}
43
+
44
+ - name: Download data and models from DagsHub
45
+ run: uv run dvc pull
46
+
47
+ # Run pytest tests
48
+ - name: Run pytest tests
49
+ run: |
50
+ set -euo pipefail
51
+ echo "Running pytest tests..."
52
+ uv run pytest tests/ -v --tb=short
53
+
54
+ # Run GX validation scripts
55
+ - name: Run GX validation scripts
56
+ run: |
57
+ set -euo pipefail
58
+ echo "Running GX validation scripts..."
59
+ uv run python tests/test_heart_data/raw_test.py
60
+ uv run python tests/test_heart_data/processed_test.py
.github/workflows/ruff-linter.yml ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Lint (Ruff)
2
+ on:
3
+ pull_request:
4
+
5
+
6
+ permissions:
7
+ contents: write
8
+
9
+ jobs:
10
+ ruff-check:
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ with:
16
+ fetch-depth: 0
17
+ ref: ${{ github.head_ref }}
18
+
19
+ # Install uv and activate cache
20
+ - uses: astral-sh/setup-uv@v3
21
+ - name: Cache uv
22
+ uses: actions/cache@v4
23
+ with:
24
+ path: ~/.cache/uv
25
+ key: uv-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'uv.lock') }}
26
+
27
+ # Install dev deps
28
+ - name: Sync dev deps
29
+ run: uv sync --dev
30
+
31
+ # find changed .py files
32
+ - name: Ruff on changed files (format then check)
33
+ id: ruff_check_changed_files
34
+ run: |
35
+ set -euo pipefail
36
+ BASE_REF="${{ github.base_ref }}"
37
+ git fetch --no-tags origin "$BASE_REF" --prune
38
+
39
+ git diff --name-only --diff-filter=ACMRT "origin/$BASE_REF...HEAD" > /tmp/changed_all.txt
40
+
41
+ CHANGED=$(grep -E '\.py$' /tmp/changed_all.txt || true)
42
+ if [ -z "$CHANGED" ]; then
43
+ echo "Any Modified file .py: skip Ruff."
44
+ echo "has_py_changes=false" >> $GITHUB_OUTPUT
45
+ exit 0
46
+ fi
47
+
48
+ echo "Modified Python Files:"
49
+ echo "$CHANGED" | sed 's/^/ - /'
50
+
51
+
52
+ echo "$CHANGED" > /tmp/changed_py.txt
53
+ echo "has_py_changes=true" >> $GITHUB_OUTPUT
54
+
55
+ # Autofix
56
+ - name: Ruff autofix on changed files
57
+ if: steps.ruff_check_changed_files.outputs.has_py_changes == 'true'
58
+ id: ruff_autofix
59
+ run: |
60
+ set -euo pipefail
61
+
62
+ CHANGED=$(cat /tmp/changed_py.txt)
63
+
64
+ echo "Ruff autofix on these files:"
65
+ echo "$CHANGED" | sed 's/^/ - /'
66
+
67
+ # --- FIX ---
68
+ uv run ruff format $CHANGED || true
69
+
70
+ # Lint con fix
71
+ uv run ruff check --fix $CHANGED || true
72
+
73
+ if uv run ruff check $CHANGED 2>&1 | tee /tmp/ruff_check_result.txt; then
74
+ echo "No remaining issues"
75
+ echo "has_remaining_issues=false" >> $GITHUB_OUTPUT
76
+ else
77
+ echo "Found remaining issues that cannot be auto-fixed"
78
+ echo "has_remaining_issues=true" >> $GITHUB_OUTPUT
79
+ fi
80
+
81
+
82
+ # Commit on pull request
83
+ - name: Commit and push changes
84
+ if: steps.ruff_check_changed_files.outputs.has_py_changes == 'true'
85
+ run: |
86
+ set -euo pipefail
87
+
88
+
89
+ if [ -z "$(git status --porcelain)" ]; then
90
+ echo "No changes to commit after Ruff autofix."
91
+ exit 0
92
+ fi
93
+
94
+ git config user.name "github-actions[bot]"
95
+ git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
96
+
97
+ git add .
98
+ git commit -m "chore: apply ruff check and format auto-fix"
99
+ git push origin HEAD:${{ github.head_ref }}
100
+
101
+
102
+ # Fail if there are remaining issues
103
+ - name: Fail if remaining issues
104
+ if: steps.ruff_check_changed_files.outputs.has_py_changes == 'true' && steps.ruff_autofix.outputs.has_remaining_issues == 'true'
105
+ run: |
106
+ echo "Found errors that cannot be auto-fixed:"
107
+ cat /tmp/ruff_check_result.txt
108
+ exit 1
.gitignore ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data
2
+ /data/raw/heart.csv
3
+
4
+ # Mac OS-specific storage files
5
+ .DS_Store
6
+
7
+ # vim
8
+ *.swp
9
+ *.swo
10
+
11
+ ## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore
12
+
13
+ # Byte-compiled / optimized / DLL files
14
+ __pycache__/
15
+ *.py[cod]
16
+ *$py.class
17
+
18
+ # C extensions
19
+ *.so
20
+
21
+ # Distribution / packaging
22
+ .Python
23
+ build/
24
+ develop-eggs/
25
+ dist/
26
+ downloads/
27
+ eggs/
28
+ .eggs/
29
+ lib/
30
+ lib64/
31
+ parts/
32
+ sdist/
33
+ var/
34
+ wheels/
35
+ share/python-wheels/
36
+ *.egg-info/
37
+ .installed.cfg
38
+ *.egg
39
+ MANIFEST
40
+
41
+ # PyInstaller
42
+ # Usually these files are written by a python script from a template
43
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
44
+ *.manifest
45
+ *.spec
46
+
47
+ # Installer logs
48
+ pip-log.txt
49
+ pip-delete-this-directory.txt
50
+
51
+ # Unit test / coverage reports
52
+ htmlcov/
53
+ .tox/
54
+ .nox/
55
+ .coverage
56
+ .coverage.*
57
+ .cache
58
+ nosetests.xml
59
+ coverage.xml
60
+ *.cover
61
+ *.py,cover
62
+ .hypothesis/
63
+ .pytest_cache/
64
+ cover/
65
+
66
+ # Translations
67
+ *.mo
68
+ *.pot
69
+
70
+ # Django stuff:
71
+ *.log
72
+ local_settings.py
73
+ db.sqlite3
74
+ db.sqlite3-journal
75
+
76
+ # Flask stuff:
77
+ instance/
78
+ .webassets-cache
79
+
80
+ # Scrapy stuff:
81
+ .scrapy
82
+
83
+ # MkDocs documentation
84
+ docs/site/
85
+
86
+ # PyBuilder
87
+ .pybuilder/
88
+ target/
89
+
90
+ # Jupyter Notebook
91
+ .ipynb_checkpoints
92
+
93
+ # IPython
94
+ profile_default/
95
+ ipython_config.py
96
+
97
+ # pyenv
98
+ # For a library or package, you might want to ignore these files since the code is
99
+ # intended to run in multiple environments; otherwise, check them in:
100
+ # .python-version
101
+
102
+ # pipenv
103
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
105
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
106
+ # install all needed dependencies.
107
+ #Pipfile.lock
108
+
109
+ # UV
110
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
111
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
112
+ # commonly ignored for libraries.
113
+ #uv.lock
114
+
115
+ # poetry
116
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
117
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
118
+ # commonly ignored for libraries.
119
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
120
+ #poetry.lock
121
+
122
+ # pdm
123
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
124
+ #pdm.lock
125
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
126
+ # in version control.
127
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
128
+ .pdm.toml
129
+ .pdm-python
130
+ .pdm-build/
131
+
132
+ # pixi
133
+ # pixi.lock should be committed to version control for reproducibility
134
+ # .pixi/ contains the environments and should not be committed
135
+ .pixi/
136
+
137
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
138
+ __pypackages__/
139
+
140
+ # Celery stuff
141
+ celerybeat-schedule
142
+ celerybeat.pid
143
+
144
+ # SageMath parsed files
145
+ *.sage.py
146
+
147
+ # Environments
148
+ .env
149
+ .venv
150
+ env/
151
+ venv/
152
+ ENV/
153
+ env.bak/
154
+ venv.bak/
155
+
156
+ # Spyder project settings
157
+ .spyderproject
158
+ .spyproject
159
+
160
+ # Rope project settings
161
+ .ropeproject
162
+
163
+ # mkdocs documentation
164
+ /site
165
+
166
+ # mypy
167
+ .mypy_cache/
168
+ .dmypy.json
169
+ dmypy.json
170
+
171
+ # Pyre type checker
172
+ .pyre/
173
+
174
+ # pytype static type analyzer
175
+ .pytype/
176
+
177
+ # Cython debug symbols
178
+ cython_debug/
179
+
180
+ # PyCharm
181
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
182
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
183
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
184
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
185
+ #.idea/
186
+
187
+ # Ruff stuff:
188
+ .ruff_cache/
189
+
190
+ # PyPI configuration file
191
+ .pypirc
Dockerfile ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.9-slim-bookworm
2
+
3
+ # avoid creating unnecessary .pyc, buffers, and pip caches
4
+ ENV PYTHONUNBUFFERED=1
5
+ ENV PYTHONDONTWRITEBYTECODE=1
6
+ ENV PIP_NO_CACHE_DIR=1
7
+ ENV PIP_DISABLE_PIP_VERSION_CHECK=1
8
+
9
+ # install curl and certificates needed to install uv
10
+ RUN apt-get update && apt-get install -y --no-install-recommends \
11
+ curl ca-certificates \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # create a non-root user for added security
15
+ RUN useradd -m -u 1000 user
16
+ USER user
17
+ ENV PATH="/home/user/.local/bin:$PATH"
18
+
19
+ WORKDIR /cardioTrack
20
+
21
+ # install uv as user
22
+ RUN curl -LsSf https://astral.sh/uv/install.sh | sh
23
+
24
+ # copy the project files and do uv sync
25
+ COPY --chown=user pyproject.toml uv.lock ./
26
+ RUN uv sync --locked --no-install-project
27
+
28
+ # copy the rest of the files needed for inference
29
+ COPY --chown=user predicting_outcomes_in_heart_failure ./predicting_outcomes_in_heart_failure
30
+ COPY --chown=user models/nosex/random_forest.joblib ./models/nosex/random_forest.joblib
31
+ COPY --chown=user reports/nosex/random_forest/cv_parameters.json ./reports/nosex/random_forest/cv_parameters.json
32
+ COPY --chown=user data/interim/preprocess_artifacts/scaler.joblib ./data/interim/preprocess_artifacts/scaler.joblib
33
+ COPY --chown=user metrics/test/nosex/random_forest.json ./metrics/test/nosex/random_forest.json
34
+ COPY --chown=user README.md ./README.md
35
+ COPY --chown=user models/README.md ./models/README.md
36
+ COPY --chown=user data/README.md ./data/README.md
37
+
38
+
39
+ EXPOSE 7860
40
+
41
+ CMD ["uv", "run", "uvicorn", "predicting_outcomes_in_heart_failure.app.main:app", "--host", "0.0.0.0", "--port", "7860"]
Makefile ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #################################################################################
2
+ # GLOBALS #
3
+ #################################################################################
4
+
5
+ PROJECT_NAME = CardioTrack
6
+ PYTHON_VERSION = 3.11
7
+ PYTHON_INTERPRETER = python
8
+
9
+ #################################################################################
10
+ # COMMANDS #
11
+ #################################################################################
12
+
13
+
14
+ ## Install Python dependencies
15
+ .PHONY: requirements
16
+ requirements:
17
+ uv sync
18
+
19
+
20
+
21
+
22
+ ## Delete all compiled Python files
23
+ .PHONY: clean
24
+ clean:
25
+ find . -type f -name "*.py[co]" -delete
26
+ find . -type d -name "__pycache__" -delete
27
+
28
+
29
+ ## Lint using ruff (use `make format` to do formatting)
30
+ .PHONY: lint
31
+ lint:
32
+ ruff format --check
33
+ ruff check
34
+
35
+ ## Format source code with ruff
36
+ .PHONY: format
37
+ format:
38
+ ruff check --fix
39
+ ruff format
40
+
41
+
42
+
43
+ ## Run tests
44
+ .PHONY: test
45
+ test:
46
+ python -m pytest tests
47
+
48
+
49
+ ## Set up Python interpreter environment
50
+ .PHONY: create_environment
51
+ create_environment:
52
+ uv venv --python $(PYTHON_VERSION)
53
+ @echo ">>> New uv virtual environment created. Activate with:"
54
+ @echo ">>> Windows: .\\\\.venv\\\\Scripts\\\\activate"
55
+ @echo ">>> Unix/macOS: source ./.venv/bin/activate"
56
+
57
+
58
+
59
+
60
+ #################################################################################
61
+ # PROJECT RULES #
62
+ #################################################################################
63
+
64
+
65
+ ## Make dataset
66
+ .PHONY: data
67
+ data: requirements
68
+ $(PYTHON_INTERPRETER) predicting_outcomes_in_heart_failure/dataset.py
69
+
70
+
71
+ #################################################################################
72
+ # Self Documenting Commands #
73
+ #################################################################################
74
+
75
+ .DEFAULT_GOAL := help
76
+
77
+ define PRINT_HELP_PYSCRIPT
78
+ import re, sys; \
79
+ lines = '\n'.join([line for line in sys.stdin]); \
80
+ matches = re.findall(r'\n## (.*)\n[\s\S]+?\n([a-zA-Z_-]+):', lines); \
81
+ print('Available rules:\n'); \
82
+ print('\n'.join(['{:25}{}'.format(*reversed(match)) for match in matches]))
83
+ endef
84
+ export PRINT_HELP_PYSCRIPT
85
+
86
+ help:
87
+ @$(PYTHON_INTERPRETER) -c "${PRINT_HELP_PYSCRIPT}" < $(MAKEFILE_LIST)
README.md ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: CardioTrack API
3
+ emoji: ❤️
4
+ colorFrom: purple
5
+ colorTo: gray
6
+ sdk: docker
7
+ app_port: 7860
8
+ ---
9
+ # Predicting Outcomes in Heart Failure
10
+
11
+ ## Table of Contents
12
+ 1. [Project Overview](#project-overview)
13
+ 2. [Project Organization](#project-organization)
14
+ 3. [DVC Pipeline Defined](#dvc-pipeline-defined)
15
+ 4. [Milestones Summary](#milestones-summary)
16
+ - [Milestone 1 - Inception](#milestone-1---inception)
17
+ - [Milestone 2 - Reproducibility](#milestone-2---reproducibility)
18
+ - [Milestone 3 - Quality Assurance](#milestone-3---quality-assurance)
19
+ - [Milestone 4 - API Integration](#milestone-4---API-Integration)
20
+
21
+ ## Project Overview
22
+ <a target="_blank" href="https://cookiecutter-data-science.drivendata.org/">
23
+ <img src="https://img.shields.io/badge/CCDS-Project%20template-328F97?logo=cookiecutter" />
24
+ </a>
25
+
26
+ This project develops a predictive pipeline for patient outcome prediction in heart failure, using a publicly available dataset of clinical records. The goal is to design and evaluate machine learning models within a reproducible workflow that can be integrated into larger systems for clinical decision support. The workflow addresses data heterogeneity, defines consistent preprocessing and feature engineering strategies, and explores alternative modeling approaches with systematic evaluation using clinically relevant metrics. It also emphasizes model transparency and auditability, ensuring that the resulting pipeline can be deployed as a reliable, adaptable software component in healthcare applications. The project aims not only to improve baseline predictive performance but also to demonstrate how data-driven models can be effectively integrated into end-to-end AI-enabled healthcare systems.
27
+
28
+ ## Project Organization
29
+
30
+ ```
31
+ ├── LICENSE <- Open-source license if one is chosen
32
+ ├── Makefile <- Makefile with convenience commands like `make data` or `make train`
33
+ ├── README.md <- The top-level README for developers using this project.
34
+ ├── data
35
+ │ ├── external <- Data from third party sources.
36
+ │ ├── interim <- Intermediate data that has been transformed.
37
+ │ ├── processed <- The final, canonical data sets for modeling.
38
+ │ └── raw <- The original, immutable data dump.
39
+
40
+ ├── docs <- A default mkdocs project; see www.mkdocs.org for details
41
+
42
+ ├── models <- Trained and serialized models, model predictions, or model summaries
43
+
44
+ ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering),
45
+ │ the creator's initials, and a short `-` delimited description, e.g.
46
+ │ `1.0-jqp-initial-data-exploration`.
47
+
48
+ ├── pyproject.toml <- Project configuration file with package metadata for
49
+ │ predicting_outcomes_in_heart_failure and configuration for tools like black
50
+
51
+ ├── references <- Data dictionaries, manuals, and all other explanatory materials.
52
+
53
+ ├── reports <- Generated analysis as HTML, PDF, LaTeX, etc.
54
+ │ └── figures <- Generated graphics and figures to be used in reporting
55
+
56
+ ├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g.
57
+ │ generated with `pip freeze > requirements.txt`
58
+
59
+ ├── setup.cfg <- Configuration file for flake8
60
+
61
+ └── predicting_outcomes_in_heart_failure <- Source code for use in this project.
62
+
63
+ ├── __init__.py <- Makes predicting_outcomes_in_heart_failure a Python module
64
+
65
+ ├── config.py <- Store useful variables and configuration
66
+
67
+ ├── data
68
+ │ ├── __init__.py
69
+ │ ├── dataset.py <- Scripts to download or generate data
70
+ | ├── preprocess.py <- Data preprocessing code
71
+ │ └── split_data.py <- Split dataset into train and test code
72
+
73
+ ├── features.py <- Code to create features for modeling
74
+
75
+ ├── modeling
76
+ │ ├── __init__.py
77
+ │ ├── predict.py <- Code to run model inference with trained models
78
+ │ └── train.py <- Code to train models
79
+
80
+ └── plots.py <- Code to create visualizations
81
+ ```
82
+
83
+ ## DVC Pipeline defined
84
+ ```
85
+ +---------------+
86
+ | download_data |
87
+ +---------------+
88
+ *
89
+ *
90
+ *
91
+ +---------------+
92
+ | preprocessing |
93
+ +---------------+
94
+ *
95
+ *
96
+ *
97
+ +------------+
98
+ | split_data |
99
+ +------------+
100
+ *** ***
101
+ * *
102
+ ** ***
103
+ +----------+ *
104
+ | training | ***
105
+ +----------+ *
106
+ *** ***
107
+ * *
108
+ ** **
109
+ +------------+
110
+ | evaluation |
111
+ +------------+
112
+ ```
113
+
114
+ ## Milestones Summary
115
+
116
+ ### Milestone 1 - Inception
117
+ During this milestone, the **CCDS Project Template** was used as the foundation for organizing the project.
118
+ The main conceptual and structural components of the system were defined, following the template guidelines to ensure consistency and traceability.
119
+
120
+ Additionally, a **Machine Learning Canvas** has been added in the [`docs/`](./docs) folder.
121
+ It outlines the model objectives, the data to be used, and the key methodological aspects planned for the next phases of the project.
122
+
123
+ ### Milestone 2 - Reproducibility
124
+ Milestone-2 introduces **reproducibility**, from **data management** to **model training and evaluation**. This includes a fully automated pipeline, experiment tracking, and model registry integration, ensuring every step can be consistently reproduced and monitored.
125
+
126
+ #### Exploratory Data Analysis (EDA)
127
+ As part of the early steps, we added and refined an **Exploratory Data Analysis** to better understand the dataset, its distribution, and relationships between variables. This helped define the preprocessing and modeling strategies used later.
128
+
129
+ #### DVC Initialization and Pipeline Setup
130
+ We initialized **DVC** and configured a full pipeline to automate the main steps of the ML workflow:
131
+ - Automatic data **download**
132
+ - **Preprocessing**
133
+ - **Data splitting**
134
+ - **Training** and **evaluation**
135
+
136
+ The pipeline is fully reproducible and version-controlled through DVC.
137
+
138
+ #### Model Training and Experiment Tracking
139
+ We implemented the **training scripts** and integrated **MLflow** for experiment tracking.
140
+ Three models are trained and evaluated within this workflow:
141
+ - Decision Tree
142
+ - Random Forest
143
+ - Logistic Regression
144
+
145
+ Each experiment is logged to MLflow.
146
+
147
+ #### Model Registry and Thresholds
148
+ Models that reach or exceed the predefined **performance thresholds** (as defined in the ML Canvas) are automatically **saved to the model registry**.
149
+
150
+ ### Milestone 3 – Quality Assurance
151
+
152
+ In this milestone, we introduced **Quality Assurance** layer to the system.
153
+
154
+ #### Static Linters
155
+ Two static linters were added to improve code style and consistency:
156
+
157
+ - **Ruff** for Python files in the `predicting_outcomes_in_heart_failure` and `tests` folders.
158
+ It checks formatting, syntax, and common anti-patterns, and is integrated into the GitHub workflow via an *action*.
159
+ - **Pynblint** for Jupyter notebooks, also integrated into the GitHub workflow through a dedicated *action*.
160
+
161
+ #### Data Quality
162
+ We implemented **data quality checks** on both raw and processed data using **Great Expectations**.
163
+ These validations help to:
164
+
165
+ - detect anomalies or invalid values at the data source
166
+ - prevent the propagation of data issues into downstream processes
167
+
168
+ #### Code Quality
169
+ We added automated **unit and integration tests** using **pytest**, covering the main modules and functionalities of the system.
170
+
171
+
172
+ #### ML Pipeline Enhancements
173
+ we applied the following enhancements to the ML pipeline:
174
+
175
+ - Refactored preprocessing with gender-based dataset variants.
176
+ - Added validation (e.g., error on single-row datasets).
177
+ - Saved StandardScaler as preprocessing artifact.
178
+ - Updated split logic and DVC pipeline.
179
+ - Training now creates variant-specific MLflow experiments.
180
+ - Added RandomOverSampler to address class imbalance.
181
+ - Updated evaluation and inference to align with the new structure.
182
+
183
+ #### Explainability
184
+ We applied an explainability module:
185
+
186
+ - Added SHAP explainability module.
187
+ - Added tests for explainability functionality.
188
+
189
+
190
+ #### Risk Classification
191
+ We added a **Risk Classification** analysis for the system in accordance with **IMDRF** and **AI Act** regulations.
192
+ The documentation is available in the [`docs/`](./docs) folder.
193
+
194
+ Ecco la versione finale **in Markdown puro**, già formattata correttamente:
195
+
196
+
197
+ ### Milestone 4 - API Integration
198
+
199
+ During Milestone 4, we implemented a fully functional API and Dataset Card and Model card for the champion model and the following used dataset.
200
+ APIs are structured into four main routers:
201
+
202
+
203
+ #### **General Router**
204
+ - **GET /**
205
+ Returns a welcome message and confirms that the API is running.
206
+
207
+
208
+ #### **Prediction Router**
209
+ - **POST /predictions**
210
+ Generates a binary prediction (0/1) for a single patient sample.
211
+
212
+ - **POST /predict-batch**
213
+ Accepts a list of patient samples and returns a prediction for each element in the batch.
214
+
215
+ - **POST /explanations**
216
+ Produces SHAP-based explanations for a single input and returns the URL of the generated SHAP waterfall plot.
217
+
218
+
219
+ #### **Model Info Router**
220
+ - **GET /model/hyperparameters**
221
+ Returns the hyperparameters and cross-validation results of the model defined in `MODEL_PATH`.
222
+
223
+ - **GET /model/metrics**
224
+ Returns the test-set metrics stored during the model evaluation stage.
225
+
226
+
227
+ #### **Cards Router**
228
+ - **GET /card/{card_type}**
229
+ Returns the content of a “card” file (dataset card or model card).
230
+
231
+
232
+ ### **Cards**
233
+
234
+ During this milestone, we also created:
235
+
236
+ - a **dataset card** describing the dataset used by the champion model
237
+ - a **model card** documenting the champion model itself
238
+
239
+
data/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /processed
2
+ /interim
3
+ /raw
data/README.md ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset Card
2
+
3
+ ## Table of Contents
4
+ - [Dataset Description](#dataset-description)
5
+ - [Dataset Summary](#dataset-summary)
6
+ - [Supported Tasks](#supported-tasks)
7
+ - [Languages](#languages)
8
+ - [Dataset Structure](#dataset-structure)
9
+ - [Data Instances](#data-instances)
10
+ - [Data Fields](#data-fields)
11
+ - [Dataset Creation](#dataset-creation)
12
+ - [Source Data](#source-data)
13
+ - [Annotations](#annotations)
14
+ - [Personal and Sensitive Information](#personal-and-sensitive-information)
15
+ - [Considerations for Using the Data](#considerations-for-using-the-data)
16
+ - [Social Impact of Dataset](#social-impact-of-dataset)
17
+ - [Discussion of Biases](#discussion-of-biases)
18
+ - [Additional Information](#additional-information)
19
+ - [Dataset Curators](#dataset-curators)
20
+ - [Citation Information](#citation-information)
21
+
22
+
23
+
24
+ ## Dataset Description
25
+
26
+ - **Homepage:** https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction
27
+
28
+
29
+ ### Dataset Summary
30
+
31
+ This dataset contains anonymized clinical data used to predict the risk of heart failure.
32
+ It includes **918 patient records**, **11 clinical features**, and **one target variable**.
33
+ The original dataset was downloaded from Kaggle and was created by merging five well-known cardiology datasets.
34
+
35
+ The version used in this project underwent additional preprocessing steps, including standardization, normalization, categorical encoding, and removal of the Sex feature. The resulting dataset is used for experimentation and model development.
36
+
37
+
38
+
39
+ ### Supported Tasks
40
+
41
+ This dataset can be used for a variety of machine learning tasks, including:
42
+
43
+ - **Binary Classification**
44
+
45
+ Predicting whether a patient has heart disease.
46
+ - **Risk Scoring / Clinical Risk Stratification**
47
+
48
+ Estimating cardiac risk based on clinical variables.
49
+ - **Explainable AI (XAI)**
50
+
51
+ Useful for feature-importance analysis and interpretability.
52
+
53
+
54
+ ### Languages
55
+
56
+ English **(en)**
57
+
58
+
59
+ ## Dataset Structure
60
+
61
+ ### Data Instances
62
+
63
+ Each instance represents one patient. Example:
64
+
65
+ | Age |Sex | ChestPainType | RestingBP | Cholesterol | FastingBS | RestingECG | MaxHR | ExerciseAngina | Oldpeak | ST_Slope | HeartDisease |
66
+ |-----|----|---------------|-----------|-------------|-----------|------------|-------|----------------|---------|----------|--------------|
67
+ | 54 | M | ASY | 140 | 239 | 0 | Normal | 160 | N | 1.2 | Flat | 1 |
68
+
69
+
70
+
71
+ ### Data Fields
72
+
73
+ | Field | Type | Description |
74
+ |----------------|-----------|---------------------------------------------------------------|
75
+ | Age | int | Patient age in years |
76
+ | Sex | binary | Patient sex (M = male, F = female) |
77
+ | ChestPainType | category | Chest pain type (TA, ATA, NAP, ASY) |
78
+ | RestingBP | int | Resting blood pressure (mm Hg) |
79
+ | Cholesterol | int | Serum cholesterol (mg/dL) |
80
+ | FastingBS | binary | Fasting blood sugar (1 if >120 mg/dL, 0 otherwise) |
81
+ | RestingECG | category | Resting ECG results (Normal, ST, LVH) |
82
+ | MaxHR | int | Maximum heart rate achieved |
83
+ | ExerciseAngina | binary | Exercise-induced angina (Y/N) |
84
+ | Oldpeak | float | ST depression relative to rest |
85
+ | ST_Slope | category | Slope of the ST segment (Up, Flat, Down) |
86
+ | HeartDisease | binary | Target variable (1 = disease, 0 = no disease) |
87
+
88
+
89
+
90
+ ## Dataset Creation
91
+
92
+ ### Source Data
93
+
94
+ The preprocessed dataset used in this project originates from the Kaggle dataset *“Heart Failure Prediction Dataset”*.
95
+
96
+ The raw dataset was created by merging five widely-used cardiology datasets:
97
+
98
+ - Cleveland (303 samples)
99
+ - Hungarian (294 samples)
100
+ - Switzerland (123 samples)
101
+ - Long Beach VA (200 samples)
102
+ - Stalog (270 samples)
103
+
104
+ The Kaggle author selected the 11 common features and merged the datasets into a unified collection of **1,190 records**, then removed **272 duplicates**, resulting in **918 unique samples**.
105
+
106
+ All initial merging and normalization steps were performed by the dataset author on Kaggle.
107
+
108
+
109
+
110
+ ### Annotations
111
+
112
+ No manual annotations were added.
113
+ The target variable `HeartDisease` is already included in the original dataset.
114
+
115
+
116
+
117
+ ### Personal and Sensitive Information
118
+
119
+ Although the dataset contains clinical information (sensitive under GDPR), it is fully anonymized:
120
+
121
+ - No personal identifiers (name, address, contact details, IDs).
122
+ - All sources were already anonymized before publication.
123
+ - No biometric or genetic data are included.
124
+
125
+ Thus, while clinically sensitive, the dataset does **not** pose identifiable privacy risks.
126
+
127
+
128
+
129
+ ## Considerations for Using the Data
130
+
131
+ ### Social Impact of Dataset
132
+
133
+ The dataset can support research and development of models for cardiac risk prediction and early detection.
134
+
135
+ However:
136
+
137
+ - Models trained on this dataset **must not be used as standalone diagnostic tools**.
138
+ - They should **not** be the sole basis for clinical decisions.
139
+ - Misuse in healthcare contexts may lead to incorrect risk assessment.
140
+
141
+
142
+
143
+ ### Discussion of Biases
144
+
145
+
146
+ This dataset may contain several sources of bias that can affect model performance and fairness:
147
+
148
+ - The data comes from multiple hospitals and countries, each with different patient profiles and clinical protocols. Some groups may be underrepresented.
149
+ - Source datasets used different diagnostic practices and measurement standards, which may introduce noise or inconsistency in labels and clinical values.
150
+ - Only 11 features are included, omitting other relevant clinical variables. This can cause proxy bias or oversimplification of cardiac risk.
151
+ - Some datasets are older and may not reflect current medical practices or population characteristics.
152
+
153
+
154
+
155
+ ## Additional Information
156
+
157
+ ### Dataset Curators
158
+
159
+ The original dataset was created and published by **[fedesoriano](https://www.kaggle.com/fedesoriano)** on Kaggle.
160
+
161
+ The preprocessed dataset was curated by the **CardioTrack** team:
162
+
163
+ - [Fabrizio Rosmarino](https://github.com/Fabrizio250)
164
+ - [Martina Capone](https://github.com/Martycap)
165
+ - [Donato Boccuzzi](https://github.com/donatooooooo)
166
+
167
+ Work carried out as part of the *Software Engineering for AI-Enabled Systems* program at the University of Bari.
168
+
169
+ ### Citation Information
170
+
171
+ If you use this datasets, please cite:
172
+
173
+ **Original Dataset**
174
+ Soriano, F. (2021). *Heart Failure Prediction Dataset*. Kaggle.
175
+ https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction
176
+
docs/.gitkeep ADDED
File without changes
docs/CardioTrack_ML_Canvas.md ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # **CARDIO TRACK - MACHINE LEARNING CANVAS**
2
+
3
+ **Designed for:** Giulio Mallardi
4
+ **Designed by:** D. Boccuzzi, M. Capone, F. Rosmarino
5
+ **Date:** 17/10/2025
6
+ **Iteration:** 2
7
+
8
+ ---
9
+
10
+ ## **1. Prediction Task**
11
+
12
+ Cardio Track ML system performs a **binary classification** task based on clinical data from individual patients, with the goal of predicting the presence or absence of heart disease.
13
+ Specifically, the model analyzes each patient’s clinical features and risk factors to estimate the likelihood of developing heart failure.
14
+
15
+ There are two possible prediction outcomes:
16
+ - **Positive:** when the patient shows indicators of heart failure.
17
+ - **Negative:** when no signs of disease are detected.
18
+
19
+ ---
20
+
21
+ ## **2. Decisions**
22
+
23
+ The system’s predictions support **cardiologists** and **public health institutions (ASL)**.
24
+ For positive cases, cardiologists can order further tests, start monitoring, and define personalized treatments.
25
+ Aggregated results help public health institutions plan resources, prioritize facilities, and promote prevention and lifestyle improvements for long-term cardiovascular health.
26
+
27
+ ---
28
+
29
+ ## **3. Value Proposition**
30
+
31
+ The main end users are **cardiologists** and **local health authorities (ASL)**.
32
+ For cardiologists, the system provides a reliable tool to assist in the early diagnosis of heart failure.
33
+ For health authorities, it enables more efficient management of healthcare resources by optimizing the distribution of diagnostic and therapeutic services.
34
+
35
+ Overall, Cardio Track ML system aims to support **prevention** and **early detection** of heart failure, improving patient outcomes and reducing mortality rates.
36
+
37
+ ---
38
+
39
+ ## **4. Data Collection**
40
+
41
+ Data collection will be a **continuous and evolving process**.
42
+ Real and high quality clinical data will be carefully labeled and verified by domain experts, ensuring data quality and consistency.
43
+ New patient data collected through standardized clinical protocols will periodically update and improve the model, allowing it to adapt and learn over time.
44
+
45
+ ---
46
+
47
+ ## **5. Data Sources**
48
+
49
+ Cardio Track ML system will rely on a **publicly available dataset** that includes clinical parameters from both healthy individuals and patients diagnosed with heart failure.
50
+ The reference dataset is the [Heart Failure Prediction Dataset](https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction).
51
+
52
+ ---
53
+
54
+ ## **6. Impact Simulation**
55
+
56
+ Before release, the model will undergo rigorous validation on an independent test set.
57
+
58
+ As a baseline, minimum thresholds are defined for the key evaluation metrics: **F1-score**, **recall** and **accuracy** ≥ 0.80 and **ROC-AUC** ≥ 0.85. These values ensure that the model maintains high discriminative capability while minimizing the risk of undetected clinical cases.
59
+
60
+ We will assess potential bias across demographic subgroups to ensure fairness and consistent model performance.
61
+ Any detected bias will be mitigated through rebalancing techniques or threshold adjustment to guarantee equitable treatment across all patient categories.
62
+
63
+ ---
64
+
65
+ ## **7. Making Predictions**
66
+
67
+ Predictions will be made on-demand, triggered whenever new or updated clinical data becomes available in the hospital database.
68
+ Real-time processing is not required, but timely inference will support the decision-making workflow.
69
+ All computations will be executed **on-premises**, using the existing hospital IT infrastructure to ensure **data privacy** and **security**.
70
+
71
+ ---
72
+
73
+ ## **8. Building Models**
74
+
75
+ Cardio Track ML system will use a **single main model** in production.
76
+ Model updates will occur periodically as new data is integrated, or when a new version demonstrates statistically significant improvements in key metrics: **F1-score**, **recall**, **accuracy**, and **ROC-AUC**.
77
+
78
+ Model explainability will be ensured through the analysis of feature importance. Feature impact will be quantified by observing charts, allowing medical experts to interpret and validate the relevance of clinical factors used in the decision process.
79
+
80
+ ---
81
+
82
+ ## **9. Features**
83
+
84
+ The Heart Failure Prediction Dataset already provides a complete set of clinical features, so there is no need to extract them directly from medical exams or diagnostic reports.
85
+
86
+ **Included features:**
87
+ Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, and ST_Slope.
88
+
89
+ These features capture key cardiovascular risk factors such as hypertension, diabetes, hyperlipidemia, obesity, and other pre-existing heart conditions, making the dataset suitable for early heart failure diagnosis.
90
+
91
+ ---
92
+
93
+ ## **10. Monitoring**
94
+
95
+ After deployment, system performance will be continuously **monitored** to detect potential drifts or degradations over time.
96
+ Key metrics include **F1-score**, **recall**, **accuracy**, and **ROC-AUC**, reviewed at regular intervals.
97
+
98
+ Clinician feedback will also be collected to assess **usability** and **clinical relevance**, ensuring continuous model improvement and alignment with real-world medical needs.
docs/Risk_Classification.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # **Risk Classification**
2
+
3
+ ## **1. Purpose**
4
+
5
+ This document describes the risk classification of a Software as a Medical Device (SaMD) designed to identify the presence or absence of signs of heart failure through a **binary classification based on clinical data**.
6
+ The classification is developed using the **IMDRF SaMD Risk Categorization** framework and additional European regulatory references (AI Act, MDR).
7
+
8
+ ---
9
+
10
+ ## **2. Intended Use**
11
+
12
+ The system performs a **binary classification** task based on clinical data from individual patients, aiming to predict the presence or absence of heart disease. Specifically, the model analyzes each patient’s clinical features and risk factors to identify the potential presence of heart failure.
13
+
14
+ The model outputs two possible classification results:
15
+
16
+ * **Positive:** when the patient shows indicators compatible with heart failure.
17
+ * **Negative:** when no signs of the condition are detected.
18
+
19
+ ### **2.1 Clinical Role**
20
+
21
+ * The software output is intended as a **Clinical Decision Support (CDS)** tool.
22
+ * The intended user is a **qualified medical professional**.
23
+ * The software **does not perform diagnosis**, **does not make autonomous therapeutic decisions**, and **is not intended for use in emergency settings**.
24
+ * The information provided supports—but does not replace—clinical judgement.
25
+
26
+ ---
27
+
28
+ ## **3. IMDRF SaMD Risk Categorization**
29
+
30
+ The IMDRF framework evaluates two key dimensions:
31
+
32
+ 1. **The significance of the information provided by the software**
33
+ 2. **The severity of the clinical condition addressed**
34
+
35
+ ### **3.1 Significance of the Information – *Treat/Diagnose***
36
+
37
+ **Rationale:**
38
+
39
+ * The software provides a **binary risk classification** that may influence clinical decisions such as follow-up, diagnostic investigation, or changes in patient management.
40
+ * The output goes beyond merely describing clinical status (“inform” level), contributing instead to medical decision-making.
41
+ * As the system supports decisions relevant to diagnosis and treatment, it falls within the **Treat/Diagnose** category of the IMDRF framework.
42
+
43
+ ### **3.2 Severity of the Clinical Condition – *Serious***
44
+
45
+ **Rationale:**
46
+
47
+ * Heart failure is a serious medical condition with potentially significant complications.
48
+ * The system is not intended for emergency use, does not initiate immediate life-saving actions, and operates within routine or preventive clinical care.
49
+ * The presence of a medical professional mitigates the risk of immediate harm due to software errors.
50
+ * In the intended use context, the condition is therefore appropriately classified as **Serious**, not “Critical”.
51
+
52
+ ### **3.3 IMDRF Classification Result**
53
+
54
+ | Significance | Condition | IMDRF Category |
55
+ | -------------- | --------- | -------------- |
56
+ | Treat/Diagnose | Serious | **III** |
57
+
58
+ ---
59
+
60
+ ## **4. AI Act – Probable Classification as High-Risk AI**
61
+
62
+ According to the **Regulation (EU) 2024/1689 (AI Act)**, artificial intelligence systems used as medical devices or as components of medical devices regulated under the MDR/IVDR are included among **High-Risk AI Systems**, as listed in Annex III.
63
+
64
+ For the system under consideration:
65
+
66
+ * it meets the MDR definition of **SaMD**;
67
+ * it supports clinically relevant decisions;
68
+ * it may influence patient management concerning a serious medical condition.
69
+
70
+ Therefore, the system can be **reasonably considered a High-Risk AI System** under the AI Act.
71
+ This is not a definitive classification—the formal designation will depend on MDR processes and final technical documentation—but it represents a consistent regulatory interpretation based on the software’s intended purpose and domain.
72
+
73
+ ---
74
+
75
+ ## **5. Conclusion**
76
+
77
+ The software is classified as:
78
+
79
+ * **SaMD, IMDRF Category III**, based on:
80
+
81
+ * information of the **Treat/Diagnose** type;
82
+ * management of a condition categorized as **Serious**.
83
+
84
+ This classification does not represent the final MDR class but provides a robust basis for risk assessment and regulatory positioning, including the likely classification as **High-Risk AI** under the AI Act.
85
+
86
+ ---
87
+
88
+ ## **6. MDR (EU) – Additional Note**
89
+
90
+ IMDRF categorization does not directly determine the MDR class but offers a helpful conceptual framework.
91
+
92
+ The European MDR classification will be established through:
93
+
94
+ * **MDR 2017/745**, Annex VIII
95
+ * **Rule 11**, specific to medical device software
96
+ * **MDCG 2019-11**, interpretative guidance
97
+
98
+ Based on the system’s functionality (clinical classification supporting diagnosis/prognosis), an assignment of at least **Class IIa** is likely.
99
+ However, the final class will depend on the risk evaluation and the risk-control measures implemented.
100
+
101
+
dvc.lock ADDED
@@ -0,0 +1,754 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ schema: '2.0'
2
+ stages:
3
+ download_data:
4
+ cmd: uv run predicting_outcomes_in_heart_failure/data/dataset.py
5
+ deps:
6
+ - path: predicting_outcomes_in_heart_failure/data/dataset.py
7
+ hash: md5
8
+ md5: 2896ae3ebb48acbbfa118415494c70ef
9
+ size: 559
10
+ outs:
11
+ - path: data/raw/heart.csv
12
+ hash: md5
13
+ md5: ab21f2524241ed14b321bcaf40c8b86e
14
+ size: 35921
15
+ preprocessing:
16
+ cmd: uv run predicting_outcomes_in_heart_failure/data/preprocess.py
17
+ deps:
18
+ - path: data/raw/heart.csv
19
+ hash: md5
20
+ md5: ab21f2524241ed14b321bcaf40c8b86e
21
+ size: 35921
22
+ - path: predicting_outcomes_in_heart_failure/data/preprocess.py
23
+ hash: md5
24
+ md5: a3586b10fae9eed2183fb516b2298273
25
+ size: 4328
26
+ outs:
27
+ - path: data/interim/preprocess_artifacts/scaler.joblib
28
+ hash: md5
29
+ md5: 224537cb262510335c5515d6156952d6
30
+ size: 1023
31
+ - path: data/interim/preprocessed.csv
32
+ hash: md5
33
+ md5: aeb0353e39e219cf0b574ab72b08ac26
34
+ size: 151228
35
+ - path: data/interim/preprocessed_female_only.csv
36
+ hash: md5
37
+ md5: 337b3e1dd4f47911997eded603ab3f4b
38
+ size: 31966
39
+ - path: data/interim/preprocessed_male_only.csv
40
+ hash: md5
41
+ md5: 9fa26adfc62e4ccd77089627dbe01f5d
42
+ size: 119503
43
+ - path: data/interim/preprocessed_no_sex_column.csv
44
+ hash: md5
45
+ md5: 95d3064bbfef4c8f09d45fe4e1c915d4
46
+ size: 149390
47
+ split_data:
48
+ cmd: uv run predicting_outcomes_in_heart_failure/data/split_data.py
49
+ deps:
50
+ - path: data/interim/preprocessed.csv
51
+ hash: md5
52
+ md5: aeb0353e39e219cf0b574ab72b08ac26
53
+ size: 151228
54
+ - path: predicting_outcomes_in_heart_failure/data/split_data.py
55
+ hash: md5
56
+ md5: 068ee60f2ab294944d3392ef6aa033e5
57
+ size: 2262
58
+ outs:
59
+ - path: data/processed
60
+ hash: md5
61
+ md5: 7809f68c8d582ea4f49135f432bab71f.dir
62
+ size: 149721
63
+ nfiles: 2
64
+ training:
65
+ cmd: uv run predicting_outcomes_in_heart_failure/modeling/train.py
66
+ deps:
67
+ - path: data/processed/train.csv
68
+ hash: md5
69
+ md5: c7af893630cff97ccd3ce364ed1ee6eb
70
+ size: 104547
71
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
72
+ hash: md5
73
+ md5: 6e81f5b43ee6b2698cf3d582a9712c46
74
+ size: 4811
75
+ outs:
76
+ - path: models/decision_tree.joblib
77
+ hash: md5
78
+ md5: 2277d8f53277419bf3594d2b2dd6c8b5
79
+ size: 5561
80
+ - path: models/logreg.joblib
81
+ hash: md5
82
+ md5: bf500ad9a04a83594316bd2571584dc4
83
+ size: 1519
84
+ - path: models/random_forest.joblib
85
+ hash: md5
86
+ md5: da8670002f66b3fcb7e27ea32c33f14d
87
+ size: 9919545
88
+ - path: reports/decision_tree
89
+ hash: md5
90
+ md5: 60beca0dca76c93ee85a1874eb68a1e9.dir
91
+ size: 838814
92
+ nfiles: 2
93
+ - path: reports/logreg
94
+ hash: md5
95
+ md5: a6b3653e15d6d1b1472d1faeb11bd3cb.dir
96
+ size: 41389
97
+ nfiles: 2
98
+ - path: reports/random_forest
99
+ hash: md5
100
+ md5: 06127fd0761a967d67577c88fc428426.dir
101
+ size: 175536
102
+ nfiles: 2
103
+ evaluation:
104
+ cmd: uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py
105
+ deps:
106
+ - path: data/processed/test.csv
107
+ hash: md5
108
+ md5: d06bcd540eff4a5c8e6dd668a2b148ed
109
+ size: 45174
110
+ - path: models
111
+ hash: md5
112
+ md5: f8d5e9622b5c447a4ed7b66ad4608927.dir
113
+ size: 9926687
114
+ nfiles: 5
115
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
116
+ hash: md5
117
+ md5: ee20dc257b9ac063d61a815401341836
118
+ size: 3256
119
+ split_data@all:
120
+ cmd: "uv run predicting_outcomes_in_heart_failure/data/split_data.py --variant
121
+ all\n"
122
+ deps:
123
+ - path: data/interim/preprocessed.csv
124
+ hash: md5
125
+ md5: aeb0353e39e219cf0b574ab72b08ac26
126
+ size: 151228
127
+ - path: data/interim/preprocessed_female_only.csv
128
+ hash: md5
129
+ md5: 337b3e1dd4f47911997eded603ab3f4b
130
+ size: 31966
131
+ - path: data/interim/preprocessed_male_only.csv
132
+ hash: md5
133
+ md5: 9fa26adfc62e4ccd77089627dbe01f5d
134
+ size: 119503
135
+ - path: data/interim/preprocessed_no_sex_column.csv
136
+ hash: md5
137
+ md5: 95d3064bbfef4c8f09d45fe4e1c915d4
138
+ size: 149390
139
+ - path: predicting_outcomes_in_heart_failure/data/split_data.py
140
+ hash: md5
141
+ md5: 63484f5b1cbe60115d922b0c68012a66
142
+ size: 3616
143
+ outs:
144
+ - path: data/processed/all
145
+ hash: md5
146
+ md5: 7809f68c8d582ea4f49135f432bab71f.dir
147
+ size: 149721
148
+ nfiles: 2
149
+ split_data@female:
150
+ cmd: "uv run predicting_outcomes_in_heart_failure/data/split_data.py --variant
151
+ female\n"
152
+ deps:
153
+ - path: data/interim/preprocessed.csv
154
+ hash: md5
155
+ md5: aeb0353e39e219cf0b574ab72b08ac26
156
+ size: 151228
157
+ - path: data/interim/preprocessed_female_only.csv
158
+ hash: md5
159
+ md5: 337b3e1dd4f47911997eded603ab3f4b
160
+ size: 31966
161
+ - path: data/interim/preprocessed_male_only.csv
162
+ hash: md5
163
+ md5: 9fa26adfc62e4ccd77089627dbe01f5d
164
+ size: 119503
165
+ - path: data/interim/preprocessed_no_sex_column.csv
166
+ hash: md5
167
+ md5: 95d3064bbfef4c8f09d45fe4e1c915d4
168
+ size: 149390
169
+ - path: predicting_outcomes_in_heart_failure/data/split_data.py
170
+ hash: md5
171
+ md5: 63484f5b1cbe60115d922b0c68012a66
172
+ size: 3616
173
+ outs:
174
+ - path: data/processed/female
175
+ hash: md5
176
+ md5: d873e58476d480ce41edfc7806cbde86.dir
177
+ size: 31872
178
+ nfiles: 2
179
+ split_data@male:
180
+ cmd: "uv run predicting_outcomes_in_heart_failure/data/split_data.py --variant
181
+ male\n"
182
+ deps:
183
+ - path: data/interim/preprocessed.csv
184
+ hash: md5
185
+ md5: aeb0353e39e219cf0b574ab72b08ac26
186
+ size: 151228
187
+ - path: data/interim/preprocessed_female_only.csv
188
+ hash: md5
189
+ md5: 337b3e1dd4f47911997eded603ab3f4b
190
+ size: 31966
191
+ - path: data/interim/preprocessed_male_only.csv
192
+ hash: md5
193
+ md5: 9fa26adfc62e4ccd77089627dbe01f5d
194
+ size: 119503
195
+ - path: data/interim/preprocessed_no_sex_column.csv
196
+ hash: md5
197
+ md5: 95d3064bbfef4c8f09d45fe4e1c915d4
198
+ size: 149390
199
+ - path: predicting_outcomes_in_heart_failure/data/split_data.py
200
+ hash: md5
201
+ md5: 63484f5b1cbe60115d922b0c68012a66
202
+ size: 3616
203
+ outs:
204
+ - path: data/processed/male
205
+ hash: md5
206
+ md5: 248b95ea1cff032fcea0092636795342.dir
207
+ size: 118331
208
+ nfiles: 2
209
+ split_data@nosex:
210
+ cmd: "uv run predicting_outcomes_in_heart_failure/data/split_data.py --variant
211
+ nosex\n"
212
+ deps:
213
+ - path: data/interim/preprocessed.csv
214
+ hash: md5
215
+ md5: aeb0353e39e219cf0b574ab72b08ac26
216
+ size: 151228
217
+ - path: data/interim/preprocessed_female_only.csv
218
+ hash: md5
219
+ md5: 337b3e1dd4f47911997eded603ab3f4b
220
+ size: 31966
221
+ - path: data/interim/preprocessed_male_only.csv
222
+ hash: md5
223
+ md5: 9fa26adfc62e4ccd77089627dbe01f5d
224
+ size: 119503
225
+ - path: data/interim/preprocessed_no_sex_column.csv
226
+ hash: md5
227
+ md5: 95d3064bbfef4c8f09d45fe4e1c915d4
228
+ size: 149390
229
+ - path: predicting_outcomes_in_heart_failure/data/split_data.py
230
+ hash: md5
231
+ md5: 63484f5b1cbe60115d922b0c68012a66
232
+ size: 3616
233
+ outs:
234
+ - path: data/processed/nosex
235
+ hash: md5
236
+ md5: d6f6cbb3681cea6a652efef8141ef546.dir
237
+ size: 147879
238
+ nfiles: 2
239
+ training@0:
240
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
241
+ all --model logreg\n"
242
+ deps:
243
+ - path: data/processed/all/train.csv
244
+ hash: md5
245
+ md5: c7af893630cff97ccd3ce364ed1ee6eb
246
+ size: 104547
247
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
248
+ hash: md5
249
+ md5: e2ae238f0d45e8032e3944a24d0c27e3
250
+ size: 7931
251
+ outs:
252
+ - path: models/all/logreg.joblib
253
+ hash: md5
254
+ md5: 44595213dc6024d0c3f043d080145c29
255
+ size: 1519
256
+ - path: reports/all/logreg
257
+ hash: md5
258
+ md5: c3345c2b9f17869574dd78e20e9c5829.dir
259
+ size: 11970
260
+ nfiles: 2
261
+ training@1:
262
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
263
+ all --model random_forest\n"
264
+ deps:
265
+ - path: data/processed/all/train.csv
266
+ hash: md5
267
+ md5: c7af893630cff97ccd3ce364ed1ee6eb
268
+ size: 104547
269
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
270
+ hash: md5
271
+ md5: e2ae238f0d45e8032e3944a24d0c27e3
272
+ size: 7931
273
+ outs:
274
+ - path: models/all/random_forest.joblib
275
+ hash: md5
276
+ md5: 93d7fcec110a443bd85ef6579c2a5c2e
277
+ size: 3441465
278
+ - path: reports/all/random_forest
279
+ hash: md5
280
+ md5: b929c6714dd61f762452554919b9b5e8.dir
281
+ size: 19446
282
+ nfiles: 2
283
+ training@2:
284
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
285
+ all --model decision_tree\n"
286
+ deps:
287
+ - path: data/processed/all/train.csv
288
+ hash: md5
289
+ md5: c7af893630cff97ccd3ce364ed1ee6eb
290
+ size: 104547
291
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
292
+ hash: md5
293
+ md5: e2ae238f0d45e8032e3944a24d0c27e3
294
+ size: 7931
295
+ outs:
296
+ - path: models/all/decision_tree.joblib
297
+ hash: md5
298
+ md5: aedace81fc3fca71a5cbdb80f9cb5b28
299
+ size: 2521
300
+ - path: reports/all/decision_tree
301
+ hash: md5
302
+ md5: ee869eabad22f17701ddd17ddb2f5474.dir
303
+ size: 7025256
304
+ nfiles: 2
305
+ training@3:
306
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
307
+ female --model logreg\n"
308
+ deps:
309
+ - path: data/processed/female/train.csv
310
+ hash: md5
311
+ md5: b37819cf2f8e1a53a7e715a46299dab0
312
+ size: 22207
313
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
314
+ hash: md5
315
+ md5: e2ae238f0d45e8032e3944a24d0c27e3
316
+ size: 7931
317
+ outs:
318
+ - path: models/female/logreg.joblib
319
+ hash: md5
320
+ md5: 3b8764d1ac710c8a250074bf2d8be1ea
321
+ size: 1519
322
+ - path: reports/female/logreg
323
+ hash: md5
324
+ md5: 4b14fe2da9a58fc5ed3e0f7587efe7a7.dir
325
+ size: 8706
326
+ nfiles: 2
327
+ training@4:
328
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
329
+ female --model random_forest\n"
330
+ deps:
331
+ - path: data/processed/female/train.csv
332
+ hash: md5
333
+ md5: b37819cf2f8e1a53a7e715a46299dab0
334
+ size: 22207
335
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
336
+ hash: md5
337
+ md5: e2ae238f0d45e8032e3944a24d0c27e3
338
+ size: 7931
339
+ outs:
340
+ - path: models/female/random_forest.joblib
341
+ hash: md5
342
+ md5: 343a3a957a3a505be860c350cebd2af8
343
+ size: 880185
344
+ - path: reports/female/random_forest
345
+ hash: md5
346
+ md5: 23c999394e9140be73cb6db2ee039d2f.dir
347
+ size: 13839
348
+ nfiles: 2
349
+ training@5:
350
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
351
+ female --model decision_tree\n"
352
+ deps:
353
+ - path: data/processed/female/train.csv
354
+ hash: md5
355
+ md5: b37819cf2f8e1a53a7e715a46299dab0
356
+ size: 22207
357
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
358
+ hash: md5
359
+ md5: e2ae238f0d45e8032e3944a24d0c27e3
360
+ size: 7931
361
+ outs:
362
+ - path: models/female/decision_tree.joblib
363
+ hash: md5
364
+ md5: 432ddc9ffd54955ca752fc6b7da9a312
365
+ size: 7161
366
+ - path: reports/female/decision_tree
367
+ hash: md5
368
+ md5: 8675acf1093c63f8758331c9aef72484.dir
369
+ size: 5042622
370
+ nfiles: 2
371
+ training@6:
372
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
373
+ male --model logreg\n"
374
+ deps:
375
+ - path: data/processed/male/train.csv
376
+ hash: md5
377
+ md5: 377755a75869ae39a64b39120c09aa5b
378
+ size: 82611
379
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
380
+ hash: md5
381
+ md5: e2ae238f0d45e8032e3944a24d0c27e3
382
+ size: 7931
383
+ outs:
384
+ - path: models/male/logreg.joblib
385
+ hash: md5
386
+ md5: 703f955be56bea513500f9bdf1b541b3
387
+ size: 1535
388
+ - path: reports/male/logreg
389
+ hash: md5
390
+ md5: fc5c9fda539597ab95b47e3cef544ae4.dir
391
+ size: 11191
392
+ nfiles: 2
393
+ training@7:
394
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
395
+ male --model random_forest\n"
396
+ deps:
397
+ - path: data/processed/male/train.csv
398
+ hash: md5
399
+ md5: 377755a75869ae39a64b39120c09aa5b
400
+ size: 82611
401
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
402
+ hash: md5
403
+ md5: e2ae238f0d45e8032e3944a24d0c27e3
404
+ size: 7931
405
+ outs:
406
+ - path: models/male/random_forest.joblib
407
+ hash: md5
408
+ md5: a0b69f9da0287b8f8cc06f65ee0aa994
409
+ size: 2747049
410
+ - path: reports/male/random_forest
411
+ hash: md5
412
+ md5: 89011934a73df9bc579839a96f4e24eb.dir
413
+ size: 17913
414
+ nfiles: 2
415
+ training@8:
416
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
417
+ male --model decision_tree\n"
418
+ deps:
419
+ - path: data/processed/male/train.csv
420
+ hash: md5
421
+ md5: 377755a75869ae39a64b39120c09aa5b
422
+ size: 82611
423
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
424
+ hash: md5
425
+ md5: e2ae238f0d45e8032e3944a24d0c27e3
426
+ size: 7931
427
+ outs:
428
+ - path: models/male/decision_tree.joblib
429
+ hash: md5
430
+ md5: 440f308bbd3226e7fbcee4f16dbefe65
431
+ size: 15177
432
+ - path: reports/male/decision_tree
433
+ hash: md5
434
+ md5: da1c41a5584c71b6a016d97c9a2d45a1.dir
435
+ size: 6493113
436
+ nfiles: 2
437
+ training@9:
438
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
439
+ nosex --model logreg\n"
440
+ deps:
441
+ - path: data/processed/nosex/train.csv
442
+ hash: md5
443
+ md5: c17c9ac3520d3f8ce46eb97f1c03b664
444
+ size: 103261
445
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
446
+ hash: md5
447
+ md5: e2ae238f0d45e8032e3944a24d0c27e3
448
+ size: 7931
449
+ outs:
450
+ - path: models/nosex/logreg.joblib
451
+ hash: md5
452
+ md5: 43e4b952e67bc6956c8fe3bd66e6fb39
453
+ size: 1503
454
+ - path: reports/nosex/logreg
455
+ hash: md5
456
+ md5: ce5b23988c9364c9ecbd288ca1cfb77b.dir
457
+ size: 11959
458
+ nfiles: 2
459
+ training@10:
460
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
461
+ nosex --model random_forest\n"
462
+ deps:
463
+ - path: data/processed/nosex/train.csv
464
+ hash: md5
465
+ md5: c17c9ac3520d3f8ce46eb97f1c03b664
466
+ size: 103261
467
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
468
+ hash: md5
469
+ md5: e2ae238f0d45e8032e3944a24d0c27e3
470
+ size: 7931
471
+ outs:
472
+ - path: models/nosex/random_forest.joblib
473
+ hash: md5
474
+ md5: 5e94c1ecb14e0e6174af94b5f928daac
475
+ size: 13471369
476
+ - path: reports/nosex/random_forest
477
+ hash: md5
478
+ md5: 11dfa343b1ba9daa394b071ee340292a.dir
479
+ size: 19280
480
+ nfiles: 2
481
+ training@11:
482
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
483
+ nosex --model decision_tree\n"
484
+ deps:
485
+ - path: data/processed/nosex/train.csv
486
+ hash: md5
487
+ md5: c17c9ac3520d3f8ce46eb97f1c03b664
488
+ size: 103261
489
+ - path: predicting_outcomes_in_heart_failure/modeling/train.py
490
+ hash: md5
491
+ md5: e2ae238f0d45e8032e3944a24d0c27e3
492
+ size: 7931
493
+ outs:
494
+ - path: models/nosex/decision_tree.joblib
495
+ hash: md5
496
+ md5: 72129c6d52da6da3b6108c8f8490950d
497
+ size: 2985
498
+ - path: reports/nosex/decision_tree
499
+ hash: md5
500
+ md5: 9b1e7f06634b537c9291dea1ac0d98d7.dir
501
+ size: 7002873
502
+ nfiles: 2
503
+ evaluation@0:
504
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
505
+ all --model logreg\n"
506
+ deps:
507
+ - path: data/processed/all/test.csv
508
+ hash: md5
509
+ md5: d06bcd540eff4a5c8e6dd668a2b148ed
510
+ size: 45174
511
+ - path: models/all/logreg.joblib
512
+ hash: md5
513
+ md5: 44595213dc6024d0c3f043d080145c29
514
+ size: 1519
515
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
516
+ hash: md5
517
+ md5: 90ecfc732599b4427b3d585d27a47b60
518
+ size: 6262
519
+ outs:
520
+ - path: metrics/test/all/logreg.json
521
+ hash: md5
522
+ md5: 17425d74cc062c83f054b6e7559ff7fd
523
+ size: 255
524
+ evaluation@1:
525
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
526
+ all --model random_forest\n"
527
+ deps:
528
+ - path: data/processed/all/test.csv
529
+ hash: md5
530
+ md5: d06bcd540eff4a5c8e6dd668a2b148ed
531
+ size: 45174
532
+ - path: models/all/random_forest.joblib
533
+ hash: md5
534
+ md5: 93d7fcec110a443bd85ef6579c2a5c2e
535
+ size: 3441465
536
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
537
+ hash: md5
538
+ md5: 90ecfc732599b4427b3d585d27a47b60
539
+ size: 6262
540
+ outs:
541
+ - path: metrics/test/all/random_forest.json
542
+ hash: md5
543
+ md5: 88b271ac93b466730d4edc6bba0f2eb6
544
+ size: 261
545
+ evaluation@2:
546
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
547
+ all --model decision_tree\n"
548
+ deps:
549
+ - path: data/processed/all/test.csv
550
+ hash: md5
551
+ md5: d06bcd540eff4a5c8e6dd668a2b148ed
552
+ size: 45174
553
+ - path: models/all/decision_tree.joblib
554
+ hash: md5
555
+ md5: aedace81fc3fca71a5cbdb80f9cb5b28
556
+ size: 2521
557
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
558
+ hash: md5
559
+ md5: 90ecfc732599b4427b3d585d27a47b60
560
+ size: 6262
561
+ outs:
562
+ - path: metrics/test/all/decision_tree.json
563
+ hash: md5
564
+ md5: 3a493b39b23186e90b649df8eeeacb47
565
+ size: 262
566
+ evaluation@3:
567
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
568
+ female --model logreg\n"
569
+ deps:
570
+ - path: data/processed/female/test.csv
571
+ hash: md5
572
+ md5: 6aa472fd41a51bea05b7d2b105f40d85
573
+ size: 9665
574
+ - path: models/female/logreg.joblib
575
+ hash: md5
576
+ md5: 3b8764d1ac710c8a250074bf2d8be1ea
577
+ size: 1519
578
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
579
+ hash: md5
580
+ md5: 90ecfc732599b4427b3d585d27a47b60
581
+ size: 6262
582
+ outs:
583
+ - path: metrics/test/female/logreg.json
584
+ hash: md5
585
+ md5: 0aeb472460b0188feb2a11770cb2c96f
586
+ size: 258
587
+ evaluation@4:
588
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
589
+ female --model random_forest\n"
590
+ deps:
591
+ - path: data/processed/female/test.csv
592
+ hash: md5
593
+ md5: 6aa472fd41a51bea05b7d2b105f40d85
594
+ size: 9665
595
+ - path: models/female/random_forest.joblib
596
+ hash: md5
597
+ md5: 343a3a957a3a505be860c350cebd2af8
598
+ size: 880185
599
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
600
+ hash: md5
601
+ md5: 90ecfc732599b4427b3d585d27a47b60
602
+ size: 6262
603
+ outs:
604
+ - path: metrics/test/female/random_forest.json
605
+ hash: md5
606
+ md5: abe473afdd356ec9d3915749b1b1ce98
607
+ size: 265
608
+ evaluation@5:
609
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
610
+ female --model decision_tree\n"
611
+ deps:
612
+ - path: data/processed/female/test.csv
613
+ hash: md5
614
+ md5: 6aa472fd41a51bea05b7d2b105f40d85
615
+ size: 9665
616
+ - path: models/female/decision_tree.joblib
617
+ hash: md5
618
+ md5: 432ddc9ffd54955ca752fc6b7da9a312
619
+ size: 7161
620
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
621
+ hash: md5
622
+ md5: 90ecfc732599b4427b3d585d27a47b60
623
+ size: 6262
624
+ outs:
625
+ - path: metrics/test/female/decision_tree.json
626
+ hash: md5
627
+ md5: b8bb78497457625c190ab0e391c98c15
628
+ size: 263
629
+ evaluation@6:
630
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
631
+ male --model logreg\n"
632
+ deps:
633
+ - path: data/processed/male/test.csv
634
+ hash: md5
635
+ md5: 7c8ccfdb9557e357265780a1f504cee3
636
+ size: 35720
637
+ - path: models/male/logreg.joblib
638
+ hash: md5
639
+ md5: 703f955be56bea513500f9bdf1b541b3
640
+ size: 1535
641
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
642
+ hash: md5
643
+ md5: 90ecfc732599b4427b3d585d27a47b60
644
+ size: 6262
645
+ outs:
646
+ - path: metrics/test/male/logreg.json
647
+ hash: md5
648
+ md5: 3ea9b47f01e8f1b52c1a27b46623aa9e
649
+ size: 256
650
+ evaluation@7:
651
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
652
+ male --model random_forest\n"
653
+ deps:
654
+ - path: data/processed/male/test.csv
655
+ hash: md5
656
+ md5: 7c8ccfdb9557e357265780a1f504cee3
657
+ size: 35720
658
+ - path: models/male/random_forest.joblib
659
+ hash: md5
660
+ md5: a0b69f9da0287b8f8cc06f65ee0aa994
661
+ size: 2747049
662
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
663
+ hash: md5
664
+ md5: 90ecfc732599b4427b3d585d27a47b60
665
+ size: 6262
666
+ outs:
667
+ - path: metrics/test/male/random_forest.json
668
+ hash: md5
669
+ md5: d25cd10633b8514e23aeb95c31cc7fb4
670
+ size: 263
671
+ evaluation@8:
672
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
673
+ male --model decision_tree\n"
674
+ deps:
675
+ - path: data/processed/male/test.csv
676
+ hash: md5
677
+ md5: 7c8ccfdb9557e357265780a1f504cee3
678
+ size: 35720
679
+ - path: models/male/decision_tree.joblib
680
+ hash: md5
681
+ md5: 440f308bbd3226e7fbcee4f16dbefe65
682
+ size: 15177
683
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
684
+ hash: md5
685
+ md5: 90ecfc732599b4427b3d585d27a47b60
686
+ size: 6262
687
+ outs:
688
+ - path: metrics/test/male/decision_tree.json
689
+ hash: md5
690
+ md5: 4646c2963e50087a6b237c7b557e0628
691
+ size: 263
692
+ evaluation@9:
693
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
694
+ nosex --model logreg\n"
695
+ deps:
696
+ - path: data/processed/nosex/test.csv
697
+ hash: md5
698
+ md5: d79d369fb709d0f0eb9d3c9096488118
699
+ size: 44618
700
+ - path: models/nosex/logreg.joblib
701
+ hash: md5
702
+ md5: 43e4b952e67bc6956c8fe3bd66e6fb39
703
+ size: 1503
704
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
705
+ hash: md5
706
+ md5: 90ecfc732599b4427b3d585d27a47b60
707
+ size: 6262
708
+ outs:
709
+ - path: metrics/test/nosex/logreg.json
710
+ hash: md5
711
+ md5: 570687d930d6363d89908ea20838c8d6
712
+ size: 257
713
+ evaluation@10:
714
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
715
+ nosex --model random_forest\n"
716
+ deps:
717
+ - path: data/processed/nosex/test.csv
718
+ hash: md5
719
+ md5: d79d369fb709d0f0eb9d3c9096488118
720
+ size: 44618
721
+ - path: models/nosex/random_forest.joblib
722
+ hash: md5
723
+ md5: 5e94c1ecb14e0e6174af94b5f928daac
724
+ size: 13471369
725
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
726
+ hash: md5
727
+ md5: 90ecfc732599b4427b3d585d27a47b60
728
+ size: 6262
729
+ outs:
730
+ - path: metrics/test/nosex/random_forest.json
731
+ hash: md5
732
+ md5: 6a135f593534616b51858c7f6f251d36
733
+ size: 264
734
+ evaluation@11:
735
+ cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
736
+ nosex --model decision_tree\n"
737
+ deps:
738
+ - path: data/processed/nosex/test.csv
739
+ hash: md5
740
+ md5: d79d369fb709d0f0eb9d3c9096488118
741
+ size: 44618
742
+ - path: models/nosex/decision_tree.joblib
743
+ hash: md5
744
+ md5: 72129c6d52da6da3b6108c8f8490950d
745
+ size: 2985
746
+ - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
747
+ hash: md5
748
+ md5: 90ecfc732599b4427b3d585d27a47b60
749
+ size: 6262
750
+ outs:
751
+ - path: metrics/test/nosex/decision_tree.json
752
+ hash: md5
753
+ md5: dca85bcdc4b67a21cee91b31f45d6225
754
+ size: 264
dvc.yaml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ stages:
2
+ download_data:
3
+ cmd: uv run predicting_outcomes_in_heart_failure/data/dataset.py
4
+ deps:
5
+ - predicting_outcomes_in_heart_failure/data/dataset.py
6
+ outs:
7
+ - data/raw/heart.csv
8
+
9
+ preprocessing:
10
+ cmd: uv run predicting_outcomes_in_heart_failure/data/preprocess.py
11
+ deps:
12
+ - predicting_outcomes_in_heart_failure/data/preprocess.py
13
+ - data/raw/heart.csv
14
+ outs:
15
+ - data/interim/preprocessed.csv
16
+ - data/interim/preprocessed_female_only.csv
17
+ - data/interim/preprocessed_male_only.csv
18
+ - data/interim/preprocessed_no_sex_column.csv
19
+ - data/interim/preprocess_artifacts/scaler.joblib
20
+
21
+ split_data:
22
+ foreach: [all, female, male, nosex]
23
+ do:
24
+ cmd: >
25
+ uv run predicting_outcomes_in_heart_failure/data/split_data.py
26
+ --variant ${item}
27
+ deps:
28
+ - predicting_outcomes_in_heart_failure/data/split_data.py
29
+ - data/interim/preprocessed.csv
30
+ - data/interim/preprocessed_female_only.csv
31
+ - data/interim/preprocessed_male_only.csv
32
+ - data/interim/preprocessed_no_sex_column.csv
33
+ outs:
34
+ - data/processed/${item}
35
+
36
+ training:
37
+ foreach:
38
+ - { variant: all, model: logreg }
39
+ - { variant: all, model: random_forest }
40
+ - { variant: all, model: decision_tree }
41
+ - { variant: female, model: logreg }
42
+ - { variant: female, model: random_forest }
43
+ - { variant: female, model: decision_tree }
44
+ - { variant: male, model: logreg }
45
+ - { variant: male, model: random_forest }
46
+ - { variant: male, model: decision_tree }
47
+ - { variant: nosex, model: logreg }
48
+ - { variant: nosex, model: random_forest }
49
+ - { variant: nosex, model: decision_tree }
50
+ do:
51
+ cmd: >
52
+ uv run predicting_outcomes_in_heart_failure/modeling/train.py
53
+ --variant ${item.variant}
54
+ --model ${item.model}
55
+ deps:
56
+ - predicting_outcomes_in_heart_failure/modeling/train.py
57
+ - data/processed/${item.variant}/train.csv
58
+ outs:
59
+ - models/${item.variant}/${item.model}.joblib
60
+ - reports/${item.variant}/${item.model}
61
+
62
+ evaluation:
63
+ foreach:
64
+ - { variant: all, model: logreg }
65
+ - { variant: all, model: random_forest }
66
+ - { variant: all, model: decision_tree }
67
+ - { variant: female, model: logreg }
68
+ - { variant: female, model: random_forest }
69
+ - { variant: female, model: decision_tree }
70
+ - { variant: male, model: logreg }
71
+ - { variant: male, model: random_forest }
72
+ - { variant: male, model: decision_tree }
73
+ - { variant: nosex, model: logreg }
74
+ - { variant: nosex, model: random_forest }
75
+ - { variant: nosex, model: decision_tree }
76
+ do:
77
+ cmd: >
78
+ uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py
79
+ --variant ${item.variant}
80
+ --model ${item.model}
81
+ deps:
82
+ - predicting_outcomes_in_heart_failure/modeling/evaluate.py
83
+ - models/${item.variant}/${item.model}.joblib
84
+ - data/processed/${item.variant}/test.csv
85
+ outs:
86
+ - metrics/test/${item.variant}/${item.model}.json
metrics/test/all/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /logreg.json
2
+ /random_forest.json
3
+ /decision_tree.json
metrics/test/female/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /logreg.json
2
+ /random_forest.json
3
+ /decision_tree.json
metrics/test/male/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /logreg.json
2
+ /random_forest.json
3
+ /decision_tree.json
metrics/test/nosex/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /logreg.json
2
+ /random_forest.json
3
+ /decision_tree.json
models/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /decision_tree.joblib
2
+ /random_forest.joblib
3
+ /logreg.joblib
models/README.md ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Card
2
+
3
+ ## Table of Contents
4
+
5
+ - [Model Details](#model-details)
6
+ - [Training Information](#training-information)
7
+ - [Intended Use](#intended-use)
8
+ - [Primary Intended Uses](#primary-intended-uses)
9
+ - [Primary Intended Users](#primary-intended-users)
10
+ - [Out-of-scope Use Cases](#out-of-scope-use-cases)
11
+ - [Factors](#factors)
12
+ - [Relevant Factors](#relevant-factors)
13
+ - [Evaluation Factors](#evaluation-factors)
14
+
15
+ - [Metrics](#metrics)
16
+ - [Model Performance](#model-performance)
17
+ - [Variation Approaches](#variation-approaches)
18
+
19
+ - [Evaluation Data](#evaluation-data)
20
+ - [Datasets](#datasets)
21
+ - [Motivation](#motivation)
22
+ - [Preprocessing](#preprocessing)
23
+
24
+ - [Training Data](#training-data)
25
+ - [Datasets](#datasets-1)
26
+ - [Preprocessing](#preprocessing-1)
27
+
28
+ - [Ethical Considerations](#ethical-considerations)
29
+
30
+ - [Caveats and Recommendations](#caveats-and-recommendations)
31
+
32
+ ## Model Details
33
+ - Developed by: D. Boccuzzi, M. Capone, F. Rosmarino
34
+ - Model Date: November 11th, 2025
35
+ - Model Version: 6 - nosex
36
+ - Model Type: RandomForestClassifier
37
+ ### Training information
38
+ - Best hyperparameters tuned with a 5-fold cross validation:
39
+ - `max_depth` 12
40
+ - `n_estimators` 800
41
+ - `class_weight` balanced
42
+ - Applied approaches:
43
+ During training, oversampling technique was applied to balance the dataset and reduce bias toward the majority class. This ensured that the model learned equally from positive and negative cases, improving prediction performance for the minority class.
44
+ - Training started at: 11:26:59 2025-11-12
45
+ - Training ended at: 11:34:29 2025-11-12
46
+
47
+ ## Intended Use
48
+ ### Primary intended uses
49
+ The CardioTrack ML system is designed to support early detection of heart failure by analyzing clinical features and identifying patients who may be at risk. Its purpose is to assist cardiologists in deciding when further diagnostic tests, monitoring, or preventive treatments are needed. The system is also intended for local public health authorities, who can use aggregated predictions to plan healthcare resources and implement prevention strategies within the population.
50
+ ### Primary intended users
51
+ The primary users of the model are cardiologists and other qualified medical professionals who rely on clinical decision support tools. They are responsible for interpreting the model’s predictions in conjunction with the patient’s medical history and additional clinical information. Public health authorities may also use aggregated, non-individual results to support long-term planning and policy development.
52
+ ### Out-of-scope use cases
53
+ The model should not be used without access to complete and reliable clinical features, and it is not suitable for real-time emergency triage or for predictive tasks not directly related to heart failure.
54
+
55
+ ## Factors
56
+ ### Relevant factors
57
+ Model performance may vary depending on patient characteristics that influence heart disease risk, as reflected in the contributions of individual clinical features. Age remains a relevant factor because it strongly correlates with cardiovascular conditions. In addition, features such as ST_Slope, ChestPainType, MaxHR, and ExerciseAngina have the largest impact on individual predictions, as highlighted by SHAP module for XAI. These features capture meaningful physiological and clinical differences among patients and explain why the model predicts higher or lower risk for specific individuals. Instrumentation and environmental factors are not relevant because the model operates on structured clinical data rather than on signals or images affected by measurement devices or environmental conditions.
58
+ ### Evaluation Factors
59
+ The evaluation focuses on key clinical features that the model heavily relies on. The Relevant factors were chosen because they are both present in the dataset and have the largest impact on the model’s outputs, allowing clear interpretation of how predictions are made.
60
+
61
+ ## Metrics
62
+ ### Model Performance
63
+ - `F1 Score` 0.8990
64
+ - `Recall` 0.9019
65
+ - `Accuracy` 0.8876
66
+ - `ROC-AUC` 0.9399
67
+ ### Variation approaches
68
+ The reported metrics were computed using the best model selected during cross validation for hyperparameter tuning, and evaluated on a completely independent test set. This setup was chosen because it provides a cleaner estimate of real-world performance, reduces the risk of overfitting to validation folds, and ensures that the results reflect the model’s generalization ability.
69
+
70
+ ## Evaluation Data
71
+ ### Datasets
72
+ The evaluation was performed using 276 of 918 (30%) observations of the Kaggle's [Heart Failure Prediction Dataset](https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction), which contains clinical data from both healthy individuals and patients diagnosed with heart failure.
73
+ ### Motivation
74
+ This dataset was chosen because it provides a comprehensive set of relevant clinical features that capture key cardiovascular risk factors, enabling the model to perform early detection of heart failure in individual patients. Its publicly available nature ensures transparency.
75
+ ### Preprocessing
76
+ Before evaluation, the data was preprocessed as follows:
77
+
78
+ - **Cleaning of invalid values**
79
+ Rows with impossible clinical values (e.g., `RestingBP = 0`) were removed.
80
+ Zero cholesterol values were treated as missing and replaced using a central-tendency statistic.
81
+
82
+ - **Encoding of categorical variables**
83
+ Binary categories were converted to numerical format, while multi-class fields (`ChestPainType`, `RestingECG`, `ST_Slope`) were one-hot encoded.
84
+
85
+ - **Scaling of numerical features**
86
+ Continuous variables were standardized to have mean 0 and unit variance.
87
+
88
+ - **Removal of the `Sex` feature**
89
+ The Sex feature was removed to reduce potential fairness concerns and because it was not required for the planned experiments.
90
+
91
+ - The processed dataset is versioned on Dagshub at following [link](https://dagshub.com/se4ai2526-uniba/CardioTrack)
92
+
93
+ ## Training Data
94
+ ### Datasets
95
+ The training data mirrors the evaluation dataset, using 642 of 918 (70%) of the same Kaggle Heart Failure Prediction Dataset.
96
+ ### Preprocessing
97
+ The training data underwent the same preprocessing steps as the evaluation data. Additionally, RandomOversampler technique was applied to balance the classes, ensuring that the model learned equally from positive (heart failure) and negative cases.
98
+
99
+ ## Ethical Considerations
100
+ The Cardio Track ML system is intended to support clinical decision-making but not replace professional judgment. Ethical considerations include:
101
+ - **Privacy and security**: All patient data is processed on-premises, in accordance with hospital IT protocols, protecting sensitive health information.
102
+ - **Transparency**: Feature importance with SHAP visualizations allow clinicians to interpret predictions.
103
+ - **Clinical responsibility**: Diagnosis must be combined with patient history, exams, and expert judgment. Misuse in isolation could lead to incorrect interventions.
104
+
105
+ ## Caveats and Recommendations
106
+ - **Inference time**: The model’s inference time is about 0.2 seconds, but it can varies with computing power where inference is run.
107
+ - **Limitations**: The model is trained on a specific public dataset and may not capture rare cardiovascular conditions or population-specific variations.
108
+ - **Data quality**: Accurate predictions depend on complete and correctly measured clinical features. Erroneous data can reduce performance. Missing data is not allowed.
109
+ - **Not for emergency triage**: Predictions are intended for early detection and planning, not for immediate emergency decision-making.
110
+ - **Periodic retraining**: To maintain accuracy, the model should be updated with newly collected clinical data to account for shifts in patient population or disease prevalence.
models/all/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /logreg.joblib
2
+ /random_forest.joblib
3
+ /decision_tree.joblib
models/female/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /logreg.joblib
2
+ /random_forest.joblib
3
+ /decision_tree.joblib
models/male/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /logreg.joblib
2
+ /random_forest.joblib
3
+ /decision_tree.joblib
models/nosex/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /logreg.joblib
2
+ /random_forest.joblib
3
+ /decision_tree.joblib
notebooks/.gitkeep ADDED
File without changes
notebooks/1.0-mc-initial-data-exploration.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
predicting_outcomes_in_heart_failure/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from predicting_outcomes_in_heart_failure import config # noqa: F401
predicting_outcomes_in_heart_failure/app/__init__.py ADDED
File without changes
predicting_outcomes_in_heart_failure/app/main.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import asynccontextmanager
2
+
3
+ from fastapi import FastAPI
4
+ from fastapi.staticfiles import StaticFiles
5
+ import gradio as gr
6
+ import joblib
7
+ from loguru import logger
8
+
9
+ from predicting_outcomes_in_heart_failure.app.routers import cards, model_info, prediction
10
+ from predicting_outcomes_in_heart_failure.app.utils import load_page, update_patient_index_choices
11
+ from predicting_outcomes_in_heart_failure.app.wrapper import Wrapper
12
+ from predicting_outcomes_in_heart_failure.config import FIGURES_DIR, MODEL_PATH
13
+
14
+
15
+ @asynccontextmanager
16
+ async def lifespan(app: FastAPI):
17
+ """Context manager to handle application lifespan events."""
18
+ if not MODEL_PATH.exists():
19
+ logger.error(f"Model file not found at: {MODEL_PATH}")
20
+ raise FileNotFoundError(f"Model file not found at: {MODEL_PATH}")
21
+
22
+ logger.info(f"Loading default model from {MODEL_PATH} ...")
23
+ app.state.model = joblib.load(MODEL_PATH)
24
+ logger.success(f"Default model loaded from {MODEL_PATH}")
25
+
26
+ try:
27
+ yield
28
+ finally:
29
+ app.state.model = None
30
+ logger.info("Default model cleared on application shutdown")
31
+
32
+
33
+ app = FastAPI(
34
+ title="CardioTrack's Model Space - Heart Failure Prediction",
35
+ version="0.01",
36
+ lifespan=lifespan,
37
+ )
38
+
39
+
40
+ if not FIGURES_DIR.exists():
41
+ logger.warning(f"Figures directory {FIGURES_DIR} does not exist. Creating it.")
42
+ FIGURES_DIR.mkdir(parents=True, exist_ok=True)
43
+ app.mount("/figures", StaticFiles(directory=str(FIGURES_DIR)), name="figures")
44
+ app.include_router(prediction.router)
45
+ app.include_router(model_info.router)
46
+ app.include_router(cards.router)
47
+
48
+
49
+ # UI Definition
50
+ with gr.Blocks(title="CardioTrack") as io:
51
+ gr.Markdown(
52
+ """
53
+ # 🫀 CardioTrack's Model Space - Heart Failure Diagnosis
54
+ Choose an area to access the platform's features.
55
+ """
56
+ )
57
+
58
+ with gr.Tabs():
59
+ with gr.TabItem("Single Diagnosis"):
60
+ gr.Markdown("### Enter patient data for the diagnosis")
61
+
62
+ with gr.Row():
63
+ with gr.Column():
64
+ age = gr.Slider(minimum=20, maximum=100, step=1, label="Age", value=60)
65
+ resting_bp = gr.Slider(
66
+ minimum=80,
67
+ maximum=200,
68
+ step=1,
69
+ label="Resting Blood Pressure (mm Hg)",
70
+ value=120,
71
+ )
72
+ cholesterol = gr.Slider(
73
+ minimum=0, maximum=600, step=1, label="Cholesterol (mg/dL)", value=200
74
+ )
75
+ max_hr = gr.Slider(
76
+ minimum=60, maximum=220, step=1, label="Max Heart Rate", value=150
77
+ )
78
+ oldpeak = gr.Slider(
79
+ minimum=-3.0,
80
+ maximum=7.0,
81
+ step=0.1,
82
+ label="Oldpeak (ST Depression)",
83
+ value=1.0,
84
+ )
85
+
86
+ with gr.Column():
87
+ chest_pain_type = gr.Dropdown(
88
+ choices=["TA", "ATA", "NAP", "ASY"], label="Chest Pain Type", value="ASY"
89
+ )
90
+ fasting_bs = gr.Dropdown(
91
+ choices=[0, 1],
92
+ label="Fasting Blood Sugar (0: <=120 mg/dL, 1: >120 mg/dL)",
93
+ value=0,
94
+ )
95
+ resting_ecg = gr.Dropdown(
96
+ choices=["Normal", "ST", "LVH"], label="Resting ECG", value="Normal"
97
+ )
98
+ exercise_angina = gr.Dropdown(
99
+ choices=["Y", "N"], label="Exercise Angina", value="N"
100
+ )
101
+ st_slope = gr.Dropdown(
102
+ choices=["Up", "Flat", "Down"], label="ST Slope", value="Flat"
103
+ )
104
+
105
+ predict_btn = gr.Button("Analyze", variant="primary")
106
+ single_output = gr.Markdown(label="Result")
107
+
108
+ explanation_img = gr.Image(label="Explanation", type="filepath", visible=True)
109
+
110
+ predict_btn.click(
111
+ fn=Wrapper.prediction_with_explanation,
112
+ inputs=[
113
+ age,
114
+ chest_pain_type,
115
+ resting_bp,
116
+ cholesterol,
117
+ fasting_bs,
118
+ resting_ecg,
119
+ max_hr,
120
+ exercise_angina,
121
+ oldpeak,
122
+ st_slope,
123
+ ],
124
+ outputs=[single_output, explanation_img],
125
+ )
126
+
127
+ with gr.TabItem("Group Diagnosis"):
128
+ gr.Markdown("### Upload a CSV file for analize multiple subjects")
129
+ gr.Markdown(
130
+ "The CSV should contain columns: Age, ChestPainType, RestingBP, Cholesterol,"
131
+ + "FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope"
132
+ )
133
+
134
+ file_input = gr.File(label="Upload CSV", file_types=[".csv"])
135
+ batch_predict_btn = gr.Button("Analyze Group", variant="primary")
136
+ batch_output = gr.Dataframe(label="Results")
137
+
138
+ batch_predict_btn.click(
139
+ fn=Wrapper.batch_prediction, inputs=file_input, outputs=batch_output
140
+ )
141
+
142
+ gr.Markdown("### Explain a specific patient from the group")
143
+
144
+ patient_index = gr.Dropdown(
145
+ label="Patient index (0-based)",
146
+ choices=[],
147
+ interactive=True,
148
+ )
149
+
150
+ batch_output.change(
151
+ fn=update_patient_index_choices,
152
+ inputs=batch_output,
153
+ outputs=patient_index,
154
+ )
155
+
156
+ batch_explain_btn = gr.Button("Explain selected patient", variant="secondary")
157
+
158
+ batch_explanation_img = gr.Image(
159
+ label="Explanation",
160
+ type="filepath",
161
+ )
162
+
163
+ batch_explain_btn.click(
164
+ fn=Wrapper.batch_explanation,
165
+ inputs=[file_input, patient_index],
166
+ outputs=batch_explanation_img,
167
+ )
168
+
169
+ with gr.TabItem("ModelCard"):
170
+ io = load_page(io, Wrapper.get_model_card)
171
+
172
+ with gr.TabItem("DatasetCard"):
173
+ io = load_page(io, Wrapper.get_dataset_card)
174
+
175
+ with gr.TabItem("Hyperparameters"):
176
+ gr.Markdown("## Model Hyperparameters")
177
+ io = load_page(io, Wrapper.get_hyperparameters)
178
+
179
+ with gr.TabItem("Evaluation Metrics"):
180
+ gr.Markdown("## Model Performance Metrics")
181
+ io = load_page(io, Wrapper.get_metrics)
182
+
183
+ app = gr.mount_gradio_app(app, io, path="/")
predicting_outcomes_in_heart_failure/app/routers/cards.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from http import HTTPStatus
2
+
3
+ from fastapi import APIRouter, HTTPException, Request
4
+ from loguru import logger
5
+ from predicting_outcomes_in_heart_failure.app.utils import construct_response
6
+ from predicting_outcomes_in_heart_failure.config import CARD_PATHS
7
+
8
+ router = APIRouter(tags=["Cards"])
9
+
10
+
11
+ @router.get("/cards/{card_type}")
12
+ @construct_response
13
+ def card(request: Request, card_type: str):
14
+ """Return card information.
15
+ card_type = dataset_card / model_card
16
+ """
17
+ logger.info(f"Received /cards/{card_type} request")
18
+
19
+ # Normalizza il card_type per gestire eventuali varianti
20
+ card_type = card_type.lower().replace("-", "_")
21
+
22
+ path = CARD_PATHS.get(card_type)
23
+ if path is None:
24
+ logger.warning(f"Unsupported card_type requested: {card_type}")
25
+ raise HTTPException(
26
+ status_code=HTTPStatus.NOT_FOUND,
27
+ detail=f"Card type '{card_type}' not supported."
28
+ + f" Valid types: {', '.join(CARD_PATHS.keys())}",
29
+ )
30
+
31
+ try:
32
+ with open(path, encoding="utf-8") as f:
33
+ card_content = f.read()
34
+
35
+ logger.success(f"{path} loaded successfully")
36
+
37
+ return {
38
+ "message": HTTPStatus.OK.phrase,
39
+ "status-code": HTTPStatus.OK.value,
40
+ "data": {
41
+ "card_type": card_type,
42
+ "path": str(path),
43
+ "card_lines": card_content.split("\n"),
44
+ },
45
+ }
46
+
47
+ except Exception as e:
48
+ logger.exception(f"Failed to load card content from {path}: {e}")
49
+ raise HTTPException(
50
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
51
+ detail=f"Error reading card file: {e}",
52
+ ) from e
predicting_outcomes_in_heart_failure/app/routers/general.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from http import HTTPStatus
2
+
3
+ from fastapi import APIRouter, Request
4
+ from loguru import logger
5
+ from predicting_outcomes_in_heart_failure.app.utils import construct_response
6
+
7
+ router = APIRouter(tags=["General"])
8
+
9
+
10
+ @router.get("/")
11
+ @construct_response
12
+ def index(request: Request):
13
+ """Root endpoint."""
14
+ logger.info("General requested")
15
+ return {
16
+ "message": HTTPStatus.OK.phrase,
17
+ "status-code": HTTPStatus.OK,
18
+ "data": {"message": "Welcome to Heart Failure Predictor!"},
19
+ }
predicting_outcomes_in_heart_failure/app/routers/model_info.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from http import HTTPStatus
2
+ import json
3
+ from typing import Any
4
+
5
+ from fastapi import APIRouter, Request
6
+ from loguru import logger
7
+ from predicting_outcomes_in_heart_failure.app.utils import construct_response
8
+ from predicting_outcomes_in_heart_failure.config import (
9
+ MODEL_PATH,
10
+ REPORTS_DIR,
11
+ TEST_METRICS_DIR,
12
+ )
13
+
14
+ router = APIRouter(tags=["Model"])
15
+
16
+
17
+ @router.get("/model/hyperparameters")
18
+ @construct_response
19
+ def get_model_hyperparameters(request: Request):
20
+ variant = MODEL_PATH.parent.name
21
+ model_name = MODEL_PATH.stem
22
+ hyperparams_path = REPORTS_DIR / variant / model_name / "cv_parameters.json"
23
+ logger.info(
24
+ f"Looking for hyperparameters file at {hyperparams_path} "
25
+ f"(model={model_name}, variant={variant})"
26
+ )
27
+
28
+ if not hyperparams_path.exists():
29
+ logger.warning("Hyperparameters file not found")
30
+ return {
31
+ "message": HTTPStatus.NOT_FOUND.phrase,
32
+ "status-code": HTTPStatus.NOT_FOUND,
33
+ "data": {
34
+ "detail": "Hyperparameters file not found. Run the training pipeline.",
35
+ "model_name": model_name,
36
+ "variant": variant,
37
+ "expected_path": str(hyperparams_path),
38
+ },
39
+ }
40
+
41
+ with hyperparams_path.open("r", encoding="utf-8") as f:
42
+ hyperparams_data = json.load(f)
43
+
44
+ data: dict[str, Any] = {
45
+ "model_path": str(MODEL_PATH),
46
+ "hyperparameters": hyperparams_data,
47
+ }
48
+
49
+ return {
50
+ "message": HTTPStatus.OK.phrase,
51
+ "status-code": HTTPStatus.OK,
52
+ "data": data,
53
+ }
54
+
55
+
56
+ @router.get("/model/metrics")
57
+ @construct_response
58
+ def get_model_metrics(request: Request):
59
+ variant = MODEL_PATH.parent.name
60
+ model_name = MODEL_PATH.stem
61
+ metrics_path = TEST_METRICS_DIR / variant / f"{model_name}.json"
62
+ logger.info(
63
+ f"Looking for metrics file at {metrics_path} (model={model_name}, variant={variant})"
64
+ )
65
+
66
+ if not metrics_path.exists():
67
+ logger.warning("Metrics file not found")
68
+ return {
69
+ "message": HTTPStatus.NOT_FOUND.phrase,
70
+ "status-code": HTTPStatus.NOT_FOUND,
71
+ "data": {
72
+ "detail": (
73
+ "Metrics file not found. Run the evaluation pipeline for this model first."
74
+ ),
75
+ "model_name": model_name,
76
+ "variant": variant,
77
+ "expected_path": str(metrics_path),
78
+ },
79
+ }
80
+
81
+ with metrics_path.open("r", encoding="utf-8") as f:
82
+ metrics_data = json.load(f)
83
+
84
+ data: dict[str, Any] = {
85
+ "model_path": str(MODEL_PATH),
86
+ "model_name": model_name,
87
+ "variant": variant,
88
+ "metrics": metrics_data.get("metrics", metrics_data),
89
+ }
90
+
91
+ return {
92
+ "message": HTTPStatus.OK.phrase,
93
+ "status-code": HTTPStatus.OK,
94
+ "data": data,
95
+ }
predicting_outcomes_in_heart_failure/app/routers/prediction.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from http import HTTPStatus
2
+ from typing import Any
3
+
4
+ from fastapi import APIRouter, Request
5
+ from loguru import logger
6
+ import pandas as pd
7
+ from predicting_outcomes_in_heart_failure.app.schema import HeartSample
8
+ from predicting_outcomes_in_heart_failure.app.utils import (
9
+ construct_response,
10
+ get_model_from_state,
11
+ )
12
+ from predicting_outcomes_in_heart_failure.config import FIGURES_DIR, MODEL_PATH
13
+ from predicting_outcomes_in_heart_failure.modeling.explainability import (
14
+ explain_prediction,
15
+ save_shap_waterfall_plot,
16
+ )
17
+ from predicting_outcomes_in_heart_failure.modeling.predict import preprocessing
18
+
19
+ router = APIRouter()
20
+
21
+
22
+ @router.post("/predictions", tags=["Prediction"])
23
+ @construct_response
24
+ def predict(request: Request, payload: HeartSample):
25
+ model = get_model_from_state(request)
26
+ if model is None:
27
+ return {
28
+ "message": HTTPStatus.SERVICE_UNAVAILABLE.phrase,
29
+ "status-code": HTTPStatus.SERVICE_UNAVAILABLE,
30
+ "data": {"detail": "Model is not loaded."},
31
+ }
32
+
33
+ X_raw = payload.to_dataframe()
34
+ X = preprocessing(X_raw)
35
+ y_pred = int(model.predict(X)[0])
36
+
37
+ data: dict[str, Any] = {
38
+ "input": payload.model_dump(),
39
+ "prediction": y_pred,
40
+ }
41
+
42
+ logger.success("Prediction completed successfully for /predictions")
43
+ return {
44
+ "message": HTTPStatus.OK.phrase,
45
+ "status-code": HTTPStatus.OK,
46
+ "data": data,
47
+ }
48
+
49
+
50
+ @router.post("/batch-predictions", tags=["Prediction"])
51
+ @construct_response
52
+ def predict_batch(request: Request, payload: list[HeartSample]):
53
+ model = get_model_from_state(request)
54
+ if model is None:
55
+ return {
56
+ "message": HTTPStatus.SERVICE_UNAVAILABLE.phrase,
57
+ "status-code": HTTPStatus.SERVICE_UNAVAILABLE,
58
+ "data": {"detail": "Model is not loaded."},
59
+ }
60
+
61
+ X_raw_list = [sample.to_dataframe() for sample in payload]
62
+ X_raw = pd.concat(X_raw_list, ignore_index=True)
63
+ X = preprocessing(X_raw)
64
+
65
+ y_pred = [int(y) for y in model.predict(X)]
66
+
67
+ results: list[dict[str, Any]] = []
68
+ for idx, (sample, pred) in enumerate(zip(payload, y_pred, strict=True)):
69
+ results.append(
70
+ {
71
+ "index": idx,
72
+ "input": sample.model_dump(),
73
+ "prediction": pred,
74
+ }
75
+ )
76
+
77
+ data: dict[str, Any] = {
78
+ "results": results,
79
+ "batch_size": len(results),
80
+ }
81
+
82
+ return {
83
+ "message": HTTPStatus.OK.phrase,
84
+ "status-code": HTTPStatus.OK,
85
+ "data": data,
86
+ }
87
+
88
+
89
+ @router.post("/explanations", tags=["Explainability"])
90
+ @construct_response
91
+ def explain(request: Request, payload: HeartSample):
92
+ model = get_model_from_state(request)
93
+ if model is None:
94
+ return {
95
+ "message": HTTPStatus.SERVICE_UNAVAILABLE.phrase,
96
+ "status-code": HTTPStatus.SERVICE_UNAVAILABLE,
97
+ "data": {"detail": "Model is not loaded."},
98
+ }
99
+
100
+ X_raw = payload.to_dataframe()
101
+ X = preprocessing(X_raw)
102
+
103
+ data: dict[str, Any] = {"input": payload.model_dump()}
104
+ model_type = MODEL_PATH.stem
105
+
106
+ try:
107
+ logger.info("Computing explanation for default model prediction...")
108
+ explanations = explain_prediction(model, X, model_type=model_type, top_k=5)
109
+ if explanations:
110
+ data["explanations"] = explanations
111
+ logger.success("Explanation computed successfully for default model.")
112
+ else:
113
+ logger.warning("No explanation available for default model.")
114
+ except Exception as e:
115
+ logger.exception(f"Failed to compute explanation: {e}")
116
+
117
+ try:
118
+ plot_path = FIGURES_DIR / f"shap_waterfall_default_{model_type}.png"
119
+ saved_path = save_shap_waterfall_plot(
120
+ model=model,
121
+ X=X,
122
+ model_type=model_type,
123
+ output_path=plot_path,
124
+ )
125
+ if saved_path is not None:
126
+ data["explanation_plot_url"] = f"/figures/{saved_path.name}"
127
+ except Exception as e:
128
+ logger.exception(f"Failed to generate explanation plot: {e}")
129
+
130
+ logger.success("Explanation completed successfully for /explanations")
131
+ return {
132
+ "message": HTTPStatus.OK.phrase,
133
+ "status-code": HTTPStatus.OK,
134
+ "data": data,
135
+ }
predicting_outcomes_in_heart_failure/app/schema.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Literal
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from pydantic import BaseModel, field_validator
8
+
9
+
10
+ class HeartSample(BaseModel):
11
+ Age: int
12
+ ChestPainType: Literal["TA", "ATA", "NAP", "ASY"]
13
+ RestingBP: int
14
+ Cholesterol: int
15
+ FastingBS: int
16
+ RestingECG: Literal["Normal", "ST", "LVH"]
17
+ MaxHR: int
18
+ ExerciseAngina: Literal["Y", "N"]
19
+ Oldpeak: float
20
+ ST_Slope: Literal["Up", "Flat", "Down"]
21
+
22
+ @field_validator("Oldpeak")
23
+ @classmethod
24
+ def round_oldpeak(cls, v: float) -> float:
25
+ return float(np.round(v, 2))
26
+
27
+ def to_dataframe(self) -> pd.DataFrame:
28
+ return pd.DataFrame([self.model_dump()])
predicting_outcomes_in_heart_failure/app/utils.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from functools import wraps
3
+
4
+ from fastapi import Request
5
+ import gradio as gr
6
+ from loguru import logger
7
+
8
+
9
+ def construct_response(f):
10
+ """Construct a JSON response for an endpoint's results."""
11
+
12
+ @wraps(f)
13
+ def wrap(request: Request, *args, **kwargs):
14
+ result = f(request, *args, **kwargs)
15
+ response = {
16
+ "message": result["message"],
17
+ "method": request.method,
18
+ "status-code": result["status-code"],
19
+ "timestamp": datetime.now().isoformat(),
20
+ "url": request.url._url,
21
+ }
22
+ if "data" in result:
23
+ response["data"] = result["data"]
24
+ return response
25
+
26
+ return wrap
27
+
28
+
29
+ def get_model_from_state(request: Request):
30
+ """Retrieve the model from the app state."""
31
+ model = getattr(request.app.state, "model", None)
32
+ if model is None:
33
+ logger.error("Model not loaded in app.state.model")
34
+ return model
35
+
36
+
37
+ def load_page(io, fn):
38
+ content = gr.Markdown("Loading...")
39
+
40
+ io.load(fn=fn, inputs=None, outputs=content)
41
+ return io
42
+
43
+
44
+ def update_patient_index_choices(df):
45
+ """Populate the dropdown with valid patient indices from the batch results."""
46
+ import gradio as gr
47
+
48
+ if df is None:
49
+ return gr.update(choices=[], value=None)
50
+
51
+ try:
52
+ indices = list(df["Patients's index"].astype(int))
53
+ except Exception:
54
+ return gr.update(choices=[], value=None)
55
+
56
+ if not indices:
57
+ return gr.update(choices=[], value=None)
58
+
59
+ return gr.update(choices=indices, value=indices[0])
predicting_outcomes_in_heart_failure/app/wrapper.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+ from loguru import logger
3
+ import pandas as pd
4
+
5
+ from predicting_outcomes_in_heart_failure.config import API_URL, FIGURES_DIR
6
+
7
+
8
+ async def _fetch_api(endpoint: str):
9
+ async with httpx.AsyncClient() as client:
10
+ try:
11
+ response = await client.get(f"{API_URL}/{endpoint}")
12
+ response.raise_for_status()
13
+ return response.json()
14
+ except Exception as e:
15
+ logger.error(f"Error fetching {endpoint}: {e}")
16
+ return {"error": str(e)}
17
+
18
+
19
+ class Wrapper:
20
+ async def prediction_with_explanation(
21
+ age,
22
+ chest_pain_type,
23
+ resting_bp,
24
+ cholesterol,
25
+ fasting_bs,
26
+ resting_ecg,
27
+ max_hr,
28
+ exercise_angina,
29
+ oldpeak,
30
+ st_slope,
31
+ ):
32
+ payload = {
33
+ "Age": age,
34
+ "ChestPainType": chest_pain_type,
35
+ "RestingBP": resting_bp,
36
+ "Cholesterol": cholesterol,
37
+ "FastingBS": fasting_bs,
38
+ "RestingECG": resting_ecg,
39
+ "MaxHR": max_hr,
40
+ "ExerciseAngina": exercise_angina,
41
+ "Oldpeak": round(oldpeak, 2),
42
+ "ST_Slope": st_slope,
43
+ }
44
+
45
+ async with httpx.AsyncClient() as client:
46
+ try:
47
+ pred_resp = await client.post(f"{API_URL}/predictions", json=payload)
48
+ pred_resp.raise_for_status()
49
+ pred_json = pred_resp.json()
50
+
51
+ prediction_value = pred_json["data"]["prediction"]
52
+ status = "🆘 Risk Detected" if prediction_value == 1 else "✅ No Risk Detected"
53
+ status_text = f"# Patient's status: {status}"
54
+ except Exception as e:
55
+ logger.error(f"Error making prediction: {e}")
56
+ return f"Error during prediction: {str(e)}", ""
57
+
58
+ try:
59
+ expl_resp = await client.post(f"{API_URL}/explanations", json=payload)
60
+ expl_resp.raise_for_status()
61
+ expl_json = expl_resp.json()
62
+
63
+ plot_rel_url = expl_json["data"].get("explanation_plot_url")
64
+ if not plot_rel_url:
65
+ logger.warning("No explanation_plot_url found in /explanations response.")
66
+ return status_text, ""
67
+
68
+ filename = plot_rel_url.split("/")[-1]
69
+ plot_path = FIGURES_DIR / filename
70
+ return status_text, str(plot_path)
71
+
72
+ except Exception as e:
73
+ logger.error(f"Error getting explanation: {e}")
74
+ return status_text, ""
75
+
76
+ async def batch_prediction(file):
77
+ async with httpx.AsyncClient(timeout=30.0) as client:
78
+ try:
79
+ df = pd.read_csv(file)
80
+
81
+ payload = []
82
+ for _, row in df.iterrows():
83
+ sample = {
84
+ "Age": int(row["Age"]),
85
+ "ChestPainType": row["ChestPainType"],
86
+ "RestingBP": int(row["RestingBP"]),
87
+ "Cholesterol": int(row["Cholesterol"]),
88
+ "FastingBS": int(row["FastingBS"]),
89
+ "RestingECG": row["RestingECG"],
90
+ "MaxHR": int(row["MaxHR"]),
91
+ "ExerciseAngina": row["ExerciseAngina"],
92
+ "Oldpeak": round(float(row["Oldpeak"]), 2),
93
+ "ST_Slope": row["ST_Slope"],
94
+ }
95
+ payload.append(sample)
96
+
97
+ response = await client.post(f"{API_URL}/batch-predictions", json=payload)
98
+ response.raise_for_status()
99
+ result = response.json()
100
+
101
+ results = result["data"]["results"]
102
+ df_results = pd.DataFrame(
103
+ [
104
+ {
105
+ "Patients's index": r["index"],
106
+ "Patient's status": "🆘 Risk Detected"
107
+ if r["prediction"] == 1
108
+ else "✅ No Risk Detected",
109
+ }
110
+ for r in results
111
+ ]
112
+ )
113
+
114
+ return df_results
115
+ except Exception as e:
116
+ logger.error(f"Error making batch prediction: {e}")
117
+ return pd.DataFrame({"error": [str(e)]})
118
+
119
+ async def get_model_card():
120
+ data = await _fetch_api("cards/model_card")
121
+
122
+ card_lines = data.get("data").get("card_lines")
123
+ return "\n".join(card_lines)
124
+
125
+ async def get_dataset_card():
126
+ data = await _fetch_api("cards/dataset_card")
127
+
128
+ card_lines = data.get("data").get("card_lines")
129
+ return "\n".join(card_lines)
130
+
131
+ async def get_hyperparameters():
132
+ data = await _fetch_api("model/hyperparameters")
133
+ if "error" in data:
134
+ return f"## Error\n{data['error']}"
135
+
136
+ data = data.get("data", {}).get("hyperparameters", {}).get("cv", {})
137
+
138
+ md = ""
139
+ for key, value in data.items():
140
+ md += f"- **{key}**: {value}\n"
141
+ return md
142
+
143
+ async def get_metrics():
144
+ data = await _fetch_api("model/metrics")
145
+ if "error" in data:
146
+ return f"## Error\n{data['error']}"
147
+
148
+ metrics = data.get("data", {}).get("metrics", {})
149
+ if not metrics:
150
+ return "## No metrics found"
151
+
152
+ md = ""
153
+ for key, value in metrics.items():
154
+ md += f"- **{key}**: {value:.4f}\n"
155
+ return md
156
+
157
+ async def batch_explanation(file, patient_index: int):
158
+ """Return SHAP plot (filepath) for a specific patient in the uploaded CSV."""
159
+ try:
160
+ df = pd.read_csv(file)
161
+ except Exception as e:
162
+ logger.error(f"Error reading CSV for batch explanation: {e}")
163
+ return None
164
+
165
+ try:
166
+ idx = int(patient_index)
167
+ except (TypeError, ValueError):
168
+ logger.error(f"Invalid patient_index: {patient_index}")
169
+ return None
170
+
171
+ if idx < 0 or idx >= len(df):
172
+ logger.error(f"patient_index {idx} out of range (0..{len(df) - 1})")
173
+ return None
174
+
175
+ row = df.iloc[idx]
176
+
177
+ payload = {
178
+ "Age": int(row["Age"]),
179
+ "ChestPainType": row["ChestPainType"],
180
+ "RestingBP": int(row["RestingBP"]),
181
+ "Cholesterol": int(row["Cholesterol"]),
182
+ "FastingBS": int(row["FastingBS"]),
183
+ "RestingECG": row["RestingECG"],
184
+ "MaxHR": int(row["MaxHR"]),
185
+ "ExerciseAngina": row["ExerciseAngina"],
186
+ "Oldpeak": round(float(row["Oldpeak"]), 2),
187
+ "ST_Slope": row["ST_Slope"],
188
+ }
189
+
190
+ async with httpx.AsyncClient() as client:
191
+ try:
192
+ expl_resp = await client.post(f"{API_URL}/explanations", json=payload)
193
+ expl_resp.raise_for_status()
194
+ expl_json = expl_resp.json()
195
+
196
+ plot_rel_url = expl_json["data"].get("explanation_plot_url")
197
+ if not plot_rel_url:
198
+ logger.warning(
199
+ "No explanation_plot_url found in /explanations response (batch)."
200
+ )
201
+ return None
202
+
203
+ filename = plot_rel_url.split("/")[-1]
204
+ plot_path = FIGURES_DIR / filename
205
+
206
+ return str(plot_path)
207
+
208
+ except Exception as e:
209
+ logger.error(f"Error getting batch explanation: {e}")
210
+ return None
predicting_outcomes_in_heart_failure/config.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from dotenv import load_dotenv
4
+ from loguru import logger
5
+
6
+ load_dotenv()
7
+
8
+ # -------------------
9
+ # Experiment settings
10
+ # -------------------
11
+ VALID_VARIANTS = ["all", "female", "male", "nosex"]
12
+ VALID_MODELS = ["logreg", "random_forest", "decision_tree"]
13
+ EXPERIMENT_NAME = "Heart_Failure_Prediction"
14
+ DATASET_NAME = "fedesoriano/heart-failure-prediction"
15
+ TARGET_COL = "HeartDisease"
16
+ RANDOM_STATE = 42
17
+ TEST_SIZE = 0.30
18
+ N_SPLITS = 5
19
+ SCORING = {
20
+ "accuracy": "accuracy",
21
+ "f1": "f1",
22
+ "recall": "recall",
23
+ "roc_auc": "roc_auc",
24
+ }
25
+
26
+ NUM_COLS_DEFAULT = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]
27
+ CAT_COLS_DEFAULT = [
28
+ "Sex",
29
+ "ChestPainType",
30
+ "FastingBS",
31
+ "RestingECG",
32
+ "ExerciseAngina",
33
+ "ST_Slope",
34
+ ]
35
+ MULTI_CAT = ["ChestPainType", "RestingECG", "ST_Slope"]
36
+
37
+ INPUT_COLUMNS = [
38
+ "Age",
39
+ "RestingBP",
40
+ "Cholesterol",
41
+ "FastingBS",
42
+ "MaxHR",
43
+ "ExerciseAngina",
44
+ "Oldpeak",
45
+ "ChestPainType_ASY",
46
+ "ChestPainType_ATA",
47
+ "ChestPainType_NAP",
48
+ "ChestPainType_TA",
49
+ "RestingECG_LVH",
50
+ "RestingECG_Normal",
51
+ "RestingECG_ST",
52
+ "ST_Slope_Down",
53
+ "ST_Slope_Flat",
54
+ "ST_Slope_Up",
55
+ ]
56
+ # ----------------------------
57
+ # Model hyperparameter configurations
58
+ # ----------------------------
59
+ CONFIG_RF = {
60
+ "n_estimators": [200, 400, 800],
61
+ "max_depth": [None, 6, 12],
62
+ "class_weight": [None, "balanced"],
63
+ }
64
+ CONFIG_DT = {
65
+ "criterion": ["gini", "entropy", "log_loss"],
66
+ "max_depth": [None, 3, 5, 7, 9, 12],
67
+ "min_samples_split": [2, 5, 10, 20],
68
+ "min_samples_leaf": [1, 2, 4, 8],
69
+ "max_features": [None, "sqrt", "log2"],
70
+ "class_weight": [None, "balanced"],
71
+ "ccp_alpha": [0.0, 0.001, 0.01],
72
+ }
73
+ CONFIG_LR = {"C": [0.01, 0.1, 1, 10], "penalty": ["l2"], "class_weight": [None, "balanced"]}
74
+
75
+ # ----------------------------
76
+ # Repository info
77
+ # ----------------------------
78
+ REPO_OWNER = "se4ai2526-uniba"
79
+ REPO_NAME = "CardioTrack"
80
+
81
+ # ----------------------------
82
+ # Great Expectations
83
+ # ----------------------------
84
+ SOURCE_NAME = "heart_data_source"
85
+ ASSET_NAME = "heart_failure"
86
+ SUITE_NAME = "heart_failure_data_quality"
87
+
88
+ # ----------------------------
89
+ # Paths
90
+ # ----------------------------
91
+ PROJ_ROOT = Path(__file__).resolve().parents[1]
92
+ logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")
93
+
94
+ DATA_DIR = PROJ_ROOT / "data"
95
+ INTERIM_DATA_DIR = DATA_DIR / "interim"
96
+ PROCESSED_DATA_DIR = DATA_DIR / "processed"
97
+ RAW_DATA_DIR = DATA_DIR / "raw"
98
+ EXTERNAL_DATA_DIR = DATA_DIR / "external"
99
+
100
+ RAW_PATH = RAW_DATA_DIR / "heart.csv"
101
+ PREPROCESSED_CSV = INTERIM_DATA_DIR / "preprocessed.csv"
102
+ TRAIN_CSV = PROCESSED_DATA_DIR / "train.csv"
103
+ TEST_CSV = PROCESSED_DATA_DIR / "test.csv"
104
+
105
+ MODELS_DIR = PROJ_ROOT / "models"
106
+ REPORTS_DIR = PROJ_ROOT / "reports"
107
+ FIGURES_DIR = REPORTS_DIR / "figures"
108
+
109
+ METRICS_DIR = PROJ_ROOT / "metrics"
110
+ TEST_METRICS_DIR = METRICS_DIR / "test"
111
+
112
+ NOSEX_CSV = INTERIM_DATA_DIR / "preprocessed_no_sex_column.csv"
113
+ MALE_CSV = INTERIM_DATA_DIR / "preprocessed_male_only.csv"
114
+ FEMALE_CSV = INTERIM_DATA_DIR / "preprocessed_female_only.csv"
115
+
116
+ PREPROCESS_ARTIFACTS_DIR = INTERIM_DATA_DIR / "preprocess_artifacts"
117
+ SCALER_PATH = PREPROCESS_ARTIFACTS_DIR / "scaler.joblib"
118
+
119
+ MODEL_PATH = Path("models/nosex/random_forest.joblib")
120
+
121
+ CARD_PATHS = {
122
+ "dataset_card": DATA_DIR / "README.md",
123
+ "model_card": MODELS_DIR / "README.md",
124
+ }
125
+
126
+ # ----------------------------
127
+ # API
128
+ # ----------------------------
129
+ API_URL = "http://localhost:7860"
predicting_outcomes_in_heart_failure/data/dataset.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+
4
+ import kagglehub
5
+ from loguru import logger
6
+ from predicting_outcomes_in_heart_failure.config import DATASET_NAME, RAW_DATA_DIR
7
+ import typer
8
+
9
+ app = typer.Typer()
10
+
11
+
12
+ @app.command()
13
+ def main():
14
+ logger.info("Downloading dataset from Kaggle...")
15
+ os.makedirs(RAW_DATA_DIR, exist_ok=True)
16
+ path = kagglehub.dataset_download(DATASET_NAME)
17
+ shutil.copytree(path, RAW_DATA_DIR, dirs_exist_ok=True)
18
+ logger.success("Dataset downloaded successfully to {RAW_DATA_DIR}.")
19
+
20
+
21
+ if __name__ == "__main__":
22
+ app()
predicting_outcomes_in_heart_failure/data/preprocess.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ from loguru import logger
3
+ import pandas as pd
4
+ from predicting_outcomes_in_heart_failure.config import (
5
+ FEMALE_CSV,
6
+ INTERIM_DATA_DIR,
7
+ MALE_CSV,
8
+ NOSEX_CSV,
9
+ NUM_COLS_DEFAULT,
10
+ PREPROCESS_ARTIFACTS_DIR,
11
+ PREPROCESSED_CSV,
12
+ RAW_PATH,
13
+ SCALER_PATH,
14
+ TARGET_COL,
15
+ )
16
+ from sklearn.preprocessing import StandardScaler
17
+
18
+
19
+ def save_scaler_artifact(scaler: StandardScaler):
20
+ """Save only the fitted scaler used during preprocessing for inference."""
21
+ PREPROCESS_ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
22
+ joblib.dump(scaler, SCALER_PATH)
23
+ logger.success(f"Saved StandardScaler to {SCALER_PATH}")
24
+
25
+
26
+ def generate_gender_splits(df: pd.DataFrame):
27
+ """Create and save gender-based CSV splits (female, male, nosex)."""
28
+ if "Sex" in df.columns:
29
+ df_female = df[df["Sex"] == 0].copy()
30
+ df_female.to_csv(FEMALE_CSV, index=False)
31
+ logger.success(f"Saved female-only dataset: {FEMALE_CSV} (rows={len(df_female)})")
32
+
33
+ if "Sex" in df.columns:
34
+ df_male = df[df["Sex"] == 1].copy()
35
+ df_male.to_csv(MALE_CSV, index=False)
36
+ logger.success(f"Saved male-only dataset: {MALE_CSV} (rows={len(df_male)})")
37
+
38
+ df_nosex = df.drop(columns=["Sex"], errors="ignore").copy()
39
+ df_nosex.to_csv(NOSEX_CSV, index=False)
40
+ logger.success(f"Saved dataset without 'Sex': {NOSEX_CSV} (rows={len(df_nosex)})")
41
+
42
+
43
+ def preprocessing():
44
+ """Run the full preprocessing pipeline on the raw heart dataset."""
45
+ logger.info("Starting preprocessing pipeline...")
46
+
47
+ if not RAW_PATH.exists():
48
+ logger.error(f"Missing {RAW_PATH}. Put heart.csv under data/raw/ or adjust RAW_PATH.")
49
+ raise FileNotFoundError(f"Missing {RAW_PATH}.")
50
+
51
+ df = pd.read_csv(RAW_PATH)
52
+ logger.info(f"Loaded dataset: {RAW_PATH} (rows={len(df)}, cols={df.shape[1]})")
53
+
54
+ if len(df) < 2:
55
+ raise ValueError("Preprocessing requires at least 2 rows, got only 1.")
56
+
57
+ # Ensure target is integer
58
+ df[TARGET_COL] = df[TARGET_COL].astype(int)
59
+
60
+ # Remove invalid RestingBP rows
61
+ if "RestingBP" in df.columns:
62
+ before = len(df)
63
+ df = df[df["RestingBP"] != 0].reset_index(drop=True)
64
+ removed = before - len(df)
65
+ if removed > 0:
66
+ logger.warning(f"Removed {removed} rows with RestingBP == 0")
67
+
68
+ # Impute missing/zero Cholesterol
69
+ if "Cholesterol" in df.columns:
70
+ zero_mask = df["Cholesterol"] == 0
71
+ if zero_mask.any():
72
+ median_chol = df.loc[~zero_mask, "Cholesterol"].median()
73
+ df.loc[zero_mask, "Cholesterol"] = median_chol
74
+ logger.info(f"Imputed {zero_mask.sum()} Cholesterol==0 with median={median_chol}")
75
+
76
+ # Encode binary categorical features
77
+ if "Sex" in df.columns:
78
+ df["Sex"] = df["Sex"].map({"M": 1, "F": 0}).astype(int)
79
+ logger.debug("Encoded 'Sex' as binary.")
80
+
81
+ if "ExerciseAngina" in df.columns:
82
+ df["ExerciseAngina"] = df["ExerciseAngina"].map({"Y": 1, "N": 0}).astype(int)
83
+ logger.debug("Encoded 'ExerciseAngina' as binary.")
84
+
85
+ # One-hot encode multi-category features
86
+ multi_cat = [c for c in ["ChestPainType", "RestingECG", "ST_Slope"] if c in df.columns]
87
+ df = pd.get_dummies(df, columns=multi_cat, drop_first=False)
88
+ logger.debug(f"One-hot encoded columns: {multi_cat}")
89
+
90
+ # Scale numerical columns
91
+ num_cols = [c for c in NUM_COLS_DEFAULT if c in df.columns and c != TARGET_COL]
92
+ scaler = StandardScaler()
93
+ df[num_cols] = scaler.fit_transform(df[num_cols])
94
+ logger.info(f"Scaled numerical features: {num_cols}")
95
+
96
+ # Save processed dataset
97
+ df.to_csv(PREPROCESSED_CSV, index=False)
98
+ logger.success(
99
+ "Saved preprocessed dataset: %s (rows=%d, cols=%d)", PREPROCESSED_CSV, len(df), df.shape[1]
100
+ )
101
+
102
+ # Log class distribution
103
+ count_0 = (df[TARGET_COL] == 0).sum()
104
+ count_1 = (df[TARGET_COL] == 1).sum()
105
+ logger.info(f"Target balance — 0: {count_0} | 1: {count_1}")
106
+
107
+ save_scaler_artifact(scaler)
108
+
109
+ logger.success("Preprocessing completed successfully.")
110
+ return df
111
+
112
+
113
+ if __name__ == "__main__":
114
+ INTERIM_DATA_DIR.mkdir(parents=True, exist_ok=True)
115
+ df_processed = preprocessing()
116
+ generate_gender_splits(df_processed)
predicting_outcomes_in_heart_failure/data/split_data.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from pathlib import Path
3
+
4
+ from loguru import logger
5
+ import pandas as pd
6
+ from predicting_outcomes_in_heart_failure.config import (
7
+ FEMALE_CSV,
8
+ MALE_CSV,
9
+ NOSEX_CSV,
10
+ PREPROCESSED_CSV,
11
+ PROCESSED_DATA_DIR,
12
+ RANDOM_STATE,
13
+ TARGET_COL,
14
+ TEST_SIZE,
15
+ )
16
+ from sklearn.model_selection import train_test_split
17
+
18
+ VARIANTS = {
19
+ "all": PREPROCESSED_CSV,
20
+ "female": FEMALE_CSV,
21
+ "male": MALE_CSV,
22
+ "nosex": NOSEX_CSV,
23
+ }
24
+
25
+
26
+ def _safe_train_test_split(X, y, test_size, random_state):
27
+ """Perform a stratified train/test split with fallback if not possible."""
28
+ stratify_y = y if y.nunique() > 1 else None
29
+ try:
30
+ X_tr, X_te, y_tr, y_te = train_test_split(
31
+ X,
32
+ y,
33
+ test_size=test_size,
34
+ stratify=stratify_y,
35
+ random_state=random_state,
36
+ shuffle=True,
37
+ )
38
+ if stratify_y is None:
39
+ logger.warning("Target has only one class — performing non-stratified split.")
40
+ else:
41
+ logger.debug("Stratified split executed successfully.")
42
+ return X_tr, X_te, y_tr, y_te
43
+ except ValueError as e:
44
+ logger.warning(f"Stratified split failed ({e}). Falling back to non-stratified split.")
45
+ return train_test_split(
46
+ X,
47
+ y,
48
+ test_size=test_size,
49
+ stratify=None,
50
+ random_state=random_state,
51
+ shuffle=True,
52
+ )
53
+
54
+
55
+ def split_one(csv_path: Path, variant: str):
56
+ """Split a specific variant (all/female/male/nosex) into train/test sets."""
57
+ if not csv_path.exists():
58
+ logger.warning(f"[{variant}] Missing CSV file: {csv_path} — skipping.")
59
+ return
60
+
61
+ df = pd.read_csv(csv_path)
62
+ logger.info(f"[{variant}] Loaded {csv_path} (rows={len(df)}, cols={df.shape[1]})")
63
+
64
+ if TARGET_COL not in df.columns:
65
+ raise ValueError(f"[{variant}] Target column '{TARGET_COL}' not found in {csv_path}")
66
+
67
+ X = df.drop(columns=[TARGET_COL])
68
+ y = df[TARGET_COL].astype(int)
69
+
70
+ X_train, X_test, y_train, y_test = _safe_train_test_split(X, y, TEST_SIZE, RANDOM_STATE)
71
+
72
+ train_df = X_train.copy()
73
+ train_df[TARGET_COL] = y_train.values
74
+ test_df = X_test.copy()
75
+ test_df[TARGET_COL] = y_test.values
76
+
77
+ out_dir = PROCESSED_DATA_DIR / variant
78
+ out_dir.mkdir(parents=True, exist_ok=True)
79
+ train_p = out_dir / "train.csv"
80
+ test_p = out_dir / "test.csv"
81
+
82
+ train_df.to_csv(train_p, index=False)
83
+ test_df.to_csv(test_p, index=False)
84
+
85
+ logger.success(f"[{variant}] Saved TRAIN -> {train_p} (rows={len(train_df)})")
86
+ logger.success(f"[{variant}] Saved TEST -> {test_p} (rows={len(test_df)})")
87
+
88
+ train_counts = train_df[TARGET_COL].value_counts().to_dict()
89
+ test_counts = test_df[TARGET_COL].value_counts().to_dict()
90
+ logger.info(f"[{variant}] Class distribution — TRAIN: {train_counts} | TEST: {test_counts}")
91
+
92
+
93
+ def main():
94
+ parser = argparse.ArgumentParser()
95
+ parser.add_argument(
96
+ "--variant",
97
+ type=str,
98
+ choices=list(VARIANTS.keys()),
99
+ required=True,
100
+ help="Data variant to split: all, female, male, or nosex.",
101
+ )
102
+ args = parser.parse_args()
103
+
104
+ variant = args.variant
105
+ csv_path = VARIANTS[variant]
106
+
107
+ logger.info(f"Starting splitting for variant='{variant}'")
108
+ PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
109
+ split_one(csv_path, variant)
110
+ logger.success(f"Splitting completed for variant='{variant}'")
111
+
112
+
113
+ if __name__ == "__main__":
114
+ main()
predicting_outcomes_in_heart_failure/modeling/__init__.py ADDED
File without changes
predicting_outcomes_in_heart_failure/modeling/evaluate.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+
5
+ import dagshub
6
+ import joblib
7
+ from loguru import logger
8
+ import mlflow
9
+ from mlflow.models.signature import infer_signature
10
+ from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
11
+
12
+ from predicting_outcomes_in_heart_failure.config import (
13
+ DATASET_NAME,
14
+ EXPERIMENT_NAME,
15
+ MODELS_DIR,
16
+ PROCESSED_DATA_DIR,
17
+ REPO_NAME,
18
+ REPO_OWNER,
19
+ TARGET_COL,
20
+ TEST_METRICS_DIR,
21
+ VALID_MODELS,
22
+ VALID_VARIANTS,
23
+ )
24
+ from predicting_outcomes_in_heart_failure.modeling.train import load_split
25
+
26
+
27
+ def compute_metrics(model, X_test, y_test) -> dict:
28
+ """Compute evaluation metrics (F1, recall, accuracy, ROC-AUC)."""
29
+ y_pred = model.predict(X_test)
30
+ results = {
31
+ "test_f1": f1_score(y_test, y_pred, zero_division=0),
32
+ "test_recall": recall_score(y_test, y_pred, zero_division=0),
33
+ "test_accuracy": accuracy_score(y_test, y_pred),
34
+ }
35
+ if hasattr(model, "predict_proba"):
36
+ try:
37
+ y_prob = model.predict_proba(X_test)[:, 1]
38
+ results["test_roc_auc"] = roc_auc_score(y_test, y_prob)
39
+ except Exception as e:
40
+ logger.warning(f"ROC AUC not computed: {e}")
41
+ return results, y_pred
42
+
43
+
44
+ def evaluate_variant(variant: str, model_name: str | None = None):
45
+ """Evaluate trained models for a given variant, optionally by model."""
46
+ logger.info(f"=== Evaluation started (variant={variant}, model={model_name or 'ALL'}) ===")
47
+
48
+ test_path = PROCESSED_DATA_DIR / variant / "test.csv"
49
+ test_df = load_split(test_path)
50
+
51
+ X_test = test_df.drop(columns=[TARGET_COL])
52
+ y_test = test_df[TARGET_COL].astype(int)
53
+
54
+ models_dir_variant = MODELS_DIR / variant
55
+ if not models_dir_variant.exists():
56
+ logger.warning(
57
+ f"[{variant}] Models directory does not exist: {models_dir_variant} — skipping."
58
+ )
59
+ return
60
+
61
+ experiment_name = f"{EXPERIMENT_NAME}_{variant}"
62
+ experiment = mlflow.get_experiment_by_name(experiment_name)
63
+ if experiment is None:
64
+ logger.error(f"Experiment '{experiment_name}' not found.")
65
+ return
66
+
67
+ model_files = []
68
+ if model_name is not None:
69
+ model_files = [f"{model_name}.joblib"]
70
+ else:
71
+ model_files = [f for f in os.listdir(models_dir_variant) if f.endswith(".joblib")]
72
+
73
+ for file in model_files:
74
+ if not file.endswith(".joblib"):
75
+ continue
76
+
77
+ current_model_name = file.split(".joblib")[0]
78
+ run_name = f"{current_model_name}_{variant}"
79
+ logger.info(
80
+ f"[{variant} | {current_model_name}] Looking for training run '{run_name}' in MLflow."
81
+ )
82
+
83
+ runs = mlflow.search_runs(
84
+ experiment_ids=[experiment.experiment_id],
85
+ filter_string=f"tags.mlflow.runName = '{run_name}'",
86
+ order_by=["start_time DESC"],
87
+ max_results=1,
88
+ )
89
+
90
+ if runs.empty:
91
+ logger.warning(
92
+ f"[{variant} | {current_model_name}]No matching MLflow run found — skipping."
93
+ )
94
+ continue
95
+
96
+ tracked_id = runs.loc[0, "run_id"]
97
+
98
+ with mlflow.start_run(run_id=tracked_id):
99
+ rawdata = mlflow.data.from_pandas(test_df, name=f"{DATASET_NAME}_{variant}_test")
100
+ mlflow.log_input(rawdata, context="testing")
101
+
102
+ model_path = models_dir_variant / file
103
+ model = joblib.load(model_path)
104
+
105
+ metrics, _ = compute_metrics(model, X_test, y_test)
106
+ mlflow.log_metrics(metrics)
107
+
108
+ logger.info(f"[{variant} | {current_model_name}] Test set metrics:")
109
+ for k in ["test_f1", "test_recall", "test_accuracy", "test_roc_auc"]:
110
+ if k in metrics:
111
+ logger.info(f" - {k}: {metrics[k]:.4f}")
112
+
113
+ metrics_dir = TEST_METRICS_DIR / variant
114
+ metrics_dir.mkdir(parents=True, exist_ok=True)
115
+
116
+ metrics_path = metrics_dir / f"{current_model_name}.json"
117
+
118
+ to_save = {
119
+ "variant": variant,
120
+ "model_name": current_model_name,
121
+ "metrics": metrics,
122
+ }
123
+
124
+ with open(metrics_path, "w", encoding="utf-8") as f:
125
+ json.dump(to_save, f, indent=4)
126
+
127
+ logger.info(
128
+ f"[{variant} | {current_model_name}] Saved test metrics locally → {metrics_path}"
129
+ )
130
+
131
+ if (
132
+ metrics.get("test_f1", 0.0) >= 0.80
133
+ and metrics.get("test_recall", 0.0) >= 0.80
134
+ and metrics.get("test_accuracy", 0.0) >= 0.80
135
+ and metrics.get("test_roc_auc", 0.0) >= 0.85
136
+ ):
137
+ signature = infer_signature(X_test, model.predict(X_test))
138
+ registered_name = f"{current_model_name}_{variant}"
139
+ mlflow.sklearn.log_model(
140
+ sk_model=model,
141
+ artifact_path="Model_Info",
142
+ signature=signature,
143
+ input_example=X_test,
144
+ registered_model_name=registered_name,
145
+ )
146
+ logger.success(
147
+ f"[{variant} | {current_model_name}] "
148
+ f"Model promoted and registered as '{registered_name}'."
149
+ )
150
+
151
+ logger.success(
152
+ f"=== Evaluation completed (variant={variant}, model={model_name or 'ALL'}) ==="
153
+ )
154
+
155
+
156
+ def main():
157
+ parser = argparse.ArgumentParser()
158
+ parser.add_argument(
159
+ "--variant",
160
+ type=str,
161
+ choices=VALID_VARIANTS,
162
+ required=True,
163
+ help="Data variant to use: all, female, male, or nosex.",
164
+ )
165
+ parser.add_argument(
166
+ "--model",
167
+ type=str,
168
+ choices=VALID_MODELS,
169
+ required=False,
170
+ help=(
171
+ "Specific model to evaluate (logreg, random_forest, decision_tree)."
172
+ " If omitted, evaluate all models."
173
+ ),
174
+ )
175
+ args = parser.parse_args()
176
+
177
+ dagshub.init(repo_owner=REPO_OWNER, repo_name=REPO_NAME, mlflow=True)
178
+ evaluate_variant(args.variant, args.model)
179
+
180
+
181
+ if __name__ == "__main__":
182
+ main()
predicting_outcomes_in_heart_failure/modeling/explainability.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import matplotlib
7
+
8
+ matplotlib.use("Agg")
9
+
10
+ from loguru import logger
11
+ import matplotlib.pyplot as plt
12
+ import numpy as np
13
+ import pandas as pd
14
+ import shap
15
+
16
+
17
+ def explain_prediction(
18
+ model: Any,
19
+ X: pd.DataFrame,
20
+ model_type: str,
21
+ top_k: int = 5,
22
+ ):
23
+ """
24
+ Build a explanation for a single sample.
25
+ """
26
+
27
+ if X.empty:
28
+ logger.warning("Received empty DataFrame for explanation; returning empty list.")
29
+ return []
30
+
31
+ model_type = model_type.lower()
32
+ x = X.iloc[[0]]
33
+ feature_names = x.columns.tolist()
34
+
35
+ # ---------------------------------------------------------------------
36
+ # 1) Logistic Regression → use coefficients
37
+ # ---------------------------------------------------------------------
38
+ if model_type in ("logreg", "logistic_regression"):
39
+ logger.info("Using coefficient-based explanation for Logistic Regression.")
40
+
41
+ if not hasattr(model, "coef_"):
42
+ logger.error(
43
+ "Model has no coef_ attribute;cannot build coefficient-based explanation."
44
+ )
45
+ return []
46
+
47
+ coef = np.asarray(model.coef_[0]).reshape(-1)
48
+ if coef.shape[0] != len(feature_names):
49
+ logger.warning(
50
+ f"Coefficient vector length ({coef.shape[0]}) does not match "
51
+ f"number of features ({len(feature_names)}). "
52
+ "Truncating to minimum length."
53
+ )
54
+
55
+ n = min(len(feature_names), coef.shape[0])
56
+ explanations = [
57
+ {
58
+ "feature": feature_names[i],
59
+ "value": float(coef[i]),
60
+ "abs_value": float(abs(coef[i])),
61
+ }
62
+ for i in range(n)
63
+ ]
64
+
65
+ explanations = sorted(explanations, key=lambda d: d["abs_value"], reverse=True)[:top_k]
66
+ logger.info(
67
+ f"Built coefficient-based explanation. Returning top {len(explanations)} features."
68
+ )
69
+ return explanations
70
+
71
+ # ---------------------------------------------------------------------
72
+ # 2) Tree-based models → SHAP TreeExplainer
73
+ # ---------------------------------------------------------------------
74
+ if model_type in ("random_forest", "decision_tree"):
75
+ logger.info("Using SHAP TreeExplainer for tree-based model.")
76
+
77
+ if X.empty:
78
+ logger.warning("Received empty DataFrame for SHAP explanation; returning empty list.")
79
+ return []
80
+
81
+ x = X.iloc[[0]]
82
+ feature_names = x.columns.tolist()
83
+
84
+ try:
85
+ explainer = shap.TreeExplainer(model)
86
+ shap_exp = explainer(x)
87
+ values = np.asarray(shap_exp.values)
88
+ logger.debug(f"Raw SHAP values shape: {values.shape!r}")
89
+ except Exception as e:
90
+ logger.error(f"SHAP TreeExplainer failed: {e}")
91
+ logger.warning("SHAP explanation not available for this model.")
92
+ return []
93
+
94
+ if values.ndim == 2:
95
+ shap_vec = values[0]
96
+
97
+ elif values.ndim == 3:
98
+ n_samples, dim2, dim3 = values.shape
99
+
100
+ if dim2 == x.shape[1]:
101
+ n_outputs = dim3
102
+ class_index = 1 if n_outputs > 1 else 0
103
+ shap_vec = values[0, :, class_index]
104
+
105
+ elif dim3 == x.shape[1]:
106
+ n_outputs = dim2
107
+ class_index = 1 if n_outputs > 1 else 0
108
+ shap_vec = values[0, class_index, :]
109
+
110
+ else:
111
+ logger.error(f"Unexpected SHAP shape {values.shape} for {x.shape[1]} features.")
112
+ return []
113
+
114
+ else:
115
+ logger.error(f"Unexpected SHAP values dimension: {values.ndim}")
116
+ return []
117
+
118
+ shap_vec = np.asarray(shap_vec).reshape(-1)
119
+
120
+ if shap_vec.shape[0] != len(feature_names):
121
+ logger.warning(
122
+ f"SHAP vector length ({shap_vec.shape[0]}) "
123
+ f"!= number of features ({len(feature_names)}). "
124
+ "Truncating to minimum length."
125
+ )
126
+
127
+ n = min(len(feature_names), shap_vec.shape[0])
128
+ explanations = [
129
+ {
130
+ "feature": feature_names[i],
131
+ "value": float(shap_vec[i]),
132
+ "abs_value": float(abs(shap_vec[i])),
133
+ }
134
+ for i in range(n)
135
+ ]
136
+
137
+ explanations = sorted(explanations, key=lambda d: d["abs_value"], reverse=True)[:top_k]
138
+
139
+ logger.info(f"Built SHAP-based explanation. Returning top {len(explanations)} features.")
140
+ return explanations
141
+
142
+
143
+ def save_shap_waterfall_plot(
144
+ model: Any,
145
+ X: pd.DataFrame,
146
+ model_type: str,
147
+ output_path: Path,
148
+ ) -> Path | None:
149
+ """
150
+ Save a SHAP waterfall plot for a single sample to the given output path.
151
+ """
152
+ model_type = model_type.lower()
153
+
154
+ if model_type not in ("random_forest", "decision_tree"):
155
+ logger.warning(
156
+ f"Waterfall plot is only supported for tree-based models. "
157
+ f"Got model_type='{model_type}'. Skipping plot generation."
158
+ )
159
+ return None
160
+
161
+ if X.empty:
162
+ logger.warning("Received empty DataFrame for SHAP plot; skipping.")
163
+ return None
164
+
165
+ x = X.iloc[[0]]
166
+ logger.info(f"Generating SHAP waterfall plot for model_type='{model_type}'.")
167
+
168
+ try:
169
+ explainer = shap.TreeExplainer(model)
170
+ shap_exp = explainer(x)
171
+ except Exception as e:
172
+ logger.error(f"Failed to build SHAP explainer for plot: {e}")
173
+ return None
174
+
175
+ try:
176
+ output_path.parent.mkdir(parents=True, exist_ok=True)
177
+
178
+ shap_to_plot = shap_exp
179
+ if np.asarray(shap_exp.values).ndim == 3:
180
+ vals = np.asarray(shap_exp.values)
181
+ if vals.shape[1] == x.shape[1]:
182
+ shap_to_plot = shap_exp[..., 1]
183
+ elif vals.shape[2] == x.shape[1]:
184
+ shap_to_plot = shap_exp[:, 1, :]
185
+ else:
186
+ logger.warning(
187
+ f"Unexpected shape for SHAP values in plot: {vals.shape}. "
188
+ "Falling back to shap_exp[0]."
189
+ )
190
+ shap_to_plot = shap_exp
191
+
192
+ plt.figure()
193
+ shap.plots.waterfall(shap_to_plot[0], show=False)
194
+ plt.tight_layout()
195
+ plt.savefig(output_path, bbox_inches="tight")
196
+ plt.close()
197
+
198
+ logger.success(f"SHAP waterfall plot saved to {output_path}")
199
+ return output_path
200
+ except Exception as e:
201
+ logger.error(f"Failed to save SHAP waterfall plot: {e}")
202
+ return None
predicting_outcomes_in_heart_failure/modeling/predict.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import time
4
+
5
+ import joblib
6
+ from loguru import logger
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from predicting_outcomes_in_heart_failure.app.schema import HeartSample
11
+ from predicting_outcomes_in_heart_failure.config import (
12
+ FIGURES_DIR,
13
+ INPUT_COLUMNS,
14
+ MODEL_PATH,
15
+ MULTI_CAT,
16
+ NUM_COLS_DEFAULT,
17
+ SCALER_PATH,
18
+ )
19
+ from predicting_outcomes_in_heart_failure.modeling.explainability import (
20
+ explain_prediction,
21
+ save_shap_waterfall_plot,
22
+ )
23
+
24
+
25
+ def preprocessing(sample_df: pd.DataFrame) -> pd.DataFrame:
26
+ """
27
+ Apply the exact same preprocessing used during training:
28
+ """
29
+ logger.info("Applying preprocessing pipeline for inference...")
30
+
31
+ if not (SCALER_PATH.exists() and MODEL_PATH.exists()):
32
+ raise FileNotFoundError("Preprocessing artifacts missing.")
33
+
34
+ scaler = joblib.load(SCALER_PATH)
35
+ input_columns = INPUT_COLUMNS
36
+ multi_cat = MULTI_CAT
37
+ num_cols = NUM_COLS_DEFAULT
38
+
39
+ logger.debug(f"Loaded scaler from {SCALER_PATH}")
40
+ logger.debug(f"Using {len(input_columns)} input columns")
41
+
42
+ if "Sex" in sample_df.columns and "Sex" not in input_columns:
43
+ logger.debug("Dropping column 'Sex' since it's not used by the current model variant.")
44
+ sample_df = sample_df.drop(columns=["Sex"])
45
+
46
+ if "Sex" in sample_df.columns and "Sex" in input_columns:
47
+ sample_df["Sex"] = sample_df["Sex"].map({"M": 1, "F": 0}).astype(int)
48
+ logger.debug("Mapped 'Sex' to binary values (M=1, F=0).")
49
+
50
+ if "ExerciseAngina" in sample_df.columns and "ExerciseAngina" in input_columns:
51
+ sample_df["ExerciseAngina"] = sample_df["ExerciseAngina"].map({"Y": 1, "N": 0}).astype(int)
52
+ logger.debug("Mapped 'ExerciseAngina' to binary values (Y=1, N=0).")
53
+
54
+ present_multi = [c for c in multi_cat if c in sample_df.columns]
55
+ if present_multi:
56
+ logger.debug(f"Performing one-hot encoding on: {present_multi}")
57
+ sample_df = pd.get_dummies(sample_df, columns=present_multi, drop_first=False)
58
+
59
+ for col in input_columns:
60
+ if col not in sample_df.columns:
61
+ sample_df[col] = 0
62
+ sample_df = sample_df.reindex(columns=input_columns, fill_value=0)
63
+ logger.debug("Aligned input columns with training feature order.")
64
+
65
+ cols_to_scale = [c for c in num_cols if c in sample_df.columns]
66
+ sample_df[cols_to_scale] = scaler.transform(sample_df[cols_to_scale])
67
+ logger.debug(f"Scaled numerical columns: {cols_to_scale}")
68
+
69
+ logger.success("Preprocessing completed successfully.")
70
+ return sample_df
71
+
72
+
73
+ def main():
74
+ logger.info("Starting static inference...")
75
+
76
+ sample = HeartSample(
77
+ Age=54,
78
+ ChestPainType="ASY",
79
+ RestingBP=140,
80
+ Cholesterol=239,
81
+ FastingBS=0,
82
+ RestingECG="Normal",
83
+ MaxHR=160,
84
+ ExerciseAngina="N",
85
+ Oldpeak=0.0,
86
+ ST_Slope="Up",
87
+ )
88
+ logger.info("Sample created successfully.")
89
+
90
+ X_raw = sample.to_dataframe()
91
+ logger.debug(f"Raw input features:\n{X_raw}")
92
+ X = preprocessing(X_raw)
93
+
94
+ if not MODEL_PATH.exists():
95
+ raise FileNotFoundError(f"Model not found: {MODEL_PATH}")
96
+ model = joblib.load(MODEL_PATH)
97
+ logger.success(f"Loaded model from {MODEL_PATH}")
98
+
99
+ # Perform prediction
100
+ t0 = time.perf_counter()
101
+ y_pred = model.predict(X)[0]
102
+ inference_time = time.perf_counter() - t0
103
+ y_pred = int(y_pred) if np.issubdtype(type(y_pred), np.integer) else y_pred
104
+ result = {
105
+ "prediction": y_pred,
106
+ "inference_time_seconds": inference_time,
107
+ }
108
+
109
+ # Explainability
110
+ model = joblib.load(MODEL_PATH)
111
+ model_type = MODEL_PATH.stem
112
+ try:
113
+ logger.info("Computing explanation for the prediction...")
114
+ explanations = explain_prediction(model, X, model_type=model_type, top_k=5)
115
+ result["explanations"] = explanations
116
+ logger.success("Explanation computed successfully.")
117
+ except Exception as e:
118
+ logger.error(f"Failed to compute explanation: {e}")
119
+
120
+ try:
121
+ shap_path = FIGURES_DIR / f"shap_waterfall_{model_type}.png"
122
+ saved = save_shap_waterfall_plot(model, X, model_type=model_type, output_path=shap_path)
123
+ if saved is not None:
124
+ result["explanation_plot"] = str(saved)
125
+ except Exception as e:
126
+ logger.error(f"Failed to generate SHAP waterfall plot: {e}")
127
+
128
+ logger.info("Inference completed.")
129
+ logger.success(f"Prediction result: {result}")
130
+
131
+ return result
132
+
133
+
134
+ if __name__ == "__main__":
135
+ main()
predicting_outcomes_in_heart_failure/modeling/train.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+
7
+ import dagshub
8
+ from imblearn.over_sampling import RandomOverSampler
9
+ import joblib
10
+ from loguru import logger
11
+ import mlflow
12
+ import pandas as pd
13
+ from sklearn.model_selection import GridSearchCV, StratifiedKFold
14
+
15
+ from predicting_outcomes_in_heart_failure.config import (
16
+ CONFIG_DT,
17
+ CONFIG_LR,
18
+ CONFIG_RF,
19
+ DATASET_NAME,
20
+ EXPERIMENT_NAME,
21
+ MODELS_DIR,
22
+ N_SPLITS,
23
+ PROCESSED_DATA_DIR,
24
+ RANDOM_STATE,
25
+ REPO_NAME,
26
+ REPO_OWNER,
27
+ REPORTS_DIR,
28
+ SCORING,
29
+ TARGET_COL,
30
+ VALID_MODELS,
31
+ VALID_VARIANTS,
32
+ )
33
+
34
+ REFIT = "f1"
35
+
36
+
37
+ def load_split(path: Path) -> pd.DataFrame:
38
+ if not path.exists():
39
+ logger.error(f"Missing split file: {path}. Run split_data.py first.")
40
+ raise FileNotFoundError(path)
41
+ df = pd.read_csv(path)
42
+ logger.info(f"Loaded {path} (rows={len(df)}, cols={df.shape[1]})")
43
+ return df
44
+
45
+
46
+ def apply_random_oversampling(
47
+ X: pd.DataFrame,
48
+ y: pd.Series,
49
+ model_name: str,
50
+ variant: str,
51
+ ):
52
+ """Apply RandomOverSampler to balance classes in the training set."""
53
+ logger.info(f"[{variant} | {model_name}] Applying RandomOverSampler on training data...")
54
+
55
+ # Log original class distribution
56
+ orig_counts = y.value_counts().to_dict()
57
+ logger.info(f"[{variant} | {model_name}] Original class distribution: {orig_counts}")
58
+
59
+ ros = RandomOverSampler(random_state=RANDOM_STATE)
60
+ X_res, y_res = ros.fit_resample(X, y)
61
+
62
+ # Log resampled class distribution
63
+ res_counts = y_res.value_counts().to_dict()
64
+ logger.info(f"[{variant} | {model_name}] Resampled class distribution: {res_counts}")
65
+
66
+ logger.success(f"[{variant} | {model_name}] RandomOverSampler applied successfully.")
67
+ return X_res, y_res
68
+
69
+
70
+ def get_model_and_grid(model_name: str):
71
+ """Return estimator and parameter grid for the selected model."""
72
+ if model_name == "decision_tree":
73
+ from sklearn.tree import DecisionTreeClassifier
74
+
75
+ estimator = DecisionTreeClassifier(random_state=RANDOM_STATE)
76
+ param_grid = CONFIG_DT
77
+ return estimator, param_grid
78
+
79
+ elif model_name == "logreg":
80
+ from sklearn.linear_model import LogisticRegression
81
+
82
+ estimator = LogisticRegression(max_iter=500, random_state=RANDOM_STATE)
83
+ param_grid = CONFIG_LR
84
+ return estimator, param_grid
85
+
86
+ elif model_name == "random_forest":
87
+ from sklearn.ensemble import RandomForestClassifier
88
+
89
+ estimator = RandomForestClassifier(random_state=RANDOM_STATE)
90
+ param_grid = CONFIG_RF
91
+ return estimator, param_grid
92
+
93
+ else:
94
+ raise ValueError(f"Unknown model_name: {model_name}")
95
+
96
+
97
+ def run_grid_search(
98
+ estimator,
99
+ param_grid,
100
+ X_train,
101
+ y_train,
102
+ model_name: str,
103
+ variant: str,
104
+ reports_dir: Path,
105
+ ):
106
+ """Run GridSearchCV for the specified model and log CV results."""
107
+ cv = StratifiedKFold(
108
+ n_splits=N_SPLITS,
109
+ shuffle=True,
110
+ random_state=RANDOM_STATE,
111
+ )
112
+ grid = GridSearchCV(
113
+ estimator=estimator,
114
+ param_grid=param_grid,
115
+ scoring=SCORING,
116
+ refit=REFIT,
117
+ cv=cv,
118
+ n_jobs=-1,
119
+ verbose=1,
120
+ return_train_score=True,
121
+ )
122
+
123
+ logger.info(f"[{variant} | {model_name}] Starting GridSearchCV …")
124
+ grid.fit(X_train, y_train)
125
+
126
+ logger.success(f"[{variant} | {model_name}] GridSearchCV completed.")
127
+ logger.info(f"[{variant} | {model_name}] Best params ({REFIT}): {grid.best_params_}")
128
+ logger.info(f"[{variant} | {model_name}] Best CV {REFIT}: {grid.best_score_:.4f}")
129
+
130
+ cv_results_path = reports_dir / "cv_results.csv"
131
+ df = pd.DataFrame(grid.cv_results_)
132
+ df.to_csv(cv_results_path, index=False)
133
+
134
+ mlflow.log_artifact(str(cv_results_path))
135
+ return grid.best_estimator_, grid, grid.best_params_
136
+
137
+
138
+ def save_artifacts(
139
+ model,
140
+ grid,
141
+ X_train,
142
+ model_name: str,
143
+ variant: str,
144
+ model_dir: Path,
145
+ reports_dir: Path,
146
+ ) -> None:
147
+ """Save model, parameters, and metadata to disk and MLflow."""
148
+ model_dir.mkdir(parents=True, exist_ok=True)
149
+ reports_dir.mkdir(parents=True, exist_ok=True)
150
+
151
+ model_path = model_dir / f"{model_name}.joblib"
152
+ joblib.dump(model, model_path)
153
+ logger.success(f"[{variant} | {model_name}] Saved model → {model_path}")
154
+
155
+ out = {
156
+ "model_name": model_name,
157
+ "data_variant": variant,
158
+ "cv": {
159
+ "refit": REFIT,
160
+ "best_score": getattr(grid, "best_score_", None),
161
+ "best_params": getattr(grid, "best_params_", None),
162
+ "scoring": list(SCORING.keys()),
163
+ "n_splits": N_SPLITS,
164
+ "random_state": RANDOM_STATE,
165
+ },
166
+ "features": list(X_train.columns),
167
+ }
168
+
169
+ cv_params_path = reports_dir / "cv_parameters.json"
170
+ with open(cv_params_path, "w", encoding="utf-8") as f:
171
+ json.dump(out, f, indent=4)
172
+
173
+ mlflow.log_artifact(str(cv_params_path))
174
+ logger.success(f"[{variant} | {model_name}] Saved artifacts.")
175
+
176
+
177
+ def train(model_name: str, variant: str):
178
+ """Train a model for a specific dataset variant and log results to MLflow."""
179
+ experiment_name = f"{EXPERIMENT_NAME}_{variant}"
180
+ if not mlflow.get_experiment_by_name(experiment_name):
181
+ mlflow.create_experiment(experiment_name)
182
+ mlflow.set_experiment(experiment_name)
183
+
184
+ train_path = PROCESSED_DATA_DIR / variant / "train.csv"
185
+ run_name = f"{model_name}_{variant}"
186
+
187
+ logger.info(f"=== Training started (model={model_name}, variant={variant}) ===")
188
+
189
+ with mlflow.start_run(run_name=run_name):
190
+ train_df = load_split(train_path)
191
+
192
+ rawdata = mlflow.data.from_pandas(train_df, name=f"{DATASET_NAME}_{variant}")
193
+ mlflow.log_input(rawdata, context="training")
194
+
195
+ X_train = train_df.drop(columns=[TARGET_COL])
196
+ y_train = train_df[TARGET_COL].astype(int)
197
+
198
+ X_train, y_train = apply_random_oversampling(
199
+ X_train,
200
+ y_train,
201
+ model_name=model_name,
202
+ variant=variant,
203
+ )
204
+
205
+ estimator, param_grid = get_model_and_grid(model_name)
206
+ mlflow.set_tag("estimator_name", estimator.__class__.__name__)
207
+ mlflow.set_tag("data_variant", variant)
208
+ mlflow.log_param("data_variant", variant)
209
+
210
+ model_dir = MODELS_DIR / variant
211
+ reports_dir = REPORTS_DIR / variant / model_name
212
+ reports_dir.mkdir(parents=True, exist_ok=True)
213
+
214
+ best_model, grid, params = run_grid_search(
215
+ estimator,
216
+ param_grid,
217
+ X_train,
218
+ y_train,
219
+ model_name=model_name,
220
+ variant=variant,
221
+ reports_dir=reports_dir,
222
+ )
223
+ mlflow.log_params(params)
224
+
225
+ save_artifacts(
226
+ best_model,
227
+ grid,
228
+ X_train,
229
+ model_name=model_name,
230
+ variant=variant,
231
+ model_dir=model_dir,
232
+ reports_dir=reports_dir,
233
+ )
234
+
235
+ logger.success(f"=== Training completed (model={model_name}, variant={variant}) ===")
236
+
237
+
238
+ def main():
239
+ parser = argparse.ArgumentParser()
240
+ parser.add_argument(
241
+ "--variant",
242
+ type=str,
243
+ choices=VALID_VARIANTS,
244
+ required=True,
245
+ help="Data variant to use: all, female, male, or nosex.",
246
+ )
247
+ parser.add_argument(
248
+ "--model",
249
+ type=str,
250
+ choices=VALID_MODELS,
251
+ required=True,
252
+ help="Model to train: logreg, random_forest, or decision_tree.",
253
+ )
254
+ args = parser.parse_args()
255
+
256
+ dagshub.init(repo_owner=REPO_OWNER, repo_name=REPO_NAME, mlflow=True)
257
+ train(args.model, args.variant)
258
+
259
+
260
+ if __name__ == "__main__":
261
+ main()
pyproject.toml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["flit_core >=3.2,<4"]
3
+ build-backend = "flit_core.buildapi"
4
+
5
+ [project]
6
+ name = "predicting_outcomes_in_heart_failure"
7
+ version = "0.0.1"
8
+ description = "This project develops a predictive pipeline for patient outcome prediction in heart failure, using a publicly available dataset of clinical records. The goal is to design and evaluate machine learning models within a reproducible workflow that can be integrated into larger systems for clinical decision support. The workflow addresses data heterogeneity, defines consistent preprocessing and feature engineering strategies, and explores alternative modeling approaches with systematic evaluation using clinically relevant metrics. It also emphasizes model transparency and auditability, ensuring that the resulting pipeline can be deployed as a reliable, adaptable software component in healthcare applications. The project aims not only to improve baseline predictive performance but also to demonstrate how data-driven models can be effectively integrated into end-to-end AI-enabled healthcare systems."
9
+ authors = [
10
+ { name = "CardioTrack" },
11
+ ]
12
+
13
+ readme = "README.md"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+
17
+ ]
18
+ dependencies = [
19
+ "asttokens>=3.0.0",
20
+ "dagshub>=0.6.3",
21
+ "gradio>=6.0.2",
22
+ "great-expectations>=1.9.0",
23
+ "httpx>=0.28.1",
24
+ "imbalanced-learn>=0.14.0",
25
+ "ipykernel>=7.1.0",
26
+ "kagglehub>=0.3.13",
27
+ "loguru",
28
+ "matplotlib>=3.10.7",
29
+ "mkdocs",
30
+ "mlflow==2.22.0",
31
+ "numpy>=2.3.4",
32
+ "pandas>=2.3.3",
33
+ "pip",
34
+ "pytest",
35
+ "python-dotenv",
36
+ "ruff",
37
+ "scikit-learn>=1.7.2",
38
+ "seaborn>=0.13.2",
39
+ "shap>=0.50.0",
40
+ "tqdm",
41
+ "typer",
42
+ ]
43
+ requires-python = "~=3.11.0"
44
+
45
+
46
+ [tool.ruff]
47
+ target-version = "py311"
48
+ line-length = 99
49
+ src = [
50
+ "predicting_outcomes_in_heart_failure",
51
+ "tests",
52
+ ]
53
+ include = ["pyproject.toml", "predicting_outcomes_in_heart_failure/**/*.py", "tests/**/*.py",]
54
+ extend-exclude = [
55
+ ".git/", ".venv/", ".ruff_cache/", ".mypy_cache/",
56
+ "data/", "artifacts/", "mlruns/", "notebooks/.ipynb_checkpoints/",
57
+ ]
58
+
59
+
60
+ [tool.ruff.lint]
61
+ # Enable sets of rules: basic style, static errors, import, upgrade, bugbear, simplify
62
+ select = ["E", "F", "I", "UP", "B", "SIM"]
63
+ ignore = ["E203"] # compatibility with slicing and formatter
64
+ per-file-ignores = {"tests/**" = ["S101", "D"], "notebooks/**" = ["E402", "F401", "D"]}
65
+
66
+ [tool.ruff.lint.isort]
67
+ force-sort-within-sections = true
68
+
69
+ [tool.ruff.format]
70
+ quote-style = "double"
71
+ indent-style = "space"
72
+ docstring-code-format = true # format code inside docstrings
73
+
74
+
75
+ [dependency-groups]
76
+ dev = [
77
+ "pynblint>=0.1.6",
78
+ "ruff>=0.14.2",
79
+ ]
80
+