CardioTrackTest

Sleeping

App Files Files Community

Martinacap02 commited on Dec 11, 2025

Commit

f7d11f7

0 Parent(s):

Init deploy branch for HF Space

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +2 -0
.dvc/.gitignore +3 -0
.dvc/config +6 -0
.dvcignore +3 -0
.github/workflows/pynblint.yml +67 -0
.github/workflows/pytestAndGX.yml +60 -0
.github/workflows/ruff-linter.yml +108 -0
.gitignore +191 -0
Dockerfile +41 -0
Makefile +87 -0
README.md +239 -0
data/.gitignore +3 -0
data/README.md +176 -0
docs/.gitkeep +0 -0
docs/CardioTrack_ML_Canvas.md +98 -0
docs/Risk_Classification.md +101 -0
dvc.lock +754 -0
dvc.yaml +86 -0
metrics/test/all/.gitignore +3 -0
metrics/test/female/.gitignore +3 -0
metrics/test/male/.gitignore +3 -0
metrics/test/nosex/.gitignore +3 -0
models/.gitignore +3 -0
models/README.md +110 -0
models/all/.gitignore +3 -0
models/female/.gitignore +3 -0
models/male/.gitignore +3 -0
models/nosex/.gitignore +3 -0
notebooks/.gitkeep +0 -0
notebooks/1.0-mc-initial-data-exploration.ipynb +0 -0
predicting_outcomes_in_heart_failure/__init__.py +1 -0
predicting_outcomes_in_heart_failure/app/__init__.py +0 -0
predicting_outcomes_in_heart_failure/app/main.py +183 -0
predicting_outcomes_in_heart_failure/app/routers/cards.py +52 -0
predicting_outcomes_in_heart_failure/app/routers/general.py +19 -0
predicting_outcomes_in_heart_failure/app/routers/model_info.py +95 -0
predicting_outcomes_in_heart_failure/app/routers/prediction.py +135 -0
predicting_outcomes_in_heart_failure/app/schema.py +28 -0
predicting_outcomes_in_heart_failure/app/utils.py +59 -0
predicting_outcomes_in_heart_failure/app/wrapper.py +210 -0
predicting_outcomes_in_heart_failure/config.py +129 -0
predicting_outcomes_in_heart_failure/data/dataset.py +22 -0
predicting_outcomes_in_heart_failure/data/preprocess.py +116 -0
predicting_outcomes_in_heart_failure/data/split_data.py +114 -0
predicting_outcomes_in_heart_failure/modeling/__init__.py +0 -0
predicting_outcomes_in_heart_failure/modeling/evaluate.py +182 -0
predicting_outcomes_in_heart_failure/modeling/explainability.py +202 -0
predicting_outcomes_in_heart_failure/modeling/predict.py +135 -0
predicting_outcomes_in_heart_failure/modeling/train.py +261 -0
pyproject.toml +80 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ venv/

.dvc/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/config.local
+/tmp
+/cache

.dvc/config ADDED Viewed

	@@ -0,0 +1,6 @@

+[core]
+    remote = origin
+[remote "origin"]
+    url = s3://dvc
+    endpointurl = https://dagshub.com/se4ai2526-uniba/CardioTrack.s3

.dvcignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore

.github/workflows/pynblint.yml ADDED Viewed

	@@ -0,0 +1,67 @@

+name: Lint Notebooks (Pynblint)
+on:
+  pull_request:
+    paths:
+      - 'notebooks/**/*.ipynb'
+permissions:
+  contents: read
+jobs:
+  pynblint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Install uv (for uvx)
+        uses: astral-sh/setup-uv@v3
+      - name: Cache uv
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/uv
+          key: uv-${{ runner.os }}-${{ hashFiles('pyproject.toml','uv.lock') }}
+          restore-keys: |
+            uv-${{ runner.os }}-
+      - name: Run pynblint on notebooks directory
+        id: pynblint
+        run: |
+          set -e
+          echo "Running pynblint via uvx (isolated & pinned)..."
+          uvx --from pynblint \
+            --with "click<8.2" \
+            --with "typer<0.12" \
+            --with "lxml[html_clean]" \
+            pynblint notebooks/ > pynblint_report.txt 2>&1 || true
+          cat pynblint_report.txt
+      - name: Check for violations
+        run: |
+          if grep -qiE "Traceback|ImportError|ModuleNotFoundError" pynblint_report.txt; then
+            echo "❌ pynblint error."
+            cat pynblint_report.txt
+            exit 1
+          elif grep -q "LINTING RESULTS" pynblint_report.txt; then
+            echo "⚠️ Pynblint found violations in notebooks"
+            echo "violations=true" >> $GITHUB_ENV
+            cat pynblint_report.txt
+            exit 1
+          else
+            echo "✅ No violations found"
+            echo "violations=false" >> $GITHUB_ENV
+          fi
+      - name: Upload pynblint report
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: pynblint-report
+          path: pynblint_report.txt
+          retention-days: 30

.github/workflows/pytestAndGX.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+name: Pytest and GX Validation
+on:
+  pull_request:
+    branches-ignore:
+      - main
+permissions:
+  contents: read
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      # Install uv and activate cache
+      - uses: astral-sh/setup-uv@v3
+      - name: Cache uv
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/uv
+          key: uv-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'uv.lock') }}
+      # Install all dependencies
+      - name: Sync dependencies
+        run: uv sync
+      #Install dvc
+      - name: Install DVC
+        run: |
+          uv pip install "dvc-s3" "boto3>=1.36.0" "botocore>=1.36.0"
+      - name: Configure DVC credentials
+        run: |
+          uv run dvc remote modify origin --local access_key_id ${{ secrets.DAGSHUB_TOKEN }}
+          uv run dvc remote modify origin --local secret_access_key ${{ secrets.DAGSHUB_TOKEN }}
+      - name: Download data and models from DagsHub
+        run: uv run dvc pull
+      # Run pytest tests
+      - name: Run pytest tests
+        run: |
+          set -euo pipefail
+          echo "Running pytest tests..."
+          uv run pytest tests/ -v --tb=short
+      # Run GX validation scripts
+      - name: Run GX validation scripts
+        run: |
+          set -euo pipefail
+          echo "Running GX validation scripts..."
+          uv run python tests/test_heart_data/raw_test.py
+          uv run python tests/test_heart_data/processed_test.py

.github/workflows/ruff-linter.yml ADDED Viewed

	@@ -0,0 +1,108 @@

+name: Lint (Ruff)
+on:
+  pull_request:
+permissions:
+  contents: write
+jobs:
+  ruff-check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.head_ref }}
+      # Install uv and activate cache
+      - uses: astral-sh/setup-uv@v3
+      - name: Cache uv
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/uv
+          key: uv-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'uv.lock') }}
+      # Install dev deps
+      - name: Sync dev deps
+        run: uv sync --dev
+      # find changed .py files
+      - name: Ruff on changed files (format then check)
+        id: ruff_check_changed_files
+        run: |
+          set -euo pipefail
+          BASE_REF="${{ github.base_ref }}"
+          git fetch --no-tags origin "$BASE_REF" --prune
+          git diff --name-only --diff-filter=ACMRT "origin/$BASE_REF...HEAD" > /tmp/changed_all.txt
+          CHANGED=$(grep -E '\.py$' /tmp/changed_all.txt || true)
+          if [ -z "$CHANGED" ]; then
+            echo "Any Modified file .py: skip Ruff."
+            echo "has_py_changes=false" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+          echo "Modified Python Files:"
+          echo "$CHANGED" | sed 's/^/ - /'
+          echo "$CHANGED" > /tmp/changed_py.txt
+          echo "has_py_changes=true" >> $GITHUB_OUTPUT
+      # Autofix
+      - name: Ruff autofix on changed files
+        if: steps.ruff_check_changed_files.outputs.has_py_changes == 'true'
+        id: ruff_autofix
+        run: |
+          set -euo pipefail
+          CHANGED=$(cat /tmp/changed_py.txt)
+          echo "Ruff autofix on these files:"
+          echo "$CHANGED" | sed 's/^/ - /'
+          # --- FIX ---
+          uv run ruff format $CHANGED || true
+          # Lint con fix
+          uv run ruff check --fix $CHANGED || true
+          if uv run ruff check $CHANGED 2>&1 | tee /tmp/ruff_check_result.txt; then
+            echo "No remaining issues"
+            echo "has_remaining_issues=false" >> $GITHUB_OUTPUT
+          else
+            echo "Found remaining issues that cannot be auto-fixed"
+            echo "has_remaining_issues=true" >> $GITHUB_OUTPUT
+          fi
+      #  Commit on pull request
+      - name: Commit and push changes
+        if: steps.ruff_check_changed_files.outputs.has_py_changes == 'true'
+        run: |
+          set -euo pipefail
+          if [ -z "$(git status --porcelain)" ]; then
+            echo "No changes to commit after Ruff autofix."
+            exit 0
+          fi
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add .
+          git commit -m "chore: apply ruff check and format auto-fix"
+          git push origin HEAD:${{ github.head_ref }}
+      # Fail if there are remaining issues
+      - name: Fail if remaining issues
+        if: steps.ruff_check_changed_files.outputs.has_py_changes == 'true' && steps.ruff_autofix.outputs.has_remaining_issues == 'true'
+        run: |
+          echo "Found errors that cannot be auto-fixed:"
+          cat /tmp/ruff_check_result.txt
+          exit 1

.gitignore ADDED Viewed

	@@ -0,0 +1,191 @@

+# Data
+/data/raw/heart.csv
+# Mac OS-specific storage files
+.DS_Store
+# vim
+*.swp
+*.swo
+## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# MkDocs documentation
+docs/site/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   pixi.lock should be committed to version control for reproducibility
+#   .pixi/ contains the environments and should not be committed
+.pixi/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc

Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+FROM python:3.11.9-slim-bookworm
+# avoid creating unnecessary .pyc, buffers, and pip caches
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PIP_NO_CACHE_DIR=1
+ENV PIP_DISABLE_PIP_VERSION_CHECK=1
+# install curl and certificates needed to install uv
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+# create a non-root user for added security
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /cardioTrack
+# install uv as user
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+# copy the project files and do uv sync
+COPY --chown=user pyproject.toml uv.lock ./
+RUN uv sync --locked --no-install-project
+# copy the rest of the files needed for inference
+COPY --chown=user predicting_outcomes_in_heart_failure ./predicting_outcomes_in_heart_failure
+COPY --chown=user models/nosex/random_forest.joblib ./models/nosex/random_forest.joblib
+COPY --chown=user reports/nosex/random_forest/cv_parameters.json ./reports/nosex/random_forest/cv_parameters.json
+COPY --chown=user data/interim/preprocess_artifacts/scaler.joblib ./data/interim/preprocess_artifacts/scaler.joblib
+COPY --chown=user metrics/test/nosex/random_forest.json ./metrics/test/nosex/random_forest.json
+COPY --chown=user README.md ./README.md
+COPY --chown=user models/README.md ./models/README.md
+COPY --chown=user data/README.md ./data/README.md
+EXPOSE 7860
+CMD ["uv", "run", "uvicorn", "predicting_outcomes_in_heart_failure.app.main:app", "--host", "0.0.0.0", "--port", "7860"]

Makefile ADDED Viewed

	@@ -0,0 +1,87 @@

+#################################################################################
+# GLOBALS                                                                       #
+#################################################################################
+PROJECT_NAME = CardioTrack
+PYTHON_VERSION = 3.11
+PYTHON_INTERPRETER = python
+#################################################################################
+# COMMANDS                                                                      #
+#################################################################################
+## Install Python dependencies
+.PHONY: requirements
+requirements:
+	uv sync
+## Delete all compiled Python files
+.PHONY: clean
+clean:
+	find . -type f -name "*.py[co]" -delete
+	find . -type d -name "__pycache__" -delete
+## Lint using ruff (use `make format` to do formatting)
+.PHONY: lint
+lint:
+	ruff format --check
+	ruff check
+## Format source code with ruff
+.PHONY: format
+format:
+	ruff check --fix
+	ruff format
+## Run tests
+.PHONY: test
+test:
+	python -m pytest tests
+## Set up Python interpreter environment
+.PHONY: create_environment
+create_environment:
+	uv venv --python $(PYTHON_VERSION)
+	@echo ">>> New uv virtual environment created. Activate with:"
+	@echo ">>> Windows: .\\\\.venv\\\\Scripts\\\\activate"
+	@echo ">>> Unix/macOS: source ./.venv/bin/activate"
+#################################################################################
+# PROJECT RULES                                                                 #
+#################################################################################
+## Make dataset
+.PHONY: data
+data: requirements
+	$(PYTHON_INTERPRETER) predicting_outcomes_in_heart_failure/dataset.py
+#################################################################################
+# Self Documenting Commands                                                     #
+#################################################################################
+.DEFAULT_GOAL := help
+define PRINT_HELP_PYSCRIPT
+import re, sys; \
+lines = '\n'.join([line for line in sys.stdin]); \
+matches = re.findall(r'\n## (.*)\n[\s\S]+?\n([a-zA-Z_-]+):', lines); \
+print('Available rules:\n'); \
+print('\n'.join(['{:25}{}'.format(*reversed(match)) for match in matches]))
+endef
+export PRINT_HELP_PYSCRIPT
+help:
+	@$(PYTHON_INTERPRETER) -c "${PRINT_HELP_PYSCRIPT}" < $(MAKEFILE_LIST)

README.md ADDED Viewed

	@@ -0,0 +1,239 @@

+---
+title: CardioTrack API
+emoji: ❤️
+colorFrom: purple
+colorTo: gray
+sdk: docker
+app_port: 7860
+---
+# Predicting Outcomes in Heart Failure
+## Table of Contents
+1. [Project Overview](#project-overview)
+2. [Project Organization](#project-organization)
+3. [DVC Pipeline Defined](#dvc-pipeline-defined)
+4. [Milestones Summary](#milestones-summary)
+   - [Milestone 1 - Inception](#milestone-1---inception)
+   - [Milestone 2 - Reproducibility](#milestone-2---reproducibility)
+   - [Milestone 3 - Quality Assurance](#milestone-3---quality-assurance)
+   - [Milestone 4 - API Integration](#milestone-4---API-Integration)
+## Project Overview
+<a target="_blank" href="https://cookiecutter-data-science.drivendata.org/">
+    <img src="https://img.shields.io/badge/CCDS-Project%20template-328F97?logo=cookiecutter" />
+</a>
+This project develops a predictive pipeline for patient outcome prediction in heart failure, using a publicly available dataset of clinical records. The goal is to design and evaluate machine learning models within a reproducible workflow that can be integrated into larger systems for clinical decision support. The workflow addresses data heterogeneity, defines consistent preprocessing and feature engineering strategies, and explores alternative modeling approaches with systematic evaluation using clinically relevant metrics. It also emphasizes model transparency and auditability, ensuring that the resulting pipeline can be deployed as a reliable, adaptable software component in healthcare applications. The project aims not only to improve baseline predictive performance but also to demonstrate how data-driven models can be effectively integrated into end-to-end AI-enabled healthcare systems.
+## Project Organization
+```
+├── LICENSE            <- Open-source license if one is chosen
+├── Makefile           <- Makefile with convenience commands like `make data` or `make train`
+├── README.md          <- The top-level README for developers using this project.
+├── data
+│   ├── external       <- Data from third party sources.
+│   ├── interim        <- Intermediate data that has been transformed.
+│   ├── processed      <- The final, canonical data sets for modeling.
+│   └── raw            <- The original, immutable data dump.
+│
+├── docs               <- A default mkdocs project; see www.mkdocs.org for details
+│
+├── models             <- Trained and serialized models, model predictions, or model summaries
+│
+├── notebooks          <- Jupyter notebooks. Naming convention is a number (for ordering),
+│                         the creator's initials, and a short `-` delimited description, e.g.
+│                         `1.0-jqp-initial-data-exploration`.
+│
+├── pyproject.toml     <- Project configuration file with package metadata for
+│                         predicting_outcomes_in_heart_failure and configuration for tools like black
+│
+├── references         <- Data dictionaries, manuals, and all other explanatory materials.
+│
+├── reports            <- Generated analysis as HTML, PDF, LaTeX, etc.
+│   └── figures        <- Generated graphics and figures to be used in reporting
+│
+├── requirements.txt   <- The requirements file for reproducing the analysis environment, e.g.
+│                         generated with `pip freeze > requirements.txt`
+│
+├── setup.cfg          <- Configuration file for flake8
+│
+└── predicting_outcomes_in_heart_failure   <- Source code for use in this project.
+    │
+    ├── __init__.py             <- Makes predicting_outcomes_in_heart_failure a Python module
+    │
+    ├── config.py               <- Store useful variables and configuration
+    │
+    ├── data
+    │   ├── __init__.py
+    │   ├── dataset.py          <- Scripts to download or generate data
+    |   ├── preprocess.py       <- Data preprocessing code
+    │   └── split_data.py       <- Split dataset into train and test code
+    │
+    ├── features.py             <- Code to create features for modeling
+    │
+    ├── modeling
+    │   ├── __init__.py
+    │   ├── predict.py          <- Code to run model inference with trained models
+    │   └── train.py            <- Code to train models
+    │
+    └── plots.py                <- Code to create visualizations
+```
+## DVC Pipeline defined
+```
+          +---------------+
+          | download_data |
+          +---------------+
+                  *
+                  *
+                  *
+          +---------------+
+          | preprocessing |
+          +---------------+
+                  *
+                  *
+                  *
+            +------------+
+            | split_data |
+            +------------+
+           ***          ***
+          *                *
+        **                  ***
++----------+                   *
+| training |                ***
++----------+               *
+           ***          ***
+              *        *
+               **    **
+            +------------+
+            | evaluation |
+            +------------+
+```
+## Milestones Summary
+### Milestone 1 - Inception
+During this milestone, the **CCDS Project Template** was used as the foundation for organizing the project.
+The main conceptual and structural components of the system were defined, following the template guidelines to ensure consistency and traceability.
+Additionally, a **Machine Learning Canvas** has been added in the [`docs/`](./docs) folder.
+It outlines the model objectives, the data to be used, and the key methodological aspects planned for the next phases of the project.
+### Milestone 2 - Reproducibility
+Milestone-2 introduces **reproducibility**, from **data management** to **model training and evaluation**. This includes a fully automated pipeline, experiment tracking, and model registry integration, ensuring every step can be consistently reproduced and monitored.
+#### Exploratory Data Analysis (EDA)
+As part of the early steps, we added and refined an **Exploratory Data Analysis** to better understand the dataset, its distribution, and relationships between variables. This helped define the preprocessing and modeling strategies used later.
+#### DVC Initialization and Pipeline Setup
+We initialized **DVC** and configured a full pipeline to automate the main steps of the ML workflow:
+- Automatic data **download**
+- **Preprocessing**
+- **Data splitting**
+- **Training** and **evaluation**
+The pipeline is fully reproducible and version-controlled through DVC.
+#### Model Training and Experiment Tracking
+We implemented the **training scripts** and integrated **MLflow** for experiment tracking.
+Three models are trained and evaluated within this workflow:
+- Decision Tree
+- Random Forest
+- Logistic Regression
+Each experiment is logged to MLflow.
+#### Model Registry and Thresholds
+Models that reach or exceed the predefined **performance thresholds** (as defined in the ML Canvas) are automatically **saved to the model registry**.
+### Milestone 3 – Quality Assurance
+In this milestone, we introduced  **Quality Assurance** layer to the system.
+#### Static Linters
+Two static linters were added to improve code style and consistency:
+- **Ruff** for Python files in the `predicting_outcomes_in_heart_failure` and `tests` folders.
+  It checks formatting, syntax, and common anti-patterns, and is integrated into the GitHub workflow via an *action*.
+- **Pynblint** for Jupyter notebooks, also integrated into the GitHub workflow through a dedicated *action*.
+#### Data Quality
+We implemented **data quality checks** on both raw and processed data using **Great Expectations**.
+These validations help to:
+- detect anomalies or invalid values at the data source
+- prevent the propagation of data issues into downstream processes
+#### Code Quality
+We added automated **unit and integration tests** using **pytest**, covering the main modules and functionalities of the system.
+#### ML Pipeline Enhancements
+ we applied the following enhancements to the ML pipeline:
+- Refactored preprocessing with gender-based dataset variants.
+- Added validation (e.g., error on single-row datasets).
+- Saved StandardScaler as preprocessing artifact.
+- Updated split logic and DVC pipeline.
+- Training now creates variant-specific MLflow experiments.
+- Added RandomOverSampler to address class imbalance.
+- Updated evaluation and inference to align with the new structure.
+#### Explainability
+We applied an explainability module:
+- Added SHAP explainability module.
+- Added tests for explainability functionality.
+#### Risk Classification
+We added a **Risk Classification** analysis for the system in accordance with **IMDRF** and **AI Act** regulations.
+The documentation is available in the [`docs/`](./docs) folder.
+Ecco la versione finale **in Markdown puro**, già formattata correttamente:
+### Milestone 4 - API Integration
+During Milestone 4, we implemented a fully functional API and Dataset Card and Model card for the champion model and the following used dataset.
+APIs are structured into four main routers:
+#### **General Router**
+- **GET /**
+  Returns a welcome message and confirms that the API is running.
+#### **Prediction Router**
+- **POST /predictions**
+  Generates a binary prediction (0/1) for a single patient sample.
+- **POST /predict-batch**
+  Accepts a list of patient samples and returns a prediction for each element in the batch.
+- **POST /explanations**
+  Produces SHAP-based explanations for a single input and returns the URL of the generated SHAP waterfall plot.
+#### **Model Info Router**
+- **GET /model/hyperparameters**
+  Returns the hyperparameters and cross-validation results of the model defined in `MODEL_PATH`.
+- **GET /model/metrics**
+  Returns the test-set metrics stored during the model evaluation stage.
+#### **Cards Router**
+- **GET /card/{card_type}**
+  Returns the content of a “card” file (dataset card or model card).
+### **Cards**
+During this milestone, we also created:
+- a **dataset card** describing the dataset used by the champion model
+- a **model card** documenting the champion model itself

data/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/processed
+/interim
+/raw

data/README.md ADDED Viewed

	@@ -0,0 +1,176 @@

+# Dataset Card
+## Table of Contents
+- [Dataset Description](#dataset-description)
+  - [Dataset Summary](#dataset-summary)
+  - [Supported Tasks](#supported-tasks)
+  - [Languages](#languages)
+- [Dataset Structure](#dataset-structure)
+  - [Data Instances](#data-instances)
+  - [Data Fields](#data-fields)
+- [Dataset Creation](#dataset-creation)
+  - [Source Data](#source-data)
+  - [Annotations](#annotations)
+  - [Personal and Sensitive Information](#personal-and-sensitive-information)
+- [Considerations for Using the Data](#considerations-for-using-the-data)
+  - [Social Impact of Dataset](#social-impact-of-dataset)
+  - [Discussion of Biases](#discussion-of-biases)
+- [Additional Information](#additional-information)
+  - [Dataset Curators](#dataset-curators)
+  - [Citation Information](#citation-information)
+## Dataset Description
+- **Homepage:** https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction
+### Dataset Summary
+This dataset contains anonymized clinical data used to predict the risk of heart failure.
+It includes **918 patient records**, **11 clinical features**, and **one target variable**.
+The original dataset was downloaded from Kaggle and was created by merging five well-known cardiology datasets.
+The version used in this project underwent additional preprocessing steps, including standardization, normalization, categorical encoding, and removal of the Sex feature. The resulting dataset is used for experimentation and model development.
+### Supported Tasks
+This dataset can be used for a variety of machine learning tasks, including:
+- **Binary Classification**
+  Predicting whether a patient has heart disease.
+- **Risk Scoring / Clinical Risk Stratification**
+   Estimating cardiac risk based on clinical variables.
+- **Explainable AI (XAI)**
+   Useful for feature-importance analysis and interpretability.
+### Languages
+English **(en)**
+## Dataset Structure
+### Data Instances
+Each instance represents one patient. Example:
+| Age |Sex | ChestPainType | RestingBP | Cholesterol | FastingBS | RestingECG | MaxHR | ExerciseAngina | Oldpeak | ST_Slope | HeartDisease |
+|-----|----|---------------|-----------|-------------|-----------|------------|-------|----------------|---------|----------|--------------|
+| 54  | M  | ASY           | 140       | 239         | 0         | Normal     | 160   | N              | 1.2     | Flat     | 1            |
+### Data Fields
+| Field          | Type      | Description                                                   |
+|----------------|-----------|---------------------------------------------------------------|
+| Age            | int       | Patient age in years                                          |
+| Sex            | binary    | Patient sex (M = male, F = female)                            |
+| ChestPainType  | category  | Chest pain type (TA, ATA, NAP, ASY)                           |
+| RestingBP      | int       | Resting blood pressure (mm Hg)                                |
+| Cholesterol    | int       | Serum cholesterol (mg/dL)                                     |
+| FastingBS      | binary    | Fasting blood sugar (1 if >120 mg/dL, 0 otherwise)            |
+| RestingECG     | category  | Resting ECG results (Normal, ST, LVH)                         |
+| MaxHR          | int       | Maximum heart rate achieved                                   |
+| ExerciseAngina | binary    | Exercise-induced angina (Y/N)                                 |
+| Oldpeak        | float     | ST depression relative to rest                                |
+| ST_Slope       | category  | Slope of the ST segment (Up, Flat, Down)                      |
+| HeartDisease   | binary    | Target variable (1 = disease, 0 = no disease)                 |
+## Dataset Creation
+### Source Data
+The preprocessed dataset used in this project originates from the Kaggle dataset *“Heart Failure Prediction Dataset”*.
+The raw dataset was created by merging five widely-used cardiology datasets:
+- Cleveland (303 samples)
+- Hungarian (294 samples)
+- Switzerland (123 samples)
+- Long Beach VA (200 samples)
+- Stalog (270 samples)
+The Kaggle author selected the 11 common features and merged the datasets into a unified collection of **1,190 records**, then removed **272 duplicates**, resulting in **918 unique samples**.
+All initial merging and normalization steps were performed by the dataset author on Kaggle.
+### Annotations
+No manual annotations were added.
+The target variable `HeartDisease` is already included in the original dataset.
+### Personal and Sensitive Information
+Although the dataset contains clinical information (sensitive under GDPR), it is fully anonymized:
+- No personal identifiers (name, address, contact details, IDs).
+- All sources were already anonymized before publication.
+- No biometric or genetic data are included.
+Thus, while clinically sensitive, the dataset does **not** pose identifiable privacy risks.
+## Considerations for Using the Data
+### Social Impact of Dataset
+The dataset can support research and development of models for cardiac risk prediction and early detection.
+However:
+- Models trained on this dataset **must not be used as standalone diagnostic tools**.
+- They should **not** be the sole basis for clinical decisions.
+- Misuse in healthcare contexts may lead to incorrect risk assessment.
+### Discussion of Biases
+This dataset may contain several sources of bias that can affect model performance and fairness:
+- The data comes from multiple hospitals and countries, each with different patient profiles and clinical protocols. Some groups may be underrepresented.
+- Source datasets used different diagnostic practices and measurement standards, which may introduce noise or inconsistency in labels and clinical values.
+- Only 11 features are included, omitting other relevant clinical variables. This can cause proxy bias or oversimplification of cardiac risk.
+- Some datasets are older and may not reflect current medical practices or population characteristics.
+## Additional Information
+### Dataset Curators
+The original dataset was created and published by **[fedesoriano](https://www.kaggle.com/fedesoriano)** on Kaggle.
+The preprocessed dataset was curated by the **CardioTrack** team:
+- [Fabrizio Rosmarino](https://github.com/Fabrizio250)
+- [Martina Capone](https://github.com/Martycap)
+- [Donato Boccuzzi](https://github.com/donatooooooo)
+Work carried out as part of the *Software Engineering for AI-Enabled Systems* program at the University of Bari.
+### Citation Information
+If you use this datasets, please cite:
+**Original Dataset**
+Soriano, F. (2021). *Heart Failure Prediction Dataset*. Kaggle.
+https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction

docs/.gitkeep ADDED Viewed

File without changes

docs/CardioTrack_ML_Canvas.md ADDED Viewed

	@@ -0,0 +1,98 @@

+# **CARDIO TRACK - MACHINE LEARNING CANVAS**
+**Designed for:** Giulio Mallardi
+**Designed by:** D. Boccuzzi, M. Capone, F. Rosmarino
+**Date:** 17/10/2025
+**Iteration:** 2
+---
+## **1. Prediction Task**
+Cardio Track ML system performs a **binary classification** task based on clinical data from individual patients, with the goal of predicting the presence or absence of heart disease.
+Specifically, the model analyzes each patient’s clinical features and risk factors to estimate the likelihood of developing heart failure.
+There are two possible prediction outcomes:
+- **Positive:** when the patient shows indicators of heart failure.
+- **Negative:** when no signs of disease are detected.
+---
+## **2. Decisions**
+The system’s predictions support **cardiologists** and **public health institutions (ASL)**.
+For positive cases, cardiologists can order further tests, start monitoring, and define personalized treatments.
+Aggregated results help public health institutions plan resources, prioritize facilities, and promote prevention and lifestyle improvements for long-term cardiovascular health.
+---
+## **3. Value Proposition**
+The main end users are **cardiologists** and **local health authorities (ASL)**.
+For cardiologists, the system provides a reliable tool to assist in the early diagnosis of heart failure.
+For health authorities, it enables more efficient management of healthcare resources by optimizing the distribution of diagnostic and therapeutic services.
+Overall, Cardio Track ML system aims to support **prevention** and **early detection** of heart failure, improving patient outcomes and reducing mortality rates.
+---
+## **4. Data Collection**
+Data collection will be a **continuous and evolving process**.
+Real and high quality clinical data will be carefully labeled and verified by domain experts, ensuring data quality and consistency.
+New patient data collected through standardized clinical protocols will periodically update and improve the model, allowing it to adapt and learn over time.
+---
+## **5. Data Sources**
+Cardio Track ML system will rely on a **publicly available dataset** that includes clinical parameters from both healthy individuals and patients diagnosed with heart failure.
+The reference dataset is the [Heart Failure Prediction Dataset](https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction).
+---
+## **6. Impact Simulation**
+Before release, the model will undergo rigorous validation on an independent test set.
+As a baseline, minimum thresholds are defined for the key evaluation metrics: **F1-score**, **recall** and **accuracy** ≥ 0.80 and **ROC-AUC** ≥ 0.85. These values ensure that the model maintains high discriminative capability while minimizing the risk of undetected clinical cases.
+We will assess potential bias across demographic subgroups to ensure fairness and consistent model performance.
+Any detected bias will be mitigated through rebalancing techniques or threshold adjustment to guarantee equitable treatment across all patient categories.
+---
+## **7. Making Predictions**
+Predictions will be made on-demand, triggered whenever new or updated clinical data becomes available in the hospital database.
+Real-time processing is not required, but timely inference will support the decision-making workflow.
+All computations will be executed **on-premises**, using the existing hospital IT infrastructure to ensure **data privacy** and **security**.
+---
+## **8. Building Models**
+Cardio Track ML system will use a **single main model** in production.
+Model updates will occur periodically as new data is integrated, or when a new version demonstrates statistically significant improvements in key metrics: **F1-score**, **recall**, **accuracy**, and **ROC-AUC**.
+Model explainability will be ensured through the analysis of feature importance. Feature impact will be quantified by observing charts, allowing medical experts to interpret and validate the relevance of clinical factors used in the decision process.
+---
+## **9. Features**
+The Heart Failure Prediction Dataset already provides a complete set of clinical features, so there is no need to extract them directly from medical exams or diagnostic reports.
+**Included features:**
+Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, and ST_Slope.
+These features capture key cardiovascular risk factors such as hypertension, diabetes, hyperlipidemia, obesity, and other pre-existing heart conditions, making the dataset suitable for early heart failure diagnosis.
+---
+## **10. Monitoring**
+After deployment, system performance will be continuously **monitored** to detect potential drifts or degradations over time.
+Key metrics include **F1-score**, **recall**, **accuracy**, and **ROC-AUC**, reviewed at regular intervals.
+Clinician feedback will also be collected to assess **usability** and **clinical relevance**, ensuring continuous model improvement and alignment with real-world medical needs.

docs/Risk_Classification.md ADDED Viewed

	@@ -0,0 +1,101 @@

+# **Risk Classification**
+## **1. Purpose**
+This document describes the risk classification of a Software as a Medical Device (SaMD) designed to identify the presence or absence of signs of heart failure through a **binary classification based on clinical data**.
+The classification is developed using the **IMDRF SaMD Risk Categorization** framework and additional European regulatory references (AI Act, MDR).
+---
+## **2. Intended Use**
+The system performs a **binary classification** task based on clinical data from individual patients, aiming to predict the presence or absence of heart disease. Specifically, the model analyzes each patient’s clinical features and risk factors to identify the potential presence of heart failure.
+The model outputs two possible classification results:
+* **Positive:** when the patient shows indicators compatible with heart failure.
+* **Negative:** when no signs of the condition are detected.
+### **2.1 Clinical Role**
+* The software output is intended as a **Clinical Decision Support (CDS)** tool.
+* The intended user is a **qualified medical professional**.
+* The software **does not perform diagnosis**, **does not make autonomous therapeutic decisions**, and **is not intended for use in emergency settings**.
+* The information provided supports—but does not replace—clinical judgement.
+---
+## **3. IMDRF SaMD Risk Categorization**
+The IMDRF framework evaluates two key dimensions:
+1. **The significance of the information provided by the software**
+2. **The severity of the clinical condition addressed**
+### **3.1 Significance of the Information – *Treat/Diagnose***
+**Rationale:**
+* The software provides a **binary risk classification** that may influence clinical decisions such as follow-up, diagnostic investigation, or changes in patient management.
+* The output goes beyond merely describing clinical status (“inform” level), contributing instead to medical decision-making.
+* As the system supports decisions relevant to diagnosis and treatment, it falls within the **Treat/Diagnose** category of the IMDRF framework.
+### **3.2 Severity of the Clinical Condition – *Serious***
+**Rationale:**
+* Heart failure is a serious medical condition with potentially significant complications.
+* The system is not intended for emergency use, does not initiate immediate life-saving actions, and operates within routine or preventive clinical care.
+* The presence of a medical professional mitigates the risk of immediate harm due to software errors.
+* In the intended use context, the condition is therefore appropriately classified as **Serious**, not “Critical”.
+### **3.3 IMDRF Classification Result**
+| Significance   | Condition | IMDRF Category |
+| -------------- | --------- | -------------- |
+| Treat/Diagnose | Serious   | **III**        |
+---
+## **4. AI Act – Probable Classification as High-Risk AI**
+According to the **Regulation (EU) 2024/1689 (AI Act)**, artificial intelligence systems used as medical devices or as components of medical devices regulated under the MDR/IVDR are included among **High-Risk AI Systems**, as listed in Annex III.
+For the system under consideration:
+* it meets the MDR definition of **SaMD**;
+* it supports clinically relevant decisions;
+* it may influence patient management concerning a serious medical condition.
+Therefore, the system can be **reasonably considered a High-Risk AI System** under the AI Act.
+This is not a definitive classification—the formal designation will depend on MDR processes and final technical documentation—but it represents a consistent regulatory interpretation based on the software’s intended purpose and domain.
+---
+## **5. Conclusion**
+The software is classified as:
+* **SaMD, IMDRF Category III**, based on:
+  * information of the **Treat/Diagnose** type;
+  * management of a condition categorized as **Serious**.
+This classification does not represent the final MDR class but provides a robust basis for risk assessment and regulatory positioning, including the likely classification as **High-Risk AI** under the AI Act.
+---
+## **6. MDR (EU) – Additional Note**
+IMDRF categorization does not directly determine the MDR class but offers a helpful conceptual framework.
+The European MDR classification will be established through:
+* **MDR 2017/745**, Annex VIII
+* **Rule 11**, specific to medical device software
+* **MDCG 2019-11**, interpretative guidance
+Based on the system’s functionality (clinical classification supporting diagnosis/prognosis), an assignment of at least **Class IIa** is likely.
+However, the final class will depend on the risk evaluation and the risk-control measures implemented.

dvc.lock ADDED Viewed

	@@ -0,0 +1,754 @@

+schema: '2.0'
+stages:
+  download_data:
+    cmd: uv run predicting_outcomes_in_heart_failure/data/dataset.py
+    deps:
+    - path: predicting_outcomes_in_heart_failure/data/dataset.py
+      hash: md5
+      md5: 2896ae3ebb48acbbfa118415494c70ef
+      size: 559
+    outs:
+    - path: data/raw/heart.csv
+      hash: md5
+      md5: ab21f2524241ed14b321bcaf40c8b86e
+      size: 35921
+  preprocessing:
+    cmd: uv run predicting_outcomes_in_heart_failure/data/preprocess.py
+    deps:
+    - path: data/raw/heart.csv
+      hash: md5
+      md5: ab21f2524241ed14b321bcaf40c8b86e
+      size: 35921
+    - path: predicting_outcomes_in_heart_failure/data/preprocess.py
+      hash: md5
+      md5: a3586b10fae9eed2183fb516b2298273
+      size: 4328
+    outs:
+    - path: data/interim/preprocess_artifacts/scaler.joblib
+      hash: md5
+      md5: 224537cb262510335c5515d6156952d6
+      size: 1023
+    - path: data/interim/preprocessed.csv
+      hash: md5
+      md5: aeb0353e39e219cf0b574ab72b08ac26
+      size: 151228
+    - path: data/interim/preprocessed_female_only.csv
+      hash: md5
+      md5: 337b3e1dd4f47911997eded603ab3f4b
+      size: 31966
+    - path: data/interim/preprocessed_male_only.csv
+      hash: md5
+      md5: 9fa26adfc62e4ccd77089627dbe01f5d
+      size: 119503
+    - path: data/interim/preprocessed_no_sex_column.csv
+      hash: md5
+      md5: 95d3064bbfef4c8f09d45fe4e1c915d4
+      size: 149390
+  split_data:
+    cmd: uv run predicting_outcomes_in_heart_failure/data/split_data.py
+    deps:
+    - path: data/interim/preprocessed.csv
+      hash: md5
+      md5: aeb0353e39e219cf0b574ab72b08ac26
+      size: 151228
+    - path: predicting_outcomes_in_heart_failure/data/split_data.py
+      hash: md5
+      md5: 068ee60f2ab294944d3392ef6aa033e5
+      size: 2262
+    outs:
+    - path: data/processed
+      hash: md5
+      md5: 7809f68c8d582ea4f49135f432bab71f.dir
+      size: 149721
+      nfiles: 2
+  training:
+    cmd: uv run predicting_outcomes_in_heart_failure/modeling/train.py
+    deps:
+    - path: data/processed/train.csv
+      hash: md5
+      md5: c7af893630cff97ccd3ce364ed1ee6eb
+      size: 104547
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: 6e81f5b43ee6b2698cf3d582a9712c46
+      size: 4811
+    outs:
+    - path: models/decision_tree.joblib
+      hash: md5
+      md5: 2277d8f53277419bf3594d2b2dd6c8b5
+      size: 5561
+    - path: models/logreg.joblib
+      hash: md5
+      md5: bf500ad9a04a83594316bd2571584dc4
+      size: 1519
+    - path: models/random_forest.joblib
+      hash: md5
+      md5: da8670002f66b3fcb7e27ea32c33f14d
+      size: 9919545
+    - path: reports/decision_tree
+      hash: md5
+      md5: 60beca0dca76c93ee85a1874eb68a1e9.dir
+      size: 838814
+      nfiles: 2
+    - path: reports/logreg
+      hash: md5
+      md5: a6b3653e15d6d1b1472d1faeb11bd3cb.dir
+      size: 41389
+      nfiles: 2
+    - path: reports/random_forest
+      hash: md5
+      md5: 06127fd0761a967d67577c88fc428426.dir
+      size: 175536
+      nfiles: 2
+  evaluation:
+    cmd: uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py
+    deps:
+    - path: data/processed/test.csv
+      hash: md5
+      md5: d06bcd540eff4a5c8e6dd668a2b148ed
+      size: 45174
+    - path: models
+      hash: md5
+      md5: f8d5e9622b5c447a4ed7b66ad4608927.dir
+      size: 9926687
+      nfiles: 5
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: ee20dc257b9ac063d61a815401341836
+      size: 3256
+  split_data@all:
+    cmd: "uv run predicting_outcomes_in_heart_failure/data/split_data.py --variant
+      all\n"
+    deps:
+    - path: data/interim/preprocessed.csv
+      hash: md5
+      md5: aeb0353e39e219cf0b574ab72b08ac26
+      size: 151228
+    - path: data/interim/preprocessed_female_only.csv
+      hash: md5
+      md5: 337b3e1dd4f47911997eded603ab3f4b
+      size: 31966
+    - path: data/interim/preprocessed_male_only.csv
+      hash: md5
+      md5: 9fa26adfc62e4ccd77089627dbe01f5d
+      size: 119503
+    - path: data/interim/preprocessed_no_sex_column.csv
+      hash: md5
+      md5: 95d3064bbfef4c8f09d45fe4e1c915d4
+      size: 149390
+    - path: predicting_outcomes_in_heart_failure/data/split_data.py
+      hash: md5
+      md5: 63484f5b1cbe60115d922b0c68012a66
+      size: 3616
+    outs:
+    - path: data/processed/all
+      hash: md5
+      md5: 7809f68c8d582ea4f49135f432bab71f.dir
+      size: 149721
+      nfiles: 2
+  split_data@female:
+    cmd: "uv run predicting_outcomes_in_heart_failure/data/split_data.py --variant
+      female\n"
+    deps:
+    - path: data/interim/preprocessed.csv
+      hash: md5
+      md5: aeb0353e39e219cf0b574ab72b08ac26
+      size: 151228
+    - path: data/interim/preprocessed_female_only.csv
+      hash: md5
+      md5: 337b3e1dd4f47911997eded603ab3f4b
+      size: 31966
+    - path: data/interim/preprocessed_male_only.csv
+      hash: md5
+      md5: 9fa26adfc62e4ccd77089627dbe01f5d
+      size: 119503
+    - path: data/interim/preprocessed_no_sex_column.csv
+      hash: md5
+      md5: 95d3064bbfef4c8f09d45fe4e1c915d4
+      size: 149390
+    - path: predicting_outcomes_in_heart_failure/data/split_data.py
+      hash: md5
+      md5: 63484f5b1cbe60115d922b0c68012a66
+      size: 3616
+    outs:
+    - path: data/processed/female
+      hash: md5
+      md5: d873e58476d480ce41edfc7806cbde86.dir
+      size: 31872
+      nfiles: 2
+  split_data@male:
+    cmd: "uv run predicting_outcomes_in_heart_failure/data/split_data.py --variant
+      male\n"
+    deps:
+    - path: data/interim/preprocessed.csv
+      hash: md5
+      md5: aeb0353e39e219cf0b574ab72b08ac26
+      size: 151228
+    - path: data/interim/preprocessed_female_only.csv
+      hash: md5
+      md5: 337b3e1dd4f47911997eded603ab3f4b
+      size: 31966
+    - path: data/interim/preprocessed_male_only.csv
+      hash: md5
+      md5: 9fa26adfc62e4ccd77089627dbe01f5d
+      size: 119503
+    - path: data/interim/preprocessed_no_sex_column.csv
+      hash: md5
+      md5: 95d3064bbfef4c8f09d45fe4e1c915d4
+      size: 149390
+    - path: predicting_outcomes_in_heart_failure/data/split_data.py
+      hash: md5
+      md5: 63484f5b1cbe60115d922b0c68012a66
+      size: 3616
+    outs:
+    - path: data/processed/male
+      hash: md5
+      md5: 248b95ea1cff032fcea0092636795342.dir
+      size: 118331
+      nfiles: 2
+  split_data@nosex:
+    cmd: "uv run predicting_outcomes_in_heart_failure/data/split_data.py --variant
+      nosex\n"
+    deps:
+    - path: data/interim/preprocessed.csv
+      hash: md5
+      md5: aeb0353e39e219cf0b574ab72b08ac26
+      size: 151228
+    - path: data/interim/preprocessed_female_only.csv
+      hash: md5
+      md5: 337b3e1dd4f47911997eded603ab3f4b
+      size: 31966
+    - path: data/interim/preprocessed_male_only.csv
+      hash: md5
+      md5: 9fa26adfc62e4ccd77089627dbe01f5d
+      size: 119503
+    - path: data/interim/preprocessed_no_sex_column.csv
+      hash: md5
+      md5: 95d3064bbfef4c8f09d45fe4e1c915d4
+      size: 149390
+    - path: predicting_outcomes_in_heart_failure/data/split_data.py
+      hash: md5
+      md5: 63484f5b1cbe60115d922b0c68012a66
+      size: 3616
+    outs:
+    - path: data/processed/nosex
+      hash: md5
+      md5: d6f6cbb3681cea6a652efef8141ef546.dir
+      size: 147879
+      nfiles: 2
+  training@0:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
+      all --model logreg\n"
+    deps:
+    - path: data/processed/all/train.csv
+      hash: md5
+      md5: c7af893630cff97ccd3ce364ed1ee6eb
+      size: 104547
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: e2ae238f0d45e8032e3944a24d0c27e3
+      size: 7931
+    outs:
+    - path: models/all/logreg.joblib
+      hash: md5
+      md5: 44595213dc6024d0c3f043d080145c29
+      size: 1519
+    - path: reports/all/logreg
+      hash: md5
+      md5: c3345c2b9f17869574dd78e20e9c5829.dir
+      size: 11970
+      nfiles: 2
+  training@1:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
+      all --model random_forest\n"
+    deps:
+    - path: data/processed/all/train.csv
+      hash: md5
+      md5: c7af893630cff97ccd3ce364ed1ee6eb
+      size: 104547
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: e2ae238f0d45e8032e3944a24d0c27e3
+      size: 7931
+    outs:
+    - path: models/all/random_forest.joblib
+      hash: md5
+      md5: 93d7fcec110a443bd85ef6579c2a5c2e
+      size: 3441465
+    - path: reports/all/random_forest
+      hash: md5
+      md5: b929c6714dd61f762452554919b9b5e8.dir
+      size: 19446
+      nfiles: 2
+  training@2:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
+      all --model decision_tree\n"
+    deps:
+    - path: data/processed/all/train.csv
+      hash: md5
+      md5: c7af893630cff97ccd3ce364ed1ee6eb
+      size: 104547
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: e2ae238f0d45e8032e3944a24d0c27e3
+      size: 7931
+    outs:
+    - path: models/all/decision_tree.joblib
+      hash: md5
+      md5: aedace81fc3fca71a5cbdb80f9cb5b28
+      size: 2521
+    - path: reports/all/decision_tree
+      hash: md5
+      md5: ee869eabad22f17701ddd17ddb2f5474.dir
+      size: 7025256
+      nfiles: 2
+  training@3:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
+      female --model logreg\n"
+    deps:
+    - path: data/processed/female/train.csv
+      hash: md5
+      md5: b37819cf2f8e1a53a7e715a46299dab0
+      size: 22207
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: e2ae238f0d45e8032e3944a24d0c27e3
+      size: 7931
+    outs:
+    - path: models/female/logreg.joblib
+      hash: md5
+      md5: 3b8764d1ac710c8a250074bf2d8be1ea
+      size: 1519
+    - path: reports/female/logreg
+      hash: md5
+      md5: 4b14fe2da9a58fc5ed3e0f7587efe7a7.dir
+      size: 8706
+      nfiles: 2
+  training@4:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
+      female --model random_forest\n"
+    deps:
+    - path: data/processed/female/train.csv
+      hash: md5
+      md5: b37819cf2f8e1a53a7e715a46299dab0
+      size: 22207
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: e2ae238f0d45e8032e3944a24d0c27e3
+      size: 7931
+    outs:
+    - path: models/female/random_forest.joblib
+      hash: md5
+      md5: 343a3a957a3a505be860c350cebd2af8
+      size: 880185
+    - path: reports/female/random_forest
+      hash: md5
+      md5: 23c999394e9140be73cb6db2ee039d2f.dir
+      size: 13839
+      nfiles: 2
+  training@5:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
+      female --model decision_tree\n"
+    deps:
+    - path: data/processed/female/train.csv
+      hash: md5
+      md5: b37819cf2f8e1a53a7e715a46299dab0
+      size: 22207
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: e2ae238f0d45e8032e3944a24d0c27e3
+      size: 7931
+    outs:
+    - path: models/female/decision_tree.joblib
+      hash: md5
+      md5: 432ddc9ffd54955ca752fc6b7da9a312
+      size: 7161
+    - path: reports/female/decision_tree
+      hash: md5
+      md5: 8675acf1093c63f8758331c9aef72484.dir
+      size: 5042622
+      nfiles: 2
+  training@6:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
+      male --model logreg\n"
+    deps:
+    - path: data/processed/male/train.csv
+      hash: md5
+      md5: 377755a75869ae39a64b39120c09aa5b
+      size: 82611
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: e2ae238f0d45e8032e3944a24d0c27e3
+      size: 7931
+    outs:
+    - path: models/male/logreg.joblib
+      hash: md5
+      md5: 703f955be56bea513500f9bdf1b541b3
+      size: 1535
+    - path: reports/male/logreg
+      hash: md5
+      md5: fc5c9fda539597ab95b47e3cef544ae4.dir
+      size: 11191
+      nfiles: 2
+  training@7:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
+      male --model random_forest\n"
+    deps:
+    - path: data/processed/male/train.csv
+      hash: md5
+      md5: 377755a75869ae39a64b39120c09aa5b
+      size: 82611
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: e2ae238f0d45e8032e3944a24d0c27e3
+      size: 7931
+    outs:
+    - path: models/male/random_forest.joblib
+      hash: md5
+      md5: a0b69f9da0287b8f8cc06f65ee0aa994
+      size: 2747049
+    - path: reports/male/random_forest
+      hash: md5
+      md5: 89011934a73df9bc579839a96f4e24eb.dir
+      size: 17913
+      nfiles: 2
+  training@8:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
+      male --model decision_tree\n"
+    deps:
+    - path: data/processed/male/train.csv
+      hash: md5
+      md5: 377755a75869ae39a64b39120c09aa5b
+      size: 82611
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: e2ae238f0d45e8032e3944a24d0c27e3
+      size: 7931
+    outs:
+    - path: models/male/decision_tree.joblib
+      hash: md5
+      md5: 440f308bbd3226e7fbcee4f16dbefe65
+      size: 15177
+    - path: reports/male/decision_tree
+      hash: md5
+      md5: da1c41a5584c71b6a016d97c9a2d45a1.dir
+      size: 6493113
+      nfiles: 2
+  training@9:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
+      nosex --model logreg\n"
+    deps:
+    - path: data/processed/nosex/train.csv
+      hash: md5
+      md5: c17c9ac3520d3f8ce46eb97f1c03b664
+      size: 103261
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: e2ae238f0d45e8032e3944a24d0c27e3
+      size: 7931
+    outs:
+    - path: models/nosex/logreg.joblib
+      hash: md5
+      md5: 43e4b952e67bc6956c8fe3bd66e6fb39
+      size: 1503
+    - path: reports/nosex/logreg
+      hash: md5
+      md5: ce5b23988c9364c9ecbd288ca1cfb77b.dir
+      size: 11959
+      nfiles: 2
+  training@10:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
+      nosex --model random_forest\n"
+    deps:
+    - path: data/processed/nosex/train.csv
+      hash: md5
+      md5: c17c9ac3520d3f8ce46eb97f1c03b664
+      size: 103261
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: e2ae238f0d45e8032e3944a24d0c27e3
+      size: 7931
+    outs:
+    - path: models/nosex/random_forest.joblib
+      hash: md5
+      md5: 5e94c1ecb14e0e6174af94b5f928daac
+      size: 13471369
+    - path: reports/nosex/random_forest
+      hash: md5
+      md5: 11dfa343b1ba9daa394b071ee340292a.dir
+      size: 19280
+      nfiles: 2
+  training@11:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/train.py --variant
+      nosex --model decision_tree\n"
+    deps:
+    - path: data/processed/nosex/train.csv
+      hash: md5
+      md5: c17c9ac3520d3f8ce46eb97f1c03b664
+      size: 103261
+    - path: predicting_outcomes_in_heart_failure/modeling/train.py
+      hash: md5
+      md5: e2ae238f0d45e8032e3944a24d0c27e3
+      size: 7931
+    outs:
+    - path: models/nosex/decision_tree.joblib
+      hash: md5
+      md5: 72129c6d52da6da3b6108c8f8490950d
+      size: 2985
+    - path: reports/nosex/decision_tree
+      hash: md5
+      md5: 9b1e7f06634b537c9291dea1ac0d98d7.dir
+      size: 7002873
+      nfiles: 2
+  evaluation@0:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
+      all --model logreg\n"
+    deps:
+    - path: data/processed/all/test.csv
+      hash: md5
+      md5: d06bcd540eff4a5c8e6dd668a2b148ed
+      size: 45174
+    - path: models/all/logreg.joblib
+      hash: md5
+      md5: 44595213dc6024d0c3f043d080145c29
+      size: 1519
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: 90ecfc732599b4427b3d585d27a47b60
+      size: 6262
+    outs:
+    - path: metrics/test/all/logreg.json
+      hash: md5
+      md5: 17425d74cc062c83f054b6e7559ff7fd
+      size: 255
+  evaluation@1:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
+      all --model random_forest\n"
+    deps:
+    - path: data/processed/all/test.csv
+      hash: md5
+      md5: d06bcd540eff4a5c8e6dd668a2b148ed
+      size: 45174
+    - path: models/all/random_forest.joblib
+      hash: md5
+      md5: 93d7fcec110a443bd85ef6579c2a5c2e
+      size: 3441465
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: 90ecfc732599b4427b3d585d27a47b60
+      size: 6262
+    outs:
+    - path: metrics/test/all/random_forest.json
+      hash: md5
+      md5: 88b271ac93b466730d4edc6bba0f2eb6
+      size: 261
+  evaluation@2:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
+      all --model decision_tree\n"
+    deps:
+    - path: data/processed/all/test.csv
+      hash: md5
+      md5: d06bcd540eff4a5c8e6dd668a2b148ed
+      size: 45174
+    - path: models/all/decision_tree.joblib
+      hash: md5
+      md5: aedace81fc3fca71a5cbdb80f9cb5b28
+      size: 2521
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: 90ecfc732599b4427b3d585d27a47b60
+      size: 6262
+    outs:
+    - path: metrics/test/all/decision_tree.json
+      hash: md5
+      md5: 3a493b39b23186e90b649df8eeeacb47
+      size: 262
+  evaluation@3:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
+      female --model logreg\n"
+    deps:
+    - path: data/processed/female/test.csv
+      hash: md5
+      md5: 6aa472fd41a51bea05b7d2b105f40d85
+      size: 9665
+    - path: models/female/logreg.joblib
+      hash: md5
+      md5: 3b8764d1ac710c8a250074bf2d8be1ea
+      size: 1519
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: 90ecfc732599b4427b3d585d27a47b60
+      size: 6262
+    outs:
+    - path: metrics/test/female/logreg.json
+      hash: md5
+      md5: 0aeb472460b0188feb2a11770cb2c96f
+      size: 258
+  evaluation@4:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
+      female --model random_forest\n"
+    deps:
+    - path: data/processed/female/test.csv
+      hash: md5
+      md5: 6aa472fd41a51bea05b7d2b105f40d85
+      size: 9665
+    - path: models/female/random_forest.joblib
+      hash: md5
+      md5: 343a3a957a3a505be860c350cebd2af8
+      size: 880185
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: 90ecfc732599b4427b3d585d27a47b60
+      size: 6262
+    outs:
+    - path: metrics/test/female/random_forest.json
+      hash: md5
+      md5: abe473afdd356ec9d3915749b1b1ce98
+      size: 265
+  evaluation@5:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
+      female --model decision_tree\n"
+    deps:
+    - path: data/processed/female/test.csv
+      hash: md5
+      md5: 6aa472fd41a51bea05b7d2b105f40d85
+      size: 9665
+    - path: models/female/decision_tree.joblib
+      hash: md5
+      md5: 432ddc9ffd54955ca752fc6b7da9a312
+      size: 7161
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: 90ecfc732599b4427b3d585d27a47b60
+      size: 6262
+    outs:
+    - path: metrics/test/female/decision_tree.json
+      hash: md5
+      md5: b8bb78497457625c190ab0e391c98c15
+      size: 263
+  evaluation@6:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
+      male --model logreg\n"
+    deps:
+    - path: data/processed/male/test.csv
+      hash: md5
+      md5: 7c8ccfdb9557e357265780a1f504cee3
+      size: 35720
+    - path: models/male/logreg.joblib
+      hash: md5
+      md5: 703f955be56bea513500f9bdf1b541b3
+      size: 1535
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: 90ecfc732599b4427b3d585d27a47b60
+      size: 6262
+    outs:
+    - path: metrics/test/male/logreg.json
+      hash: md5
+      md5: 3ea9b47f01e8f1b52c1a27b46623aa9e
+      size: 256
+  evaluation@7:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
+      male --model random_forest\n"
+    deps:
+    - path: data/processed/male/test.csv
+      hash: md5
+      md5: 7c8ccfdb9557e357265780a1f504cee3
+      size: 35720
+    - path: models/male/random_forest.joblib
+      hash: md5
+      md5: a0b69f9da0287b8f8cc06f65ee0aa994
+      size: 2747049
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: 90ecfc732599b4427b3d585d27a47b60
+      size: 6262
+    outs:
+    - path: metrics/test/male/random_forest.json
+      hash: md5
+      md5: d25cd10633b8514e23aeb95c31cc7fb4
+      size: 263
+  evaluation@8:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
+      male --model decision_tree\n"
+    deps:
+    - path: data/processed/male/test.csv
+      hash: md5
+      md5: 7c8ccfdb9557e357265780a1f504cee3
+      size: 35720
+    - path: models/male/decision_tree.joblib
+      hash: md5
+      md5: 440f308bbd3226e7fbcee4f16dbefe65
+      size: 15177
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: 90ecfc732599b4427b3d585d27a47b60
+      size: 6262
+    outs:
+    - path: metrics/test/male/decision_tree.json
+      hash: md5
+      md5: 4646c2963e50087a6b237c7b557e0628
+      size: 263
+  evaluation@9:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
+      nosex --model logreg\n"
+    deps:
+    - path: data/processed/nosex/test.csv
+      hash: md5
+      md5: d79d369fb709d0f0eb9d3c9096488118
+      size: 44618
+    - path: models/nosex/logreg.joblib
+      hash: md5
+      md5: 43e4b952e67bc6956c8fe3bd66e6fb39
+      size: 1503
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: 90ecfc732599b4427b3d585d27a47b60
+      size: 6262
+    outs:
+    - path: metrics/test/nosex/logreg.json
+      hash: md5
+      md5: 570687d930d6363d89908ea20838c8d6
+      size: 257
+  evaluation@10:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
+      nosex --model random_forest\n"
+    deps:
+    - path: data/processed/nosex/test.csv
+      hash: md5
+      md5: d79d369fb709d0f0eb9d3c9096488118
+      size: 44618
+    - path: models/nosex/random_forest.joblib
+      hash: md5
+      md5: 5e94c1ecb14e0e6174af94b5f928daac
+      size: 13471369
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: 90ecfc732599b4427b3d585d27a47b60
+      size: 6262
+    outs:
+    - path: metrics/test/nosex/random_forest.json
+      hash: md5
+      md5: 6a135f593534616b51858c7f6f251d36
+      size: 264
+  evaluation@11:
+    cmd: "uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py --variant
+      nosex --model decision_tree\n"
+    deps:
+    - path: data/processed/nosex/test.csv
+      hash: md5
+      md5: d79d369fb709d0f0eb9d3c9096488118
+      size: 44618
+    - path: models/nosex/decision_tree.joblib
+      hash: md5
+      md5: 72129c6d52da6da3b6108c8f8490950d
+      size: 2985
+    - path: predicting_outcomes_in_heart_failure/modeling/evaluate.py
+      hash: md5
+      md5: 90ecfc732599b4427b3d585d27a47b60
+      size: 6262
+    outs:
+    - path: metrics/test/nosex/decision_tree.json
+      hash: md5
+      md5: dca85bcdc4b67a21cee91b31f45d6225
+      size: 264

dvc.yaml ADDED Viewed

	@@ -0,0 +1,86 @@

+stages:
+  download_data:
+    cmd: uv run predicting_outcomes_in_heart_failure/data/dataset.py
+    deps:
+      - predicting_outcomes_in_heart_failure/data/dataset.py
+    outs:
+      - data/raw/heart.csv
+  preprocessing:
+    cmd: uv run predicting_outcomes_in_heart_failure/data/preprocess.py
+    deps:
+      - predicting_outcomes_in_heart_failure/data/preprocess.py
+      - data/raw/heart.csv
+    outs:
+      - data/interim/preprocessed.csv
+      - data/interim/preprocessed_female_only.csv
+      - data/interim/preprocessed_male_only.csv
+      - data/interim/preprocessed_no_sex_column.csv
+      - data/interim/preprocess_artifacts/scaler.joblib
+  split_data:
+    foreach: [all, female, male, nosex]
+    do:
+      cmd: >
+        uv run predicting_outcomes_in_heart_failure/data/split_data.py
+        --variant ${item}
+      deps:
+        - predicting_outcomes_in_heart_failure/data/split_data.py
+        - data/interim/preprocessed.csv
+        - data/interim/preprocessed_female_only.csv
+        - data/interim/preprocessed_male_only.csv
+        - data/interim/preprocessed_no_sex_column.csv
+      outs:
+        - data/processed/${item}
+  training:
+    foreach:
+      - { variant: all,    model: logreg }
+      - { variant: all,    model: random_forest }
+      - { variant: all,    model: decision_tree }
+      - { variant: female, model: logreg }
+      - { variant: female, model: random_forest }
+      - { variant: female, model: decision_tree }
+      - { variant: male,   model: logreg }
+      - { variant: male,   model: random_forest }
+      - { variant: male,   model: decision_tree }
+      - { variant: nosex,  model: logreg }
+      - { variant: nosex,  model: random_forest }
+      - { variant: nosex,  model: decision_tree }
+    do:
+      cmd: >
+        uv run predicting_outcomes_in_heart_failure/modeling/train.py
+        --variant ${item.variant}
+        --model ${item.model}
+      deps:
+        - predicting_outcomes_in_heart_failure/modeling/train.py
+        - data/processed/${item.variant}/train.csv
+      outs:
+        - models/${item.variant}/${item.model}.joblib
+        - reports/${item.variant}/${item.model}
+  evaluation:
+    foreach:
+      - { variant: all,    model: logreg }
+      - { variant: all,    model: random_forest }
+      - { variant: all,    model: decision_tree }
+      - { variant: female, model: logreg }
+      - { variant: female, model: random_forest }
+      - { variant: female, model: decision_tree }
+      - { variant: male,   model: logreg }
+      - { variant: male,   model: random_forest }
+      - { variant: male,   model: decision_tree }
+      - { variant: nosex,  model: logreg }
+      - { variant: nosex,  model: random_forest }
+      - { variant: nosex,  model: decision_tree }
+    do:
+      cmd: >
+        uv run predicting_outcomes_in_heart_failure/modeling/evaluate.py
+        --variant ${item.variant}
+        --model ${item.model}
+      deps:
+        - predicting_outcomes_in_heart_failure/modeling/evaluate.py
+        - models/${item.variant}/${item.model}.joblib
+        - data/processed/${item.variant}/test.csv
+      outs:
+        - metrics/test/${item.variant}/${item.model}.json

metrics/test/all/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/logreg.json
+/random_forest.json
+/decision_tree.json

metrics/test/female/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/logreg.json
+/random_forest.json
+/decision_tree.json

metrics/test/male/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/logreg.json
+/random_forest.json
+/decision_tree.json

metrics/test/nosex/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/logreg.json
+/random_forest.json
+/decision_tree.json

models/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/decision_tree.joblib
+/random_forest.joblib
+/logreg.joblib

models/README.md ADDED Viewed

	@@ -0,0 +1,110 @@

+# Model Card
+## Table of Contents
+- [Model Details](#model-details)
+  - [Training Information](#training-information)
+- [Intended Use](#intended-use)
+  - [Primary Intended Uses](#primary-intended-uses)
+  - [Primary Intended Users](#primary-intended-users)
+  - [Out-of-scope Use Cases](#out-of-scope-use-cases)
+- [Factors](#factors)
+  - [Relevant Factors](#relevant-factors)
+  - [Evaluation Factors](#evaluation-factors)
+- [Metrics](#metrics)
+  - [Model Performance](#model-performance)
+  - [Variation Approaches](#variation-approaches)
+- [Evaluation Data](#evaluation-data)
+  - [Datasets](#datasets)
+  - [Motivation](#motivation)
+  - [Preprocessing](#preprocessing)
+- [Training Data](#training-data)
+  - [Datasets](#datasets-1)
+  - [Preprocessing](#preprocessing-1)
+- [Ethical Considerations](#ethical-considerations)
+- [Caveats and Recommendations](#caveats-and-recommendations)
+## Model Details
+- Developed by: D. Boccuzzi, M. Capone, F. Rosmarino
+- Model Date: November 11th, 2025
+- Model Version: 6 - nosex
+- Model Type: RandomForestClassifier
+### Training information
+- Best hyperparameters tuned with a 5-fold cross validation:
+    - `max_depth` 12
+    - `n_estimators` 800
+    - `class_weight` balanced
+- Applied approaches:
+During training, oversampling technique was applied to balance the dataset and reduce bias toward the majority class. This ensured that the model learned equally from positive and negative cases, improving prediction performance for the minority class.
+- Training started at: 11:26:59 2025-11-12
+- Training ended at: 11:34:29 2025-11-12
+## Intended Use
+### Primary intended uses
+The CardioTrack ML system is designed to support early detection of heart failure by analyzing clinical features and identifying patients who may be at risk. Its purpose is to assist cardiologists in deciding when further diagnostic tests, monitoring, or preventive treatments are needed. The system is also intended for local public health authorities, who can use aggregated predictions to plan healthcare resources and implement prevention strategies within the population.
+### Primary intended users
+The primary users of the model are cardiologists and other qualified medical professionals who rely on clinical decision support tools. They are responsible for interpreting the model’s predictions in conjunction with the patient’s medical history and additional clinical information. Public health authorities may also use aggregated, non-individual results to support long-term planning and policy development.
+### Out-of-scope use cases
+The model should not be used without access to complete and reliable clinical features, and it is not suitable for real-time emergency triage or for predictive tasks not directly related to heart failure.
+## Factors
+### Relevant factors
+Model performance may vary depending on patient characteristics that influence heart disease risk, as reflected in the contributions of individual clinical features. Age remains a relevant factor because it strongly correlates with cardiovascular conditions. In addition, features such as ST_Slope, ChestPainType, MaxHR, and ExerciseAngina have the largest impact on individual predictions, as highlighted by SHAP module for XAI. These features capture meaningful physiological and clinical differences among patients and explain why the model predicts higher or lower risk for specific individuals. Instrumentation and environmental factors are not relevant because the model operates on structured clinical data rather than on signals or images affected by measurement devices or environmental conditions.
+### Evaluation Factors
+The evaluation focuses on key clinical features that the model heavily relies on. The Relevant factors were chosen because they are both present in the dataset and have the largest impact on the model’s outputs, allowing clear interpretation of how predictions are made.
+## Metrics
+### Model Performance
+- `F1 Score` 0.8990
+- `Recall` 0.9019
+- `Accuracy` 0.8876
+- `ROC-AUC` 0.9399
+### Variation approaches
+The reported metrics were computed using the best model selected during cross validation for hyperparameter tuning, and evaluated on a completely independent test set. This setup was chosen because it provides a cleaner estimate of real-world performance, reduces the risk of overfitting to validation folds, and ensures that the results reflect the model’s generalization ability.
+## Evaluation Data
+### Datasets
+The evaluation was performed using 276 of 918 (30%) observations of the Kaggle's [Heart Failure Prediction Dataset](https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction), which contains clinical data from both healthy individuals and patients diagnosed with heart failure.
+### Motivation
+This dataset was chosen because it provides a comprehensive set of relevant clinical features that capture key cardiovascular risk factors, enabling the model to perform early detection of heart failure in individual patients. Its publicly available nature ensures transparency.
+### Preprocessing
+Before evaluation, the data was preprocessed as follows:
+- **Cleaning of invalid values**
+  Rows with impossible clinical values (e.g., `RestingBP = 0`) were removed.
+  Zero cholesterol values were treated as missing and replaced using a central-tendency statistic.
+- **Encoding of categorical variables**
+  Binary categories were converted to numerical format, while multi-class fields (`ChestPainType`, `RestingECG`, `ST_Slope`) were one-hot encoded.
+- **Scaling of numerical features**
+  Continuous variables were standardized to have mean 0 and unit variance.
+- **Removal of the `Sex` feature**
+  The Sex feature was removed to reduce potential fairness concerns and because it was not required for the planned experiments.
+  - The processed dataset is versioned on Dagshub at following [link](https://dagshub.com/se4ai2526-uniba/CardioTrack)
+## Training Data
+### Datasets
+The training data mirrors the evaluation dataset, using 642 of 918 (70%) of the same Kaggle Heart Failure Prediction Dataset.
+### Preprocessing
+The training data underwent the same preprocessing steps as the evaluation data. Additionally, RandomOversampler technique was applied to balance the classes, ensuring that the model learned equally from positive (heart failure) and negative cases.
+## Ethical Considerations
+The Cardio Track ML system is intended to support clinical decision-making but not replace professional judgment. Ethical considerations include:
+- **Privacy and security**: All patient data is processed on-premises, in accordance with hospital IT protocols, protecting sensitive health information.
+- **Transparency**: Feature importance with SHAP visualizations allow clinicians to interpret predictions.
+- **Clinical responsibility**: Diagnosis must be combined with patient history, exams, and expert judgment. Misuse in isolation could lead to incorrect interventions.
+## Caveats and Recommendations
+- **Inference time**: The model’s inference time is about 0.2 seconds, but it can varies with computing power where inference is run.
+- **Limitations**: The model is trained on a specific public dataset and may not capture rare cardiovascular conditions or population-specific variations.
+- **Data quality**: Accurate predictions depend on complete and correctly measured clinical features. Erroneous data can reduce performance. Missing data is not allowed.
+- **Not for emergency triage**: Predictions are intended for early detection and planning, not for immediate emergency decision-making.
+- **Periodic retraining**: To maintain accuracy, the model should be updated with newly collected clinical data to account for shifts in patient population or disease prevalence.

models/all/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/logreg.joblib
+/random_forest.joblib
+/decision_tree.joblib

models/female/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/logreg.joblib
+/random_forest.joblib
+/decision_tree.joblib

models/male/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/logreg.joblib
+/random_forest.joblib
+/decision_tree.joblib

models/nosex/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/logreg.joblib
+/random_forest.joblib
+/decision_tree.joblib

notebooks/.gitkeep ADDED Viewed

File without changes

notebooks/1.0-mc-initial-data-exploration.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

predicting_outcomes_in_heart_failure/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from predicting_outcomes_in_heart_failure import config # noqa: F401

predicting_outcomes_in_heart_failure/app/__init__.py ADDED Viewed

File without changes

predicting_outcomes_in_heart_failure/app/main.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from fastapi.staticfiles import StaticFiles
+import gradio as gr
+import joblib
+from loguru import logger
+from predicting_outcomes_in_heart_failure.app.routers import cards, model_info, prediction
+from predicting_outcomes_in_heart_failure.app.utils import load_page, update_patient_index_choices
+from predicting_outcomes_in_heart_failure.app.wrapper import Wrapper
+from predicting_outcomes_in_heart_failure.config import FIGURES_DIR, MODEL_PATH
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Context manager to handle application lifespan events."""
+    if not MODEL_PATH.exists():
+        logger.error(f"Model file not found at: {MODEL_PATH}")
+        raise FileNotFoundError(f"Model file not found at: {MODEL_PATH}")
+    logger.info(f"Loading default model from {MODEL_PATH} ...")
+    app.state.model = joblib.load(MODEL_PATH)
+    logger.success(f"Default model loaded from {MODEL_PATH}")
+    try:
+        yield
+    finally:
+        app.state.model = None
+        logger.info("Default model cleared on application shutdown")
+app = FastAPI(
+    title="CardioTrack's Model Space - Heart Failure Prediction",
+    version="0.01",
+    lifespan=lifespan,
+)
+if not FIGURES_DIR.exists():
+    logger.warning(f"Figures directory {FIGURES_DIR} does not exist. Creating it.")
+    FIGURES_DIR.mkdir(parents=True, exist_ok=True)
+app.mount("/figures", StaticFiles(directory=str(FIGURES_DIR)), name="figures")
+app.include_router(prediction.router)
+app.include_router(model_info.router)
+app.include_router(cards.router)
+# UI Definition
+with gr.Blocks(title="CardioTrack") as io:
+    gr.Markdown(
+        """
+        # 🫀 CardioTrack's Model Space - Heart Failure Diagnosis
+        Choose an area to access the platform's features.
+        """
+    )
+    with gr.Tabs():
+        with gr.TabItem("Single Diagnosis"):
+            gr.Markdown("### Enter patient data for the diagnosis")
+            with gr.Row():
+                with gr.Column():
+                    age = gr.Slider(minimum=20, maximum=100, step=1, label="Age", value=60)
+                    resting_bp = gr.Slider(
+                        minimum=80,
+                        maximum=200,
+                        step=1,
+                        label="Resting Blood Pressure (mm Hg)",
+                        value=120,
+                    )
+                    cholesterol = gr.Slider(
+                        minimum=0, maximum=600, step=1, label="Cholesterol (mg/dL)", value=200
+                    )
+                    max_hr = gr.Slider(
+                        minimum=60, maximum=220, step=1, label="Max Heart Rate", value=150
+                    )
+                    oldpeak = gr.Slider(
+                        minimum=-3.0,
+                        maximum=7.0,
+                        step=0.1,
+                        label="Oldpeak (ST Depression)",
+                        value=1.0,
+                    )
+                with gr.Column():
+                    chest_pain_type = gr.Dropdown(
+                        choices=["TA", "ATA", "NAP", "ASY"], label="Chest Pain Type", value="ASY"
+                    )
+                    fasting_bs = gr.Dropdown(
+                        choices=[0, 1],
+                        label="Fasting Blood Sugar (0: <=120 mg/dL, 1: >120 mg/dL)",
+                        value=0,
+                    )
+                    resting_ecg = gr.Dropdown(
+                        choices=["Normal", "ST", "LVH"], label="Resting ECG", value="Normal"
+                    )
+                    exercise_angina = gr.Dropdown(
+                        choices=["Y", "N"], label="Exercise Angina", value="N"
+                    )
+                    st_slope = gr.Dropdown(
+                        choices=["Up", "Flat", "Down"], label="ST Slope", value="Flat"
+                    )
+            predict_btn = gr.Button("Analyze", variant="primary")
+            single_output = gr.Markdown(label="Result")
+            explanation_img = gr.Image(label="Explanation", type="filepath", visible=True)
+            predict_btn.click(
+                fn=Wrapper.prediction_with_explanation,
+                inputs=[
+                    age,
+                    chest_pain_type,
+                    resting_bp,
+                    cholesterol,
+                    fasting_bs,
+                    resting_ecg,
+                    max_hr,
+                    exercise_angina,
+                    oldpeak,
+                    st_slope,
+                ],
+                outputs=[single_output, explanation_img],
+            )
+        with gr.TabItem("Group Diagnosis"):
+            gr.Markdown("### Upload a CSV file for analize multiple subjects")
+            gr.Markdown(
+                "The CSV should contain columns: Age, ChestPainType, RestingBP, Cholesterol,"
+                + "FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope"
+            )
+            file_input = gr.File(label="Upload CSV", file_types=[".csv"])
+            batch_predict_btn = gr.Button("Analyze Group", variant="primary")
+            batch_output = gr.Dataframe(label="Results")
+            batch_predict_btn.click(
+                fn=Wrapper.batch_prediction, inputs=file_input, outputs=batch_output
+            )
+            gr.Markdown("### Explain a specific patient from the group")
+            patient_index = gr.Dropdown(
+                label="Patient index (0-based)",
+                choices=[],
+                interactive=True,
+            )
+            batch_output.change(
+                fn=update_patient_index_choices,
+                inputs=batch_output,
+                outputs=patient_index,
+            )
+            batch_explain_btn = gr.Button("Explain selected patient", variant="secondary")
+            batch_explanation_img = gr.Image(
+                label="Explanation",
+                type="filepath",
+            )
+            batch_explain_btn.click(
+                fn=Wrapper.batch_explanation,
+                inputs=[file_input, patient_index],
+                outputs=batch_explanation_img,
+            )
+        with gr.TabItem("ModelCard"):
+            io = load_page(io, Wrapper.get_model_card)
+        with gr.TabItem("DatasetCard"):
+            io = load_page(io, Wrapper.get_dataset_card)
+        with gr.TabItem("Hyperparameters"):
+            gr.Markdown("## Model Hyperparameters")
+            io = load_page(io, Wrapper.get_hyperparameters)
+        with gr.TabItem("Evaluation Metrics"):
+            gr.Markdown("## Model Performance Metrics")
+            io = load_page(io, Wrapper.get_metrics)
+app = gr.mount_gradio_app(app, io, path="/")

predicting_outcomes_in_heart_failure/app/routers/cards.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from http import HTTPStatus
+from fastapi import APIRouter, HTTPException, Request
+from loguru import logger
+from predicting_outcomes_in_heart_failure.app.utils import construct_response
+from predicting_outcomes_in_heart_failure.config import CARD_PATHS
+router = APIRouter(tags=["Cards"])
+@router.get("/cards/{card_type}")
+@construct_response
+def card(request: Request, card_type: str):
+    """Return card information.
+    card_type = dataset_card / model_card
+    """
+    logger.info(f"Received /cards/{card_type} request")
+    # Normalizza il card_type per gestire eventuali varianti
+    card_type = card_type.lower().replace("-", "_")
+    path = CARD_PATHS.get(card_type)
+    if path is None:
+        logger.warning(f"Unsupported card_type requested: {card_type}")
+        raise HTTPException(
+            status_code=HTTPStatus.NOT_FOUND,
+            detail=f"Card type '{card_type}' not supported."
+            + f" Valid types: {', '.join(CARD_PATHS.keys())}",
+        )
+    try:
+        with open(path, encoding="utf-8") as f:
+            card_content = f.read()
+        logger.success(f"{path} loaded successfully")
+        return {
+            "message": HTTPStatus.OK.phrase,
+            "status-code": HTTPStatus.OK.value,
+            "data": {
+                "card_type": card_type,
+                "path": str(path),
+                "card_lines": card_content.split("\n"),
+            },
+        }
+    except Exception as e:
+        logger.exception(f"Failed to load card content from {path}: {e}")
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
+            detail=f"Error reading card file: {e}",
+        ) from e

predicting_outcomes_in_heart_failure/app/routers/general.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from http import HTTPStatus
+from fastapi import APIRouter, Request
+from loguru import logger
+from predicting_outcomes_in_heart_failure.app.utils import construct_response
+router = APIRouter(tags=["General"])
+@router.get("/")
+@construct_response
+def index(request: Request):
+    """Root endpoint."""
+    logger.info("General requested")
+    return {
+        "message": HTTPStatus.OK.phrase,
+        "status-code": HTTPStatus.OK,
+        "data": {"message": "Welcome to Heart Failure Predictor!"},
+    }

predicting_outcomes_in_heart_failure/app/routers/model_info.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from http import HTTPStatus
+import json
+from typing import Any
+from fastapi import APIRouter, Request
+from loguru import logger
+from predicting_outcomes_in_heart_failure.app.utils import construct_response
+from predicting_outcomes_in_heart_failure.config import (
+    MODEL_PATH,
+    REPORTS_DIR,
+    TEST_METRICS_DIR,
+)
+router = APIRouter(tags=["Model"])
+@router.get("/model/hyperparameters")
+@construct_response
+def get_model_hyperparameters(request: Request):
+    variant = MODEL_PATH.parent.name
+    model_name = MODEL_PATH.stem
+    hyperparams_path = REPORTS_DIR / variant / model_name / "cv_parameters.json"
+    logger.info(
+        f"Looking for hyperparameters file at {hyperparams_path} "
+        f"(model={model_name}, variant={variant})"
+    )
+    if not hyperparams_path.exists():
+        logger.warning("Hyperparameters file not found")
+        return {
+            "message": HTTPStatus.NOT_FOUND.phrase,
+            "status-code": HTTPStatus.NOT_FOUND,
+            "data": {
+                "detail": "Hyperparameters file not found. Run the training pipeline.",
+                "model_name": model_name,
+                "variant": variant,
+                "expected_path": str(hyperparams_path),
+            },
+        }
+    with hyperparams_path.open("r", encoding="utf-8") as f:
+        hyperparams_data = json.load(f)
+    data: dict[str, Any] = {
+        "model_path": str(MODEL_PATH),
+        "hyperparameters": hyperparams_data,
+    }
+    return {
+        "message": HTTPStatus.OK.phrase,
+        "status-code": HTTPStatus.OK,
+        "data": data,
+    }
+@router.get("/model/metrics")
+@construct_response
+def get_model_metrics(request: Request):
+    variant = MODEL_PATH.parent.name
+    model_name = MODEL_PATH.stem
+    metrics_path = TEST_METRICS_DIR / variant / f"{model_name}.json"
+    logger.info(
+        f"Looking for metrics file at {metrics_path} (model={model_name}, variant={variant})"
+    )
+    if not metrics_path.exists():
+        logger.warning("Metrics file not found")
+        return {
+            "message": HTTPStatus.NOT_FOUND.phrase,
+            "status-code": HTTPStatus.NOT_FOUND,
+            "data": {
+                "detail": (
+                    "Metrics file not found. Run the evaluation pipeline for this model first."
+                ),
+                "model_name": model_name,
+                "variant": variant,
+                "expected_path": str(metrics_path),
+            },
+        }
+    with metrics_path.open("r", encoding="utf-8") as f:
+        metrics_data = json.load(f)
+    data: dict[str, Any] = {
+        "model_path": str(MODEL_PATH),
+        "model_name": model_name,
+        "variant": variant,
+        "metrics": metrics_data.get("metrics", metrics_data),
+    }
+    return {
+        "message": HTTPStatus.OK.phrase,
+        "status-code": HTTPStatus.OK,
+        "data": data,
+    }

predicting_outcomes_in_heart_failure/app/routers/prediction.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from http import HTTPStatus
+from typing import Any
+from fastapi import APIRouter, Request
+from loguru import logger
+import pandas as pd
+from predicting_outcomes_in_heart_failure.app.schema import HeartSample
+from predicting_outcomes_in_heart_failure.app.utils import (
+    construct_response,
+    get_model_from_state,
+)
+from predicting_outcomes_in_heart_failure.config import FIGURES_DIR, MODEL_PATH
+from predicting_outcomes_in_heart_failure.modeling.explainability import (
+    explain_prediction,
+    save_shap_waterfall_plot,
+)
+from predicting_outcomes_in_heart_failure.modeling.predict import preprocessing
+router = APIRouter()
+@router.post("/predictions", tags=["Prediction"])
+@construct_response
+def predict(request: Request, payload: HeartSample):
+    model = get_model_from_state(request)
+    if model is None:
+        return {
+            "message": HTTPStatus.SERVICE_UNAVAILABLE.phrase,
+            "status-code": HTTPStatus.SERVICE_UNAVAILABLE,
+            "data": {"detail": "Model is not loaded."},
+        }
+    X_raw = payload.to_dataframe()
+    X = preprocessing(X_raw)
+    y_pred = int(model.predict(X)[0])
+    data: dict[str, Any] = {
+        "input": payload.model_dump(),
+        "prediction": y_pred,
+    }
+    logger.success("Prediction completed successfully for /predictions")
+    return {
+        "message": HTTPStatus.OK.phrase,
+        "status-code": HTTPStatus.OK,
+        "data": data,
+    }
+@router.post("/batch-predictions", tags=["Prediction"])
+@construct_response
+def predict_batch(request: Request, payload: list[HeartSample]):
+    model = get_model_from_state(request)
+    if model is None:
+        return {
+            "message": HTTPStatus.SERVICE_UNAVAILABLE.phrase,
+            "status-code": HTTPStatus.SERVICE_UNAVAILABLE,
+            "data": {"detail": "Model is not loaded."},
+        }
+    X_raw_list = [sample.to_dataframe() for sample in payload]
+    X_raw = pd.concat(X_raw_list, ignore_index=True)
+    X = preprocessing(X_raw)
+    y_pred = [int(y) for y in model.predict(X)]
+    results: list[dict[str, Any]] = []
+    for idx, (sample, pred) in enumerate(zip(payload, y_pred, strict=True)):
+        results.append(
+            {
+                "index": idx,
+                "input": sample.model_dump(),
+                "prediction": pred,
+            }
+        )
+    data: dict[str, Any] = {
+        "results": results,
+        "batch_size": len(results),
+    }
+    return {
+        "message": HTTPStatus.OK.phrase,
+        "status-code": HTTPStatus.OK,
+        "data": data,
+    }
+@router.post("/explanations", tags=["Explainability"])
+@construct_response
+def explain(request: Request, payload: HeartSample):
+    model = get_model_from_state(request)
+    if model is None:
+        return {
+            "message": HTTPStatus.SERVICE_UNAVAILABLE.phrase,
+            "status-code": HTTPStatus.SERVICE_UNAVAILABLE,
+            "data": {"detail": "Model is not loaded."},
+        }
+    X_raw = payload.to_dataframe()
+    X = preprocessing(X_raw)
+    data: dict[str, Any] = {"input": payload.model_dump()}
+    model_type = MODEL_PATH.stem
+    try:
+        logger.info("Computing explanation for default model prediction...")
+        explanations = explain_prediction(model, X, model_type=model_type, top_k=5)
+        if explanations:
+            data["explanations"] = explanations
+            logger.success("Explanation computed successfully for default model.")
+        else:
+            logger.warning("No explanation available for default model.")
+    except Exception as e:
+        logger.exception(f"Failed to compute explanation: {e}")
+    try:
+        plot_path = FIGURES_DIR / f"shap_waterfall_default_{model_type}.png"
+        saved_path = save_shap_waterfall_plot(
+            model=model,
+            X=X,
+            model_type=model_type,
+            output_path=plot_path,
+        )
+        if saved_path is not None:
+            data["explanation_plot_url"] = f"/figures/{saved_path.name}"
+    except Exception as e:
+        logger.exception(f"Failed to generate explanation plot: {e}")
+    logger.success("Explanation completed successfully for /explanations")
+    return {
+        "message": HTTPStatus.OK.phrase,
+        "status-code": HTTPStatus.OK,
+        "data": data,
+    }

predicting_outcomes_in_heart_failure/app/schema.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from __future__ import annotations
+from typing import Literal
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, field_validator
+class HeartSample(BaseModel):
+    Age: int
+    ChestPainType: Literal["TA", "ATA", "NAP", "ASY"]
+    RestingBP: int
+    Cholesterol: int
+    FastingBS: int
+    RestingECG: Literal["Normal", "ST", "LVH"]
+    MaxHR: int
+    ExerciseAngina: Literal["Y", "N"]
+    Oldpeak: float
+    ST_Slope: Literal["Up", "Flat", "Down"]
+    @field_validator("Oldpeak")
+    @classmethod
+    def round_oldpeak(cls, v: float) -> float:
+        return float(np.round(v, 2))
+    def to_dataframe(self) -> pd.DataFrame:
+        return pd.DataFrame([self.model_dump()])

predicting_outcomes_in_heart_failure/app/utils.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from datetime import datetime
+from functools import wraps
+from fastapi import Request
+import gradio as gr
+from loguru import logger
+def construct_response(f):
+    """Construct a JSON response for an endpoint's results."""
+    @wraps(f)
+    def wrap(request: Request, *args, **kwargs):
+        result = f(request, *args, **kwargs)
+        response = {
+            "message": result["message"],
+            "method": request.method,
+            "status-code": result["status-code"],
+            "timestamp": datetime.now().isoformat(),
+            "url": request.url._url,
+        }
+        if "data" in result:
+            response["data"] = result["data"]
+        return response
+    return wrap
+def get_model_from_state(request: Request):
+    """Retrieve the model from the app state."""
+    model = getattr(request.app.state, "model", None)
+    if model is None:
+        logger.error("Model not loaded in app.state.model")
+    return model
+def load_page(io, fn):
+    content = gr.Markdown("Loading...")
+    io.load(fn=fn, inputs=None, outputs=content)
+    return io
+def update_patient_index_choices(df):
+    """Populate the dropdown with valid patient indices from the batch results."""
+    import gradio as gr
+    if df is None:
+        return gr.update(choices=[], value=None)
+    try:
+        indices = list(df["Patients's index"].astype(int))
+    except Exception:
+        return gr.update(choices=[], value=None)
+    if not indices:
+        return gr.update(choices=[], value=None)
+    return gr.update(choices=indices, value=indices[0])

predicting_outcomes_in_heart_failure/app/wrapper.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import httpx
+from loguru import logger
+import pandas as pd
+from predicting_outcomes_in_heart_failure.config import API_URL, FIGURES_DIR
+async def _fetch_api(endpoint: str):
+    async with httpx.AsyncClient() as client:
+        try:
+            response = await client.get(f"{API_URL}/{endpoint}")
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            logger.error(f"Error fetching {endpoint}: {e}")
+            return {"error": str(e)}
+class Wrapper:
+    async def prediction_with_explanation(
+        age,
+        chest_pain_type,
+        resting_bp,
+        cholesterol,
+        fasting_bs,
+        resting_ecg,
+        max_hr,
+        exercise_angina,
+        oldpeak,
+        st_slope,
+    ):
+        payload = {
+            "Age": age,
+            "ChestPainType": chest_pain_type,
+            "RestingBP": resting_bp,
+            "Cholesterol": cholesterol,
+            "FastingBS": fasting_bs,
+            "RestingECG": resting_ecg,
+            "MaxHR": max_hr,
+            "ExerciseAngina": exercise_angina,
+            "Oldpeak": round(oldpeak, 2),
+            "ST_Slope": st_slope,
+        }
+        async with httpx.AsyncClient() as client:
+            try:
+                pred_resp = await client.post(f"{API_URL}/predictions", json=payload)
+                pred_resp.raise_for_status()
+                pred_json = pred_resp.json()
+                prediction_value = pred_json["data"]["prediction"]
+                status = "🆘 Risk Detected" if prediction_value == 1 else "✅ No Risk Detected"
+                status_text = f"# Patient's status: {status}"
+            except Exception as e:
+                logger.error(f"Error making prediction: {e}")
+                return f"Error during prediction: {str(e)}", ""
+            try:
+                expl_resp = await client.post(f"{API_URL}/explanations", json=payload)
+                expl_resp.raise_for_status()
+                expl_json = expl_resp.json()
+                plot_rel_url = expl_json["data"].get("explanation_plot_url")
+                if not plot_rel_url:
+                    logger.warning("No explanation_plot_url found in /explanations response.")
+                    return status_text, ""
+                filename = plot_rel_url.split("/")[-1]
+                plot_path = FIGURES_DIR / filename
+                return status_text, str(plot_path)
+            except Exception as e:
+                logger.error(f"Error getting explanation: {e}")
+                return status_text, ""
+    async def batch_prediction(file):
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            try:
+                df = pd.read_csv(file)
+                payload = []
+                for _, row in df.iterrows():
+                    sample = {
+                        "Age": int(row["Age"]),
+                        "ChestPainType": row["ChestPainType"],
+                        "RestingBP": int(row["RestingBP"]),
+                        "Cholesterol": int(row["Cholesterol"]),
+                        "FastingBS": int(row["FastingBS"]),
+                        "RestingECG": row["RestingECG"],
+                        "MaxHR": int(row["MaxHR"]),
+                        "ExerciseAngina": row["ExerciseAngina"],
+                        "Oldpeak": round(float(row["Oldpeak"]), 2),
+                        "ST_Slope": row["ST_Slope"],
+                    }
+                    payload.append(sample)
+                response = await client.post(f"{API_URL}/batch-predictions", json=payload)
+                response.raise_for_status()
+                result = response.json()
+                results = result["data"]["results"]
+                df_results = pd.DataFrame(
+                    [
+                        {
+                            "Patients's index": r["index"],
+                            "Patient's status": "🆘 Risk Detected"
+                            if r["prediction"] == 1
+                            else "✅ No Risk Detected",
+                        }
+                        for r in results
+                    ]
+                )
+                return df_results
+            except Exception as e:
+                logger.error(f"Error making batch prediction: {e}")
+                return pd.DataFrame({"error": [str(e)]})
+    async def get_model_card():
+        data = await _fetch_api("cards/model_card")
+        card_lines = data.get("data").get("card_lines")
+        return "\n".join(card_lines)
+    async def get_dataset_card():
+        data = await _fetch_api("cards/dataset_card")
+        card_lines = data.get("data").get("card_lines")
+        return "\n".join(card_lines)
+    async def get_hyperparameters():
+        data = await _fetch_api("model/hyperparameters")
+        if "error" in data:
+            return f"## Error\n{data['error']}"
+        data = data.get("data", {}).get("hyperparameters", {}).get("cv", {})
+        md = ""
+        for key, value in data.items():
+            md += f"- **{key}**: {value}\n"
+        return md
+    async def get_metrics():
+        data = await _fetch_api("model/metrics")
+        if "error" in data:
+            return f"## Error\n{data['error']}"
+        metrics = data.get("data", {}).get("metrics", {})
+        if not metrics:
+            return "## No metrics found"
+        md = ""
+        for key, value in metrics.items():
+            md += f"- **{key}**: {value:.4f}\n"
+        return md
+    async def batch_explanation(file, patient_index: int):
+        """Return SHAP plot (filepath) for a specific patient in the uploaded CSV."""
+        try:
+            df = pd.read_csv(file)
+        except Exception as e:
+            logger.error(f"Error reading CSV for batch explanation: {e}")
+            return None
+        try:
+            idx = int(patient_index)
+        except (TypeError, ValueError):
+            logger.error(f"Invalid patient_index: {patient_index}")
+            return None
+        if idx < 0 or idx >= len(df):
+            logger.error(f"patient_index {idx} out of range (0..{len(df) - 1})")
+            return None
+        row = df.iloc[idx]
+        payload = {
+            "Age": int(row["Age"]),
+            "ChestPainType": row["ChestPainType"],
+            "RestingBP": int(row["RestingBP"]),
+            "Cholesterol": int(row["Cholesterol"]),
+            "FastingBS": int(row["FastingBS"]),
+            "RestingECG": row["RestingECG"],
+            "MaxHR": int(row["MaxHR"]),
+            "ExerciseAngina": row["ExerciseAngina"],
+            "Oldpeak": round(float(row["Oldpeak"]), 2),
+            "ST_Slope": row["ST_Slope"],
+        }
+        async with httpx.AsyncClient() as client:
+            try:
+                expl_resp = await client.post(f"{API_URL}/explanations", json=payload)
+                expl_resp.raise_for_status()
+                expl_json = expl_resp.json()
+                plot_rel_url = expl_json["data"].get("explanation_plot_url")
+                if not plot_rel_url:
+                    logger.warning(
+                        "No explanation_plot_url found in /explanations response (batch)."
+                    )
+                    return None
+                filename = plot_rel_url.split("/")[-1]
+                plot_path = FIGURES_DIR / filename
+                return str(plot_path)
+            except Exception as e:
+                logger.error(f"Error getting batch explanation: {e}")
+                return None

predicting_outcomes_in_heart_failure/config.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from pathlib import Path
+from dotenv import load_dotenv
+from loguru import logger
+load_dotenv()
+# -------------------
+# Experiment settings
+# -------------------
+VALID_VARIANTS = ["all", "female", "male", "nosex"]
+VALID_MODELS = ["logreg", "random_forest", "decision_tree"]
+EXPERIMENT_NAME = "Heart_Failure_Prediction"
+DATASET_NAME = "fedesoriano/heart-failure-prediction"
+TARGET_COL = "HeartDisease"
+RANDOM_STATE = 42
+TEST_SIZE = 0.30
+N_SPLITS = 5
+SCORING = {
+    "accuracy": "accuracy",
+    "f1": "f1",
+    "recall": "recall",
+    "roc_auc": "roc_auc",
+}
+NUM_COLS_DEFAULT = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]
+CAT_COLS_DEFAULT = [
+    "Sex",
+    "ChestPainType",
+    "FastingBS",
+    "RestingECG",
+    "ExerciseAngina",
+    "ST_Slope",
+]
+MULTI_CAT = ["ChestPainType", "RestingECG", "ST_Slope"]
+INPUT_COLUMNS = [
+    "Age",
+    "RestingBP",
+    "Cholesterol",
+    "FastingBS",
+    "MaxHR",
+    "ExerciseAngina",
+    "Oldpeak",
+    "ChestPainType_ASY",
+    "ChestPainType_ATA",
+    "ChestPainType_NAP",
+    "ChestPainType_TA",
+    "RestingECG_LVH",
+    "RestingECG_Normal",
+    "RestingECG_ST",
+    "ST_Slope_Down",
+    "ST_Slope_Flat",
+    "ST_Slope_Up",
+]
+# ----------------------------
+# Model hyperparameter configurations
+# ----------------------------
+CONFIG_RF = {
+    "n_estimators": [200, 400, 800],
+    "max_depth": [None, 6, 12],
+    "class_weight": [None, "balanced"],
+}
+CONFIG_DT = {
+    "criterion": ["gini", "entropy", "log_loss"],
+    "max_depth": [None, 3, 5, 7, 9, 12],
+    "min_samples_split": [2, 5, 10, 20],
+    "min_samples_leaf": [1, 2, 4, 8],
+    "max_features": [None, "sqrt", "log2"],
+    "class_weight": [None, "balanced"],
+    "ccp_alpha": [0.0, 0.001, 0.01],
+}
+CONFIG_LR = {"C": [0.01, 0.1, 1, 10], "penalty": ["l2"], "class_weight": [None, "balanced"]}
+# ----------------------------
+# Repository info
+# ----------------------------
+REPO_OWNER = "se4ai2526-uniba"
+REPO_NAME = "CardioTrack"
+# ----------------------------
+# Great Expectations
+# ----------------------------
+SOURCE_NAME = "heart_data_source"
+ASSET_NAME = "heart_failure"
+SUITE_NAME = "heart_failure_data_quality"
+# ----------------------------
+# Paths
+# ----------------------------
+PROJ_ROOT = Path(__file__).resolve().parents[1]
+logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")
+DATA_DIR = PROJ_ROOT / "data"
+INTERIM_DATA_DIR = DATA_DIR / "interim"
+PROCESSED_DATA_DIR = DATA_DIR / "processed"
+RAW_DATA_DIR = DATA_DIR / "raw"
+EXTERNAL_DATA_DIR = DATA_DIR / "external"
+RAW_PATH = RAW_DATA_DIR / "heart.csv"
+PREPROCESSED_CSV = INTERIM_DATA_DIR / "preprocessed.csv"
+TRAIN_CSV = PROCESSED_DATA_DIR / "train.csv"
+TEST_CSV = PROCESSED_DATA_DIR / "test.csv"
+MODELS_DIR = PROJ_ROOT / "models"
+REPORTS_DIR = PROJ_ROOT / "reports"
+FIGURES_DIR = REPORTS_DIR / "figures"
+METRICS_DIR = PROJ_ROOT / "metrics"
+TEST_METRICS_DIR = METRICS_DIR / "test"
+NOSEX_CSV = INTERIM_DATA_DIR / "preprocessed_no_sex_column.csv"
+MALE_CSV = INTERIM_DATA_DIR / "preprocessed_male_only.csv"
+FEMALE_CSV = INTERIM_DATA_DIR / "preprocessed_female_only.csv"
+PREPROCESS_ARTIFACTS_DIR = INTERIM_DATA_DIR / "preprocess_artifacts"
+SCALER_PATH = PREPROCESS_ARTIFACTS_DIR / "scaler.joblib"
+MODEL_PATH = Path("models/nosex/random_forest.joblib")
+CARD_PATHS = {
+    "dataset_card": DATA_DIR / "README.md",
+    "model_card": MODELS_DIR / "README.md",
+}
+# ----------------------------
+# API
+# ----------------------------
+API_URL = "http://localhost:7860"

predicting_outcomes_in_heart_failure/data/dataset.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+import shutil
+import kagglehub
+from loguru import logger
+from predicting_outcomes_in_heart_failure.config import DATASET_NAME, RAW_DATA_DIR
+import typer
+app = typer.Typer()
+@app.command()
+def main():
+    logger.info("Downloading dataset from Kaggle...")
+    os.makedirs(RAW_DATA_DIR, exist_ok=True)
+    path = kagglehub.dataset_download(DATASET_NAME)
+    shutil.copytree(path, RAW_DATA_DIR, dirs_exist_ok=True)
+    logger.success("Dataset downloaded successfully to {RAW_DATA_DIR}.")
+if __name__ == "__main__":
+    app()

predicting_outcomes_in_heart_failure/data/preprocess.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import joblib
+from loguru import logger
+import pandas as pd
+from predicting_outcomes_in_heart_failure.config import (
+    FEMALE_CSV,
+    INTERIM_DATA_DIR,
+    MALE_CSV,
+    NOSEX_CSV,
+    NUM_COLS_DEFAULT,
+    PREPROCESS_ARTIFACTS_DIR,
+    PREPROCESSED_CSV,
+    RAW_PATH,
+    SCALER_PATH,
+    TARGET_COL,
+)
+from sklearn.preprocessing import StandardScaler
+def save_scaler_artifact(scaler: StandardScaler):
+    """Save only the fitted scaler used during preprocessing for inference."""
+    PREPROCESS_ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
+    joblib.dump(scaler, SCALER_PATH)
+    logger.success(f"Saved StandardScaler to {SCALER_PATH}")
+def generate_gender_splits(df: pd.DataFrame):
+    """Create and save gender-based CSV splits (female, male, nosex)."""
+    if "Sex" in df.columns:
+        df_female = df[df["Sex"] == 0].copy()
+        df_female.to_csv(FEMALE_CSV, index=False)
+        logger.success(f"Saved female-only dataset: {FEMALE_CSV} (rows={len(df_female)})")
+    if "Sex" in df.columns:
+        df_male = df[df["Sex"] == 1].copy()
+        df_male.to_csv(MALE_CSV, index=False)
+        logger.success(f"Saved male-only dataset: {MALE_CSV} (rows={len(df_male)})")
+    df_nosex = df.drop(columns=["Sex"], errors="ignore").copy()
+    df_nosex.to_csv(NOSEX_CSV, index=False)
+    logger.success(f"Saved dataset without 'Sex': {NOSEX_CSV} (rows={len(df_nosex)})")
+def preprocessing():
+    """Run the full preprocessing pipeline on the raw heart dataset."""
+    logger.info("Starting preprocessing pipeline...")
+    if not RAW_PATH.exists():
+        logger.error(f"Missing {RAW_PATH}. Put heart.csv under data/raw/ or adjust RAW_PATH.")
+        raise FileNotFoundError(f"Missing {RAW_PATH}.")
+    df = pd.read_csv(RAW_PATH)
+    logger.info(f"Loaded dataset: {RAW_PATH} (rows={len(df)}, cols={df.shape[1]})")
+    if len(df) < 2:
+        raise ValueError("Preprocessing requires at least 2 rows, got only 1.")
+    # Ensure target is integer
+    df[TARGET_COL] = df[TARGET_COL].astype(int)
+    # Remove invalid RestingBP rows
+    if "RestingBP" in df.columns:
+        before = len(df)
+        df = df[df["RestingBP"] != 0].reset_index(drop=True)
+        removed = before - len(df)
+        if removed > 0:
+            logger.warning(f"Removed {removed} rows with RestingBP == 0")
+    # Impute missing/zero Cholesterol
+    if "Cholesterol" in df.columns:
+        zero_mask = df["Cholesterol"] == 0
+        if zero_mask.any():
+            median_chol = df.loc[~zero_mask, "Cholesterol"].median()
+            df.loc[zero_mask, "Cholesterol"] = median_chol
+            logger.info(f"Imputed {zero_mask.sum()} Cholesterol==0 with median={median_chol}")
+    # Encode binary categorical features
+    if "Sex" in df.columns:
+        df["Sex"] = df["Sex"].map({"M": 1, "F": 0}).astype(int)
+        logger.debug("Encoded 'Sex' as binary.")
+    if "ExerciseAngina" in df.columns:
+        df["ExerciseAngina"] = df["ExerciseAngina"].map({"Y": 1, "N": 0}).astype(int)
+        logger.debug("Encoded 'ExerciseAngina' as binary.")
+    # One-hot encode multi-category features
+    multi_cat = [c for c in ["ChestPainType", "RestingECG", "ST_Slope"] if c in df.columns]
+    df = pd.get_dummies(df, columns=multi_cat, drop_first=False)
+    logger.debug(f"One-hot encoded columns: {multi_cat}")
+    # Scale numerical columns
+    num_cols = [c for c in NUM_COLS_DEFAULT if c in df.columns and c != TARGET_COL]
+    scaler = StandardScaler()
+    df[num_cols] = scaler.fit_transform(df[num_cols])
+    logger.info(f"Scaled numerical features: {num_cols}")
+    # Save processed dataset
+    df.to_csv(PREPROCESSED_CSV, index=False)
+    logger.success(
+        "Saved preprocessed dataset: %s (rows=%d, cols=%d)", PREPROCESSED_CSV, len(df), df.shape[1]
+    )
+    # Log class distribution
+    count_0 = (df[TARGET_COL] == 0).sum()
+    count_1 = (df[TARGET_COL] == 1).sum()
+    logger.info(f"Target balance — 0: {count_0} | 1: {count_1}")
+    save_scaler_artifact(scaler)
+    logger.success("Preprocessing completed successfully.")
+    return df
+if __name__ == "__main__":
+    INTERIM_DATA_DIR.mkdir(parents=True, exist_ok=True)
+    df_processed = preprocessing()
+    generate_gender_splits(df_processed)

predicting_outcomes_in_heart_failure/data/split_data.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import argparse
+from pathlib import Path
+from loguru import logger
+import pandas as pd
+from predicting_outcomes_in_heart_failure.config import (
+    FEMALE_CSV,
+    MALE_CSV,
+    NOSEX_CSV,
+    PREPROCESSED_CSV,
+    PROCESSED_DATA_DIR,
+    RANDOM_STATE,
+    TARGET_COL,
+    TEST_SIZE,
+)
+from sklearn.model_selection import train_test_split
+VARIANTS = {
+    "all": PREPROCESSED_CSV,
+    "female": FEMALE_CSV,
+    "male": MALE_CSV,
+    "nosex": NOSEX_CSV,
+}
+def _safe_train_test_split(X, y, test_size, random_state):
+    """Perform a stratified train/test split with fallback if not possible."""
+    stratify_y = y if y.nunique() > 1 else None
+    try:
+        X_tr, X_te, y_tr, y_te = train_test_split(
+            X,
+            y,
+            test_size=test_size,
+            stratify=stratify_y,
+            random_state=random_state,
+            shuffle=True,
+        )
+        if stratify_y is None:
+            logger.warning("Target has only one class — performing non-stratified split.")
+        else:
+            logger.debug("Stratified split executed successfully.")
+        return X_tr, X_te, y_tr, y_te
+    except ValueError as e:
+        logger.warning(f"Stratified split failed ({e}). Falling back to non-stratified split.")
+        return train_test_split(
+            X,
+            y,
+            test_size=test_size,
+            stratify=None,
+            random_state=random_state,
+            shuffle=True,
+        )
+def split_one(csv_path: Path, variant: str):
+    """Split a specific variant (all/female/male/nosex) into train/test sets."""
+    if not csv_path.exists():
+        logger.warning(f"[{variant}] Missing CSV file: {csv_path} — skipping.")
+        return
+    df = pd.read_csv(csv_path)
+    logger.info(f"[{variant}] Loaded {csv_path} (rows={len(df)}, cols={df.shape[1]})")
+    if TARGET_COL not in df.columns:
+        raise ValueError(f"[{variant}] Target column '{TARGET_COL}' not found in {csv_path}")
+    X = df.drop(columns=[TARGET_COL])
+    y = df[TARGET_COL].astype(int)
+    X_train, X_test, y_train, y_test = _safe_train_test_split(X, y, TEST_SIZE, RANDOM_STATE)
+    train_df = X_train.copy()
+    train_df[TARGET_COL] = y_train.values
+    test_df = X_test.copy()
+    test_df[TARGET_COL] = y_test.values
+    out_dir = PROCESSED_DATA_DIR / variant
+    out_dir.mkdir(parents=True, exist_ok=True)
+    train_p = out_dir / "train.csv"
+    test_p = out_dir / "test.csv"
+    train_df.to_csv(train_p, index=False)
+    test_df.to_csv(test_p, index=False)
+    logger.success(f"[{variant}] Saved TRAIN -> {train_p} (rows={len(train_df)})")
+    logger.success(f"[{variant}] Saved TEST  -> {test_p} (rows={len(test_df)})")
+    train_counts = train_df[TARGET_COL].value_counts().to_dict()
+    test_counts = test_df[TARGET_COL].value_counts().to_dict()
+    logger.info(f"[{variant}] Class distribution — TRAIN: {train_counts} | TEST: {test_counts}")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--variant",
+        type=str,
+        choices=list(VARIANTS.keys()),
+        required=True,
+        help="Data variant to split: all, female, male, or nosex.",
+    )
+    args = parser.parse_args()
+    variant = args.variant
+    csv_path = VARIANTS[variant]
+    logger.info(f"Starting splitting for variant='{variant}'")
+    PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
+    split_one(csv_path, variant)
+    logger.success(f"Splitting completed for variant='{variant}'")
+if __name__ == "__main__":
+    main()

predicting_outcomes_in_heart_failure/modeling/__init__.py ADDED Viewed

File without changes

predicting_outcomes_in_heart_failure/modeling/evaluate.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import argparse
+import json
+import os
+import dagshub
+import joblib
+from loguru import logger
+import mlflow
+from mlflow.models.signature import infer_signature
+from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
+from predicting_outcomes_in_heart_failure.config import (
+    DATASET_NAME,
+    EXPERIMENT_NAME,
+    MODELS_DIR,
+    PROCESSED_DATA_DIR,
+    REPO_NAME,
+    REPO_OWNER,
+    TARGET_COL,
+    TEST_METRICS_DIR,
+    VALID_MODELS,
+    VALID_VARIANTS,
+)
+from predicting_outcomes_in_heart_failure.modeling.train import load_split
+def compute_metrics(model, X_test, y_test) -> dict:
+    """Compute evaluation metrics (F1, recall, accuracy, ROC-AUC)."""
+    y_pred = model.predict(X_test)
+    results = {
+        "test_f1": f1_score(y_test, y_pred, zero_division=0),
+        "test_recall": recall_score(y_test, y_pred, zero_division=0),
+        "test_accuracy": accuracy_score(y_test, y_pred),
+    }
+    if hasattr(model, "predict_proba"):
+        try:
+            y_prob = model.predict_proba(X_test)[:, 1]
+            results["test_roc_auc"] = roc_auc_score(y_test, y_prob)
+        except Exception as e:
+            logger.warning(f"ROC AUC not computed: {e}")
+    return results, y_pred
+def evaluate_variant(variant: str, model_name: str | None = None):
+    """Evaluate trained models for a given variant, optionally by model."""
+    logger.info(f"=== Evaluation started (variant={variant}, model={model_name or 'ALL'}) ===")
+    test_path = PROCESSED_DATA_DIR / variant / "test.csv"
+    test_df = load_split(test_path)
+    X_test = test_df.drop(columns=[TARGET_COL])
+    y_test = test_df[TARGET_COL].astype(int)
+    models_dir_variant = MODELS_DIR / variant
+    if not models_dir_variant.exists():
+        logger.warning(
+            f"[{variant}] Models directory does not exist: {models_dir_variant} — skipping."
+        )
+        return
+    experiment_name = f"{EXPERIMENT_NAME}_{variant}"
+    experiment = mlflow.get_experiment_by_name(experiment_name)
+    if experiment is None:
+        logger.error(f"Experiment '{experiment_name}' not found.")
+        return
+    model_files = []
+    if model_name is not None:
+        model_files = [f"{model_name}.joblib"]
+    else:
+        model_files = [f for f in os.listdir(models_dir_variant) if f.endswith(".joblib")]
+    for file in model_files:
+        if not file.endswith(".joblib"):
+            continue
+        current_model_name = file.split(".joblib")[0]
+        run_name = f"{current_model_name}_{variant}"
+        logger.info(
+            f"[{variant} | {current_model_name}] Looking for training run '{run_name}' in MLflow."
+        )
+        runs = mlflow.search_runs(
+            experiment_ids=[experiment.experiment_id],
+            filter_string=f"tags.mlflow.runName = '{run_name}'",
+            order_by=["start_time DESC"],
+            max_results=1,
+        )
+        if runs.empty:
+            logger.warning(
+                f"[{variant} | {current_model_name}]No matching MLflow run found — skipping."
+            )
+            continue
+        tracked_id = runs.loc[0, "run_id"]
+        with mlflow.start_run(run_id=tracked_id):
+            rawdata = mlflow.data.from_pandas(test_df, name=f"{DATASET_NAME}_{variant}_test")
+            mlflow.log_input(rawdata, context="testing")
+            model_path = models_dir_variant / file
+            model = joblib.load(model_path)
+            metrics, _ = compute_metrics(model, X_test, y_test)
+            mlflow.log_metrics(metrics)
+            logger.info(f"[{variant} | {current_model_name}] Test set metrics:")
+            for k in ["test_f1", "test_recall", "test_accuracy", "test_roc_auc"]:
+                if k in metrics:
+                    logger.info(f"  - {k}: {metrics[k]:.4f}")
+            metrics_dir = TEST_METRICS_DIR / variant
+            metrics_dir.mkdir(parents=True, exist_ok=True)
+            metrics_path = metrics_dir / f"{current_model_name}.json"
+            to_save = {
+                "variant": variant,
+                "model_name": current_model_name,
+                "metrics": metrics,
+            }
+            with open(metrics_path, "w", encoding="utf-8") as f:
+                json.dump(to_save, f, indent=4)
+            logger.info(
+                f"[{variant} | {current_model_name}] Saved test metrics locally → {metrics_path}"
+            )
+            if (
+                metrics.get("test_f1", 0.0) >= 0.80
+                and metrics.get("test_recall", 0.0) >= 0.80
+                and metrics.get("test_accuracy", 0.0) >= 0.80
+                and metrics.get("test_roc_auc", 0.0) >= 0.85
+            ):
+                signature = infer_signature(X_test, model.predict(X_test))
+                registered_name = f"{current_model_name}_{variant}"
+                mlflow.sklearn.log_model(
+                    sk_model=model,
+                    artifact_path="Model_Info",
+                    signature=signature,
+                    input_example=X_test,
+                    registered_model_name=registered_name,
+                )
+                logger.success(
+                    f"[{variant} | {current_model_name}] "
+                    f"Model promoted and registered as '{registered_name}'."
+                )
+    logger.success(
+        f"=== Evaluation completed (variant={variant}, model={model_name or 'ALL'}) ==="
+    )
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--variant",
+        type=str,
+        choices=VALID_VARIANTS,
+        required=True,
+        help="Data variant to use: all, female, male, or nosex.",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        choices=VALID_MODELS,
+        required=False,
+        help=(
+            "Specific model to evaluate (logreg, random_forest, decision_tree)."
+            " If omitted, evaluate all models."
+        ),
+    )
+    args = parser.parse_args()
+    dagshub.init(repo_owner=REPO_OWNER, repo_name=REPO_NAME, mlflow=True)
+    evaluate_variant(args.variant, args.model)
+if __name__ == "__main__":
+    main()

predicting_outcomes_in_heart_failure/modeling/explainability.py ADDED Viewed

	@@ -0,0 +1,202 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+import matplotlib
+matplotlib.use("Agg")
+from loguru import logger
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import shap
+def explain_prediction(
+    model: Any,
+    X: pd.DataFrame,
+    model_type: str,
+    top_k: int = 5,
+):
+    """
+    Build a explanation for a single sample.
+    """
+    if X.empty:
+        logger.warning("Received empty DataFrame for explanation; returning empty list.")
+        return []
+    model_type = model_type.lower()
+    x = X.iloc[[0]]
+    feature_names = x.columns.tolist()
+    # ---------------------------------------------------------------------
+    # 1) Logistic Regression → use coefficients
+    # ---------------------------------------------------------------------
+    if model_type in ("logreg", "logistic_regression"):
+        logger.info("Using coefficient-based explanation for Logistic Regression.")
+        if not hasattr(model, "coef_"):
+            logger.error(
+                "Model has no coef_ attribute;cannot build coefficient-based explanation."
+            )
+            return []
+        coef = np.asarray(model.coef_[0]).reshape(-1)
+        if coef.shape[0] != len(feature_names):
+            logger.warning(
+                f"Coefficient vector length ({coef.shape[0]}) does not match "
+                f"number of features ({len(feature_names)}). "
+                "Truncating to minimum length."
+            )
+        n = min(len(feature_names), coef.shape[0])
+        explanations = [
+            {
+                "feature": feature_names[i],
+                "value": float(coef[i]),
+                "abs_value": float(abs(coef[i])),
+            }
+            for i in range(n)
+        ]
+        explanations = sorted(explanations, key=lambda d: d["abs_value"], reverse=True)[:top_k]
+        logger.info(
+            f"Built coefficient-based explanation. Returning top {len(explanations)} features."
+        )
+        return explanations
+    # ---------------------------------------------------------------------
+    # 2) Tree-based models → SHAP TreeExplainer
+    # ---------------------------------------------------------------------
+    if model_type in ("random_forest", "decision_tree"):
+        logger.info("Using SHAP TreeExplainer for tree-based model.")
+        if X.empty:
+            logger.warning("Received empty DataFrame for SHAP explanation; returning empty list.")
+            return []
+        x = X.iloc[[0]]
+        feature_names = x.columns.tolist()
+        try:
+            explainer = shap.TreeExplainer(model)
+            shap_exp = explainer(x)
+            values = np.asarray(shap_exp.values)
+            logger.debug(f"Raw SHAP values shape: {values.shape!r}")
+        except Exception as e:
+            logger.error(f"SHAP TreeExplainer failed: {e}")
+            logger.warning("SHAP explanation not available for this model.")
+            return []
+        if values.ndim == 2:
+            shap_vec = values[0]
+        elif values.ndim == 3:
+            n_samples, dim2, dim3 = values.shape
+            if dim2 == x.shape[1]:
+                n_outputs = dim3
+                class_index = 1 if n_outputs > 1 else 0
+                shap_vec = values[0, :, class_index]
+            elif dim3 == x.shape[1]:
+                n_outputs = dim2
+                class_index = 1 if n_outputs > 1 else 0
+                shap_vec = values[0, class_index, :]
+            else:
+                logger.error(f"Unexpected SHAP shape {values.shape} for {x.shape[1]} features.")
+                return []
+        else:
+            logger.error(f"Unexpected SHAP values dimension: {values.ndim}")
+            return []
+        shap_vec = np.asarray(shap_vec).reshape(-1)
+        if shap_vec.shape[0] != len(feature_names):
+            logger.warning(
+                f"SHAP vector length ({shap_vec.shape[0]}) "
+                f"!= number of features ({len(feature_names)}). "
+                "Truncating to minimum length."
+            )
+        n = min(len(feature_names), shap_vec.shape[0])
+        explanations = [
+            {
+                "feature": feature_names[i],
+                "value": float(shap_vec[i]),
+                "abs_value": float(abs(shap_vec[i])),
+            }
+            for i in range(n)
+        ]
+        explanations = sorted(explanations, key=lambda d: d["abs_value"], reverse=True)[:top_k]
+        logger.info(f"Built SHAP-based explanation. Returning top {len(explanations)} features.")
+        return explanations
+def save_shap_waterfall_plot(
+    model: Any,
+    X: pd.DataFrame,
+    model_type: str,
+    output_path: Path,
+) -> Path | None:
+    """
+    Save a SHAP waterfall plot for a single sample to the given output path.
+    """
+    model_type = model_type.lower()
+    if model_type not in ("random_forest", "decision_tree"):
+        logger.warning(
+            f"Waterfall plot is only supported for tree-based models. "
+            f"Got model_type='{model_type}'. Skipping plot generation."
+        )
+        return None
+    if X.empty:
+        logger.warning("Received empty DataFrame for SHAP plot; skipping.")
+        return None
+    x = X.iloc[[0]]
+    logger.info(f"Generating SHAP waterfall plot for model_type='{model_type}'.")
+    try:
+        explainer = shap.TreeExplainer(model)
+        shap_exp = explainer(x)
+    except Exception as e:
+        logger.error(f"Failed to build SHAP explainer for plot: {e}")
+        return None
+    try:
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        shap_to_plot = shap_exp
+        if np.asarray(shap_exp.values).ndim == 3:
+            vals = np.asarray(shap_exp.values)
+            if vals.shape[1] == x.shape[1]:
+                shap_to_plot = shap_exp[..., 1]
+            elif vals.shape[2] == x.shape[1]:
+                shap_to_plot = shap_exp[:, 1, :]
+            else:
+                logger.warning(
+                    f"Unexpected shape for SHAP values in plot: {vals.shape}. "
+                    "Falling back to shap_exp[0]."
+                )
+                shap_to_plot = shap_exp
+        plt.figure()
+        shap.plots.waterfall(shap_to_plot[0], show=False)
+        plt.tight_layout()
+        plt.savefig(output_path, bbox_inches="tight")
+        plt.close()
+        logger.success(f"SHAP waterfall plot saved to {output_path}")
+        return output_path
+    except Exception as e:
+        logger.error(f"Failed to save SHAP waterfall plot: {e}")
+        return None

predicting_outcomes_in_heart_failure/modeling/predict.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from __future__ import annotations
+import time
+import joblib
+from loguru import logger
+import numpy as np
+import pandas as pd
+from predicting_outcomes_in_heart_failure.app.schema import HeartSample
+from predicting_outcomes_in_heart_failure.config import (
+    FIGURES_DIR,
+    INPUT_COLUMNS,
+    MODEL_PATH,
+    MULTI_CAT,
+    NUM_COLS_DEFAULT,
+    SCALER_PATH,
+)
+from predicting_outcomes_in_heart_failure.modeling.explainability import (
+    explain_prediction,
+    save_shap_waterfall_plot,
+)
+def preprocessing(sample_df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Apply the exact same preprocessing used during training:
+    """
+    logger.info("Applying preprocessing pipeline for inference...")
+    if not (SCALER_PATH.exists() and MODEL_PATH.exists()):
+        raise FileNotFoundError("Preprocessing artifacts missing.")
+    scaler = joblib.load(SCALER_PATH)
+    input_columns = INPUT_COLUMNS
+    multi_cat = MULTI_CAT
+    num_cols = NUM_COLS_DEFAULT
+    logger.debug(f"Loaded scaler from {SCALER_PATH}")
+    logger.debug(f"Using {len(input_columns)} input columns")
+    if "Sex" in sample_df.columns and "Sex" not in input_columns:
+        logger.debug("Dropping column 'Sex' since it's not used by the current model variant.")
+        sample_df = sample_df.drop(columns=["Sex"])
+    if "Sex" in sample_df.columns and "Sex" in input_columns:
+        sample_df["Sex"] = sample_df["Sex"].map({"M": 1, "F": 0}).astype(int)
+        logger.debug("Mapped 'Sex' to binary values (M=1, F=0).")
+    if "ExerciseAngina" in sample_df.columns and "ExerciseAngina" in input_columns:
+        sample_df["ExerciseAngina"] = sample_df["ExerciseAngina"].map({"Y": 1, "N": 0}).astype(int)
+        logger.debug("Mapped 'ExerciseAngina' to binary values (Y=1, N=0).")
+    present_multi = [c for c in multi_cat if c in sample_df.columns]
+    if present_multi:
+        logger.debug(f"Performing one-hot encoding on: {present_multi}")
+        sample_df = pd.get_dummies(sample_df, columns=present_multi, drop_first=False)
+    for col in input_columns:
+        if col not in sample_df.columns:
+            sample_df[col] = 0
+    sample_df = sample_df.reindex(columns=input_columns, fill_value=0)
+    logger.debug("Aligned input columns with training feature order.")
+    cols_to_scale = [c for c in num_cols if c in sample_df.columns]
+    sample_df[cols_to_scale] = scaler.transform(sample_df[cols_to_scale])
+    logger.debug(f"Scaled numerical columns: {cols_to_scale}")
+    logger.success("Preprocessing completed successfully.")
+    return sample_df
+def main():
+    logger.info("Starting static inference...")
+    sample = HeartSample(
+        Age=54,
+        ChestPainType="ASY",
+        RestingBP=140,
+        Cholesterol=239,
+        FastingBS=0,
+        RestingECG="Normal",
+        MaxHR=160,
+        ExerciseAngina="N",
+        Oldpeak=0.0,
+        ST_Slope="Up",
+    )
+    logger.info("Sample created successfully.")
+    X_raw = sample.to_dataframe()
+    logger.debug(f"Raw input features:\n{X_raw}")
+    X = preprocessing(X_raw)
+    if not MODEL_PATH.exists():
+        raise FileNotFoundError(f"Model not found: {MODEL_PATH}")
+    model = joblib.load(MODEL_PATH)
+    logger.success(f"Loaded model from {MODEL_PATH}")
+    # Perform prediction
+    t0 = time.perf_counter()
+    y_pred = model.predict(X)[0]
+    inference_time = time.perf_counter() - t0
+    y_pred = int(y_pred) if np.issubdtype(type(y_pred), np.integer) else y_pred
+    result = {
+        "prediction": y_pred,
+        "inference_time_seconds": inference_time,
+    }
+    # Explainability
+    model = joblib.load(MODEL_PATH)
+    model_type = MODEL_PATH.stem
+    try:
+        logger.info("Computing explanation for the prediction...")
+        explanations = explain_prediction(model, X, model_type=model_type, top_k=5)
+        result["explanations"] = explanations
+        logger.success("Explanation computed successfully.")
+    except Exception as e:
+        logger.error(f"Failed to compute explanation: {e}")
+    try:
+        shap_path = FIGURES_DIR / f"shap_waterfall_{model_type}.png"
+        saved = save_shap_waterfall_plot(model, X, model_type=model_type, output_path=shap_path)
+        if saved is not None:
+            result["explanation_plot"] = str(saved)
+    except Exception as e:
+        logger.error(f"Failed to generate SHAP waterfall plot: {e}")
+    logger.info("Inference completed.")
+    logger.success(f"Prediction result: {result}")
+    return result
+if __name__ == "__main__":
+    main()

predicting_outcomes_in_heart_failure/modeling/train.py ADDED Viewed

	@@ -0,0 +1,261 @@

+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+import dagshub
+from imblearn.over_sampling import RandomOverSampler
+import joblib
+from loguru import logger
+import mlflow
+import pandas as pd
+from sklearn.model_selection import GridSearchCV, StratifiedKFold
+from predicting_outcomes_in_heart_failure.config import (
+    CONFIG_DT,
+    CONFIG_LR,
+    CONFIG_RF,
+    DATASET_NAME,
+    EXPERIMENT_NAME,
+    MODELS_DIR,
+    N_SPLITS,
+    PROCESSED_DATA_DIR,
+    RANDOM_STATE,
+    REPO_NAME,
+    REPO_OWNER,
+    REPORTS_DIR,
+    SCORING,
+    TARGET_COL,
+    VALID_MODELS,
+    VALID_VARIANTS,
+)
+REFIT = "f1"
+def load_split(path: Path) -> pd.DataFrame:
+    if not path.exists():
+        logger.error(f"Missing split file: {path}. Run split_data.py first.")
+        raise FileNotFoundError(path)
+    df = pd.read_csv(path)
+    logger.info(f"Loaded {path} (rows={len(df)}, cols={df.shape[1]})")
+    return df
+def apply_random_oversampling(
+    X: pd.DataFrame,
+    y: pd.Series,
+    model_name: str,
+    variant: str,
+):
+    """Apply RandomOverSampler to balance classes in the training set."""
+    logger.info(f"[{variant} | {model_name}] Applying RandomOverSampler on training data...")
+    # Log original class distribution
+    orig_counts = y.value_counts().to_dict()
+    logger.info(f"[{variant} | {model_name}] Original class distribution: {orig_counts}")
+    ros = RandomOverSampler(random_state=RANDOM_STATE)
+    X_res, y_res = ros.fit_resample(X, y)
+    # Log resampled class distribution
+    res_counts = y_res.value_counts().to_dict()
+    logger.info(f"[{variant} | {model_name}] Resampled class distribution: {res_counts}")
+    logger.success(f"[{variant} | {model_name}] RandomOverSampler applied successfully.")
+    return X_res, y_res
+def get_model_and_grid(model_name: str):
+    """Return estimator and parameter grid for the selected model."""
+    if model_name == "decision_tree":
+        from sklearn.tree import DecisionTreeClassifier
+        estimator = DecisionTreeClassifier(random_state=RANDOM_STATE)
+        param_grid = CONFIG_DT
+        return estimator, param_grid
+    elif model_name == "logreg":
+        from sklearn.linear_model import LogisticRegression
+        estimator = LogisticRegression(max_iter=500, random_state=RANDOM_STATE)
+        param_grid = CONFIG_LR
+        return estimator, param_grid
+    elif model_name == "random_forest":
+        from sklearn.ensemble import RandomForestClassifier
+        estimator = RandomForestClassifier(random_state=RANDOM_STATE)
+        param_grid = CONFIG_RF
+        return estimator, param_grid
+    else:
+        raise ValueError(f"Unknown model_name: {model_name}")
+def run_grid_search(
+    estimator,
+    param_grid,
+    X_train,
+    y_train,
+    model_name: str,
+    variant: str,
+    reports_dir: Path,
+):
+    """Run GridSearchCV for the specified model and log CV results."""
+    cv = StratifiedKFold(
+        n_splits=N_SPLITS,
+        shuffle=True,
+        random_state=RANDOM_STATE,
+    )
+    grid = GridSearchCV(
+        estimator=estimator,
+        param_grid=param_grid,
+        scoring=SCORING,
+        refit=REFIT,
+        cv=cv,
+        n_jobs=-1,
+        verbose=1,
+        return_train_score=True,
+    )
+    logger.info(f"[{variant} | {model_name}] Starting GridSearchCV …")
+    grid.fit(X_train, y_train)
+    logger.success(f"[{variant} | {model_name}] GridSearchCV completed.")
+    logger.info(f"[{variant} | {model_name}] Best params ({REFIT}): {grid.best_params_}")
+    logger.info(f"[{variant} | {model_name}] Best CV {REFIT}: {grid.best_score_:.4f}")
+    cv_results_path = reports_dir / "cv_results.csv"
+    df = pd.DataFrame(grid.cv_results_)
+    df.to_csv(cv_results_path, index=False)
+    mlflow.log_artifact(str(cv_results_path))
+    return grid.best_estimator_, grid, grid.best_params_
+def save_artifacts(
+    model,
+    grid,
+    X_train,
+    model_name: str,
+    variant: str,
+    model_dir: Path,
+    reports_dir: Path,
+) -> None:
+    """Save model, parameters, and metadata to disk and MLflow."""
+    model_dir.mkdir(parents=True, exist_ok=True)
+    reports_dir.mkdir(parents=True, exist_ok=True)
+    model_path = model_dir / f"{model_name}.joblib"
+    joblib.dump(model, model_path)
+    logger.success(f"[{variant} | {model_name}] Saved model → {model_path}")
+    out = {
+        "model_name": model_name,
+        "data_variant": variant,
+        "cv": {
+            "refit": REFIT,
+            "best_score": getattr(grid, "best_score_", None),
+            "best_params": getattr(grid, "best_params_", None),
+            "scoring": list(SCORING.keys()),
+            "n_splits": N_SPLITS,
+            "random_state": RANDOM_STATE,
+        },
+        "features": list(X_train.columns),
+    }
+    cv_params_path = reports_dir / "cv_parameters.json"
+    with open(cv_params_path, "w", encoding="utf-8") as f:
+        json.dump(out, f, indent=4)
+    mlflow.log_artifact(str(cv_params_path))
+    logger.success(f"[{variant} | {model_name}] Saved artifacts.")
+def train(model_name: str, variant: str):
+    """Train a model for a specific dataset variant and log results to MLflow."""
+    experiment_name = f"{EXPERIMENT_NAME}_{variant}"
+    if not mlflow.get_experiment_by_name(experiment_name):
+        mlflow.create_experiment(experiment_name)
+    mlflow.set_experiment(experiment_name)
+    train_path = PROCESSED_DATA_DIR / variant / "train.csv"
+    run_name = f"{model_name}_{variant}"
+    logger.info(f"=== Training started (model={model_name}, variant={variant}) ===")
+    with mlflow.start_run(run_name=run_name):
+        train_df = load_split(train_path)
+        rawdata = mlflow.data.from_pandas(train_df, name=f"{DATASET_NAME}_{variant}")
+        mlflow.log_input(rawdata, context="training")
+        X_train = train_df.drop(columns=[TARGET_COL])
+        y_train = train_df[TARGET_COL].astype(int)
+        X_train, y_train = apply_random_oversampling(
+            X_train,
+            y_train,
+            model_name=model_name,
+            variant=variant,
+        )
+        estimator, param_grid = get_model_and_grid(model_name)
+        mlflow.set_tag("estimator_name", estimator.__class__.__name__)
+        mlflow.set_tag("data_variant", variant)
+        mlflow.log_param("data_variant", variant)
+        model_dir = MODELS_DIR / variant
+        reports_dir = REPORTS_DIR / variant / model_name
+        reports_dir.mkdir(parents=True, exist_ok=True)
+        best_model, grid, params = run_grid_search(
+            estimator,
+            param_grid,
+            X_train,
+            y_train,
+            model_name=model_name,
+            variant=variant,
+            reports_dir=reports_dir,
+        )
+        mlflow.log_params(params)
+        save_artifacts(
+            best_model,
+            grid,
+            X_train,
+            model_name=model_name,
+            variant=variant,
+            model_dir=model_dir,
+            reports_dir=reports_dir,
+        )
+    logger.success(f"=== Training completed (model={model_name}, variant={variant}) ===")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--variant",
+        type=str,
+        choices=VALID_VARIANTS,
+        required=True,
+        help="Data variant to use: all, female, male, or nosex.",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        choices=VALID_MODELS,
+        required=True,
+        help="Model to train: logreg, random_forest, or decision_tree.",
+    )
+    args = parser.parse_args()
+    dagshub.init(repo_owner=REPO_OWNER, repo_name=REPO_NAME, mlflow=True)
+    train(args.model, args.variant)
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,80 @@

+[build-system]
+requires = ["flit_core >=3.2,<4"]
+build-backend = "flit_core.buildapi"
+[project]
+name = "predicting_outcomes_in_heart_failure"
+version = "0.0.1"
+description = "This project develops a predictive pipeline for patient outcome prediction in heart failure, using a publicly available dataset of clinical records. The goal is to design and evaluate machine learning models within a reproducible workflow that can be integrated into larger systems for clinical decision support. The workflow addresses data heterogeneity, defines consistent preprocessing and feature engineering strategies, and explores alternative modeling approaches with systematic evaluation using clinically relevant metrics. It also emphasizes model transparency and auditability, ensuring that the resulting pipeline can be deployed as a reliable, adaptable software component in healthcare applications. The project aims not only to improve baseline predictive performance but also to demonstrate how data-driven models can be effectively integrated into end-to-end AI-enabled healthcare systems."
+authors = [
+  { name = "CardioTrack" },
+]
+readme = "README.md"
+classifiers = [
+    "Programming Language :: Python :: 3",
+]
+dependencies = [
+    "asttokens>=3.0.0",
+    "dagshub>=0.6.3",
+    "gradio>=6.0.2",
+    "great-expectations>=1.9.0",
+    "httpx>=0.28.1",
+    "imbalanced-learn>=0.14.0",
+    "ipykernel>=7.1.0",
+    "kagglehub>=0.3.13",
+    "loguru",
+    "matplotlib>=3.10.7",
+    "mkdocs",
+    "mlflow==2.22.0",
+    "numpy>=2.3.4",
+    "pandas>=2.3.3",
+    "pip",
+    "pytest",
+    "python-dotenv",
+    "ruff",
+    "scikit-learn>=1.7.2",
+    "seaborn>=0.13.2",
+    "shap>=0.50.0",
+    "tqdm",
+    "typer",
+]
+requires-python = "~=3.11.0"
+[tool.ruff]
+target-version = "py311"
+line-length = 99
+src = [
+    "predicting_outcomes_in_heart_failure",
+    "tests",
+]
+include = ["pyproject.toml", "predicting_outcomes_in_heart_failure/**/*.py", "tests/**/*.py",]
+extend-exclude = [
+  ".git/", ".venv/", ".ruff_cache/", ".mypy_cache/",
+  "data/", "artifacts/", "mlruns/", "notebooks/.ipynb_checkpoints/",
+]
+[tool.ruff.lint]
+# Enable sets of rules: basic style, static errors, import, upgrade, bugbear, simplify
+select = ["E", "F", "I", "UP", "B", "SIM"]
+ignore = ["E203"]                 # compatibility with slicing and formatter
+per-file-ignores = {"tests/**" = ["S101", "D"], "notebooks/**" = ["E402", "F401", "D"]}
+[tool.ruff.lint.isort]
+force-sort-within-sections = true
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+docstring-code-format = true          # format code inside docstrings
+[dependency-groups]
+dev = [
+    "pynblint>=0.1.6",
+    "ruff>=0.14.2",
+]