IamGrooooot commited on Mar 3

Commit

e69d4e4

1 Parent(s): bcaf08f

Initial release: 72-hour COPD exacerbation prediction model

Browse files

Files changed (40) hide show

.gitignore +150 -0
README.md +212 -0
pipeline.yml +29 -0
requirements.txt +1 -0
setup.cfg +7 -0
training/README.MD +15 -0
training/copd.py +644 -0
training/create_sh_lookup_table.py +43 -0
training/cross_validation.py +208 -0
training/cross_validation_algorithms.py +109 -0
training/cross_validation_calibration.py +260 -0
training/cross_validation_comorbs.py +191 -0
training/define_exacerbations_prologic.py +466 -0
training/fitbit_exploration.py +144 -0
training/lookups/README.MD +1 -0
training/lookups/type_lookup.txt +116 -0
training/prepare_test_data.py +271 -0
training/prepare_train_data.py +295 -0
training/prepare_train_data_crossval.py +331 -0
training/tests/__init__.py +0 -0
training/tests/test_apply_logic_response_criterion.py +82 -0
training/tests/test_bin_numeric_column.py +36 -0
training/tests/test_calculate_days_since_last_event.py +36 -0
training/tests/test_define_hospital admission.py +27 -0
training/tests/test_define_service_exac_event.py +47 -0
training/tests/test_extract_clinician_verified_exacerbations.py +42 -0
training/tests/test_filter_symptom_diary.py +31 -0
training/tests/test_get_logic_exacerbation_indices.py +56 -0
training/tests/test_get_rescue_med_pro_responses.py +29 -0
training/tests/test_logic_consecutive_negative_responses.py +175 -0
training/tests/test_minimum_period_between_exacerbations.py +42 -0
training/tests/test_remove_data_between_exacerbations.py +32 -0
training/tests/test_remove_unknown_date_exacerbations.py +36 -0
training/tests/test_rolling_mean_previous_period.py +101 -0
training/tests/test_rolling_sum_previous_period.py +89 -0
training/tests/test_set_prediction_window.py +28 -0
training/tests/test_set_pro_exac_dates.py +46 -0
training/tests/test_triple_inhaler_therapy_service.py +95 -0
training/tests/test_unit_lookup.py +31 -0
training/train_test_split.py +129 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,150 @@

+# Folders for model cohort data, training data plots and logs
+data/
+training/logs/
+# mlflow
+training/tmp
+training/mlruns
+training/mlruns.sqlite
+# VS Code
+.vscode/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/

README.md ADDED Viewed

	@@ -0,0 +1,212 @@

+---
+language: en
+license: apache-2.0
+tags:
+  - healthcare
+  - ehr
+  - copd
+  - clinical-risk
+  - tabular
+  - scikit-learn
+  - xgboost
+  - lightgbm
+pipeline_tag: tabular-classification
+library_name: sklearn
+---
+# COPD Open Models — Model C (72-Hour Exacerbation Prediction)
+## Model Details
+Model C predicts the risk of a COPD exacerbation within **72 hours** using features derived from NHS EHR datasets and patient-reported outcomes (PROs). It includes a reproducible training/evaluation pipeline and runs on standard Python ML libraries (pandas, scikit-learn, imbalanced-learn, plus optional gradient-boosting libraries).
+### Key Characteristics
+- **PRO LOGIC** — a clinically-informed validation algorithm that deduplicates and filters patient-reported exacerbation events (14-day minimum between episodes, consecutive negative rescue-medication responses required for borderline events, 7-day rescue-med prescription spacing).
+- Compares **10 algorithms** with per-fold preprocessing to prevent data leakage.
+- Training code is fully decoupled from cloud infrastructure — runs locally with no Azure dependencies.
+> **Note:** This repository contains no real patient-level data. All included data files are synthetic or example data for pipeline validation.
+### Model Type
+Traditional tabular ML classifiers (multiple candidate estimators; see "Training Procedure").
+### Release Notes
+- **Phase 1 (current):** Models C, E, H published as the initial "COPD Open Models" collection.
+- **Phase 2 (planned):** Additional models may follow after codebase sanitisation.
+---
+## Intended Use
+This model and code are published as **reference implementations** for research, education, and benchmarking on COPD prediction tasks.
+### Intended Users
+- ML practitioners exploring tabular healthcare ML pipelines
+- Researchers comparing feature engineering and evaluation approaches
+- Developers building internal prototypes (non-clinical)
+### Out-of-Scope Uses
+- **Not** for clinical decision-making, triage, diagnosis, or treatment planning.
+- **Not** a substitute for clinical judgement or validated clinical tools.
+- Do **not** deploy in healthcare settings without an appropriate regulatory, clinical safety, and information governance framework.
+### Regulatory Considerations (SaMD)
+Regulatory status for software depends on the intended purpose expressed in documentation, labelling, and promotional materials. Downstream users integrating or deploying this model should determine whether their implementation qualifies as Software as a Medical Device (SaMD) and identify the legal "manufacturer" responsible for compliance and post-market obligations.
+---
+## Training Data
+- **Source:** NHS EHR-derived datasets and Lenus COPD Service PRO data (training performed on controlled datasets; not distributed here).
+- **Data available in this repo:** Synthetic/example datasets only.
+- **Cohort:** ~302 COPD patients (84 RECEIVER + 218 Scale-Up). Daily predictions per patient.
+- **Train/test split:** 85% / 15%, stratified by exacerbation status and sex.
+- **Class balance:** Exacerbation days are minority class (~5–10% positive).
+### Features (35 total)
+| Category | Features |
+|----------|----------|
+| **Daily PROs** | CAT Q1–Q8, CAT Score, Symptom Diary Q1–Q3, plus 3-day rolling mean difference variants for each |
+| **Weekly PROs** | Q5 (rescue meds), Q8 (phlegm difficulty), Q9 (phlegm consistency), Q10 (phlegm colour) — target-encoded |
+| **Clinical** | Sex_F, RequiredAcuteNIV, RequiredICUAdmission, HighestEosinophilCount_0_3, TripleTherapy, AsthmaOverlap |
+| **Categorical (target-encoded)** | SmokingStatus, Age (binned: <50 / 50-59 / 60-69 / 70-79 / 80+), FEV1PercentPredicted (Mild / Moderate / Severe / Very Severe), Comorbidities (None / 1-2 / 3+), DaysSinceLastExac (binned) |
+| **Temporal** | ExacsPrevYear (rolling 365-day sum), AdmissionsPrevYear (rolling 365-day sum) |
+### Data Preprocessing
+1. **Target encoding** — applied per-fold using K-fold encoding on categorical features.
+2. **MinMax scaling** — all features scaled to [0, 1], fit on training fold only.
+3. **Median imputation** — missing values imputed per-fold using training fold medians.
+---
+## Training Procedure
+### Training Framework
+- pandas, scikit-learn, imbalanced-learn
+- Optional: xgboost, lightgbm, interpret (for EBM)
+- Experiment tracking: MLflow
+### Algorithms Evaluated
+| # | Algorithm | Library |
+|---|-----------|---------|
+| 1 | RandomForestClassifier | sklearn |
+| 2 | RandomForestClassifier (class_weight='balanced') | sklearn |
+| 3 | BalancedBaggingClassifier | imblearn |
+| 4 | **BalancedRandomForestClassifier** | imblearn |
+| 5 | XGBClassifier | xgboost |
+| 6 | XGBClassifier (scale_pos_weight) | xgboost |
+| 7 | LGBMClassifier | lightgbm |
+| 8 | ExplainableBoostingClassifier | interpret |
+| 9 | LogisticRegression | sklearn |
+| 10 | LogisticRegression (class_weight='balanced') | sklearn |
+### Evaluation Design
+- **5-fold** stratified cross-validation, balanced by class and grouped by patient.
+- Per-fold preprocessing (encoding, scaling, imputation) to prevent data leakage.
+- Decision thresholds evaluated at: **0.3, 0.4, 0.5, 0.6, 0.7, 0.8**.
+- Calibration tested: **sigmoid** and **isotonic** methods via CalibratedClassifierCV.
+---
+## Evaluation Results
+> Replace this section with measured results from your training run.
+| Metric | Value | Notes |
+|--------|-------|-------|
+| ROC-AUC | TBD | Cross-validation mean (± std) |
+| AUC-PR | TBD | Primary metric for imbalanced outcome |
+| F1 Score | TBD | At threshold 0.5 |
+| Balanced Accuracy | TBD | Cross-validation mean |
+| Precision | TBD | At chosen threshold |
+| Recall | TBD | At chosen threshold |
+| Brier Score | TBD | Probability calibration quality |
+### Caveats on Metrics
+- Performance depends heavily on cohort definition, feature availability, and label construction.
+- Reported metrics from controlled datasets may not transfer to other settings without recalibration and validation.
+- Exacerbation labels are constructed via PRO LOGIC — different event definitions will produce different results.
+---
+## Bias, Risks, and Limitations
+- **Dataset shift:** EHR coding practices, care pathways, and population characteristics vary across sites and time periods.
+- **Label uncertainty:** Exacerbations may be incompletely observed in routine data; PRO LOGIC filtering may not generalise to all clinical contexts.
+- **Fairness:** Outcomes and feature availability may vary by age, sex, deprivation, comorbidity burden, or service access.
+- **Misuse risk:** Using predictions to drive clinical action without clinical safety processes can cause harm through false positives and negatives.
+- **Cohort size:** ~302 patients is relatively small; results should be interpreted with appropriate uncertainty.
+---
+## How to Use
+### Pipeline Execution Order
+```bash
+# 1. Install dependencies
+pip install pandas numpy scikit-learn imbalanced-learn xgboost lightgbm interpret mlflow matplotlib seaborn
+# 2. Define exacerbations with PRO LOGIC
+python training/define_exacerbations_prologic.py
+# 3. Train/test split (85/15, stratified)
+python training/train_test_split.py
+# 4. Prepare training data (encode, scale, impute)
+python training/prepare_train_data.py
+# 5. Prepare cross-validation folds (per-fold preprocessing)
+python training/prepare_train_data_crossval.py
+# 6. Prepare test data (using training encodings)
+python training/prepare_test_data.py
+# 7. Compare algorithms via cross-validation
+python training/cross_validation_algorithms.py
+# 8. Train final model (BalancedRandomForestClassifier)
+python training/cross_validation.py
+# 9. Evaluate calibration methods
+python training/cross_validation_calibration.py
+```
+### Adapting to Your Data
+Replace the input data paths in `define_exacerbations_prologic.py` with your own EHR extract. The pipeline expects CSV files with columns for patient ID, dates, diagnoses, PRO responses, and pharmacy records.
+---
+## Environmental Impact
+Training computational requirements are minimal — all models are traditional tabular ML classifiers running on CPU. A full cross-validation sweep across 10 algorithms completes in minutes on a standard laptop.
+---
+## Citation
+If you use this model or code, please cite:
+- This repository: *(add citation format / Zenodo DOI if minted)*
+- Associated publications: *(clinical trial results paper — forthcoming)*
+## Authors and Contributors
+- **Storm ID** (maintainers)
+## License
+This model and code are released under the **Apache 2.0** license.

pipeline.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+jobs:
+- job: 'build'
+  pool:
+    vmImage: 'ubuntu-latest'
+  steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '3.8'
+      architecture: 'x64'
+    displayName: 'Specify Python version'
+  - script: |
+      python -m pip install --upgrade pip
+    displayName: 'Install pip'
+  - script: |
+      pip install -r requirements.txt
+    displayName: 'Install CI dependencies'
+  - script: |
+      flake8
+    displayName: 'Run linting'

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ flake8

setup.cfg ADDED Viewed

	@@ -0,0 +1,7 @@

+[tool:pytest]
+filterwarnings =
+    ignore::DeprecationWarning
+[flake8]
+ignore = E501,W293,W292,W504
+exclude = .git,__pycache__,docs/source/conf.py,old,build,dist
+max-complexity = 10

training/README.MD ADDED Viewed

	@@ -0,0 +1,15 @@

+ Community exacerbations confirmed to 16/03/2021
+* Hospital exacerbations confirmed to 31/08/2021
+* How to use community events data post 16/03/2021
+    * Option 1: Use them
+    * Option 2: Don't use them and discard patient data for approx. 1 month around that event
+    * Option 3: Attempt to verify them in a more automated way (looking at prescribing data etc)
+* Rescue meds bnf codes
+ steroid_codes = ['0603020T0AAACAC','0603020T0AABKBK', '0603020T0AAAXAX',
+    '0603020T0AAAGAG','0603020T0AABHBH','0603020T0AAACAC','0603020T0AABKBK',
+    '0603020T0AABNBN', '0603020T0AAAGAG', '0603020T0AABHBH']
+    antib_codes = ['0501013B0AAAAAA', '0501013B0AAABAB', '0501030I0AAABAB',
+    '0501030I0AAAAAA', '0501050B0AAAAAA', '0501050B0AAADAD', '0501013K0AAAJAJ']

training/copd.py ADDED Viewed

	@@ -0,0 +1,644 @@

+"""Module containing code for model C (exacerbation prediction)."""
+import numpy as np
+import pandas as pd
+from lenusml import encoding
+def apply_logic_response_criterion(df, N=2, minimum_period=14, maximum_period=35):
+    """
+    Apply PRO LOGIC criterion 2 (consecutive negative Q5 replies required between events).
+    For events that occur after the minimum required period following a previous exac,
+    e.g. longer than 14 days, but before they are automatically considered as a new exac
+    event, e.g. 35 days, PRO LOGIC considers weekly PRO responses between the two events.
+    For subsequent events to count as separate events, there must be at least N
+    consecutive negative responses (no rescue meds taken) to weekly PROs between each
+    postive reply. Note PRO LOGIC is applied to both hospital and patient reported events.
+    Args:
+        df (pd.DataFrame): must contain columns for PatientId, DateOfEvent, Q5Answered,
+            NegativeQ5, IsExac and DaysSinceLastExac.
+        minimum_period (int): minimum number of days since the previous exac (any exacs
+            within this window will already be removed with PRO LOGIC criterion 1).
+            Default value is 14 days.
+        maximum_period (int): maximum number of days since the previous exac (any exacs
+            occurring after this period will automatically count as a separate event).
+            Default is 35 days.
+    Returns:
+        pd.DataFrame: input df with a new boolean column 'RemoveExac'.
+    """
+    # Retrieve dataframe indices of exacs falling under PRO LOGIC criterion 2 (Q5 replies)
+    indices = get_logic_exacerbation_indices(df, minimum_period=minimum_period,
+                                             maximum_period=maximum_period)
+    remove_exac = []
+    # Loop over each exac and evaluate PRO LOGIC criterion, returning 1 (remove) or 0
+    for exac_index in indices:
+        remove_exac.append(logic_consecutive_negative_responses(df, exac_index, N))
+    # Create dataframe containing exac indices and a boolean column stating whether to
+    # remove that exac due to failing Q5 response criterion and merge with original df
+    remove_exac = pd.DataFrame({'ind': indices, 'RemoveExac': remove_exac})
+    df = df.merge(remove_exac.set_index('ind'), left_index=True, right_index=True,
+                  how='left')
+    return df
+def bin_numeric_column(*, col, bins, labels):
+    """
+    Use pd.cut to bin numeric data into categories.
+    Args:
+        col (pd.Series): dataframe column to be binned.
+        bins (list): numeric values of bins.
+        labels (list): corresponding labels for the bins.
+    Returns:
+        pd.Series: binned column.
+    """
+    return pd.cut(col, bins=bins, labels=labels, right=False).astype('str')
+def calculate_days_since_last_event(*, df, event_col, output_col):
+    """
+    Calculate the days since the last event, e.g. exacerbation or rescue med prescription.
+    Restarts the count from one the day following an event. Any days without a
+    previous event have the output column set to -1
+    Args:
+        df (pd.DataFrame): dataframe with a column containing dates and a boolean column
+            stating whether an event occurred on that date.
+        event_col (str): name of the boolean column for whether an event occurred.
+    Returns:
+        df: the input dateframe with an additional column stating the number of days since
+            the previous event occurred (or -1 if no previous event).
+    """
+    # Get all events
+    all_events = df[df[event_col].eq(1)].copy()
+    all_events['PrevEvent'] = all_events.index
+    # Merge the full df with the event df on their indices to the closest date in the past
+    # i.e. the most recent exacerbation
+    df = pd.merge_asof(df, all_events['PrevEvent'],
+                       left_index=True, right_index=True,
+                       direction='backward')
+    # Calculate the days since the previous event, restarting the count from 1 the
+    # day following an exacerbation (using shift)
+    df[output_col] = df.index - df['PrevEvent'].shift(1)
+    # Set to -1 for any rows without a prior exacerbation
+    df[output_col] = df[output_col].fillna(-1).astype('int64')
+    df = df.drop(columns=['PrevEvent'])
+    return df
+def calculate_diff_from_rolling_mean(*, df, cols):
+    for col in cols:
+        df[col + '_diff'] = df[col] - df[col + '_ave']
+    return df
+def extract_clinician_verified_exacerbations(df):
+    """
+    Extract verified events from clinician verification spreadsheets.
+    Extract only clinician verified events from verification spreadsheets and set the date
+    to the clinician supplied date if entered. Include a flag column for if the date was
+    changed from the PRO question response date.
+    Args:
+        df (pd.DataFrame): event verification data supplied by clinicians.
+    Returns:
+        pd.DataFrame: contains StudyId, DateOfEvent (a mix of true event dates and PRO
+            response dates if true dates unknown), IsCommExac (set to 1 here, used
+            after merging later) and ExacDateUnknown (boolean, 1 if clinicians did not
+            change the date).
+    """
+    # Filter for only verified events
+    df = df[df['Exacerbation confirmed'] == 1].copy()
+    df['DateRecorded'] = pd.to_datetime(df.DateRecorded, utc=True).dt.normalize()
+    df['New Date'] = pd.to_datetime(df['New Date'], utc=True).dt.normalize()
+    # Change the event date to the clinician supplied date if entered. This is considered
+    # the true event date. Set the event date to the PRO response date otherwise and flag
+    # that the true date is unknown
+    df['DateOfEvent'] = np.where(df['Date changed'] == 1, df['New Date'],
+                                 df['DateRecorded'])
+    df['ExacDateUnknown'] = np.int64(np.where(df['Date changed'] == 1, 0, 1))
+    # Flag all events as community events (this df will merge with hospital events later)
+    df['IsCommExac'] = 1
+    df = df[['StudyId', 'DateOfEvent', 'IsCommExac', 'ExacDateUnknown']]
+    return df
+def define_hospital_admission(events):
+    """
+    Define whether a COPD service event was an admission and return 1 (yes) or 0 (no).
+    Args:
+        events (pd.DataFrame): events from COPD service previously merged with
+            PatientEventTypes.txt to get a column containing EventTypeId
+        event_name_col (str): name of column containing COPD service EventTypeId
+    Returns:
+        array: boolean stating whether an event was a hospital admission.
+    """
+    hospital_event_names = ['Hospital admission - emergency, COPD related',
+                            'Hospital admission - emergency, COPD unrelated']
+    return np.where(events.isin(hospital_event_names), 1, 0)
+def define_service_exac_event(*, events, event_name_col='EventName',
+                              include_community=False):
+    """State if a COPD service event was an exacerbation and return 1 (yes) or 0 (no).
+    Args:
+        events (pd.DataFrame): events from COPD service previously merged with
+            PatientEventTypes.txt to get a column containing EventTypeId
+        event_name_col (str): name of column containing COPD service EventTypeId
+        include_community (bool): whether to include event types corresponding to
+            patient reported exacerbations (e.g. community managed with rescue meds).
+            Defaults to False.
+    Returns:
+        array: boolean stating whether an event was an exacerbation.
+    """
+    if include_community is True:
+        exacerbation_event_names = ['Hospital admission - emergency, COPD related',
+                                    'Exacerbation - self-managed with rescue pack',
+                                    'GP review - emergency, COPD related',
+                                    'Emergency department attendance, COPD related',
+                                    'Exacerbation - started abs/steroid by clinical team']
+    else:
+        exacerbation_event_names = ['Hospital admission - emergency, COPD related',
+                                    'GP review - emergency, COPD related',
+                                    'Emergency department attendance, COPD related',
+                                    'Exacerbation - started abs/steroid by clinical team']
+    return np.where(events.isin(exacerbation_event_names), 1, 0)
+def fill_column_by_patient(*, df, id_col, col):
+    """
+    Forward and back fill data by patient to fill gaps, e.g. from merges.
+    Args:
+        df (pd.DataFrame): patient data. Must contain col and id_col columns.
+        id_col (str): name of column containing unique patient identifiers.
+        col (str): name of column to be filled.
+    Returns:
+        pd.DataFrame: input data with col infilled.
+    """
+    df[col] = df.groupby(id_col)[col].apply(lambda x: x.ffill().bfill())
+    return df
+def filter_symptom_diary(*, df, patients, date_cutoff=None):
+    """
+    Filter COPD symptom diary data for patients and dates of interest.
+    Args:
+        df (pd.DataFrame): symptom diary data. Must contain 'SubmissionTime' and
+            'PatientId' columns.
+        patients (list): patient IDs of interest.
+    Returns:
+        pd.DataFrame: filtered symptom diary.
+    """
+    df['SubmissionTime'] = pd.to_datetime(df.SubmissionTime, utc=True).dt.normalize()
+    # Take only data from after the cutoff if provided (e.g. weekly Q5 change)
+    if date_cutoff:
+        df = df[df.SubmissionTime >= date_cutoff]
+    # Filter for patients of interest
+    df = df[df.PatientId.isin(patients)]
+    return df
+def get_logic_exacerbation_indices(df, minimum_period=14, maximum_period=35):
+    """
+    Return dataframe indices of exacs that need checking for PRO reponses since last exac.
+    Get the indices of exacerbations that occur long enough after the previous event to
+    not be removed by PRO LOGIC criterion 1 (e.g. within 14 days of previous exac) but
+    not long enough after to be counted as a separate event without further analysis.
+    Called by apply_logic_response_criterion.
+    Args:
+        df (pd.DataFrame): must contain IsExac and DaysSinceLastExac columns.
+        minimum_period (int): minimum number of days since the previous exac (any exacs
+            within this window will already be removed with PRO LOGIC criterion 1).
+            Default value is 14 days.
+        maximum_period (int): maximum number of days since the previous exac (any exacs
+            occurring after this period will automatically count as a separate event).
+            Default is 35 days.
+    Returns:
+        list: dataframe indices of relevant events.
+    """
+    # Get the dataframe indices for all exacerbations occurring within period of interest
+    indices = df[(df.IsExac.eq(1)) & (df.DaysSinceLastExac > minimum_period) &
+                 (df.DaysSinceLastExac <= maximum_period)].index.to_list()
+    return indices
+def get_rescue_med_pro_responses(df):
+    """Extract all responses to weekly PRO Q5 (rescue meds).
+    Add new boolean columns stating if Q5 was answered, whether it was a negative response
+    (no rescue meds taken in previous week) and whether the reply means a community
+    exacerbation. The latter two columns will be opposites.
+    Args:
+        df (pd.DataFrame): PRO symptom diary responses.
+    Returns:
+        pd.DataFrame: filtered weekly PROs with additional boolean columns Q5Answered,
+            NegativeQ5 and IsCommExac.
+    """
+    # Extract responses to weekly PRO Q5 (rescue meds)
+    df = df[df.SymptomDiaryQ5.notna()].copy()
+    df['SymptomDiaryQ5'] = df['SymptomDiaryQ5'].astype('int64')
+    # Columns for whether Q5 was answered and if the response was negative (no exac)
+    df['Q5Answered'] = 1
+    df['NegativeQ5'] = np.int64(np.where(df.SymptomDiaryQ5 == 0, 1, 0))
+    # Define community exacerbation as a positive reply to Q5
+    df['IsCommExac'] = np.int64(np.where(df.SymptomDiaryQ5 == 1, 1, 0))
+    return df
+def logic_consecutive_negative_responses(df, i, N=2):
+    """
+    Calculate number of consecutive -ve Q5 replies since previous exac (PRO LOGIC).
+    Given the dataframe index of the current exac identified as falling under the Q5
+    criterion, calculate the number of negative replies to the weeky rescue med question
+    and check if there are enough for the event to count as distinct from the previous.
+    Called by apply_logic_response_criterion.
+    Args:
+        df (pd.DataFrame): must contain weekly PRO replies and output from
+            get_rescue_med_pro_responses, set_pro_exac_dates and
+            calculate_days_since_exacerbation.
+        i (int): index of exac of interest.
+        N (int): number fo consecutive negative rescue meds required for event to be
+            counted as a separate event and retained in data. Default is 2.
+    Returns:
+        int: flag for whether the exac failed the criterion. Returns 1 for failed (exac to
+            be removed) and 0 for passed (exac to be retained).
+    """
+    # Select data since the previous exacerbation
+    days = int(df.iloc[i].DaysSinceLastExac)
+    data = df.iloc[i - days + 1: i]
+    # Select replies to Q5
+    data = data[data.Q5Answered.eq(1)][['PatientId', 'DateOfEvent', 'Q5Answered',
+                                        'NegativeQ5']]
+    # Check if there are sufficient responses
+    if len(data) < N:
+        return 1
+    else:
+        # Resample to 7 days (weekly) to account for missing responses. Resampling using
+        # the 'W' option can give spurious nans - use '7D' instead
+        data = data.set_index('DateOfEvent').resample('7D',
+                                                      origin='start').sum().reset_index()
+        # Calculate number of consecutive negative replies to Q5 (no rescue meds taken)
+        consecutive_negative_responses = data[data.NegativeQ5.eq(1)][
+            'NegativeQ5'].groupby(data.NegativeQ5.eq(0).cumsum()).sum().reset_index(
+            drop=True).max()
+        return 1 if consecutive_negative_responses < N else 0
+def minimum_period_between_exacerbations(df, minimum_days=14):
+    """
+    Identify exacs occurring too soon after the previous exac based on DaysSinceLastExac.
+    Returns 1 if the exacerbation occurred within minimum_days of that patient's previous
+    exacerbation and 0 if not.
+    Args:
+        df (pd.DataFrame): must contain DaysSinceLastExac column.
+    Returns:
+        array: contains 1 or 0.
+    """
+    return np.where((df['DaysSinceLastExac'] > 0) &
+                    (df['DaysSinceLastExac'] <= minimum_days), 1, 0)
+def remove_data_between_exacerbations(df):
+    """
+    Remove data between first exac and subsequent exacs that failed PRO LOGIC criterion 2.
+    Ensures only the first in a series of related events are counted. Any subsequent exacs
+    that occurred too close to the initial event without sufficient negative weekly PRO
+    responses in the interim will be flagged for removal. This function removes flags for
+    removal all data from the day after the first event up to the date of events to be
+    removed. Data following the final event in the series will be removed by
+    minimum_period_between_exacerbations.
+    Args:
+        df (pd.DataFrame): must contain RemoveExac and DaysSinceLastExac columns.
+    Returns:
+        pd.DataFrame: days between first event and subsequent event(s) that failed the Q5
+            criterion are now flagged for removal in RemoveRow.
+    """
+    indices = df[df.RemoveExac.eq(1)].index.to_list()
+    # Check there are exacerbations that failed the logic criterion for N consecutive
+    # negative reponses to Q5 of weekly PROs (rescue meds)
+    if len(indices) > 0:
+        for exac_index in indices:
+            # Select data since the previous exacerbation
+            days = int(df.iloc[exac_index].DaysSinceLastExac)
+            # Set data since last exac up to and including current exac to be removed
+            df.loc[exac_index - days + 1: exac_index, 'RemoveRow'] = 1
+    return df
+def remove_unknown_date_exacerbations(df, days_to_remove=7):
+    """
+    Remove data prior to and including an exacerbation whose date is unknown.
+    Args:
+        df (pd.DataFrame): one row per day per patient for full data window. Must include
+           ExacDateUnknown column.
+        days_to_remove (int): number of days of data to remove leading up to (and
+            including) the PRO response date. Default is 7 days.
+    Returns:
+        pd.DataFrame: input dataframe with updated RemoveRow column.
+    """
+    # Get indices of all exacs whose dates are flagged as unknown.
+    indices = df[df.ExacDateUnknown.eq(1)].index.to_list()
+    # Check there are exacerbations with unknown dates (answer=1 in SymptomDiaryQ11a)
+    if len(indices) > 0:
+        for exac_index in indices:
+            # Set specified number of previous days data up to and including current exac
+            # to be removed
+            df.loc[exac_index - days_to_remove + 1: exac_index, 'RemoveRow'] = 1
+    return df
+def rolling_mean_previous_period(*, df, cols, date_col, id_col, window):
+    """
+    Resample data for each patient to daily and calculate rolling mean over window.
+    Uses daily resampling due to strange behaviour with weekly/yearly resampling and
+    calculating rolling quantities with missing/NaN entries in the series. Calculates the
+    rolling mean over the window (e.g. 365 days) and shifts the data so that each date
+    contains the rolling mean for the previous period, e.g. a rolling 365 day mean
+    includes data for the previous 365 days and not the current date. This will exclude
+    the current exacerbation or hospital admission from the counts.
+    Args:
+        df (pd.DataFrame): data of interest. Must contain specified col, date_col and
+            id_col columns.
+        cols (str): name of columns on which to calculate rolling mean.
+        date_col (str): name of columns containing dates (will be set as index for
+            aggregation)
+        id_col (str): name of column containing unique patient identifiers.
+        window (int): length of rolling window in days. Use window = 7 for weekly mean and
+            window = 365 for yearly mean.
+    Returns:
+        pd.DataFrame: input dataframe with columns for rolling means of cols added with an
+            '_ave_' suffix in the column names.
+    """
+    # Copy the original df to retain all columns and dates
+    df_copy = df.copy()
+    # Resample the time series to daily records per patient
+    rolling_mean = df_copy.set_index(date_col).groupby(id_col)[cols].resample(
+        'D').mean().reset_index()
+    # Calculate the rolling mean over the specified window (in days)
+    rolling_mean = rolling_mean.set_index(date_col).groupby(id_col)[cols].rolling(
+        window=window, min_periods=1).mean().reset_index()
+    # Shift the series to exclude the current day
+    rolling_mean[cols] = rolling_mean.groupby(id_col)[cols].shift(1)
+    # # Add a suffix to the column name to denote it is an aggregation
+    rolling_mean = rolling_mean.rename(
+        columns={col: col + '_ave' for col in rolling_mean.columns if col in cols})
+    return rolling_mean
+def rolling_sum_previous_period(*, df, col, date_col, id_col, window, output_col):
+    """
+    Resample data for each patient to daily and calculate rolling sum over window.
+    Uses daily resampling due to strange behaviour with weekly/yearly resampling and
+    calculating rolling quantities with missing/NaN entries in the series. Calculates the
+    rolling sum over the window (e.g. 365 days) and shifts the data so that each date
+    contains the rolling sum for the previous period, e.g. a rolling 365 day sum includes
+    data for the previous 365 days and not the current date. This will exclude the current
+    exacerbation or hospital admission from the counts.
+    Args:
+        df (pd.DataFrame): data of interest. Must contain specified col, date_col and
+            id_col columns.
+        col (str): name of column on which to calculate rolling sum.
+        date_col (str): name of columns containing dates (will be set as index for
+            aggregation)
+        id_col (str): name of column containing unique patient identifiers.
+        window (int): length of rolling window in days. Use window = 7 for weekly sums and
+            window = 365 for yearly sums.
+        output_col (str): name of rolling sum column in output dataframe.
+    Returns:
+        pd.DataFrame: input dataframe with column for rolling sum of col added.
+    """
+    # Copy the original df to retain all columns and dates
+    df_copy = df.copy()
+    # Resample the time series to daily records per patient
+    rolling_sum = df_copy.set_index(date_col).groupby(id_col)[col].resample(
+        'D').sum().reset_index()
+    # Calculate the rolling sum over the specified window (in days)
+    rolling_sum = rolling_sum.set_index(date_col).groupby(id_col)[col].rolling(
+        window=window, min_periods=1).sum().reset_index()
+    # Shift the series to exclude the current day
+    rolling_sum[col] = rolling_sum.groupby(id_col)[col].shift(1).fillna(0)
+    # Rename the aggregate column as specified
+    rolling_sum = rolling_sum.rename(columns={col: output_col})
+    # Merge back onto the original df
+    df = df.merge(rolling_sum, on=[id_col, date_col], how='left')
+    df[output_col] = df[output_col].astype('int64')
+    return df
+def set_prediction_window(*, df, prediction_window):
+    """
+    Set the prediction window to N days by setting IsExac=1 for N-1 days prior to events.
+    Set the prediction window to prediction_window (N) days, e.g. for N = 3, change the
+    IsExac label to 1 for the two days prior to the stated exac date to give three days of
+    exacerbation. The labels now represent whether an exac occurs within N days of the
+    prediction date rather than the exact date only.
+    Args:
+        df (pd.DataFrame): must contain IsExac column. Should contain the final list of
+            exacerbation events to be used for modelling.
+        prediction_window (int): length of model prediction window in days.
+    Returns:
+        pd.DataFrame: input data frame with extended exacerbation window.
+    """
+    # Get indices of all exacerbations
+    indices = df[df.IsExac.eq(1)].index.to_list()
+    # Check there are exacerbations in the data and process if so
+    if len(indices) > 0:
+        for exac_index in indices:
+            # Set specified number of previous days data up to and including current exac
+            # to be exacerbations
+            df.loc[exac_index - prediction_window + 1: exac_index, 'IsExac'] = 1
+    return df
+def set_pro_exac_dates(df):
+    """
+    Set date of community exacerbations reported in weekly PROs Q5 and flag unknown dates.
+    Args:
+        df (pd.DataFrame: processed weekly PROs Q5 respnses, e.g. output of
+            get_rescue_med_pro_responses
+    Returns:
+        pd.DataFrame: input dataframe with additional columns for DateOfEvent (datetime)
+            and ExacDateUnknown (0 or 1).
+    """
+    # Take known exacerbation (rescue med) dates from SymptomDiaryQ11b, otherwise set the
+    # date to the date of PRO response
+    df['DateOfEvent'] = np.where(df.SymptomDiaryQ11a == 2, df.SymptomDiaryQ11b,
+                                 df.SubmissionTime)
+    # Flag which dates were unknown from the PRO response
+    df['ExacDateUnknown'] = np.int64(np.where((df.IsCommExac == 1) &
+                                              (df.SymptomDiaryQ11a != 2), 1, 0))
+    df['DateOfEvent'] = pd.to_datetime(df.DateOfEvent, utc=True).dt.normalize()
+    df = df.drop_duplicates(keep='last', subset=['PatientId', 'DateOfEvent'])
+    return df
+def triple_inhaler_therapy_service(*, df, id_col, inhaler_col, include_mitt=False):
+    """
+    Create boolean (1/0) feature for whether a patient is taking triple inhaler therapy.
+    Option to include Single Inhaler Triple Therapy (SITT, default) only or also include
+    Multiple Inhaler Triple Therapy (MITT). SITT therapies are 'LAMA +LABA-ICS' and
+    'LABA-LAMA-ICS'. MITT therapy is 'LAMA' + 'LABA-ICS'.
+    Args:
+        df (pd.DataFrame): dataframe containing list of inhaler names against patient IDs.
+        id_col (str): name of patient ID column.
+        inhaler_col (str): name of column containing inhaler types in the format of the
+            COPD service data, e.g. LAMA, LABA, LABA-LAMA-ICS, LAMA +LABA-ICS etc.
+        include_mitt (bool): whether to include Multiple Inhaler Triple Therapy (MITT).
+    Returns:
+        pd.DataFrame: input df with added boolean (1/0) feature for whether the patient is
+            taking triple inhaler therapy as defined by SITT (default) and MITT.
+    """
+    # Drop any duplicate entries
+    df = df.drop_duplicates()
+    # Pivot the table to one row per ID
+    df = df.pivot(index=id_col, columns=inhaler_col,
+                  values=inhaler_col).reset_index().rename_axis(None, axis=1)
+    # Create columns for any service inhaler types not present in the data
+    types = ['LABA-LAMA', 'LAMA', 'LABA-ICS', 'LAMA +LABA-ICS', 'LABA-LAMA-ICS', 'LABA']
+    for inhaler in types:
+        if inhaler not in df:
+            df[inhaler] = np.nan
+    # Create column for triple inhaler therapies (SITT)
+    df['TripleTherapy'] = np.int64(np.where(
+        ~df['LABA-LAMA-ICS'].isna() | ~df['LAMA +LABA-ICS'].isna(), 1, 0))
+    # Modify SITT column to also include MITT if needed
+    if include_mitt is True:
+        df['TripleTherapy'] = np.int64(np.where(
+            ~df['LAMA'].isna() & ~df['LABA-ICS'].isna(), 1, df['TripleTherapy']))
+    df = df[[id_col, 'TripleTherapy']]
+    return df
+def unit_lookup(units):
+    """Convert Lenus platform unit codes to human readable units.
+    Args:
+        units (pd.Series): Lenus platform unit codes for a measurement
+    Returns:
+        array: human readable measurement units.
+    """
+    units_lookup = {0: 'Count',
+                    1: 'CountPerSecond',
+                    2: 'InternationalUnit',
+                    3: 'Joule',
+                    4: 'Kelvin',
+                    5: 'Kilogram',
+                    6: 'KilogramPerLiter',
+                    7: 'KilogramPerSquareMeter',
+                    8: 'Liter',
+                    9: 'LiterPerKilogramSecond',
+                    10: 'LiterPerSecond',
+                    11: 'Meter',
+                    12: 'Pascal',
+                    13: 'Percent',
+                    14: 'Second',
+                    15: 'Siemen',
+                    16: 'Undefined'}
+    # Replace the unit code with its description if in the look up table, otherwise
+    # return 'Undefined'
+    units = np.where(units.isin(list(units_lookup.keys())), units.replace(units_lookup),
+                     'Undefined')
+    return units
+def kfold_encode_train_data(*, df, id_col, fold_patients, cols_to_encode, target):
+    """
+    K-fold target encoding of train data.
+    Fold by fold target encoding of train data is used to prevent data leakage in cross-
+    validation (the same folds are used for encoding and CV). For example, in 10-fold
+    target encoding, each fold is encoded using the other nine folds and that fold is
+    then used as the validation fold in CV.
+    The complete train data set is used to target encode the holdout test data set.
+    Parameters
+    ----------
+    val_fold : dataframe
+        validation data to be target encoded. This could be one of the K folds used in
+        cross-validation or the holdout test set.
+    cols_to_encode : list of strings
+        names of columns to be encoded.
+    target : str
+        name of the target variable column.
+    Returns
+    -------
+    val_fold : dataframe
+        target encoded validation fold (or holdout test set).
+    encodings_all : dict
+        encodings used for each column.
+    """
+    # Encode the train data fold by fold
+    appended_data = []
+    # Loop over folds and target encode
+    for i, fold in enumerate(fold_patients):
+        print("Fold ", i)
+        val_fold_data = df[df[id_col].isin(fold)]
+        train_fold_data = df[~df[id_col].isin(fold)]
+        encoded_fold_data, encodings = encoding.encode_validation_fold(
+            val_fold=val_fold_data, train_folds=train_fold_data,
+            cols_to_encode=cols_to_encode, target=target)
+        appended_data.append(encoded_fold_data)
+    # Reconstruct full dataframe
+    df_encoded = pd.concat(appended_data)
+    df_encoded.reset_index(inplace=True, drop=True)
+    return df_encoded

training/create_sh_lookup_table.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import pandas as pd
+data_dir = '<YOUR_DATA_PATH>/'
+# Read lookups for RECEIVER
+# receiver = pd.read_csv(os.path.join(data_dir, 'Receiver_IDs', 'COHORT_CONSENTED_2.csv'))
+# receiver = receiver.rename(columns={'Study number': 'StudyId'})
+# receiver = receiver[['SafeHavenID', 'StudyId']]
+receiver = pd.read_csv(os.path.join(data_dir, 'Cohort3Rand.csv'))
+receiver = receiver.rename(columns={'RNo': 'StudyId'})
+# Read lookups for scale up
+scaleup = pd.read_csv(os.path.join(data_dir, 'SU_IDs', 'Scale_Up_lookup.csv'))
+scaleup = scaleup.rename(columns={'Study_Number': 'StudyId'})
+# Concatenate tables and drop missing SH IDs (some study patients not in data extract)
+all_patients = pd.concat([receiver, scaleup]).dropna()
+# Save final mapping between StudyId and SafeHavenID
+all_patients.to_pickle(os.path.join(data_dir, 'sh_to_studyid_mapping.pkl'))
+# Check for matching age and sex between SafeHaven and Lenus data (mapping sanity check)
+lenus_demographics = pd.read_csv(os.path.join(data_dir, 'copd-dataset',
+                                 'CopdDatasetPatientDetails.txt'),
+                                 usecols=['StudyId', 'DateOfBirth', 'Sex'], sep='|')
+sh_demographics = pd.read_csv(os.path.join(data_dir, 'EXAMPLE_STUDY_DATA',
+                              'Demographics_Cohort4.csv'),
+                              usecols=['SafeHavenID', 'SEX', 'OBF_DOB'])
+sh_demographics['OBF_DOB'] = pd.to_datetime(
+    sh_demographics['OBF_DOB'], utc=True).dt.normalize()
+mapping = all_patients.merge(sh_demographics, on='SafeHavenID', how='inner')
+mapping = mapping.merge(lenus_demographics, on='StudyId', how='inner')
+# Check patient sex matches
+mapping[mapping.SEX != mapping.Sex]
+# There is one mismatch
+all_patients[all_patients.duplicated(subset='SafeHavenID')]
+# Check patient DOB matches
+mapping[mapping.OBF_DOB != mapping.DateOfBirth]

training/cross_validation.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""Perform CV (with explainability) on different feature sets and log to mlflow.
+Includes functionality to nest runs under parent run (e.g. different feature sets
+under a main run) and set a decision threshold for model scores. Logs the following
+artifacts as well as metrics and parameters:
+1. List of model features
+2. Feature correlation matrix
+3. Global explainability (averaged over K folds)
+4. Cumulative gains curve
+5. Lift curve
+6. Probability distributions with KDE
+"""
+from imblearn.ensemble import BalancedRandomForestClassifier
+from lenusml import splits, crossvalidation, plots
+import numpy as np
+import os
+import pandas as pd
+from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
+import mlflow
+import matplotlib.pyplot as plt
+# from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
+def get_crossvalidation_importance(*, feature_names, crossval):
+    """
+    Create dataframe of mean global feature importance for all EBMs used in CV.
+    Args:
+        feature_names (list): list of model feature names
+        crossval (dict): output of cross_validation_return_estimator_and_scores
+    Returns:
+        pd.DataFrame: contains feature names, global importance for each of the K
+            estimators, mean importance across the estimators and scaled mean importance
+            relative to the most important feature.
+    """
+    # Obtain global importance from each EBM used in cross validation
+    for i, est in enumerate(crossval['estimator']):
+        exp_global = crossval['estimator'][i].feature_importances_
+        explanations = pd.DataFrame([feature_names, exp_global]).T
+        explanations.columns = ['Feature', 'Score_{}'.format(i)]
+        # Create dataframe with global feature importances for all K estimators
+        if i == 0:
+            explanations_all = explanations.copy()
+        else:
+            explanations_all = explanations_all.merge(explanations, on='Feature')
+    # Average the importances across all models
+    explanations_all['Mean'] = explanations_all.drop(columns=['Feature']).mean(axis=1)
+    explanations_all = explanations_all.sort_values('Mean', ascending=False)
+    # Create a scaled mean importance relative to the most imprtant feature
+    explanations_all['Mean_scaled'] = explanations_all['Mean'] /\
+        explanations_all['Mean'].abs().max()
+    return explanations_all
+data_dir = '../data/models/model1/'
+cohort_info_dir = '../data/cohort_info/'
+output_dir = '../data/models/model1/output'
+# Load CV folds and train data
+fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
+                        allow_pickle=True)
+train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl'))
+# Cross check fold patients with train data
+cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients,
+                                                              id_column='StudyId',
+                                                              train_data=train_data)
+mlflow.set_tracking_uri("sqlite:///mlruns.sqlite")
+mlflow.set_experiment('model_drop2')
+# Set CV scoring strategies and any model parameters
+scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
+           'average_precision']
+####
+# Feature drop out here
+#####
+# Create list of model features
+cols_to_drop = ['StudyId', 'IsExac']
+features_list = [col for col in train_data.columns if col not in cols_to_drop]
+# Separate features from target
+features = train_data[features_list].astype('float')
+target = train_data.IsExac.astype('float')
+# Save the list of features and a correlation heatmap to the artifacts directory (to be
+# logged in mlflow)
+artifact_dir = './tmp'
+# Create the artifacts directory if it doesn't exist
+os.makedirs(artifact_dir, exist_ok=True)
+# Remove any existing directory contents to not mix files between different runs
+for f in os.listdir(artifact_dir):
+    os.remove(os.path.join(artifact_dir, f))
+np.savetxt(os.path.join(artifact_dir, 'features.txt'), features_list,
+           delimiter=",", fmt='%s')
+plots.plot_feature_correlations(features=features, figsize=(
+                                len(features_list) // 2, len(features_list) // 2),
+                                savefig=True, output_dir=artifact_dir,
+                                figname='features_correlations.png')
+# # Get the run_id of the best model from hyperparameter tuning and its parameters
+# best_run = mlflow.search_runs([8], order_by=["metrics.precision DESC"]).iloc[0].run_id
+# best_params = mlflow.get_run(best_run).data.params
+# best_params
+# params = {'inner_bags': 1,
+#           'interactions': 4,
+#           'learning_rate': 0.0012416471483555312,
+#           'max_leaves': 12,
+#           'max_rounds': 5000,
+#           'min_samples_leaf': 5,
+#           'outer_bags': 3,
+#           'random_state': 0}
+with mlflow.start_run(run_name='eosinophil_count_0.3_threshold'):
+    # runid = mlflow.active_run().info.run_id
+    # with mlflow.start_run(run_name='simplified_with_nanox', nested=True,
+    #                       tags={MLFLOW_PARENT_RUN_ID: runid}):
+    # Use the parameters from the best model in previous cross validation
+    model = BalancedRandomForestClassifier(random_state=0)
+    # crossval = cross_validate(model, features, target,
+    #                           cv=cross_validation_fold_indices,
+    #                           return_estimator=True, scoring=scoring)
+    # Perform K-fold cross validation with custom folds
+    # Set the probability threshold here if required
+    crossval, model_scores =\
+        crossvalidation.cross_validation_return_estimator_and_scores(
+            model=model, features=features,
+            target=target,
+            fold_indices=cross_validation_fold_indices)
+    # Log metrics averaged across folds
+    for score in scoring:
+        mlflow.log_metric(score, np.mean(crossval['test_' + score]))
+    # Log model parameters
+    params = model.get_params()
+    for param in params:
+        mlflow.log_param(param, params[param])
+    # Calculate average global feature importances across K models
+    explainability = get_crossvalidation_importance(feature_names=features_list,
+                                                    crossval=crossval)
+    explainability.to_csv(os.path.join(artifact_dir,
+                          'global_feature_importances.csv'), index=False)
+    plots.plot_global_explainability_cv(importances=explainability,
+                                        scaled=True,
+                                        figsize=(
+                                            len(features_list) // 2.5,
+                                            len(features_list) // 6),
+                                        savefig=True, output_dir=artifact_dir)
+    # Plot lift and cumulative gains curves
+    plots.plot_lift_curve(scores=model_scores, savefig=True, output_dir=artifact_dir,
+                          figname='cumulative_gains_curve.png')
+    plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True,
+                                      output_dir=artifact_dir,
+                                      figname='lift_curve.png')
+    # Plot distribution of model scores (histogram plus KDE)
+    plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac',
+                                  negative_class_name='No exac', savefig=True,
+                                  output_dir=artifact_dir,
+                                  figname='model_score_distribution.png')
+    # Plot CV confusion matrices with different decision thresholds
+    for threshold in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
+        plots.plot_confusion_matrix(
+            target_true=model_scores.true_label,
+            target_predicted=np.where(model_scores.model_score > threshold, 1, 0),
+            classes=['No exac', 'Exac'], savefig=True,
+            output_dir=artifact_dir,
+            figname='confusion_matrix_{}.png'.format(threshold))
+    # Plot the ROC and Precision-Recall curves
+    fig, ax = plt.subplots(figsize=(8, 6))
+    RocCurveDisplay.from_predictions(y_true=model_scores.true_label,
+                                     y_pred=model_scores.model_score, ax=ax)
+    ax.set_xlabel('False Positive Rate')
+    ax.set_ylabel('True Positive Rate')
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    plt.savefig(os.path.join(artifact_dir, 'roc_curve.png'), dpi=150)
+    plt.close()
+    fig, ax = plt.subplots(figsize=(8, 6))
+    PrecisionRecallDisplay.from_predictions(y_true=model_scores.true_label,
+                                            y_pred=model_scores.model_score, ax=ax)
+    ax.set_xlabel('Recall')
+    ax.set_ylabel('Precision')
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    plt.savefig(os.path.join(artifact_dir, 'precision_recall_curve.png'), dpi=150)
+    plt.close()
+    # Log artifacts
+    mlflow.log_artifacts(artifact_dir)
+    mlflow.end_run()
+# mlflow.end_run()

training/cross_validation_algorithms.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""Perform cross validation using a variety of algorithms."""
+import os
+import pandas as pd
+import numpy as np
+from lenusml import splits, plots
+# Model training and evaluation
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import cross_validate, cross_val_predict
+from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
+from interpret.glassbox import ExplainableBoostingClassifier
+import lightgbm as lgb
+import xgboost as xgb
+import mlflow
+data_dir = '../data/models/model1/'
+cohort_info_dir = '../data/cohort_info/'
+output_dir = '../data/models/model1/output'
+# Load CV folds and train data
+fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
+                        allow_pickle=True)
+train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl'))
+# Cross check fold patients with train data
+cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients,
+                                                              train_data=train_data,
+                                                              id_column='StudyId')
+# Create list of model features
+cols_to_drop = ['StudyId', 'IsExac']
+features_list = [col for col in train_data.columns if col not in cols_to_drop]
+# Separate features from target
+features = train_data[features_list].astype('float')
+target = train_data.IsExac.astype('float')
+scale_pos_weight = target.value_counts()[0] / target.value_counts()[1]
+mlflow.set_tracking_uri("sqlite:///mlruns.sqlite")
+mlflow.set_experiment('model_drop2')
+# Set CV scoring strategies and any model parameters
+scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
+           'average_precision', 'neg_brier_score']
+scale_pos_weight = target.value_counts()[0] / target.value_counts()[1]
+models = []
+models.append((RandomForestClassifier(random_state=0), 'random_forest'))
+models.append((RandomForestClassifier(random_state=0, class_weight='balanced'),
+               'random_forest_class_weight'))
+models.append((BalancedBaggingClassifier(random_state=0),
+               'balanced_bagging'))
+models.append((BalancedRandomForestClassifier(random_state=0), 'balanced_random_forest'))
+models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
+               eval_metric='logloss'), 'xgb'))
+models.append((lgb.LGBMClassifier(random_state=0), 'lgbm'))
+models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
+               eval_metric='logloss', scale_pos_weight=scale_pos_weight), 'xgb_spw'))
+models.append((ExplainableBoostingClassifier(random_state=0), 'ebm'))
+with mlflow.start_run(run_name='model_selection'):
+    # Perform K-fold cross validation with custom folds
+    for model in models:
+        with mlflow.start_run(run_name=model[1], nested=True):
+            # Create the artifacts directory if it doesn't exist
+            artifact_dir = './tmp'
+            os.makedirs(artifact_dir, exist_ok=True)
+            # Remove any existing directory contents to not mix files between different
+            # runs
+            for f in os.listdir(artifact_dir):
+                os.remove(os.path.join(artifact_dir, f))
+            crossval = cross_validate(model[0], features, target,
+                                      cv=cross_validation_fold_indices,
+                                      return_estimator=True, scoring=scoring)
+            # Get the predicted probabilities from each models
+            probabilities_cv = cross_val_predict(model[0], features, target,
+                                                 cv=cross_validation_fold_indices,
+                                                 method='predict_proba')[:, 1]
+            model_scores = pd.DataFrame({'model_score': probabilities_cv,
+                                        'true_label': target})
+            # Log metrics averaged across folds
+            for score in scoring:
+                mlflow.log_metric(score, crossval['test_' + score].mean())
+            # Log model parameters
+            params = model[0].get_params()
+            for param in params:
+                mlflow.log_param(param, params[param])
+            plots.plot_lift_curve(scores=model_scores, savefig=True,
+                                  output_dir=artifact_dir, figname='lift_curve.png')
+            plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True,
+                                              output_dir=artifact_dir,
+                                              figname='cumulative_gains_curve.png')
+            # Plot distribution of model scores (histogram plus KDE)
+            plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac',
+                                          negative_class_name='No exac', savefig=True,
+                                          output_dir=artifact_dir,
+                                          figname='model_score_distribution.png')
+            # Log artifacts
+            mlflow.log_artifacts(artifact_dir)
+mlflow.end_run()

training/cross_validation_calibration.py ADDED Viewed

	@@ -0,0 +1,260 @@

+"""Perform model calibration in CV on different algorithms and log to mlflow.
+Nests runs for different algos under parent run and logs the following
+artifacts as well as metrics and parameters:
+1. Calibration curves for each child algo run (calibration in CV and calibration on
+   holdout test after applying isotonic and sigmoid calibration)
+2. Calibration curve under parent run to compare all algos in CV and post calibration
+3. Cumulative gains curve
+4. Lift curve
+5. Probability distributions with KDE (CV)
+"""
+import matplotlib.pyplot as plt
+import matplotlib.lines as mlines
+from lenusml import splits, plots
+import numpy as np
+import os
+import pandas as pd
+from sklearn.model_selection import cross_val_predict, cross_validate
+from sklearn.calibration import calibration_curve, CalibratedClassifierCV
+from sklearn.linear_model import LogisticRegression
+from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
+from sklearn.ensemble import RandomForestClassifier
+import xgboost as xgb
+import lightgbm as lgb
+from interpret.glassbox import ExplainableBoostingClassifier
+import mlflow
+from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
+data_dir = '../data/models/model1/'
+cohort_info_dir = '../data/cohort_info/'
+output_dir = '../data/models/model1/output'
+# Load CV folds and train data
+fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
+                        allow_pickle=True)
+train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl'))
+test_data = pd.read_pickle(os.path.join(data_dir, 'test_data.pkl'))
+# Cross check fold patients with train data
+cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients,
+                                                              train_data=train_data,
+                                                              id_column='StudyId')
+mlflow.set_tracking_uri("sqlite:///mlruns.sqlite")
+mlflow.set_experiment('model_drop2')
+# Set CV scoring strategies and any model parameters
+scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
+           'average_precision', 'neg_brier_score']
+def plot_calibration_curves(calibration_curves, savefig=True, output_dir=None,
+                            figname=None, figsize=(8, 7)):
+    fig, ax = plt.subplots(figsize=figsize)
+    # reference line, legends, and axis labels
+    line = mlines.Line2D([0, 1], [0, 1], color='black')
+    transform = ax.transAxes
+    line.set_transform(transform)
+    ax.add_line(line)
+    fig.suptitle('Calibration plot')
+    ax.set_xlabel('Predicted probability')
+    ax.set_ylabel('True probability in each bin')
+    color = iter(plt.cm.rainbow(np.linspace(0, 1, len(calibration_curves))))
+    for cal_curve in calibration_curves:
+        c = next(color)
+        plt.plot(cal_curve[0][1], cal_curve[0][0], marker='o', c=c, linewidth=1,
+                 label=cal_curve[1])
+    plt.xlim(0, 1)
+    plt.ylim(0, 1)
+    plt.legend(frameon=False, bbox_to_anchor=(1, 1), loc="upper left")
+    plt.tight_layout()
+    if savefig:
+        plt.savefig(os.path.join(output_dir, figname))
+def plot_calibration_curves_algo(calibration_curves, savefig=True, output_dir=None,
+                                 figname=None, figsize=(8, 7)):
+    fig, ax = plt.subplots(figsize=figsize)
+    # reference line, legends, and axis labels
+    line = mlines.Line2D([0, 1], [0, 1], color='black')
+    transform = ax.transAxes
+    line.set_transform(transform)
+    ax.add_line(line)
+    fig.suptitle('Calibration plot')
+    ax.set_xlabel('Predicted probability')
+    ax.set_ylabel('True probability in each bin')
+    for cal_curve in calibration_curves:
+        plt.plot(cal_curve[0][1], cal_curve[0][0], marker='o', linewidth=1,
+                 label=cal_curve[1])
+    plt.xlim(0, 1)
+    plt.ylim(0, 1)
+    plt.legend(frameon=False)
+    plt.tight_layout()
+    if savefig:
+        plt.savefig(os.path.join(output_dir, figname))
+# Create list of model features
+cols_to_drop = ['StudyId', 'IsExac']
+# Get the features list from the preferred model
+with open('./mlruns/2/7ebf60a5d17f49d9a79e41dd72dda858/artifacts/features.txt') as f:
+    features_list = f.read().splitlines()
+# Separate features from target
+features_train = train_data[features_list].astype('float')
+target_train = train_data.IsExac.astype('float')
+features_test = test_data[features_list].astype('float')
+target_test = test_data.IsExac.astype('float')
+artifact_dir = './tmp'
+# Create the artifacts directory if it doesn't exist
+os.makedirs(artifact_dir, exist_ok=True)
+# Remove any existing directory contents to not mix files between different runs
+for f in os.listdir(artifact_dir):
+    os.remove(os.path.join(artifact_dir, f))
+scale_pos_weight = target_train.value_counts()[0] / target_train.value_counts()[1]
+# Create list of algos to try
+models = []
+models.append((LogisticRegression(random_state=0, max_iter=200), 'LR'))
+models.append((LogisticRegression(random_state=0, class_weight='balanced', max_iter=200),
+               'LR_CW_balanced'))
+models.append((lgb.LGBMClassifier(random_state=0), 'LGBM'))
+models.append((BalancedBaggingClassifier(random_state=0),
+               'Balanced_bagging'))
+models.append((BalancedRandomForestClassifier(random_state=0), 'Balanced_RF'))
+models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
+               eval_metric='logloss'), 'XGB'))
+models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
+               eval_metric='logloss', scale_pos_weight=scale_pos_weight), 'XGB_SPW'))
+models.append((ExplainableBoostingClassifier(random_state=0), 'EBM'))
+models.append((RandomForestClassifier(random_state=0), 'RF'))
+models.append((RandomForestClassifier(random_state=0, class_weight='balanced'),
+               'RF_CW_balanced'))
+calibration_curves_cv = []
+calibration_curves_sigmoid = []
+calibration_curves_isotonic = []
+cal_curve_strategy = 'uniform'
+with mlflow.start_run(run_name='sklearn_calibration_in_cv_uniform_bins'):
+    # Perform K-fold cross validation
+    runid = mlflow.active_run().info.run_id
+    for model in models:
+        with mlflow.start_run(run_name=model[1], nested=True,
+                              tags={MLFLOW_PARENT_RUN_ID: runid}):
+            # Remove any existing directory contents to not mix files between different
+            # runs
+            for f in os.listdir(artifact_dir):
+                os.remove(os.path.join(artifact_dir, f))
+            calibration_curves_algo = []
+            crossval = cross_validate(model[0], features_train, target_train,
+                                      cv=cross_validation_fold_indices,
+                                      return_estimator=True, scoring=scoring,
+                                      error_score='raise')
+            probabilities_cv = cross_val_predict(model[0], features_train, target_train,
+                                                 cv=cross_validation_fold_indices,
+                                                 method='predict_proba')[:, 1]
+            model_scores = pd.DataFrame({'model_score': probabilities_cv,
+                                         'true_label': target_train})
+            model_scores = model_scores.sort_values(by='model_score', ascending=False)
+            # Extract calibration curve
+            calibration_curves_cv.append((calibration_curve(target_train,
+                                          probabilities_cv, n_bins=10,
+                                          strategy=cal_curve_strategy), model[1]))
+            # Log metrics averaged across folds
+            for score in scoring:
+                mlflow.log_metric(score, np.mean(crossval['test_' + score]))
+            # Log model parameters
+            params = model[0].get_params()
+            for param in params:
+                mlflow.log_param(param, params[param])
+            # Calibrate model in CV
+            calibrated_sigmoid = CalibratedClassifierCV(model[0], method='sigmoid',
+                                                        cv=cross_validation_fold_indices)
+            calibrated_sigmoid.fit(features_train, target_train)
+            probabilities_sigmoid = calibrated_sigmoid.predict_proba(features_test)[:, 1]
+            calibrated_isotonic = CalibratedClassifierCV(model[0], method='isotonic',
+                                                         cv=cross_validation_fold_indices)
+            calibrated_isotonic.fit(features_train, target_train)
+            probabilities_isotonic = calibrated_isotonic.predict_proba(
+                features_test)[:, 1]
+            # Extract calibration curve
+            calibration_curves_sigmoid.append((calibration_curve(target_test,
+                                               probabilities_sigmoid, n_bins=10,
+                                               strategy=cal_curve_strategy),
+                                               model[1] + ' sigmoid'))
+            calibration_curves_isotonic.append((calibration_curve(target_test,
+                                                probabilities_isotonic, n_bins=10,
+                                                strategy=cal_curve_strategy),
+                                                model[1] + ' isotonic'))
+            calibration_curves_algo.append((calibration_curve(target_train,
+                                            probabilities_cv, n_bins=10,
+                                            strategy=cal_curve_strategy),
+                                            model[1] + ' uncalibrated'))
+            calibration_curves_algo.append((calibration_curve(target_test,
+                                            probabilities_sigmoid, n_bins=10,
+                                            strategy=cal_curve_strategy),
+                                            model[1] + ' sigmoid'))
+            calibration_curves_algo.append((calibration_curve(target_test,
+                                            probabilities_isotonic, n_bins=10,
+                                            strategy=cal_curve_strategy),
+                                            model[1] + ' isotonic'))
+            # Plot cumulative gains curves
+            plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True,
+                                              output_dir=artifact_dir,
+                                              figname='cumulative_gains_curve.png')
+            # Plot lift curves
+            plots.plot_lift_curve(scores=model_scores, savefig=True,
+                                  output_dir=artifact_dir, figname='lift_curve.png')
+            # Plot distribution of model scores (histogram plus KDE)
+            plots.plot_score_distribution(scores=model_scores,
+                                          postive_class_name='Exac',
+                                          negative_class_name='No exac', savefig=True,
+                                          output_dir=artifact_dir,
+                                          figname='model_score_distribution.png')
+            # Plot calibration curves for each algo
+            plot_calibration_curves_algo(calibration_curves=calibration_curves_algo,
+                                         savefig=True, output_dir=artifact_dir,
+                                         figname='calibration_curves.png',
+                                         figsize=(8, 7))
+            # Log artifacts under child runs
+            mlflow.log_artifacts(artifact_dir)
+            mlflow.end_run()
+# Log artifacts under parent run
+for f in os.listdir(artifact_dir):
+    os.remove(os.path.join(artifact_dir, f))
+plot_calibration_curves(calibration_curves=calibration_curves_cv, savefig=True,
+                        output_dir=artifact_dir,
+                        figname='calibration_curves_cv.png', figsize=(15, 10))
+plot_calibration_curves(calibration_curves=calibration_curves_sigmoid, savefig=True,
+                        output_dir=artifact_dir,
+                        figname='calibration_curves_sigmoid.png', figsize=(15, 10))
+plot_calibration_curves(calibration_curves=calibration_curves_isotonic, savefig=True,
+                        output_dir=artifact_dir,
+                        figname='calibration_curves_isotonic.png', figsize=(15, 10))
+with mlflow.start_run(run_id=runid):
+    mlflow.log_artifacts(artifact_dir)
+    mlflow.end_run()

training/cross_validation_comorbs.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""Perform CV (with explainability) on different feature sets and log to mlflow.
+Includes functionality to nest runs under parent run (e.g. different feature sets
+under a main run) and set a decision threshold for model scores. Logs the following
+artifacts as well as metrics and parameters:
+1. List of model features
+2. Feature correlation matrix
+3. Global explainability (averaged over K folds)
+4. Cumulative gains curve
+5. Lift curve
+6. Probability distributions with KDE
+"""
+from imblearn.ensemble import BalancedRandomForestClassifier
+from lenusml import splits, crossvalidation, plots
+import numpy as np
+import os
+import pandas as pd
+import mlflow
+from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
+def get_crossvalidation_importance(*, feature_names, crossval):
+    """
+    Create dataframe of mean global feature importance for all EBMs used in CV.
+    Args:
+        feature_names (list): list of model feature names
+        crossval (dict): output of cross_validation_return_estimator_and_scores
+    Returns:
+        pd.DataFrame: contains feature names, global importance for each of the K
+            estimators, mean importance across the estimators and scaled mean importance
+            relative to the most important feature.
+    """
+    # Obtain global importance from each EBM used in cross validation
+    for i, est in enumerate(crossval['estimator']):
+        exp_global = crossval['estimator'][i].feature_importances_
+        explanations = pd.DataFrame([feature_names, exp_global]).T
+        explanations.columns = ['Feature', 'Score_{}'.format(i)]
+        # Create dataframe with global feature importances for all K estimators
+        if i == 0:
+            explanations_all = explanations.copy()
+        else:
+            explanations_all = explanations_all.merge(explanations, on='Feature')
+    # Average the importances across all models
+    explanations_all['Mean'] = explanations_all.drop(columns=['Feature']).mean(axis=1)
+    explanations_all = explanations_all.sort_values('Mean', ascending=False)
+    # Create a scaled mean importance relative to the most imprtant feature
+    explanations_all['Mean_scaled'] = explanations_all['Mean'] /\
+        explanations_all['Mean'].abs().max()
+    return explanations_all
+data_dir = '../data/models/model1/'
+cohort_info_dir = '../data/cohort_info/'
+output_dir = '../data/models/model1/output'
+# Load CV folds and train data
+fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
+                        allow_pickle=True)
+train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl'))
+# Cross check fold patients with train data
+cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients,
+                                                              id_column='StudyId',
+                                                              train_data=train_data)
+mlflow.set_tracking_uri("sqlite:///mlruns.sqlite")
+mlflow.set_experiment('model_drop2')
+# Set CV scoring strategies and any model parameters
+scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
+           'average_precision']
+# Load comorbidity data and get list of conditions captured in COPD service
+comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt',
+                            delimiter='|')
+comorbidity_list = list(comorbidities.columns)
+comorbidity_list.remove('Id')
+comorbidity_list.remove('PatientId')
+comorbidity_list.remove('Created')
+# Add the StudyId column for merging with the train data
+patient_details = pd.read_pickle(os.path.join('<YOUR_DATA_PATH>/copd-dataset',
+                                              'patient_details.pkl'))
+comorbidities = comorbidities.merge(patient_details[['PatientId', 'StudyId']],
+                                    on='PatientId', how='left')
+# Map the True/False columns to 1/0
+bool_mapping = {True: 1, False: 0}
+comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace(
+    bool_mapping)
+with mlflow.start_run(run_name='individual_comorbidities_no_binned'):
+    runid = mlflow.active_run().info.run_id
+    # Merge each comorbidity separately and train a model nested under the parent run
+    for comorbidity in comorbidity_list:
+        print(comorbidity)
+        # Merge comorb and fill missing data with 0
+        train_data = train_data.merge(comorbidities[['StudyId', comorbidity]],
+                                      on='StudyId', how='left')
+        train_data[comorbidity] = train_data[comorbidity].fillna(0)
+        with mlflow.start_run(run_name=comorbidity, nested=True,
+                              tags={MLFLOW_PARENT_RUN_ID: runid}):
+            ####
+            # Feature addition/drop out here
+            #####
+            # Create list of model features
+            cols_to_drop = ['StudyId', 'IsExac', 'Comorbidities_te']
+            features_list = [col for col in train_data.columns if col not in cols_to_drop]
+            # Separate features from target
+            features = train_data[features_list].astype('float')
+            target = train_data.IsExac.astype('float')
+            # Save the list of features and a correlation heatmap to the artifacts
+            # directory (to be logged in mlflow)
+            artifact_dir = './tmp'
+            # Create the artifacts directory if it doesn't exist
+            os.makedirs(artifact_dir, exist_ok=True)
+            # Remove any existing directory contents to not mix files between different
+            # runs
+            for f in os.listdir(artifact_dir):
+                os.remove(os.path.join(artifact_dir, f))
+            np.savetxt(os.path.join(artifact_dir, 'features.txt'), features_list,
+                       delimiter=",", fmt='%s')
+            plots.plot_feature_correlations(
+                features=features, figsize=(len(features_list) // 2,
+                                            len(features_list) // 2),
+                savefig=True, output_dir=artifact_dir,
+                figname="feature_correlations.png")
+            # Use the parameters from the best model in previous cross validation
+            model = BalancedRandomForestClassifier(random_state=0)
+            # crossval = cross_validate(model, features, target,
+            #                           cv=cross_validation_fold_indices,
+            #                           return_estimator=True, scoring=scoring)
+            # Perform K-fold cross validation with custom folds
+            # Set the probability threshold here if required
+            crossval, model_scores =\
+                crossvalidation.cross_validation_return_estimator_and_scores(
+                    model=model, features=features,
+                    target=target,
+                    fold_indices=cross_validation_fold_indices)
+            # Log metrics averaged across folds
+            for score in scoring:
+                mlflow.log_metric(score, np.mean(crossval['test_' + score]))
+            # Log model parameters
+            params = model.get_params()
+            for param in params:
+                mlflow.log_param(param, params[param])
+            # Calculate average global feature importances across K models
+            explainability = get_crossvalidation_importance(feature_names=features_list,
+                                                            crossval=crossval)
+            explainability.to_csv(os.path.join(artifact_dir,
+                                  'global_feature_importances.csv'), index=False)
+            plots.plot_global_explainability_cv(importances=explainability,
+                                                scaled=True,
+                                                figsize=(
+                                                    len(features_list) // 2.5,
+                                                    len(features_list) // 6),
+                                                savefig=True, output_dir=artifact_dir)
+            # Plot lift and cumulative gains curves
+            plots.plot_lift_curve(scores=model_scores, savefig=True,
+                                  output_dir=artifact_dir, figname='lift_curve.png')
+            plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True,
+                                              output_dir=artifact_dir,
+                                              figname='cumulative_gains_curve.png')
+            # Plot distribution of model scores (histogram plus KDE)
+            plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac',
+                                          negative_class_name='No exac', savefig=True,
+                                          output_dir=artifact_dir,
+                                          figname='model_score_distribution.png')
+            # Log artifacts
+            mlflow.log_artifacts(artifact_dir)
+            mlflow.end_run()
+            # Drop the comorbidity column
+            train_data = train_data.drop(columns=[comorbidity])
+# mlflow.end_run()

training/define_exacerbations_prologic.py ADDED Viewed

	@@ -0,0 +1,466 @@

+"""Collate all hospital, clincian verified and patient reported events and apply LOGIC."""
+import copd
+import numpy as np
+import os
+import pandas as pd
+data_dir = '<YOUR_DATA_PATH>/copd-dataset'
+############################################################################
+# Define model cohort and training data windows
+############################################################################
+# Read relevant info from patient details
+patient_details = pd.read_csv(os.path.join(data_dir, 'CopdDatasetPatientDetails.txt'),
+                              usecols=['PatientId', 'FirstSubmissionDate',
+                                       'MostRecentSubmissionDate',
+                                       'DateOfBirth', 'Sex', 'StudyId'],
+                              delimiter="|")
+# Select patients for inclusion (those with up to date events in service)
+# Create list of patients for model inclusion
+# Original RECEIVER cohort study id list
+receiver_patients = ["RC{:02d}".format(i) for i in range(1, 85)]
+# This patient needs removing
+receiver_patients.remove('RC34')
+# Scale up patients (subset)
+scaleup_patients = ["SU{:02d}".format(i) for i in range(1, 219)]
+scaleup_patients.append('SU287')
+# List of all valid patients for modelling
+valid_patients = receiver_patients + scaleup_patients
+# Filter for valid patients accounting for white spaces in StudyId (e.g. RC 26 and RC 52)
+patient_details = patient_details[patient_details.StudyId.str.replace(' ', '').isin(
+    valid_patients)]
+# Select only non null entries in patient data start/end dates
+patient_details = patient_details[(patient_details.FirstSubmissionDate.notna()) &
+                                  (patient_details.MostRecentSubmissionDate.notna())]
+# Create a column stating the latest date permitted based on events added to service data
+patient_details['LatestPredictionDate'] = '2022-02-28'
+date_cols = ['FirstSubmissionDate', 'MostRecentSubmissionDate', 'LatestPredictionDate']
+patient_details[date_cols] = patient_details[date_cols].apply(
+    lambda x: pd.to_datetime(x, utc=True).dt.normalize(), axis=1)
+# Choose the earlier date out of the patient's last submission and the latest COPD data
+# events
+patient_details['LatestPredictionDate'] = patient_details[
+    ['MostRecentSubmissionDate', 'LatestPredictionDate']].min(axis=1)
+# Add N days to start of data window because predictions are made N days in advance
+# N=3 for the 72 hr exac model
+N = 3
+patient_details['EarliestPredictionDate'] = patient_details['FirstSubmissionDate']\
+    + pd.DateOffset(days=N)
+# Remove any patients for whom the prediction start date overlaps the final submission
+# date, i.e. they have too short a window of data
+patient_details = patient_details[patient_details['EarliestPredictionDate'] <
+                                  patient_details['LatestPredictionDate']]
+# List of remaining patients
+model_patients = list(patient_details.PatientId.unique())
+model_study_ids = list(patient_details.StudyId.unique())
+print('Model cohort: {} patients. {} RECEIVER and {} SU'.format(
+    len(model_patients),
+    len(patient_details[patient_details['StudyId'].str.startswith('RC')]),
+    len(patient_details[patient_details['StudyId'].str.startswith('SU')])))
+df = patient_details[['PatientId', 'DateOfBirth', 'Sex', 'StudyId',
+                      'FirstSubmissionDate', 'LatestPredictionDate']].copy()
+# Create a dataframe with daily entries for each patient for their data window
+# df["DateOfEvent"] = df.apply(lambda x: pd.date_range(x.FirstSubmissionDate,
+#                              x.MostRecentSubmissionDate, freq='D'), axis=1)
+df["DateOfEvent"] = df.apply(lambda x: pd.date_range(x.FirstSubmissionDate -
+                             pd.DateOffset(days=N), x.LatestPredictionDate, freq='D'),
+                             axis=1)
+df = df.explode('DateOfEvent').reset_index(drop=True)
+############################################################################
+# Extract hospital exacerbations and admissions from COPD service data
+# Includes 1 year pre-onboarding plus time on Lenus COPD service
+############################################################################
+# Contains exacerbations among other event types
+patient_events = pd.read_csv(os.path.join(data_dir, 'PatientEvents.txt'),
+                             delimiter="|", usecols=['PatientId', 'DateOfEvent',
+                                                     'EventTypeId'])
+# Filter for only patients in model cohort - will still contain events out of data windows
+patient_events = patient_events[patient_events.PatientId.isin(model_patients)]
+# Lookup table for patient event types
+patient_event_types = pd.read_csv(os.path.join(data_dir, 'PatientEventTypes.txt'),
+                                  delimiter="|", usecols=['Id', 'Name'])
+patient_event_types = patient_event_types.rename(columns={'Id': 'EventTypeId',
+                                                          'Name': 'EventName'})
+# Merge patient events with lookup table)
+patient_events = patient_events.merge(patient_event_types, on='EventTypeId')
+# Identify hospital exacerbation events
+patient_events['IsHospExac'] = copd.define_service_exac_event(
+    events=patient_events.EventName, include_community=False)
+# Identify hospital admissions (all causes)
+patient_events['IsHospAdmission'] = copd.define_hospital_admission(
+    patient_events.EventName)
+admissions = patient_events[patient_events.IsHospAdmission == 1][['PatientId',
+                                                                  'DateOfEvent',
+                                                                  'IsHospAdmission']]
+hosp_exacs = patient_events[patient_events.IsHospExac == 1][['PatientId',
+                                                             'DateOfEvent',
+                                                             'IsHospExac']]
+admissions['DateOfEvent'] = pd.to_datetime(admissions.DateOfEvent,
+                                           utc=True).dt.normalize()
+hosp_exacs['DateOfEvent'] = pd.to_datetime(hosp_exacs.DateOfEvent,
+                                           utc=True).dt.normalize()
+hosp_exacs = hosp_exacs.drop_duplicates()
+admissions = admissions.drop_duplicates()
+# Save hospital exacerbations and admissions data
+hosp_exacs.to_pickle(os.path.join(data_dir, 'hospital_exacerbations.pkl'))
+admissions.to_pickle(os.path.join(data_dir, 'admissions.pkl'))
+##########################################################################################
+# Extract all rescue meds for model cohort in the year prior to onboarding. These will be
+# used as a proxy for community exacerbations pre-OB (not captured in service data)
+##########################################################################################
+# Read mapping between StudyId and SafeHavenID, and filter for model cohort
+id_mapping = pd.read_pickle('../data/sh_to_studyid_mapping.pkl')
+id_mapping = id_mapping[id_mapping.StudyId.isin(model_study_ids)]
+# Read pharmacy data and filter for model cohort
+pharmacy = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA',
+                                    'Pharmacy_Cohort4.csv'))
+pharmacy = pharmacy[pharmacy.SafeHavenID.isin(id_mapping.SafeHavenID)]
+# Pull out rescue med prescriptions only
+steroid_codes = ['0603020T0AAACAC', '0603020T0AABKBK', '0603020T0AAAXAX',
+                 '0603020T0AAAGAG', '0603020T0AABHBH', '0603020T0AAACAC',
+                 '0603020T0AABKBK', '0603020T0AABNBN', '0603020T0AAAGAG',
+                 '0603020T0AABHBH']
+antibiotic_codes = ['0501013B0AAAAAA', '0501013B0AAABAB', '0501030I0AAABAB',
+                    '0501030I0AAAAAA', '0501050B0AAAAAA', '0501050B0AAADAD',
+                    '0501013K0AAAJAJ']
+rescue_med_bnf_codes = steroid_codes + antibiotic_codes
+pharmacy = pharmacy[pharmacy.PI_BNF_Item_Code.isin(rescue_med_bnf_codes)]
+# Get latest and earliest dates for model cohort
+cohort_dates = id_mapping.merge(patient_details[
+    ['PatientId', 'StudyId', 'FirstSubmissionDate', 'LatestPredictionDate']],
+    on='StudyId')
+# Merge and keep only rescue meds in the year before patient onboarding
+pharmacy_exacs = cohort_dates.merge(pharmacy, on='SafeHavenID').drop(
+    columns=['PatientId', 'PI_BNF_Item_Code', 'PI_BNF_Item_Description',
+             'DISP_DATE', 'SafeHavenID'])
+pharmacy_exacs = pharmacy_exacs.rename(columns={'PRESC_DATE': 'DateOfEvent'})
+pharmacy_exacs['DateOfEvent'] = pd.to_datetime(pharmacy_exacs['DateOfEvent'],
+                                               utc=True).dt.normalize()
+# Drop duplicates
+pharmacy_exacs = pharmacy_exacs.drop_duplicates()
+# Filter on dates
+pharmacy_exacs = pharmacy_exacs[
+    (pharmacy_exacs.DateOfEvent < pharmacy_exacs.FirstSubmissionDate) &
+    (pharmacy_exacs.DateOfEvent >= pharmacy_exacs.FirstSubmissionDate -
+        pd.DateOffset(years=1))]
+# New column for rescue med exac type
+pharmacy_exacs['IsRescueMedExac'] = 1
+pharmacy_exacs = pharmacy_exacs.drop(
+    columns=['FirstSubmissionDate', 'LatestPredictionDate'])
+# Save "pharmacy exacerbations" data
+pharmacy_exacs.to_pickle(os.path.join(data_dir, 'pharmacy_exacerbations.pkl'))
+######################################################
+# Extract patient reported exacerbation events
+######################################################
+########################
+# Data post Q5 change
+#######################
+# Read file containing patient reported events (not patient_events because it contains
+# the dates when patients answered PROs and not which date they reported as having taken
+# their rescue meds)
+symptom_diary = pd.read_csv(os.path.join(data_dir, 'CopdDatasetProSymptomDiary.txt'),
+                            usecols=['PatientId', 'StudyId', 'Score', 'SubmissionTime',
+                            'SymptomDiaryQ5', 'SymptomDiaryQ11a', 'SymptomDiaryQ11b'],
+                            delimiter="|")
+Q5ChangeDate = pd.to_datetime('2021-04-22', utc=True)
+symptom_diary = copd.filter_symptom_diary(df=symptom_diary, date_cutoff=Q5ChangeDate,
+                                          patients=model_patients)
+weekly_pros = copd.get_rescue_med_pro_responses(symptom_diary)
+weekly_pros = copd.set_pro_exac_dates(weekly_pros)
+weekly_pros = weekly_pros[['PatientId', 'Q5Answered', 'NegativeQ5', 'IsCommExac',
+                           'DateOfEvent', 'ExacDateUnknown']]
+#########################
+# Pre Q5 change events
+#########################
+# RECEIVER cohort - community events verified up to 16/03/21
+receiver = pd.read_excel('./LenusEvents/breakdown_of_com_exac_160321.xlsx')
+receiver = receiver.rename(columns={'Study number': 'StudyId',
+                           'Exacerbation recorded': 'DateRecorded'})
+receiver_exacs = copd.extract_clinician_verified_exacerbations(receiver)
+# Scale up cohort - community events verified up to 17/05/2021
+scaleup = pd.read_excel('./LenusEvents/Scale_Up_comm_exac_count_V9_deident.xlsx')
+scaleup = scaleup.rename(columns={'Study Number': 'StudyId',
+                         'Date Exacerbation recorded': 'DateRecorded'})
+scaleup['StudyId'] = scaleup['StudyId'].ffill()
+scaleup_exacs = copd.extract_clinician_verified_exacerbations(scaleup)
+# Combine RECEIVER and scale up events into one df
+verified_exacs = pd.concat([receiver_exacs, scaleup_exacs])
+####################################################################################
+# Merge hospital, patient reported and rescue med exacs with daily patient records
+#
+# Exacerbations occurring in Lenus service period include verified clinician events
+# pre-April 2021 (after onboarding) and community exacerbations recorded in weekly
+# PROs post-April 2021. Hospital exacs occur in year prior to OB and on Lenus service.
+# Rescue med exacs are only used for the year prior to OB.
+# Need to ensure each record has both StudyId and PatientId to prevent losing events
+######################################################################################
+# Patient reported, clinician verified (during COPD service time only, inner join)
+df = df.merge(verified_exacs, on=['StudyId', 'DateOfEvent'], how='left')
+# Patient reported, new rescue med PRO (April 2021 onwards, inner join)
+df = df.merge(weekly_pros, on=['PatientId', 'DateOfEvent'], how='left')
+# Hospital exacs (one year prior to OB plus time on service, outer join)
+df = df.merge(hosp_exacs, on=['PatientId', 'DateOfEvent'], how='outer')
+df = copd.fill_column_by_patient(df=df, id_col='PatientId', col='StudyId')
+# Pharmacy exacs, (one year prior to OB up to OB only, outer join)
+df = df.merge(pharmacy_exacs, on=['StudyId', 'DateOfEvent'], how='outer')
+df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='PatientId')
+# Respiratory hospital admissions (one year prior to OB plus time on service, outer join)
+df = df.merge(admissions, on=['PatientId', 'DateOfEvent'], how='outer')
+df = copd.fill_column_by_patient(df=df, id_col='PatientId', col='StudyId')
+# Combine cols from individual datasets into one
+df['ExacDateUnknown'] = np.where((df.ExacDateUnknown_x == 1) |
+                                 (df.ExacDateUnknown_y == 1), 1, 0)
+df['IsCommExac'] = np.where((df.IsCommExac_x == 1) |
+                            (df.IsCommExac_y == 1) | (df.IsRescueMedExac == 1), 1, 0)
+# Column for whether an exacerbation of any kind occurred on each date. To be filtered
+# using (PRO) LOGIC
+df['IsExac'] = np.where((df.IsCommExac == 1) | (df.IsHospExac == 1), 1, 0)
+# Resample the df to one day per patient starting from the earliest record (may be a
+# pre-onboarding exac. Complete daily records required for calculating DaysSinceLastExac)
+df = df.set_index('DateOfEvent').groupby('StudyId').resample('D').asfreq().drop(
+    'StudyId', axis=1).reset_index()
+# Infill binary cols with zero where applicable
+df[['Q5Answered', 'NegativeQ5', 'IsHospExac', 'IsCommExac', 'ExacDateUnknown', 'IsExac',
+    'IsRescueMedExac', 'IsHospAdmission']] = df[
+        ['Q5Answered', 'NegativeQ5', 'IsHospExac', 'IsCommExac', 'ExacDateUnknown',
+         'IsExac', 'IsRescueMedExac', 'IsHospAdmission']].fillna(0)
+# Infill some columns by StudyId to populate entire df
+df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='FirstSubmissionDate')
+df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='LatestPredictionDate')
+df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='PatientId')
+# Retain only dates before the end of each patient's data window
+df = df[df.DateOfEvent <= df.LatestPredictionDate]
+print('Starting number of exacerbations: {}'.format(df.IsExac.sum()))
+print('Exacerbations pre-onboarding to COPD service: {}'.format(
+    len(df[(df.IsExac == 1) & (df.DateOfEvent < df.FirstSubmissionDate)])))
+print('Exacerbations post-onboarding to COPD service: {}'.format(
+    len(df[(df.IsExac == 1) & (df.DateOfEvent >= df.FirstSubmissionDate)])))
+print('Number of unique exacerbation patients: {}'.format(
+    len(df[df.IsExac == 1].PatientId.unique())))
+# print('Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping'
+#       .format(df.IsHospExac.sum(), df.IsCommExac.sum(),
+#               len(df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1)])))
+print('Rescue med prescriptions in year prior to onboarding: {} ({} unique patients, \
+{} prescription dates overlapping with hospital events)'
+      .format(len(df[df.IsRescueMedExac == 1]),
+              len(df[df.IsRescueMedExac == 1].StudyId.unique()),
+              len(df[(df.IsRescueMedExac == 1) & (df.IsHospExac == 1)])))
+print('Hospital exacerbations in year prior to onboarding: {} ({} unique patients)'
+      .format(len(df[(df.IsHospExac == 1) &
+                     (df.DateOfEvent < df.FirstSubmissionDate)]),
+              len(df[(df.IsHospExac == 1) &
+                     (df.DateOfEvent < df.FirstSubmissionDate)].StudyId.unique())))
+print('Hospital exacerbations post-OB: {} ({} unique patients)'
+      .format(len(df[(df.IsHospExac == 1) &
+                     (df.DateOfEvent >= df.FirstSubmissionDate)]),
+              len(df[(df.IsHospExac == 1) &
+                     (df.DateOfEvent >= df.FirstSubmissionDate)].StudyId.unique())))
+print('Clinician verified community exacerbations post-OB: {} ({} unique patients)'
+      .format(len(df[df.IsCommExac_x == 1]),
+              len(df[df.IsCommExac_x == 1].StudyId.unique())))
+print('Community exacerbations post-OB from weekly PROs: {} ({} unique patients)'
+      .format(len(df[df.IsCommExac_y == 1]),
+              len(df[df.IsCommExac_y == 1].StudyId.unique())))
+print('Number of patient reported exacerbations with unknown dates: {} ({} overlapping\
+ with hospital events)'.format(df.ExacDateUnknown.sum(),
+                               len(df[(df.IsHospExac == 1) & (df.ExacDateUnknown == 1)])))
+# Check for any patient reported events with unknown dates that occurred on the same day
+# as a hospital event. Hospital events are trusted so set the date to known
+df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1), 'ExacDateUnknown'] = 0
+print('Remaining exacerbations with unknown dates: {}'.format(df.ExacDateUnknown.sum()))
+df = df.drop(columns=['IsCommExac_x', 'IsCommExac_y', 'ExacDateUnknown_x',
+                      'ExacDateUnknown_y'])
+############################################################################
+# Implement PRO LOGIC on hospital and patient reported exacerbation events
+############################################################################
+# Define min and max days for PRO LOGIC. No predictions made or data used within
+# logic_min_days after an exacerbation. Events falling between logic_min_days and
+# logic_max_days after an event are subject to the weekly rescue med LOGIC criterion
+logic_min_days = 14
+logic_max_days = 35
+# Calculate the days since last rescue med prescription
+df = df.groupby('StudyId').apply(
+    lambda x: copd.calculate_days_since_last_event(
+        df=x, event_col='IsRescueMedExac',
+        output_col='DaysSinceLastRescueMeds')).reset_index(drop=True)
+rescue_med_min_days = 7
+print('Rescue med prescriptions occuring within {} days of a previous prescription: {}'
+      .format(rescue_med_min_days,
+              len(df[(df.DaysSinceLastRescueMeds > -1) &
+                     (df.DaysSinceLastRescueMeds <= rescue_med_min_days) &
+                     (df.IsRescueMedExac == 1)])))
+# Reset IsExac to 0 for rescue med prescriptions within 7 days of a previous prescription
+df.loc[(df.DaysSinceLastRescueMeds > -1) &
+       (df.DaysSinceLastRescueMeds <= rescue_med_min_days) &
+       (df.IsRescueMedExac == 1), 'IsExac'] = 0
+# Calculate the days since the previous exacerbation for all patient days. Now includes
+# events before patient onboarding
+df = df.groupby('StudyId').apply(
+    lambda x: copd.calculate_days_since_last_event(
+        df=x, event_col='IsExac', output_col='DaysSinceLastExac')).reset_index(drop=True)
+pre_onboarding_min_days = 14
+print('Pre-onboarding exacerbations occuring within {} days of a previous exac: {}'
+      .format(pre_onboarding_min_days,
+              len(df[(df.IsExac == 1) &
+                     (df.DaysSinceLastExac > -1) &
+                     (df.DaysSinceLastExac <= pre_onboarding_min_days) &
+                     (df.DateOfEvent < df.FirstSubmissionDate)])))
+# Set IsExac to 0 for any pre-OB exacs within 14 days of a previous exac
+df.loc[(df.DaysSinceLastExac > -1) & (df.DaysSinceLastExac <= pre_onboarding_min_days) &
+       (df.DateOfEvent < df.FirstSubmissionDate), 'IsExac'] = 0
+# Recalculate DaysSinceLastExac to avoid counting exacs removed above
+df = df.groupby('StudyId').apply(
+    lambda x: copd.calculate_days_since_last_event(
+        df=x, event_col='IsExac', output_col='DaysSinceLastExac')).reset_index(drop=True)
+# Apply exclusion period following all exacerbations
+df['RemoveRow'] = copd.minimum_period_between_exacerbations(
+    df, minimum_days=logic_min_days)
+# Don't apply this criterion to pre-OB events (already accounted for above)
+df.loc[(df.DateOfEvent < df.FirstSubmissionDate), 'RemoveRow'] = 0
+print('Number of post-OB exacerbations excluded by PRO LOGIC {} day criterion: {}'.format(
+    logic_min_days, len(df[(df.IsExac == 1) & (df.RemoveRow == 1) &
+                        (df.DateOfEvent >= df.FirstSubmissionDate)])))
+# Apply criterion for negative weekly Q5 responses - doesn't capture anything post Q5
+# change
+consecutive_replies = 2
+df = copd.apply_logic_response_criterion(df,
+                                         minimum_period=logic_min_days,
+                                         maximum_period=logic_max_days,
+                                         N=consecutive_replies)
+print('Weekly rescue med (Q5) criterion applied to events occurring between {} and {} \
+days after a previous event. {} consecutive negative replies required for the event to \
+count as a new event'.format(logic_min_days, logic_max_days, consecutive_replies))
+# Don't apply this criterion to pre-OB events (already accounted for above)
+df.loc[(df.DateOfEvent < df.FirstSubmissionDate), 'RemoveExac'] = 0
+print('Number of exacerbations excluded by PRO LOGIC Q5 response criterion: {}'.format(
+    df.RemoveExac.sum()))
+print('Earliest and latest exacerbations excluded: {}, {}'.format(
+    df[df.RemoveExac == 1].DateOfEvent.min(), df[df.RemoveExac == 1].DateOfEvent.max()))
+print('Remaining post-OB exacerbations: {}'.format(
+    len(df[(df.IsExac == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1) &
+           (df.DateOfEvent >= df.FirstSubmissionDate)])))
+print('Remaining exacerbations with unknown dates: {}'.format(
+    len(df[(df.ExacDateUnknown == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1)])))
+# Remove data between segments of prolonged events, count only first occurrence
+df = copd.remove_data_between_exacerbations(df)
+# Remove 7 days before each reported exacerbation within unknown date (meds in last week)
+df = copd.remove_unknown_date_exacerbations(df)
+# New df with unwanted rows removed for events breakdown. Don't drop rows before setting
+# the prediction window in case of events that occur immediately after the exclusion
+# period (prediction window is set on df index rather than dates so want full daily df)
+df_counts = df[(df.RemoveRow != 1) & (df.DateOfEvent >= df.FirstSubmissionDate)].copy()
+print('Final number of exacerbations: {}'.format(df_counts.IsExac.sum()))
+exac_patients = pd.Series(df_counts[df_counts.IsExac == 1].StudyId.unique())
+print('Number of unique exacerbation patients: {} ({} RC and {} SU)'.format(
+    len(exac_patients), exac_patients.str.startswith('RC').sum(),
+    exac_patients.str.startswith('SU').sum()))
+print('Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping'
+      .format(df_counts.IsHospExac.sum(), df_counts.IsCommExac.sum(),
+              len(df_counts.loc[
+                  (df_counts.IsCommExac == 1) & (df_counts.IsHospExac == 1)])))
+#################################################################
+# Set the prediction window to N days and remove unwanted rows
+# Calculate rolling exac counts before removing pre-OB events
+#################################################################
+# Create column of exacerbations to use for rolling counts
+df['ExacsToKeep'] = np.where((df.RemoveRow != 1) & (df.RemoveExac != 1), df.IsExac, 0)
+# Calculate rolling 365 day sums of exacerbations and respiratory admissions
+df = copd.rolling_sum_previous_period(df=df, date_col='DateOfEvent', col='ExacsToKeep',
+                                      id_col='StudyId', window=365,
+                                      output_col='ExacsPrevYear')
+df = copd.rolling_sum_previous_period(df=df, date_col='DateOfEvent',
+                                      col='IsHospAdmission', id_col='StudyId', window=365,
+                                      output_col='AdmissionsPrevYear')
+# Filter for data in the training data window (first submission date onwards)
+df = df[(df.DateOfEvent >= df.FirstSubmissionDate) & (df.RemoveRow != 1)]
+print('Setting {} day prediction window'.format(N))
+df = copd.set_prediction_window(df=df, prediction_window=N)
+print('Full data set now contains {} exacerbation days out of {} ({:.1f}%)'.format(
+    df.IsExac.value_counts()[1], len(df),
+    100 * df.IsExac.value_counts(normalize=True)[1]))
+################
+# Save data
+################
+df = df[['PatientId', 'StudyId', 'DateOfBirth', 'Sex',
+         'DateOfEvent', 'IsExac', 'DaysSinceLastExac', 'FirstSubmissionDate',
+         'LatestPredictionDate', 'ExacsPrevYear', 'AdmissionsPrevYear']]
+df.to_pickle(os.path.join(data_dir, 'exac_data.pkl'))
+patient_details.to_pickle(os.path.join(data_dir, 'patient_details.pkl'))

training/fitbit_exploration.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import copd
+import os
+import pandas as pd
+from scipy.stats import ks_2samp, cramervonmises_2samp
+import seaborn as sns
+import matplotlib.pyplot as plt
+sns.set(style='darkgrid', context='talk')
+sns.set_palette('dark')
+muted = sns.palettes.color_palette(palette='muted')
+dark = sns.palettes.color_palette(palette='dark')
+data_dir = '<YOUR_DATA_PATH>/lenus-samples-dataset'
+# Load platform data
+# DataServerDatasetSample.txt contains columns: 'Id', 'CategoryValue', 'ClientAssignedId',
+# 'ClientId', 'CreationDate', 'CreatorSubject', 'DiscriminatedTypeIdentifier', 'EndDate',
+# 'QuantityId', 'SampleId', 'SampleTypeDiscriminator', 'StartDate', 'Subject',
+# 'TypeIdentifier'
+# 'QuantityId' is a unique identifier to link one platform measurement (steps, HR etc)
+# 'CreatorSubject' refers to the patient. Links to 'Id' in DataServerDatasetQuantity
+lenus_sample = pd.read_csv(os.path.join(data_dir, "DataServerDatasetSample.txt"),
+                           delimiter="|", usecols=['StartDate', 'EndDate',
+                                                   'CreatorSubject', 'QuantityId',
+                                                   'TypeIdentifier', 'CreationDate'])
+# Convert datetime columns (strings) to datetime objects (in UTC)
+# Not using a pandas apply to all columns here because it's very slow
+date_cols = ['StartDate', 'EndDate', 'CreationDate']
+for col in date_cols:
+    lenus_sample[col] = pd.to_datetime(lenus_sample[col], utc=True).dt.normalize()
+# DataServerDatasetQuantity.txt contains columns: 'Id', 'Unit', 'value'
+# 'Id' links to 'QuantityId' in DataServerDatasetSample
+lenus_quantity = pd.read_csv(os.path.join(data_dir, "DataServerDatasetQuantity.txt"),
+                             delimiter="|")
+# Merge platform data on measurement id
+platform_data = lenus_sample.merge(lenus_quantity, left_on='QuantityId',
+                                   right_on='Id').drop(columns=['Id'])
+# Apply lookups to units and measurement types
+platform_data['Units'] = copd.unit_lookup(platform_data['Unit'])
+type_lookup = pd.read_csv('./lookups/type_lookup.txt')
+platform_data = platform_data.merge(type_lookup, left_on='TypeIdentifier',
+                                    right_on=type_lookup.index)
+# Drop unwanted columns
+platform_data = platform_data.drop(columns=['TypeIdentifier', 'Unit'])
+# Pivot the platform data to obtain columns for each measurement type
+platform_data = pd.pivot_table(platform_data, values='Value',
+                               index=['StartDate', 'EndDate', 'CreationDate',
+                                      'CreatorSubject'],
+                               columns=['Description']).reset_index()
+data = pd.read_pickle(os.path.join('<YOUR_DATA_PATH>/copd-dataset', 'exac_data.pkl'))
+patients = data.LenusId.unique()
+def filter_on_date_and_id(df, min_date, patients):
+    return df[(df.CreationDate >= min_date) & (df.CreatorSubject.isin(patients))]
+def resample_and_merge_median(df, fitbit):
+    # fitbit['DateOfEvent'] = fitbit['CreationDate']
+    # Resample for one value per day
+    fitbit = fitbit.set_index('CreationDate').groupby('CreatorSubject').resample(
+        '1d').median().dropna().reset_index()
+    data = df.merge(fitbit, left_on=['LenusId', 'DateOfEvent'],
+                    right_on=['CreatorSubject', 'CreationDate'], how='inner')
+    return data
+def resample_and_merge_last(df, fitbit):
+    fitbit['DateOfEvent'] = fitbit['CreationDate']
+    # Resample for one value per day
+    fitbit = fitbit.set_index('CreationDate').groupby('CreatorSubject').resample(
+        '1d').last().dropna().reset_index(drop=True)
+    data = df.merge(fitbit, left_on=['LenusId', 'DateOfEvent'],
+                    right_on=['CreatorSubject', 'DateOfEvent'], how='inner')
+    return data
+def print_numbers(df, measurement):
+    fitbit_patients = pd.Series(df.StudyId.unique())
+    print('{} patient days with {} data across {} unique patients ({} RC and {} SU)'.
+          format(len(df), measurement, len(df.PatientId.unique()),
+                 fitbit_patients.str.startswith('RC').sum(),
+                 fitbit_patients.str.startswith('SU').sum()))
+    exac_patients = pd.Series(df[df.IsExac == 1].StudyId.unique())
+    print('{} exacerbations across {} patients ({} RC and {} SU)'.format(df.IsExac.sum(),
+          len(df[df.IsExac == 1].PatientId.unique()),
+          exac_patients.str.startswith('RC').sum(),
+          exac_patients.str.startswith('SU').sum()))
+# Select heart rate data from all platform data
+heart_rate = platform_data[platform_data['heart rate'].notna()][
+    ['CreationDate', 'CreatorSubject', 'heart rate']]
+# Filter for patients and dates of interest
+heart_rate = filter_on_date_and_id(heart_rate, min_date='2010-01-01', patients=patients)
+heart_rate.columns
+hr_data = resample_and_merge_last(df=data, fitbit=heart_rate)
+print_numbers(hr_data, 'HR')
+steps = platform_data[platform_data['number of steps taken;'].notna()][[
+    'CreationDate', 'CreatorSubject', 'number of steps taken;']]
+# Filter for patients and dates of interest
+steps = filter_on_date_and_id(steps, min_date='2010-01-01', patients=patients)
+steps_data = resample_and_merge_median(df=data, fitbit=steps)
+print_numbers(steps_data, 'steps')
+hr_exac_patients = hr_data[hr_data.IsExac == 1]['PatientId'].unique()
+hr_data = hr_data[hr_data.PatientId.isin(hr_exac_patients)]
+hr_exac = hr_data[hr_data.IsExac == 1]['heart rate']
+hr_no_exac = hr_data[hr_data.IsExac == 0]['heart rate']
+ks_2samp(hr_exac, hr_no_exac)
+cramervonmises_2samp(hr_exac, hr_no_exac)
+steps_exac_patients = steps_data[steps_data.IsExac == 1]['PatientId'].unique()
+steps_data = steps_data[steps_data.PatientId.isin(steps_exac_patients)]
+steps_exac = steps_data[steps_data.IsExac == 1]['number of steps taken;']
+steps_no_exac = steps_data[steps_data.IsExac == 0]['number of steps taken;']
+ks_2samp(steps_exac, steps_no_exac)
+cramervonmises_2samp(steps_exac, steps_no_exac)
+fig, axes = plt.subplots(nrows=1, ncols=2, sharex=True, sharey=True,
+                         constrained_layout=True, figsize=(8, 6))
+sns.histplot(hr_data[hr_data.IsExac == 0], x="heart rate", binwidth=5, binrange=[50, 100],
+             alpha=.6, stat="density", legend=True, ax=axes[0], color=dark[0])
+axes[0].set_xlabel(None)
+plt.legend(['a'])
+sns.histplot(hr_data[hr_data.IsExac == 1], x="heart rate", binwidth=5, binrange=[50, 100],
+             alpha=.6, stat="density", legend=True, ax=axes[1], color=dark[1])
+axes[1].set_xlabel(None)
+fig.supxlabel('heart rate')
+plt.legend(['b'])

training/lookups/README.MD ADDED Viewed

	@@ -0,0 +1 @@


1	+ .

training/lookups/type_lookup.txt ADDED Viewed

	@@ -0,0 +1,116 @@

+Description
+"body mass index";
+"body fat percentage";
+"height";
+"weight (body mass)";
+"lean body mass";
+"waist circumference";
+"number of steps taken";
+"distance moved walking by walking or running";
+"distance moved by cycling";
+"distance moved using a wheelchair";
+"resting energy used";
+"active energy used";
+"number of flights of stairs climbed";
+"NikeFuel point earned";
+"amount of time moved at an an average intensity of a brisk walk or greater";
+"number of pushes performed while using a wheelchair";
+"distance moved by swimming";
+"number of strokes taken by swimming";
+"vO2 max"
+"heart rate"
+"body temperature"
+"body temperature during rest"
+"systolic blood pressure"
+"diastolic blood pressure"
+"respiratory rate"
+"heart rate at rest"
+"average heart rate during walking"
+"standard deviation of heartbeat intervals"
+"oxygen saturation"
+"peripheral perfusion index"
+"blood glucose level"
+"number of times fallen"
+electrodermal activity"
+"number of puffs the user takes from their inhaler"
+"amount of insulin delivered"
+"blood alcohol content"
+"amount of air that can be forcibly exhaled from the lungs after taking the deepest breath possible"
+"amount of air that can be forcibly exhaled from the lungs during the first second of a forced exhalation"
+"maximum flow rate generated during a forceful exhalation"
+"total amount of fat consumed"
+"amount of polyunsaturated fat consumed"
+"amount of monounsaturated fat consumed"
+"amount of saturated fat consumed"
+"amount of cholesterol consumed"
+"amount of sodium consumed"
+"amount of carbohydrates consumed"
+"amount of fiber consumed"
+"amount of sugar consumed"
+"amount of energy consumed"
+"amount of protein consumed"
+"amount of vitamin A consumed"
+"amount of vitamin B6 consumed"
+"amount of vitamin B12 consumed"
+"amount of vitamin C consumed"
+"amount of vitamin D consumed"
+"amount of vitamin E consumed"
+"amount of vitamin K consumed"
+"amount of calcium consumed"
+"amount of iron consumed"
+"amount of thiamin consumed"
+"amount of riboflavin consumed"
+"amount of niacin consumed"
+"amount of folate consumed"
+"amount of biotin consumed"
+"amount of pantothenic acid consumed"
+"amount of phosphorus consumed"
+"amount of iodine consumed"
+"amount of magnesium consumed"
+"amount of zinc consumed"
+"amount of selenium consumed"
+"amount of copper consumed"
+"amount of manganese consumed"
+"amount of chromium consumed"
+"amount of molybdenum consumed"
+"amount of chloride consumed"
+"amount of potassium consumed"
+"amount of caffeine consumed"
+"amount of water consumed"
+"exposure to UV radiation"
+"tgt ipap value 50"
+"tgt ipap value 95"
+"tgt ipap maximum value"
+"tgt epap value 50"
+"tgt epap value 95"
+"tgt epap maximum value"
+"leak value 50"
+"leak value 95"
+"leak maximum value"
+"resp rate value 50"
+"resp rate value 95"
+"resp rate maximum value"
+"ie ratio value 50"
+"ie ratio value 95"
+"ie ratio maximum value"
+"minute vent value 50"
+"minute vent value 95"
+"minute vent maximum value"
+"tidal vol value 50"
+"tidal vol value 95"
+"tidal vol maximum value"
+"alveolar ventilation value 50"
+"alveolar ventilation value 95"
+"alveolar ventilation maximum value"
+"spo2 minimum value"
+"spo2 value 50"
+"spo2 value 95"
+"spo2 minutes below 88 Percent"
+"spo2 seconds below dynamic threshold"
+"spont trigg breaths"
+"spont cycled breaths"
+"resp events AHI"
+"resp events AI"
+"resp events HI"
+"resp events ODI"
+"amb humidity"

training/prepare_test_data.py ADDED Viewed

	@@ -0,0 +1,271 @@

+"""Prepare final test set for modelling (K fold encoded, scaled and imputed)."""
+import copd
+import json
+import joblib
+from lenusml import encoding
+import os
+import pandas as pd
+import numpy as np
+data_dir = '<YOUR_DATA_PATH>/test_data/'
+cohort_info_dir = '../data/cohort_info/'
+output_data_dir = '../data/models/model1'
+artifact_dir = os.path.join(output_data_dir, 'artifacts')
+data = pd.read_pickle(os.path.join(data_dir, 'test_data.pkl'))
+###############################################
+# Map the True/False cols to integers
+###############################################
+bool_mapping = {True: 1, False: 0}
+data['RequiredAcuteNIV'] = data.RequiredAcuteNIV.replace(bool_mapping)
+data['RequiredICUAdmission'] = data.RequiredICUAdmission.replace(bool_mapping)
+# Map the M and F sex column to binary (1=F)
+sex_mapping = {'F': 1, 'M': 0}
+data['Sex_F'] = data.Sex.map(sex_mapping)
+data = data.drop(columns=['Sex'])
+##############################################################
+# Read daily PRO responses, calculate aggregations and merge
+##############################################################
+cat = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProCat.txt'),
+                  delimiter="|")
+symptom_diary = pd.read_csv(
+    os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProSymptomDiary.txt'),
+    usecols=['PatientId', 'SubmissionTime', 'SymptomDiaryQ1', 'SymptomDiaryQ2',
+             'SymptomDiaryQ3', 'SymptomDiaryQ8', 'SymptomDiaryQ9', 'SymptomDiaryQ10'],
+    delimiter="|")
+cat['SubmissionTime'] = pd.to_datetime(cat.SubmissionTime,
+                                       utc=True).dt.normalize()
+symptom_diary['SubmissionTime'] = pd.to_datetime(symptom_diary.SubmissionTime,
+                                                 utc=True).dt.normalize()
+# Filter for test patients
+cat = cat[cat.PatientId.isin(data.PatientId)]
+symptom_diary = symptom_diary[symptom_diary.PatientId.isin(data.PatientId)]
+# Merge daily PROs accounting for days where patients answered the same PRO more than once
+# per day
+daily_pros = pd.merge(cat.drop_duplicates(subset=['PatientId', 'SubmissionTime']),
+                      symptom_diary.drop_duplicates(subset=['PatientId',
+                                                            'SubmissionTime']),
+                      on=['PatientId', 'SubmissionTime'], how='inner')
+# Calculate rolling mean on previous days for numeric PROs
+numeric_pros = ['CATQ1', 'CATQ2', 'CATQ3', 'CATQ4', 'CATQ5', 'CATQ6', 'CATQ7',
+                'CATQ8', 'SymptomDiaryQ1', 'SymptomDiaryQ2', 'Score']
+mean_pros = copd.rolling_mean_previous_period(df=daily_pros, cols=numeric_pros,
+                                              date_col='SubmissionTime',
+                                              id_col='StudyId', window=3)
+# Merge the averaged PROs with the original responses and calculate differences
+daily_pros = daily_pros.merge(mean_pros, on=['StudyId', 'SubmissionTime'], how='left')
+daily_pros = copd.calculate_diff_from_rolling_mean(df=daily_pros, cols=numeric_pros)
+# Remove the rolling average columns
+daily_pros = daily_pros.loc[:, ~daily_pros.columns.str.endswith('_ave')]
+# Merge PROs with full test data
+test_data = pd.merge_asof(data.sort_values(by='DateOfEvent'), daily_pros.drop(
+                          columns=['StudyId']).sort_values(by='SubmissionTime'),
+                          left_on='DateOfEvent', right_on='SubmissionTime',
+                          by='PatientId', direction='backward')
+################################################
+# Include comorbidities from Lenus service
+################################################
+comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt',
+                            delimiter='|')
+comorbidities = comorbidities.drop(columns=['Id', 'Created'])
+# Get list of comorbidities captured in the service
+comorbidity_list = list(comorbidities.columns)
+comorbidity_list.remove('PatientId')
+# Filter for test patients
+comorbidities = comorbidities[comorbidities.PatientId.isin(data.PatientId)]
+print('Test patients with entries in CopdDatasetCoMorbidityDetails: {} out of {}'.format(
+    len(comorbidities), len(data.PatientId.unique())))
+comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace(
+    bool_mapping).fillna(0)
+print('Comorbidity counts:', '\n', comorbidities[comorbidity_list].sum())
+# Merge with test data, infill nans and get counts
+test_data = test_data.merge(comorbidities, on='PatientId', how='left')
+print('Comorbidity counts after merging with patient days:', '\n',
+      test_data[comorbidity_list].sum())
+test_data[comorbidity_list] = test_data[comorbidity_list].fillna(0)
+# Get comorb counts for each patient
+test_data['Comorbidities'] = test_data[comorbidity_list].sum(axis=1)
+comorb_counts = test_data.groupby('StudyId')['Comorbidities'].max().reset_index()
+print('Patient comorbidity counts after infilling missing values: \n',
+      comorb_counts.value_counts())
+# Drop comorbidities columns from test data but retain AsthmaOverlap
+comorbidity_list.remove('AsthmaOverlap')
+test_data = test_data.drop(columns=comorbidity_list)
+###############################################################
+# Include inhaler type from Lenus service
+###############################################################
+# Load inhaler data
+inhaler_type = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetUsualTherapies.txt',
+                           delimiter='|', usecols=['StudyId', 'InhalerType'])
+# Filter for train patients
+inhaler_type = inhaler_type[inhaler_type.StudyId.isin(data.StudyId)]
+# Create new feature for triple therapy ('LABA-LAMA-ICS' or 'LAMA +LABA-ICS')
+inhaler_type = copd.triple_inhaler_therapy_service(
+    df=inhaler_type, id_col='StudyId', inhaler_col='InhalerType', include_mitt=True)
+print('Patients taking triple inhaler therapy: ', '\n',
+      inhaler_type.TripleTherapy.value_counts())
+test_data = test_data.merge(inhaler_type, on='StudyId', how='left')
+#####################################
+# Map some categorical features
+#####################################
+# Replace SDQ8 with strings for phlegm difficulty and infill as None where no phlegm
+# reported in CAT
+test_data['SymptomDiaryQ8'] = test_data.SymptomDiaryQ8.replace(
+    {1: 'Not difficult', 2: 'A little difficult', 3: 'Quite difficult',
+     4: 'Very difficult', np.nan: 'None'})
+# Replace SDQ9 with strings for phlegm consistency and infill as None where no phlegm
+# reported in CAT
+test_data['SymptomDiaryQ9'] = test_data.SymptomDiaryQ9.replace(
+    {1: 'Watery', 2: 'Sticky liquid', 3: 'Semi-solid', 4: 'Solid', np.nan: 'None'})
+# Replace SDQ10 with strings for phlegm colour and infill as None where no phlegm
+# reported in CAT
+test_data['SymptomDiaryQ10'] = test_data.SymptomDiaryQ10.replace(
+    {1: 'White', 2: 'Yellow', 3: 'Green', 4: 'Dark green', np.nan: 'None'})
+# Replace smoking status with strings
+test_data['SmokingStatus'] = test_data.SmokingStatus.replace(
+    {1: 'Smoker', 2: 'Ex-smoker', 3: 'Non-smoker'})
+test_data['InExacWindow'] = test_data.IsExac.replace({0: False, 1: True})
+#####################################################
+# Calculate DaysSinceCAT and filter data if required
+#####################################################
+test_data['DaysSinceCAT'] = (test_data.DateOfEvent -
+                             test_data.SubmissionTime).dt.days.astype('int')
+DaysSinceCAT_cutoff = 14
+test_data = test_data[test_data.DaysSinceCAT <= DaysSinceCAT_cutoff]
+#####################################
+# Bin some numeric features
+#####################################
+# Bin days since last exacerbation
+exac_bins = [-1, 0, 21, 90, 180, np.inf]
+exac_labels = ['None', '<21 days', '21 - 89 days', '90 - 179 days', '>= 180 days']
+test_data['DaysSinceLastExac'] = copd.bin_numeric_column(
+    col=test_data['DaysSinceLastExac'], bins=exac_bins, labels=exac_labels)
+# Bin patient age
+age_bins = [0, 50, 60, 70, 80, np.inf]
+age_labels = ['<50', '50-59', '60-69', '70-79', '80+']
+test_data['Age'] = copd.bin_numeric_column(
+    col=test_data['Age'], bins=age_bins, labels=age_labels)
+# Bin number of comorbidities
+comorb_bins = [0, 1, 3, np.inf]
+comorb_labels = ['None', '1-2', '3+']
+test_data['Comorbidities'] = copd.bin_numeric_column(
+    col=test_data['Comorbidities'], bins=comorb_bins, labels=comorb_labels)
+comorb_counts['Comorbidities_binned'] = copd.bin_numeric_column(
+    col=comorb_counts['Comorbidities'], bins=comorb_bins, labels=comorb_labels)
+# Bin patient spirometry at onboarding
+spirometry_bins = [0, 30, 50, 80, np.inf]
+spirometry_labels = ['Very severe', 'Severe', 'Moderate', 'Mild']
+test_data['FEV1PercentPredicted'] = copd.bin_numeric_column(
+    col=test_data['LungFunction_FEV1PercentPredicted'], bins=spirometry_bins,
+    labels=spirometry_labels)
+test_data = test_data.drop(columns=['LungFunction_FEV1PercentPredicted'])
+# Assign patients without spirometry in service data to the Mild category
+test_data.loc[
+    test_data['FEV1PercentPredicted'] == 'nan', 'FEV1PercentPredicted'] = 'Mild'
+test_data['FEV1PercentPredicted'].value_counts()
+##################################
+# Service eosinophils feature
+##################################
+test_data['HighestEosinophilCount_0_3'] = np.where(
+    test_data['LabsHighestEosinophilCount'] >= 0.3, 1, 0)
+test_data = test_data.drop(columns=['LabsHighestEosinophilCount'])
+# Target encode categorical data
+categorical_columns = ['SmokingStatus', 'SymptomDiaryQ8', 'SymptomDiaryQ9',
+                       'SymptomDiaryQ10', 'DaysSinceLastExac', 'Age', 'Comorbidities',
+                       'FEV1PercentPredicted']
+test_data[categorical_columns] = test_data[categorical_columns].astype("str")
+# Encode test set based on entire train set
+target_encodings = json.load(open(os.path.join(artifact_dir,
+                                  "target_encodings.json")))
+data_encoded = encoding.apply_target_encodings(data=test_data, encodings=target_encodings,
+                                               cols_to_encode=categorical_columns)
+###################################
+# Scale data
+###################################
+data_encoded = data_encoded.drop(columns=['PatientId', 'InExacWindow',
+                                          'DateOfEvent', 'SubmissionTime',
+                                          'FirstSubmissionDate', 'LatestPredictionDate'])
+scaler = joblib.load(os.path.join(artifact_dir, 'scaler.pkl'))
+# Scale data ignoring patient ID and target
+test_data_scaled = scaler.transform(
+    data_encoded.drop(columns=['StudyId', 'IsExac']))
+# Place scaled results back into dataframe and add back the patient ID and cohort columns
+test_data_scaled = pd.DataFrame(test_data_scaled, columns=data_encoded.drop(
+    columns=['StudyId', 'IsExac']).columns)
+test_data_scaled.insert(0, 'StudyId', data_encoded.StudyId.values)
+test_data_scaled['IsExac'] = data_encoded.IsExac.values
+print('Test data scaled')
+###################################
+# Infill missing data with median
+###################################
+imputer = joblib.load(os.path.join(artifact_dir, 'imputer.pkl'))
+imputer
+# Use scaled data
+test_data_imputed = imputer.transform(test_data_scaled.drop(
+    columns=['StudyId', 'IsExac']))
+# Place imputed results back into dataframe and add back the patient ID and target columns
+test_data_imputed = pd.DataFrame(test_data_imputed, columns=test_data_scaled.drop(
+    columns=['StudyId', 'IsExac']).columns)
+test_data_imputed.insert(0, 'StudyId', test_data_scaled.StudyId.values)
+test_data_imputed['IsExac'] = test_data_scaled.IsExac.values
+print('Test data imputed')
+########################################
+# Save final data
+#########################################
+# test data
+test_data_imputed.to_pickle(os.path.join(output_data_dir, 'test_data.pkl'))
+print('Final test data saved')

training/prepare_train_data.py ADDED Viewed

	@@ -0,0 +1,295 @@

+"""Prepare final train set (encoded, scaled and imputed) and save artifacts."""
+import copd
+import json
+import joblib
+from lenusml import encoding
+import os
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.impute import SimpleImputer
+data_dir = '<YOUR_DATA_PATH>/train_data/'
+cohort_info_dir = '../data/cohort_info/'
+output_data_dir = '../data/models/model1'
+data = pd.read_pickle(os.path.join(data_dir, 'train_data.pkl'))
+###############################################
+# Map the True/False cols to integers
+###############################################
+bool_mapping = {True: 1, False: 0}
+data['RequiredAcuteNIV'] = data.RequiredAcuteNIV.replace(bool_mapping)
+data['RequiredICUAdmission'] = data.RequiredICUAdmission.replace(bool_mapping)
+# Map the M and F sex column to binary (1=F)
+sex_mapping = {'F': 1, 'M': 0}
+data['Sex_F'] = data.Sex.map(sex_mapping)
+data = data.drop(columns=['Sex'])
+##############################################################
+# Read daily PRO responses, calculate aggregations and merge
+##############################################################
+cat = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProCat.txt'),
+                  delimiter="|")
+symptom_diary = pd.read_csv(
+    os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProSymptomDiary.txt'),
+    usecols=['PatientId', 'SubmissionTime', 'SymptomDiaryQ1', 'SymptomDiaryQ2',
+             'SymptomDiaryQ3', 'SymptomDiaryQ8', 'SymptomDiaryQ9', 'SymptomDiaryQ10'],
+    delimiter="|")
+cat['SubmissionTime'] = pd.to_datetime(cat.SubmissionTime,
+                                       utc=True).dt.normalize()
+symptom_diary['SubmissionTime'] = pd.to_datetime(symptom_diary.SubmissionTime,
+                                                 utc=True).dt.normalize()
+# Filter for train patients
+cat = cat[cat.PatientId.isin(data.PatientId)]
+symptom_diary = symptom_diary[symptom_diary.PatientId.isin(data.PatientId)]
+# Merge daily PROs accounting for days where patients answered the same PRO more than once
+# per day
+daily_pros = pd.merge(cat.drop_duplicates(subset=['PatientId', 'SubmissionTime']),
+                      symptom_diary.drop_duplicates(subset=['PatientId',
+                                                            'SubmissionTime']),
+                      on=['PatientId', 'SubmissionTime'], how='inner')
+# Calculate rolling mean on previous days for numeric PROs
+numeric_pros = ['CATQ1', 'CATQ2', 'CATQ3', 'CATQ4', 'CATQ5', 'CATQ6', 'CATQ7',
+                'CATQ8', 'SymptomDiaryQ1', 'SymptomDiaryQ2', 'Score']
+mean_pros = copd.rolling_mean_previous_period(df=daily_pros, cols=numeric_pros,
+                                              date_col='SubmissionTime',
+                                              id_col='StudyId', window=3)
+# Merge the averaged PROs with the original responses and calculate differences
+daily_pros = daily_pros.merge(mean_pros, on=['StudyId', 'SubmissionTime'], how='left')
+daily_pros = copd.calculate_diff_from_rolling_mean(df=daily_pros, cols=numeric_pros)
+# Remove the rolling average columns
+daily_pros = daily_pros.loc[:, ~daily_pros.columns.str.endswith('_ave')]
+# Merge PROs with full train data
+train_data = pd.merge_asof(data.sort_values(by='DateOfEvent'), daily_pros.drop(
+                           columns=['StudyId']).sort_values(by='SubmissionTime'),
+                           left_on='DateOfEvent', right_on='SubmissionTime',
+                           by='PatientId', direction='backward')
+################################################
+# Include comorbidities from Lenus service
+################################################
+comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt',
+                            delimiter='|')
+comorbidities = comorbidities.drop(columns=['Id', 'Created'])
+# Get list of comorbidities captured in the service
+comorbidity_list = list(comorbidities.columns)
+comorbidity_list.remove('PatientId')
+# Filter for train patients
+comorbidities = comorbidities[comorbidities.PatientId.isin(data.PatientId)]
+print('Train patients with entries in CopdDatasetCoMorbidityDetails: {} out of {}'.format(
+    len(comorbidities), len(data.PatientId.unique())))
+comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace(
+    bool_mapping).fillna(0)
+print('Comorbidity counts:', '\n', comorbidities[comorbidity_list].sum())
+# Merge with train data, infill nans and get counts
+train_data = train_data.merge(comorbidities, on='PatientId', how='left')
+print('Comorbidity counts after merging with patient days:', '\n',
+      train_data[comorbidity_list].sum())
+train_data[comorbidity_list] = train_data[comorbidity_list].fillna(0)
+# Get comorb counts for each patient
+train_data['Comorbidities'] = train_data[comorbidity_list].sum(axis=1)
+comorb_counts = train_data.groupby('StudyId')['Comorbidities'].max().reset_index()
+# print('Patient comorbidity counts after infilling missing values: \n',
+#       comorb_counts.value_counts())
+# Drop comorbidities columns from train data but retain AsthmaOverlap
+comorbidity_list.remove('AsthmaOverlap')
+train_data = train_data.drop(columns=comorbidity_list)
+###############################################################
+# Include inhaler type from Lenus service
+###############################################################
+# Load inhaler data
+inhaler_type = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetUsualTherapies.txt',
+                           delimiter='|', usecols=['StudyId', 'InhalerType'])
+# Filter for train patients
+inhaler_type = inhaler_type[inhaler_type.StudyId.isin(data.StudyId)]
+# Create new feature for triple therapy ('LABA-LAMA-ICS' or 'LAMA +LABA-ICS')
+inhaler_type = copd.triple_inhaler_therapy_service(
+    df=inhaler_type, id_col='StudyId', inhaler_col='InhalerType', include_mitt=True)
+print('Patients taking triple inhaler therapy: ', '\n',
+      inhaler_type.TripleTherapy.value_counts())
+train_data = train_data.merge(inhaler_type, on='StudyId', how='left')
+#####################################
+# Map some categorical features
+#####################################
+# Replace SDQ8 with strings for phlegm difficulty and infill as None where no phlegm
+# reported in CAT
+train_data['SymptomDiaryQ8'] = train_data.SymptomDiaryQ8.replace(
+    {1: 'Not difficult', 2: 'A little difficult', 3: 'Quite difficult',
+     4: 'Very difficult', np.nan: 'None'})
+# Replace SDQ9 with strings for phlegm consistency and infill as None where no phlegm
+# reported in CAT
+train_data['SymptomDiaryQ9'] = train_data.SymptomDiaryQ9.replace(
+    {1: 'Watery', 2: 'Sticky liquid', 3: 'Semi-solid', 4: 'Solid', np.nan: 'None'})
+# Replace SDQ10 with strings for phlegm colour and infill as None where no phlegm
+# reported in CAT
+train_data['SymptomDiaryQ10'] = train_data.SymptomDiaryQ10.replace(
+    {1: 'White', 2: 'Yellow', 3: 'Green', 4: 'Dark green', np.nan: 'None'})
+# Replace smoking status with strings
+train_data['SmokingStatus'] = train_data.SmokingStatus.replace(
+    {1: 'Smoker', 2: 'Ex-smoker', 3: 'Non-smoker'})
+train_data['InExacWindow'] = train_data.IsExac.replace({0: False, 1: True})
+#####################################################
+# Calculate DaysSinceCAT and filter data if required
+#####################################################
+train_data['DaysSinceCAT'] = (train_data.DateOfEvent -
+                              train_data.SubmissionTime).dt.days.astype('int')
+DaysSinceCAT_cutoff = 14
+train_data = train_data[train_data.DaysSinceCAT <= DaysSinceCAT_cutoff]
+#####################################
+# Bin some numeric features
+#####################################
+# Bin days since last exacerbation
+exac_bins = [-1, 0, 21, 90, 180, np.inf]
+exac_labels = ['None', '<21 days', '21 - 89 days', '90 - 179 days', '>= 180 days']
+train_data['DaysSinceLastExac'] = copd.bin_numeric_column(
+    col=train_data['DaysSinceLastExac'], bins=exac_bins, labels=exac_labels)
+# Bin patient age
+age_bins = [0, 50, 60, 70, 80, np.inf]
+age_labels = ['<50', '50-59', '60-69', '70-79', '80+']
+train_data['Age'] = copd.bin_numeric_column(
+    col=train_data['Age'], bins=age_bins, labels=age_labels)
+# Bin number of comorbidities
+comorb_bins = [0, 1, 3, np.inf]
+comorb_labels = ['None', '1-2', '3+']
+train_data['Comorbidities'] = copd.bin_numeric_column(
+    col=train_data['Comorbidities'], bins=comorb_bins, labels=comorb_labels)
+comorb_counts['Comorbidities_binned'] = copd.bin_numeric_column(
+    col=comorb_counts['Comorbidities'], bins=comorb_bins, labels=comorb_labels)
+# Bin patient spirometry at onboarding
+spirometry_bins = [0, 30, 50, 80, np.inf]
+spirometry_labels = ['Very severe', 'Severe', 'Moderate', 'Mild']
+train_data['FEV1PercentPredicted'] = copd.bin_numeric_column(
+    col=train_data['LungFunction_FEV1PercentPredicted'], bins=spirometry_bins,
+    labels=spirometry_labels)
+train_data = train_data.drop(columns=['LungFunction_FEV1PercentPredicted'])
+# Assign patients without spirometry in service data to the Mild category
+train_data.loc[
+    train_data['FEV1PercentPredicted'] == 'nan', 'FEV1PercentPredicted'] = 'Mild'
+train_data['FEV1PercentPredicted'].value_counts()
+##################################
+# Service eosinophils feature
+##################################
+train_data['HighestEosinophilCount_0_3'] = np.where(
+    train_data['LabsHighestEosinophilCount'] >= 0.3, 1, 0)
+train_data = train_data.drop(columns=['LabsHighestEosinophilCount'])
+# Target encode categorical data
+categorical_columns = ['SmokingStatus', 'SymptomDiaryQ8', 'SymptomDiaryQ9',
+                       'SymptomDiaryQ10', 'DaysSinceLastExac', 'Age', 'Comorbidities',
+                       'FEV1PercentPredicted']
+train_data[categorical_columns] = train_data[categorical_columns].astype("str")
+# Get encodings from entire train set (to be retained for holdout test
+# or new patients)
+target_encodings = encoding.get_target_encodings(train_data=train_data,
+                                                 cols_to_encode=categorical_columns,
+                                                 target='IsExac')
+# Encode entire train set
+data_encoded = encoding.apply_target_encodings(data=train_data,
+                                               encodings=target_encodings,
+                                               cols_to_encode=categorical_columns)
+###################################
+# Scale data
+###################################
+data_encoded = data_encoded.drop(columns=['PatientId', 'InExacWindow',
+                                          'DateOfEvent', 'SubmissionTime',
+                                          'FirstSubmissionDate', 'LatestPredictionDate'])
+scaler = MinMaxScaler()
+# Scale data ignoring patient ID and target
+train_data_scaled = scaler.fit_transform(
+    data_encoded.drop(columns=['StudyId', 'IsExac']))
+# Place scaled results back into dataframe and add back the patient ID and cohort columns
+train_data_scaled = pd.DataFrame(train_data_scaled, columns=data_encoded.drop(
+    columns=['StudyId', 'IsExac']).columns)
+train_data_scaled.insert(0, 'StudyId', data_encoded.StudyId.values)
+train_data_scaled['IsExac'] = data_encoded.IsExac.values
+print('Train data scaled')
+###################################
+# Infill missing data with median
+###################################
+imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+# Use scaled data
+train_data_imputed = imputer.fit_transform(train_data_scaled.drop(
+    columns=['StudyId', 'IsExac']))
+# Place imputed results back into dataframe and add back the patient ID and target columns
+train_data_imputed = pd.DataFrame(train_data_imputed, columns=train_data_scaled.drop(
+    columns=['StudyId', 'IsExac']).columns)
+train_data_imputed.insert(0, 'StudyId', train_data_scaled.StudyId.values)
+train_data_imputed['IsExac'] = train_data_scaled.IsExac.values
+print('Train data imputed')
+############################################
+# Save encodings, imputer and scaler
+############################################
+artifact_dir = os.path.join(output_data_dir, 'artifacts')
+os.makedirs(artifact_dir, exist_ok=True)
+# Remove any existing directory contents to not mix files between different runs
+for f in os.listdir(artifact_dir):
+    os.remove(os.path.join(artifact_dir, f))
+# Encodings
+json.dump(target_encodings, open(os.path.join(artifact_dir,
+                                              'target_encodings.json'), 'w'))
+# Scaler
+joblib.dump(scaler, os.path.join(artifact_dir, 'scaler.pkl'))
+print('Minmax scaler saved')
+# Imputer
+joblib.dump(imputer, os.path.join(artifact_dir, 'imputer.pkl'))
+print('Median imputer saved')
+########################################
+# Save final data
+#########################################
+# Train data
+train_data_imputed.to_pickle(os.path.join(output_data_dir, 'train_data.pkl'))
+print('Final train data saved')

training/prepare_train_data_crossval.py ADDED Viewed

	@@ -0,0 +1,331 @@

+"""Prepare final train set for cross-validation (K fold encoded, scaled and imputed)."""
+import copd
+# import matplotlib.pyplot as plt
+from lenusml import crossvalidation
+import os
+import pandas as pd
+import numpy as np
+import seaborn as sns
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.impute import SimpleImputer
+sns.set(style='darkgrid', context='talk')
+sns.set_palette('dark')
+muted = sns.palettes.color_palette(palette='muted')
+dark = sns.palettes.color_palette(palette='dark')
+data_dir = '<YOUR_DATA_PATH>/train_data/'
+cohort_info_dir = '../data/cohort_info/'
+output_data_dir = '../data/models/model1'
+fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
+                        allow_pickle=True)
+data = pd.read_pickle(os.path.join(data_dir, 'train_data.pkl'))
+exacs = data[data.IsExac == 1]
+exac_patients = exacs.StudyId.unique()
+# non_exac_patients = np.setdiff1d(data.StudyId, exac_patients)
+# len(non_exac_patients)
+# exac_counts = exacs.groupby('StudyId')['IsExac'].count().reset_index()
+# exac_counts = pd.concat([exac_counts,
+#                          pd.DataFrame({'StudyId': non_exac_patients,
+#                                        'IsExac': len(non_exac_patients)*[0]})])
+#
+# exac_counts = exac_counts.merge(data[['StudyId', 'Sex', 'SmokingStatus',
+#                                       'RequiredAcuteNIV', 'RequiredICUAdmission']],
+#                                 on='StudyId', how='left')
+###############################################
+# Map the True/False cols to integers
+###############################################
+bool_mapping = {True: 1, False: 0}
+data['RequiredAcuteNIV'] = data.RequiredAcuteNIV.replace(bool_mapping)
+data['RequiredICUAdmission'] = data.RequiredICUAdmission.replace(bool_mapping)
+# Map the M and F sex column to binary (1=F)
+sex_mapping = {'F': 1, 'M': 0}
+data['Sex_F'] = data.Sex.map(sex_mapping)
+data = data.drop(columns=['Sex'])
+##############################################################
+# Read daily PRO responses, calculate aggregations and merge
+##############################################################
+cat = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProCat.txt'),
+                  delimiter="|")
+symptom_diary = pd.read_csv(
+    os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProSymptomDiary.txt'),
+    usecols=['PatientId', 'SubmissionTime', 'SymptomDiaryQ1', 'SymptomDiaryQ2',
+             'SymptomDiaryQ3', 'SymptomDiaryQ8', 'SymptomDiaryQ9', 'SymptomDiaryQ10'],
+    delimiter="|")
+cat['SubmissionTime'] = pd.to_datetime(cat.SubmissionTime,
+                                       utc=True).dt.normalize()
+symptom_diary['SubmissionTime'] = pd.to_datetime(symptom_diary.SubmissionTime,
+                                                 utc=True).dt.normalize()
+# Filter for train patients
+cat = cat[cat.PatientId.isin(data.PatientId)]
+symptom_diary = symptom_diary[symptom_diary.PatientId.isin(data.PatientId)]
+# Merge daily PROs accounting for days where patients answered the same PRO more than once
+# per day
+daily_pros = pd.merge(cat.drop_duplicates(subset=['PatientId', 'SubmissionTime']),
+                      symptom_diary.drop_duplicates(subset=['PatientId',
+                                                            'SubmissionTime']),
+                      on=['PatientId', 'SubmissionTime'], how='inner')
+# Calculate rolling mean on previous days for numeric PROs
+numeric_pros = ['CATQ1', 'CATQ2', 'CATQ3', 'CATQ4', 'CATQ5', 'CATQ6', 'CATQ7',
+                'CATQ8', 'SymptomDiaryQ1', 'SymptomDiaryQ2', 'Score']
+mean_pros = copd.rolling_mean_previous_period(df=daily_pros, cols=numeric_pros,
+                                              date_col='SubmissionTime',
+                                              id_col='StudyId', window=3)
+# Merge the averaged PROs with the original responses and calculate differences
+daily_pros = daily_pros.merge(mean_pros, on=['StudyId', 'SubmissionTime'], how='left')
+daily_pros = copd.calculate_diff_from_rolling_mean(df=daily_pros, cols=numeric_pros)
+# Remove the rolling average columns
+daily_pros = daily_pros.loc[:, ~daily_pros.columns.str.endswith('_ave')]
+# Merge PROs with full train data
+train_data = pd.merge_asof(data.sort_values(by='DateOfEvent'), daily_pros.drop(
+                           columns=['StudyId']).sort_values(by='SubmissionTime'),
+                           left_on='DateOfEvent', right_on='SubmissionTime',
+                           by='PatientId', direction='backward')
+################################################
+# Include comorbidities from Lenus service
+################################################
+comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt',
+                            delimiter='|')
+comorbidities = comorbidities.drop(columns=['Id', 'Created'])
+# Get list of comorbidities captured in the service
+comorbidity_list = list(comorbidities.columns)
+comorbidity_list.remove('PatientId')
+# Filter for train patients
+comorbidities = comorbidities[comorbidities.PatientId.isin(data.PatientId)]
+print('Train patients with entries in CopdDatasetCoMorbidityDetails: {} out of {}'.format(
+    len(comorbidities), len(data.PatientId.unique())))
+comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace(
+    bool_mapping).fillna(0)
+print('Comorbidity counts:', '\n', comorbidities[comorbidity_list].sum())
+# Merge with train data, infill nans and get counts
+train_data = train_data.merge(comorbidities, on='PatientId', how='left')
+print('Comorbidity counts after merging with patient days:', '\n',
+      train_data[comorbidity_list].sum())
+train_data[comorbidity_list] = train_data[comorbidity_list].fillna(0)
+# Get comorb counts for each patient
+train_data['Comorbidities'] = train_data[comorbidity_list].sum(axis=1)
+comorb_counts = train_data.groupby('StudyId')['Comorbidities'].max().reset_index()
+# print('Patient comorbidity counts after infilling missing values: \n',
+#       comorb_counts.value_counts())
+comorb_counts.loc[comorb_counts.StudyId.isin(exac_patients), 'IsExacPatient'] = 1
+comorb_counts['IsExacPatient'] = comorb_counts['IsExacPatient'].fillna(0)
+# Drop comorbidities columns from train data but retain AsthmaOverlap
+comorbidity_list.remove('AsthmaOverlap')
+train_data = train_data.drop(columns=comorbidity_list)
+###############################################################
+# Include inhaler type from Lenus service
+###############################################################
+# Load inhaler data
+inhaler_type = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetUsualTherapies.txt',
+                           delimiter='|', usecols=['StudyId', 'InhalerType'])
+# Filter for train patients
+inhaler_type = inhaler_type[inhaler_type.StudyId.isin(data.StudyId)]
+# Create new feature for triple therapy ('LABA-LAMA-ICS' or 'LAMA +LABA-ICS')
+inhaler_type = copd.triple_inhaler_therapy_service(
+    df=inhaler_type, id_col='StudyId', inhaler_col='InhalerType', include_mitt=True)
+print('Patients taking triple inhaler therapy: ', '\n',
+      inhaler_type.TripleTherapy.value_counts())
+train_data = train_data.merge(inhaler_type, on='StudyId', how='left')
+#####################################
+# Map some categorical features
+#####################################
+# Replace SDQ8 with strings for phlegm difficulty and infill as None where no phlegm
+# reported in CAT
+train_data['SymptomDiaryQ8'] = train_data.SymptomDiaryQ8.replace(
+    {1: 'Not difficult', 2: 'A little difficult', 3: 'Quite difficult',
+     4: 'Very difficult', np.nan: 'None'})
+# Replace SDQ9 with strings for phlegm consistency and infill as None where no phlegm
+# reported in CAT
+train_data['SymptomDiaryQ9'] = train_data.SymptomDiaryQ9.replace(
+    {1: 'Watery', 2: 'Sticky liquid', 3: 'Semi-solid', 4: 'Solid', np.nan: 'None'})
+# Replace SDQ10 with strings for phlegm colour and infill as None where no phlegm
+# reported in CAT
+train_data['SymptomDiaryQ10'] = train_data.SymptomDiaryQ10.replace(
+    {1: 'White', 2: 'Yellow', 3: 'Green', 4: 'Dark green', np.nan: 'None'})
+# Replace smoking status with strings
+train_data['SmokingStatus'] = train_data.SmokingStatus.replace(
+    {1: 'Smoker', 2: 'Ex-smoker', 3: 'Non-smoker'})
+train_data['InExacWindow'] = train_data.IsExac.replace({0: False, 1: True})
+#####################################################
+# Calculate DaysSinceCAT and filter data if required
+#####################################################
+train_data['DaysSinceCAT'] = (train_data.DateOfEvent -
+                              train_data.SubmissionTime).dt.days.astype('int')
+DaysSinceCAT_cutoff = 14
+train_data = train_data[train_data.DaysSinceCAT <= DaysSinceCAT_cutoff]
+#####################################
+# Bin some numeric features
+#####################################
+# Bin days since last exacerbation
+exac_bins = [-1, 0, 21, 90, 180, np.inf]
+exac_labels = ['None', '<21 days', '21 - 89 days', '90 - 179 days', '>= 180 days']
+train_data['DaysSinceLastExac'] = copd.bin_numeric_column(
+    col=train_data['DaysSinceLastExac'], bins=exac_bins, labels=exac_labels)
+# Bin patient age
+age_bins = [0, 50, 60, 70, 80, np.inf]
+age_labels = ['<50', '50-59', '60-69', '70-79', '80+']
+train_data['Age'] = copd.bin_numeric_column(
+    col=train_data['Age'], bins=age_bins, labels=age_labels)
+# Bin number of comorbidities
+comorb_bins = [0, 1, 3, np.inf]
+comorb_labels = ['None', '1-2', '3+']
+train_data['Comorbidities'] = copd.bin_numeric_column(
+    col=train_data['Comorbidities'], bins=comorb_bins, labels=comorb_labels)
+comorb_counts['Comorbidities_binned'] = copd.bin_numeric_column(
+    col=comorb_counts['Comorbidities'], bins=comorb_bins, labels=comorb_labels)
+# Bin patient spirometry at onboarding
+spirometry_bins = [0, 30, 50, 80, np.inf]
+spirometry_labels = ['Very severe', 'Severe', 'Moderate', 'Mild']
+train_data['FEV1PercentPredicted'] = copd.bin_numeric_column(
+    col=train_data['LungFunction_FEV1PercentPredicted'], bins=spirometry_bins,
+    labels=spirometry_labels)
+train_data = train_data.drop(columns=['LungFunction_FEV1PercentPredicted'])
+# Assign patients without spirometry in service data to the Mild category
+train_data.loc[
+    train_data['FEV1PercentPredicted'] == 'nan', 'FEV1PercentPredicted'] = 'Mild'
+train_data['FEV1PercentPredicted'].value_counts()
+##################################
+# Service eosinophils feature
+##################################
+train_data['HighestEosinophilCount_0_3'] = np.where(
+    train_data['LabsHighestEosinophilCount'] >= 0.3, 1, 0)
+train_data = train_data.drop(columns=['LabsHighestEosinophilCount'])
+# import matplotlib.pyplot as plt
+# def plot_categorical_against_target(*, df, column, target, savefig=False,
+#                                     output_dir=None, label_rotation=None,
+#                                     category_order=None):
+#     (df.groupby(target)[column].value_counts(normalize=True).mul(100).rename('Percent')
+#         .reset_index().pipe((sns.catplot, 'data'), x=column, y='Percent', hue=target,
+#                             kind='bar', alpha=0.8, order=category_order))
+#     if label_rotation:
+#         plt.xticks(rotation=label_rotation, ha='right', rotation_mode='anchor')
+#     if savefig:
+#         plt.savefig(os.path.join(output_dir, column + '.png'), bbox_inches='tight',
+#                     dpi=150)
+# plot_categorical_against_target(df=train_data, column= 'SymptomDiaryQ10',
+#                                 target='InExacWindow', label_rotation=45,
+#                                 category_order=None, savefig=True,
+#                                 output_dir='../data/plots/')
+# plt.show()
+# plot_categorical_against_target(df=eosinophils, column= 'HighestEosinophilCount_0_3',
+#                                 target='IsExacPatient', label_rotation=None,
+#                                 category_order=None, savefig=True,
+#                                 output_dir='../data/plots/')
+# plt.show()
+# categorical_cols = ['Sex_F', 'RequiredAcuteNIV', 'RequiredICUAdmission',
+#                     'SmokingStatus', 'Comorbidities',
+#                     'CATQ1', 'CATQ2', 'CATQ3', 'CATQ4', 'CATQ5', 'CATQ6', 'CATQ7',
+#                     'CATQ8', 'SymptomDiaryQ1', 'SymptomDiaryQ2', 'SymptomDiaryQ3']
+# for column in categorical_cols:
+#     plot_categorical_against_target(df=train_data, column=column, target='InExacWindow',
+#                                     savefig=True, output_dir='../data/plots/')
+# def plot_numerical_against_target(*, df, column, target, bins=10, savefig=False,
+#                                   output_dir=None):
+#     sns.displot(x=column, hue=target, data=df, stat='density', bins=bins,
+#                 common_norm=False)
+#     if savefig:
+#         plt.savefig(os.path.join(output_dir, column + '.png'), bbox_inches='tight',
+#                     dpi=150)
+# for col in numeric_pros:
+#     plot_numerical_against_target(
+#         df=train_data, column=col + '_diff', target='InExacWindow', bins=10,
+#         savefig=True, output_dir='../data/plots')
+#     plt.show()
+# plot_numerical_against_target(
+#         df=spirometry, column='LungFunction_FEV1PercentPredicted',
+#         target='IsExacPatient', bins=20,
+#         savefig=True, output_dir='../data/plots')
+# plt.show()
+categorical_columns = ['SmokingStatus', 'SymptomDiaryQ8', 'SymptomDiaryQ9',
+                       'SymptomDiaryQ10', 'DaysSinceLastExac', 'Age', 'Comorbidities',
+                       'FEV1PercentPredicted']
+train_data[categorical_columns] = train_data[categorical_columns].astype("str")
+data_encoded = copd.kfold_encode_train_data(df=train_data, fold_patients=fold_patients,
+                                            cols_to_encode=categorical_columns,
+                                            target='IsExac', id_col='StudyId')
+data_encoded = data_encoded.drop(columns=categorical_columns, axis=1)
+###################################
+# Scale data
+###################################
+data_encoded = data_encoded.drop(columns=['PatientId', 'InExacWindow',
+                                          'DateOfEvent', 'SubmissionTime',
+                                          'FirstSubmissionDate', 'LatestPredictionDate'])
+scaler = MinMaxScaler()
+train_data_scaled = crossvalidation.kfold_process_train_data(df=data_encoded,
+                                                             fold_patients=fold_patients,
+                                                             processor=scaler,
+                                                             id_col='StudyId',
+                                                             target='IsExac')
+###################################
+# Infill missing data with median
+###################################
+# K-fold impute data with the median
+imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+train_data_imputed = crossvalidation.kfold_process_train_data(df=train_data_scaled,
+                                                              fold_patients=fold_patients,
+                                                              processor=imputer,
+                                                              id_col='StudyId',
+                                                              target='IsExac')
+#########################################
+# Save final data
+#########################################
+# Train data
+os.makedirs(output_data_dir, exist_ok=True)
+train_data_imputed.to_pickle(os.path.join(output_data_dir, 'train_data_cv.pkl'))
+print('Final train data saved (CV)')

training/tests/__init__.py ADDED Viewed

File without changes

training/tests/test_apply_logic_response_criterion.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""Unit tests for the apply_logic_response_criterion function."""
+import copd
+import numpy as np
+import pandas as pd
+import pytest
+@pytest.fixture
+def exacerbation_event():
+    """Dataframe index (27) of the exacerbation event of interest."""
+    return 27
+@pytest.fixture
+def first_pro_response():
+    """Dataframe index (8) of the first weekly PRO response."""
+    return 8
+@pytest.fixture
+def second_pro_response(first_pro_response):
+    """Dataframe index of the second weekly PRO response. Seven days after first."""
+    return first_pro_response + 7
+@pytest.fixture
+def input_df(exacerbation_event):
+    """Sample input dataframe template - specific cases to be added in each test.
+    This initial dataframe has no PRO responses between the initial exac and the event of
+    interest with DaysSinceLastExac=25. Interim PRO responses should be added in tests.
+    """
+    df = pd.DataFrame({'PatientId': ['1'] * 31,
+                       'DateOfEvent': pd.date_range('2022-01-01', '2022-01-31'),
+                       'Q5Answered': [0] * 31,
+                       'NegativeQ5': [np.nan] * 31,
+                       'IsExac': [0] * 31,
+                       'DaysSinceLastExac': [-1, -1, -1] + list(np.arange(1, 26)) +
+                       list(np.arange(1, 4))})
+    # Add initial event to simulate DaysSinceLastExac restart from 1
+    df.loc[2, 'Q5Answered'] = 1
+    df.loc[2, 'NegativeQ5'] = 0
+    df.loc[2, 'IsExac'] = 1
+    # Add event of interest (DaysSinceLastExac = 25)
+    df.loc[exacerbation_event, 'Q5Answered'] = 1
+    df.loc[exacerbation_event, 'NegativeQ5'] = 0
+    df.loc[exacerbation_event, 'IsExac'] = 1
+    # Add a negative response after the event of interest (should not be counted)
+    df.loc[exacerbation_event + 2, 'Q5Answered'] = 1
+    df.loc[exacerbation_event + 2, 'NegativeQ5'] = 1
+    return df
+def test_output_equals_expected_criterion_failed(input_df, exacerbation_event):
+    """Test output is as expected for failed LOGIC response criterion."""
+    # Output should be same as input with additional RemoveExac column
+    expected_df = input_df.copy()
+    # Insert RemoveExac column with exac flagged for removal
+    expected_df['RemoveExac'] = np.nan
+    expected_df.loc[exacerbation_event, 'RemoveExac'] = 1
+    output_df = copd.apply_logic_response_criterion(input_df)
+    pd.testing.assert_frame_equal(output_df, expected_df)
+def test_output_equals_expected_criterion_passed(
+        input_df, exacerbation_event, first_pro_response, second_pro_response):
+    """Test output is as expected for passed LOGIC response criterion."""
+    # Add PRO responses needed to pass LOGIC criterion
+    input_df.loc[first_pro_response, 'Q5Answered'] = 1
+    input_df.loc[first_pro_response, 'NegativeQ5'] = 1
+    input_df.loc[second_pro_response, 'Q5Answered'] = 1
+    input_df.loc[second_pro_response, 'NegativeQ5'] = 1
+    # Output should be same as input with additional RemoveExac column
+    expected_df = input_df.copy()
+    # Insert RemoveExac column with exac flagged for removal
+    expected_df['RemoveExac'] = np.nan
+    expected_df.loc[exacerbation_event, 'RemoveExac'] = 0
+    output_df = copd.apply_logic_response_criterion(input_df)
+    pd.testing.assert_frame_equal(output_df, expected_df)

training/tests/test_bin_numeric_column.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""Unit tests for the bin_numeric_column function."""
+import copd
+import numpy as np
+import pandas as pd
+def test_binned_ages():
+    """Test output is as expected for typical age binning values."""
+    age_bins = [0, 50, 60, 70, 80, np.inf]
+    labels = ['<50', '50-59', '60-69', '70-79', '80+']
+    df = pd.DataFrame({'Age': [10, 49, 50, 55, 59, 60, 65, 69, 70, 75, 79, 80, 85, 100]})
+    output = copd.bin_numeric_column(col=df['Age'], bins=age_bins, labels=labels)
+    assert list(output.values) == ['<50', '<50', '50-59', '50-59', '50-59', '60-69',
+                                   '60-69', '60-69', '70-79', '70-79', '70-79', '80+',
+                                   '80+', '80+']
+def test_binned_days_since():
+    """Test output is as expected for typical days since last exac binning."""
+    exac_bins = [-1, 0, 21, 90, 180, np.inf]
+    labels = ['None', '<21 days', '21 - 89 days', '90 - 179 days', '>= 180 days']
+    df = pd.DataFrame({'DaysSince': [-1, 0, 10, 21, 25, 89, 90, 150, 179, 180, 200]})
+    output = copd.bin_numeric_column(col=df['DaysSince'], bins=exac_bins, labels=labels)
+    assert list(output) == ['None', '<21 days', '<21 days', '21 - 89 days',
+                            '21 - 89 days', '21 - 89 days', '90 - 179 days',
+                            '90 - 179 days', '90 - 179 days', '>= 180 days',
+                            '>= 180 days']
+def test_binned_comorbidities():
+    """Test output is as expected for typical comorbidity count binning."""
+    comorb_bins = [0, 1, 3, np.inf]
+    labels = ['None', '1-2', '3+']
+    df = pd.DataFrame({'Comorbs': [0, 1, 2, 3, 4, 5]})
+    output = copd.bin_numeric_column(col=df['Comorbs'], bins=comorb_bins, labels=labels)
+    assert list(output) == ['None', '1-2', '1-2', '3+', '3+', '3+']

training/tests/test_calculate_days_since_last_event.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""Unit tests for the calculate_days_since_last_event function."""
+import copd
+import pandas as pd
+def test_output_equals_expected_exac():
+    """Compare the output and expected dataframes."""
+    input_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
+                             'IsExac': [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]})
+    expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
+                                'IsExac': [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+                                'DaysSinceLastExac': [-1, -1, -1, 1, 2, 3, 1, 1, 2, 3, 4,
+                                                      5, 6, 7, 8]})
+    output_df = copd.calculate_days_since_last_event(
+        df=input_df, event_col='IsExac', output_col='DaysSinceLastExac')
+    pd.testing.assert_frame_equal(output_df, expected_df)
+def test_output_equals_expected_rescue_meds():
+    """Compare the output and expected dataframes."""
+    input_df = pd.DataFrame({
+        'Date': pd.date_range('2022-02-01', '2022-02-15'),
+        'IsRescueMedExac': [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0]})
+    expected_df = pd.DataFrame({'Date': pd.date_range('2022-02-01', '2022-02-15'),
+                                'IsRescueMedExac': [
+                                    0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0],
+                                'DaysSinceLastRescueMeds': [
+                                    -1, -1, -1, 1, 2, 3, 1, 1, 2, 3, 4, 5, 1, 2, 3]})
+    output_df = copd.calculate_days_since_last_event(
+        df=input_df, event_col='IsRescueMedExac', output_col='DaysSinceLastRescueMeds')
+    pd.testing.assert_frame_equal(output_df, expected_df)

training/tests/test_define_hospital admission.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Unit tests for the define_hospital_admission function."""
+import copd
+import pandas as pd
+import pytest
+@pytest.mark.parametrize("event",
+                         ['Hospital admission - emergency, COPD related',
+                          'Hospital admission - emergency, COPD unrelated'])
+def test_admission_event(event):
+    """Test patient event definitions for hospital admissions."""
+    assert copd.define_hospital_admission(
+        pd.Series(event)) == 1
+@pytest.mark.parametrize("event",
+                         ['Death',
+                          'NHS 24 review - emergency, COPD related',
+                          'Exacerbation - self-managed with rescue pack',
+                          'GP review - emergency, COPD related',
+                          'Emergency department attendance, COPD related',
+                          'Exacerbation - started abs/steroid by clinical team'])
+def test_non_admission_event(event):
+    """Test patient event definitions for non hospital admission events."""
+    assert copd.define_hospital_admission(
+        pd.Series(event)) == 0

training/tests/test_define_service_exac_event.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Unit tests for the define_service_exac_event function."""
+import copd
+import pandas as pd
+import pytest
+@pytest.mark.parametrize("event",
+                         ['Hospital admission - emergency, COPD related',
+                          'GP review - emergency, COPD related',
+                          'Emergency department attendance, COPD related',
+                          'Exacerbation - started abs/steroid by clinical team'])
+def test_positive_event_no_community(event):
+    """Test patient event definitions for COPD exacerbations - no community events."""
+    assert copd.define_service_exac_event(events=pd.Series(event)) == 1
+@pytest.mark.parametrize("event",
+                         ['Hospital admission - emergency, COPD unrelated',
+                          'Death',
+                          'NHS 24 review - emergency, COPD related',
+                          'Exacerbation - self-managed with rescue pack'])
+def test_negative_event_no_community(event):
+    """Test patient event definitions for non-exac events - no community events."""
+    assert copd.define_service_exac_event(events=pd.Series(event)) == 0
+@pytest.mark.parametrize("event",
+                         ['Hospital admission - emergency, COPD related',
+                          'GP review - emergency, COPD related',
+                          'Emergency department attendance, COPD related',
+                          'Exacerbation - started abs/steroid by clinical team',
+                          'Exacerbation - self-managed with rescue pack'])
+def test_positive_event_with_community(event):
+    """Test patient event definitions for COPD exacerbations - with community events."""
+    assert copd.define_service_exac_event(events=pd.Series(event),
+                                          include_community=(True)) == 1
+@pytest.mark.parametrize("event",
+                         ['Hospital admission - emergency, COPD unrelated',
+                          'Death',
+                          'NHS 24 review - emergency, COPD related'])
+def test_negative_event_with_community(event):
+    """Test patient event definitions for non-exac events - with community events."""
+    assert copd.define_service_exac_event(events=pd.Series(event),
+                                          include_community=(True)) == 0

training/tests/test_extract_clinician_verified_exacerbations.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""Unit tests for the extract_clinician_verified_exacerbations function."""
+import copd
+import numpy as np
+import pandas as pd
+import pytest
+@pytest.fixture
+def input_df():
+    """Sample input data.
+    Input data covers the following cases:
+    1. Non-verified events (input rows 0 and 3)
+    2. Verified exacerbation with known date (row 1)
+    3. Verified exacerbation with unknown date (row 2)
+    """
+    return pd.DataFrame({'StudyId': [1, '2a', 1, 4],
+                         'Exacerbation confirmed': [0, 1, 1, 0],
+                         'DateRecorded': pd.to_datetime(['2022-01-03', '2022-01-05',
+                                                         '2022-01-06', '2022-01-09']),
+                         'New Date': [np.nan, '2022-01-05', np.nan, np.nan],
+                         'Date changed': [np.nan, 1, 0, np.nan],
+                         'Extra column': [1, 3, 'a', '4']})
+@pytest.fixture
+def expected_df():
+    """Define expected output dataframe."""
+    return pd.DataFrame({'StudyId': ['2a', 1],
+                         'DateOfEvent': pd.to_datetime(pd.Series(['2022-01-05',
+                                                                  '2022-01-06']),
+                                                       utc=True).dt.normalize(),
+                         'IsCommExac': [1, 1],
+                         'ExacDateUnknown': [0, 1]})
+def test_output_equals_expected(input_df, expected_df):
+    """Test output is as expected."""
+    output_df = copd.extract_clinician_verified_exacerbations(input_df).reset_index(
+        drop=True)
+    pd.testing.assert_frame_equal(output_df, expected_df)

training/tests/test_filter_symptom_diary.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""Unit tests for the filter_symptom_diary function."""
+import copd
+import pandas as pd
+import pytest
+@pytest.fixture
+def input_df():
+    """Sample input data."""
+    return pd.DataFrame({'PatientId': [1, '2a', 1, '2a', 3, 4, 4, 5, 5, 4],
+                         'SubmissionTime': pd.date_range('2022-01-01', '2022-01-10')})
+def test_output_no_date_cutoff(input_df):
+    """Test output is as expected when called without a date cut off."""
+    output_df = copd.filter_symptom_diary(df=input_df, patients=[1, '2a', 3])
+    expected_df = pd.DataFrame({'PatientId': [1, '2a', 1, '2a', 3],
+                                'SubmissionTime':
+                                pd.date_range('2022-01-01', '2022-01-05', tz='utc')})
+    pd.testing.assert_frame_equal(output_df, expected_df)
+def test_output_with_date_cutoff(input_df):
+    """Test output is as expected when called with a date cut off."""
+    output_df = copd.filter_symptom_diary(df=input_df, patients=[1, '2a', 3],
+                                          date_cutoff='2022-01-03').reset_index(drop=True)
+    expected_df = pd.DataFrame({'PatientId': [1, '2a', 3],
+                                'SubmissionTime':
+                                pd.date_range('2022-01-03', '2022-01-05', tz='utc')})
+    pd.testing.assert_frame_equal(output_df, expected_df)

training/tests/test_get_logic_exacerbation_indices.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""Unit tests for the get_logic_exacerbation_indices function."""
+import copd
+import numpy as np
+import pandas as pd
+import pytest
+def test_returns_empty_list_for_no_exacs():
+    """Test output for input with no exacerbations."""
+    input_df = pd.DataFrame({'DaysSinceLastExac': np.arange(15, 20),
+                             'IsExac': [0, 0, 0, 0, 0]})
+    output_list = copd.get_logic_exacerbation_indices(input_df)
+    assert not output_list
+@pytest.fixture
+def input_df_no_relevant_exacs():
+    """Sample input data containing no relevant exacerbations."""
+    return pd.DataFrame({'DaysSinceLastExac': np.arange(40, 45),
+                         'IsExac': [0, 0, 1, 0, 0]})
+def test_returns_empty_list_for_no_relevant_exacs_default(input_df_no_relevant_exacs):
+    """Test output for input with no relevant exacerbations. Default options."""
+    output_list = copd.get_logic_exacerbation_indices(input_df_no_relevant_exacs)
+    assert not output_list
+def test_returns_empty_list_for_no_relevant_exacs_non_default(input_df_no_relevant_exacs):
+    """Test output for input with no relevant exacerbations. Specified time window."""
+    output_list = copd.get_logic_exacerbation_indices(input_df_no_relevant_exacs,
+                                                      minimum_period=20,
+                                                      maximum_period=38)
+    assert not output_list
+@pytest.fixture
+def input_df_with_exacs():
+    """Sample input data containing exacerbations."""
+    return pd.DataFrame({'DaysSinceLastExac': [15, 20, 42, 37, 22, 18],
+                         'IsExac': [1, 0, 1, 1, 1, 0]})
+def test_returns_relevant_exacs_default(input_df_with_exacs):
+    """Test output for input with relevant exacerbations. Default options."""
+    output_list = copd.get_logic_exacerbation_indices(input_df_with_exacs)
+    assert output_list == [0, 4]
+def test_returns_relevant_exacs_non_default(input_df_with_exacs):
+    """Test output for input with relevant exacerbations. Specified time window."""
+    output_list = copd.get_logic_exacerbation_indices(input_df_with_exacs,
+                                                      minimum_period=20,
+                                                      maximum_period=38)
+    assert output_list == [3, 4]

training/tests/test_get_rescue_med_pro_responses.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""Unit tests for the get_rescue_med_pro_responses function."""
+import copd
+import numpy as np
+import pandas as pd
+import pytest
+@pytest.fixture
+def input_df():
+    """Sample input data."""
+    return pd.DataFrame({'PatientId': [1, '2a', 1],
+                         'SymptomDiaryQ5': [0, 1, np.nan]})
+@pytest.fixture
+def expected_df():
+    """Define expected output dataframe."""
+    return pd.DataFrame({'PatientId': [1, '2a'],
+                         'SymptomDiaryQ5': [0, 1],
+                         'Q5Answered': [1, 1],
+                         'NegativeQ5': [1, 0],
+                         'IsCommExac': [0, 1]})
+def test_output_equals_expected(input_df, expected_df):
+    """Test output is as expected."""
+    output_df = copd.get_rescue_med_pro_responses(input_df)
+    pd.testing.assert_frame_equal(output_df, expected_df)

training/tests/test_logic_consecutive_negative_responses.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""Unit tests for the logic_consecutive_negative_responses function."""
+import copd
+import numpy as np
+import pandas as pd
+import pytest
+@pytest.fixture
+def exacerbation_event():
+    """Dataframe index (27) of the exacerbation event of interest."""
+    return 27
+@pytest.fixture
+def first_pro_response():
+    """Dataframe index (8) of the first weekly PRO response."""
+    return 8
+@pytest.fixture
+def second_pro_response(first_pro_response):
+    """Dataframe index of the second weekly PRO response. Seven days after first."""
+    return first_pro_response + 7
+@pytest.fixture
+def third_pro_response(second_pro_response):
+    """Dataframe index of the third weekly PRO response. Seven days after second."""
+    return second_pro_response + 7
+@pytest.fixture
+def input_df(exacerbation_event):
+    """Sample input dataframe template - specific cases to be added in each test.
+    This initial dataframe has no PRO responses between the initial exacerbation at index
+    2 and the event of interest with DaysSinceLastExac=25 at index exacerbation_event (set
+    to 27). Interim PRO responses should be added in tests. Each row is a different day
+    (in chronological order). Add/subtract N from exacerbation_event to refer to N days
+    before or after the event by the dataframe index, e.g. exacerbation_event - 7 refers
+    to the day a week prior.
+    """
+    df = pd.DataFrame({'PatientId': ['1'] * 31,
+                       'DateOfEvent': pd.date_range('2022-01-01', '2022-01-31'),
+                       'Q5Answered': [0] * 31,
+                       'NegativeQ5': [np.nan] * 31,
+                       'DaysSinceLastExac': [-1, -1, -1] + list(np.arange(1, 26)) +
+                       list(np.arange(1, 4))})
+    # Add initial event to simulate DaysSinceLastExac restart from 1
+    df.loc[2, 'Q5Answered'] = 1
+    df.loc[2, 'NegativeQ5'] = 0
+    # Add event of interest (DaysSinceLastExac = 25)
+    df.loc[exacerbation_event, 'Q5Answered'] = 1
+    df.loc[exacerbation_event, 'NegativeQ5'] = 0
+    # Add a negative response 2 days after the event of interest (should not be counted)
+    df.loc[exacerbation_event + 2, 'Q5Answered'] = 1
+    df.loc[exacerbation_event + 2, 'NegativeQ5'] = 1
+    return df
+def test_returns_one_when_no_responses(input_df, exacerbation_event):
+    """Verify returns 1 (flag for removal) for no interim PRO responses."""
+    assert copd.logic_consecutive_negative_responses(input_df, exacerbation_event) == 1
+def test_returns_one_too_few_responses(input_df, exacerbation_event):
+    """Verify returns 1 (flag for removal) for too few interim PRO responses."""
+    # Add a single negative response 7 days before the exacerbation event. Should fail PRO
+    # LOGIC because the negative response at index 29 is after the event of interest.
+    input_df.loc[exacerbation_event - 7, 'Q5Answered'] = 1
+    input_df.loc[exacerbation_event - 7, 'NegativeQ5'] = 1
+    assert copd.logic_consecutive_negative_responses(input_df, exacerbation_event) == 1
+def test_returns_one_too_few_negative_responses(
+        input_df, exacerbation_event, second_pro_response, third_pro_response):
+    """Verify returns 1 (flag for removal) for too few interim PRO responses."""
+    # Add a positive response and a single negative response. Should return one because
+    # the response at index 29 is after the period of interest.
+    input_df.loc[second_pro_response, 'Q5Answered'] = 1
+    input_df.loc[second_pro_response, 'NegativeQ5'] = 0
+    input_df.loc[third_pro_response, 'Q5Answered'] = 1
+    input_df.loc[third_pro_response, 'NegativeQ5'] = 1
+    assert copd.logic_consecutive_negative_responses(input_df, exacerbation_event) == 1
+def test_returns_one_too_few_consecutive_negative_responses_missing(
+        input_df, exacerbation_event, first_pro_response, second_pro_response,
+        third_pro_response):
+    """Verify returns 1 (flag for removal) for too few consecutive -ve PRO responses.
+    Input has a missing response between the two negative responses.
+    """
+    # Add negative responses at indices 8 and 22 (missing response at 15)
+    input_df.loc[first_pro_response, 'Q5Answered'] = 1
+    input_df.loc[first_pro_response, 'NegativeQ5'] = 1
+    input_df.loc[third_pro_response, 'Q5Answered'] = 1
+    input_df.loc[third_pro_response, 'NegativeQ5'] = 1
+    assert copd.logic_consecutive_negative_responses(input_df, exacerbation_event) == 1
+def test_returns_one_too_few_consecutive_negative_responses_positive(
+        input_df, exacerbation_event, first_pro_response, second_pro_response,
+        third_pro_response):
+    """Verify returns 1 (flag for removal) for too few consecutive -ve PRO responses.
+    Input has a positive response between the two negative responses.
+    """
+    # Add negative responses at indices 8 and 22, and a positive response at 15
+    input_df.loc[first_pro_response, 'Q5Answered'] = 1
+    input_df.loc[first_pro_response, 'NegativeQ5'] = 1
+    input_df.loc[second_pro_response, 'Q5Answered'] = 1
+    input_df.loc[second_pro_response, 'NegativeQ5'] = 0
+    input_df.loc[third_pro_response, 'Q5Answered'] = 1
+    input_df.loc[third_pro_response, 'NegativeQ5'] = 1
+    assert copd.logic_consecutive_negative_responses(input_df, exacerbation_event) == 1
+def test_returns_zero_enough_consecutive_negative_responses_default(
+        input_df, exacerbation_event, first_pro_response, second_pro_response):
+    """Verify returns 0 (pass LOGIC criterion) for required consecutive -ve PRO responses.
+    Input has two consecutive negative responses. Should return 1 with default options.
+    """
+    # Add negative responses at indices 8 and 15
+    input_df.loc[first_pro_response, 'Q5Answered'] = 1
+    input_df.loc[first_pro_response, 'NegativeQ5'] = 1
+    input_df.loc[second_pro_response, 'Q5Answered'] = 1
+    input_df.loc[second_pro_response, 'NegativeQ5'] = 1
+    assert copd.logic_consecutive_negative_responses(input_df, exacerbation_event) == 0
+def test_returns_one_too_few_consecutive_negative_responses_non_default(
+        input_df, exacerbation_event, first_pro_response, second_pro_response):
+    """Verify returns 1 (flag for removal) for too few consecutive -ve PRO responses.
+    Input has two consecutive negative responses. Should return 0 with N=3.
+    """
+    # Add negative responses at indices 8 and 15
+    input_df.loc[first_pro_response, 'Q5Answered'] = 1
+    input_df.loc[first_pro_response, 'NegativeQ5'] = 1
+    input_df.loc[second_pro_response, 'Q5Answered'] = 1
+    input_df.loc[second_pro_response, 'NegativeQ5'] = 1
+    assert copd.logic_consecutive_negative_responses(
+        input_df, exacerbation_event, N=3) == 1
+def test_returns_zero_too_few_consecutive_negative_responses_non_default(
+        input_df, exacerbation_event, first_pro_response, second_pro_response,
+        third_pro_response):
+    """Verify returns 0 (pass LOGIC criterion) for required consecutive -ve PRO responses.
+    Input has three consecutive negative responses. Should return 0 with N=3
+    """
+    # Add negative responses at indices 8, 15, and 22
+    input_df.loc[first_pro_response, 'Q5Answered'] = 1
+    input_df.loc[first_pro_response, 'NegativeQ5'] = 1
+    input_df.loc[second_pro_response, 'Q5Answered'] = 1
+    input_df.loc[second_pro_response, 'NegativeQ5'] = 1
+    input_df.loc[third_pro_response, 'Q5Answered'] = 1
+    input_df.loc[third_pro_response, 'NegativeQ5'] = 1
+    assert copd.logic_consecutive_negative_responses(
+        input_df, exacerbation_event, N=3) == 0

training/tests/test_minimum_period_between_exacerbations.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""Unit tests for the minimum_period_between_exacerbations function."""
+import copd
+import pandas as pd
+import pytest
+@pytest.mark.parametrize("input_values,expected_output",
+                         [(pd.DataFrame({'DaysSinceLastExac': [-1]}), 0),
+                          (pd.DataFrame({'DaysSinceLastExac': [7]}), 1),
+                          (pd.DataFrame({'DaysSinceLastExac': [14]}), 1),
+                          (pd.DataFrame({'DaysSinceLastExac': [20]}), 0)])
+def test_threshold_equals_default(input_values, expected_output):
+    """Test output for a variety of input values.
+    Test cases cover:
+        1. No previous exacerbation
+        2. Very recent exac
+        3. Exac on the threshold value (should count as too recent)
+        4. Previous non-recent exac
+       for the default threshold of 14 days
+    """
+    assert copd.minimum_period_between_exacerbations(input_values) == expected_output
+@pytest.mark.parametrize("input_values,expected_output",
+                         [(pd.DataFrame({'DaysSinceLastExac': [-1]}), 0),
+                          (pd.DataFrame({'DaysSinceLastExac': [6]}), 1),
+                          (pd.DataFrame({'DaysSinceLastExac': [7]}), 1),
+                          (pd.DataFrame({'DaysSinceLastExac': [14]}), 0)])
+def test_threshold_equals_seven(input_values, expected_output):
+    """Test output for a variety of input values.
+    Test cases cover:
+        1. No previous exacerbation
+        2. Very recent exac
+        3. Exac on the threshold value (should count as too recent)
+        4. Previous non-recent exac
+    for a threshold of 7 days.
+    """
+    assert copd.minimum_period_between_exacerbations(
+        input_values, minimum_days=7) == expected_output

training/tests/test_remove_data_between_exacerbations.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Unit tests for the remove_data_between_exacerbations function."""
+import copd
+import numpy as np
+import pandas as pd
+import pytest
+@pytest.fixture
+def input_df():
+    """Sample input data including an exacerbation flagged for removal."""
+    return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
+                         'IsExac': [0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
+                         'DaysSinceLastExac': [-1, -1, 1, 2, 3, 4, 5, 1, 2, 1],
+                         'RemoveExac': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]})
+@pytest.fixture
+def expected_df():
+    """Define expected output dataframe."""
+    return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
+                         'IsExac': [0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
+                         'DaysSinceLastExac': [-1, -1, 1, 2, 3, 4, 5, 1, 2, 1],
+                         'RemoveExac': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
+                         'RemoveRow': [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan,
+                                       np.nan, np.nan]})
+def test_output_equals_expected(input_df, expected_df):
+    """Test output is as expected."""
+    output_df = copd.remove_data_between_exacerbations(input_df)
+    pd.testing.assert_frame_equal(output_df, expected_df)

training/tests/test_remove_unknown_date_exacerbations.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""Unit tests for the remove_unknown_date_exacerbations function."""
+import copd
+import numpy as np
+import pandas as pd
+import pytest
+@pytest.fixture
+def input_df():
+    """Sample input data including an exacerbation with an uncertain date."""
+    return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
+                         'IsExac': [0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
+                         'ExacDateUnknown': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]})
+def test_check_correct_rows_flagged_default(input_df):
+    """Check the correct rows are flagged for removal using default option (7 days)."""
+    output_df = copd.remove_unknown_date_exacerbations(input_df)
+    expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
+                                'IsExac': [0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
+                                'ExacDateUnknown': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+                                'RemoveRow': [np.nan, np.nan, 1, 1, 1, 1, 1, 1, 1,
+                                              np.nan]})
+    pd.testing.assert_frame_equal(output_df, expected_df)
+def test_check_correct_rows_flagged_non_default(input_df):
+    """Check the correct rows are flagged for removal when specifying 5 days."""
+    output_df = copd.remove_unknown_date_exacerbations(input_df, days_to_remove=5)
+    expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
+                                'IsExac': [0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
+                                'ExacDateUnknown': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+                                'RemoveRow': [np.nan, np.nan, np.nan, np.nan, 1, 1, 1, 1,
+                                              1, np.nan]})
+    pd.testing.assert_frame_equal(output_df, expected_df)

training/tests/test_rolling_mean_previous_period.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""Unit tests for the rolling_mean_previous_period function."""
+import copd
+import pandas as pd
+import numpy as np
+import pytest
+@pytest.fixture
+def input_df_single_patient():
+    """Sample daily input data including PRO responses for a single patient."""
+    return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
+                         'StudyId': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                         'Q1': [5, 6, 1, np.nan, 0, 4, 3, 7, np.nan, 1]})
+def test_seven_day_window_single_patient_columns(input_df_single_patient):
+    """Compare the output and expected columns for a single patient ID.
+    Uses a seven day rolling window for the mean.
+    """
+    output_df = copd.rolling_mean_previous_period(
+        df=input_df_single_patient, cols='Q1', id_col='StudyId', date_col='Date',
+        window=7)
+    # pd.testing.assert_frame_equal(output_df, expected_df)
+    assert set(output_df.columns) == set(['Date', 'StudyId', 'Q1_ave'])
+def test_seven_day_window_single_patient_values(input_df_single_patient):
+    """Compare the output and expected dataframes for a single patient ID.
+    Uses a seven day rolling window for the sum.
+    """
+    expected_df = pd.DataFrame(
+        {'Date': pd.date_range('2022-01-01', '2022-01-10'),
+         'StudyId': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+         'Q1_ave': [np.nan, 5.0, 5.5, 4.0, 4.0, 3.0, 3.2, 3.16666667, 3.5, 3.0]})
+    output_df = copd.rolling_mean_previous_period(
+        df=input_df_single_patient, cols='Q1', id_col='StudyId', date_col='Date',
+        window=7)
+    pd.testing.assert_frame_equal(
+        output_df[['Date', 'StudyId', 'Q1_ave']],
+        expected_df[['Date', 'StudyId', 'Q1_ave']])
+@pytest.fixture
+def input_df_several_patients_two_columns():
+    """Sample daily input data including PRO responses for three patients."""
+    return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
+                         'StudyId': [1, 2, 1, 1, 2, 1, 2, '3', 1, '3'],
+                         'Q1': [5, 6, 1, np.nan, 0, 4, 3, 7, np.nan, 1],
+                         'Q2': [-5, -6, -1, np.nan, 0, -4, -3, -7, np.nan, -1]})
+@pytest.fixture
+def expected_df_several_patients_two_columns():
+    """Create expected output df including mean PRO responses for three patients."""
+    # Create expected output dataframe (daily records for each patient with rolling means)
+    patient1 = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-09'),
+                             'StudyId': [1] * 9,
+                             'Q1_ave': [np.nan, 5.0, 5.0, 3.0, 1.0, 1.0, 4.0, 4.0, 4.0],
+                             'Q2_ave': [np.nan, -5.0, -5.0, -3.0, -1.0, -1.0, -4.0, -4.0,
+                                        -4.0]})
+    patient2 = pd.DataFrame({'Date': pd.date_range('2022-01-02', '2022-01-07'),
+                             'StudyId': [2] * 6,
+                             'Q1_ave': [np.nan, 6.0, 6.0, 6.0, 0.0, 0.0],
+                             'Q2_ave': [np.nan, -6.0, -6.0, -6.0, 0.0, 0.0]})
+    patient3 = pd.DataFrame({'Date': pd.date_range('2022-01-08', '2022-01-10'),
+                             'StudyId': ['3'] * 3,
+                             'Q1_ave': [np.nan, 7.0, 7.0],
+                             'Q2_ave': [np.nan, -7.0, -7.0]})
+    # Combine individual patient series into one df
+    expected_df = pd.concat([patient1, patient2, patient3]).reset_index(drop=True)
+    return expected_df
+def test_three_day_window_several_patients_columns(
+        input_df_several_patients_two_columns, expected_df_several_patients_two_columns):
+    """Compare the output and expected columns for three patient IDs and two mean columns.
+    Uses a three day rolling window for the mean.
+    """
+    output_df = copd.rolling_mean_previous_period(
+        df=input_df_several_patients_two_columns, cols=['Q1', 'Q2'], id_col='StudyId',
+        date_col='Date', window=3)
+    assert set(output_df.columns) == set(expected_df_several_patients_two_columns.columns)
+def test_three_day_window_several_patients_values(
+        input_df_several_patients_two_columns, expected_df_several_patients_two_columns):
+    """Compare the output and expected dataframes for three patient IDs.
+    Uses a three day rolling window for the mean.
+    """
+    output_df = copd.rolling_mean_previous_period(
+        df=input_df_several_patients_two_columns, cols=['Q1', 'Q2'], id_col='StudyId',
+        date_col='Date', window=3)
+    pd.testing.assert_frame_equal(
+        expected_df_several_patients_two_columns[['Date', 'StudyId', 'Q1_ave', 'Q2_ave']],
+        output_df[['Date', 'StudyId', 'Q1_ave', 'Q2_ave']], check_like=True)

training/tests/test_rolling_sum_previous_period.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""Unit tests for the rolling_sum_previous_period function."""
+import copd
+import pandas as pd
+import pytest
+@pytest.fixture
+def input_df_single_patient():
+    """Sample daily input data including exacerbations for a single patient."""
+    return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
+                         'StudyId': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                         'IsExac': [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]})
+def test_seven_day_window_single_patient(input_df_single_patient):
+    """Compare the output and expected dataframes for a single patient ID.
+    Uses a seven day rolling window for the sum.
+    """
+    expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
+                                'StudyId': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                                'IsExac': [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+                                'ExacsPrevPeriod': [
+                                    0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 2, 2, 2, 1, 0]})
+    output_df = copd.rolling_sum_previous_period(
+        df=input_df_single_patient, col='IsExac', output_col='ExacsPrevPeriod',
+        id_col='StudyId', date_col='Date', window=7)
+    pd.testing.assert_frame_equal(output_df, expected_df)
+def test_three_day_window_single_patient(input_df_single_patient):
+    """Compare the output and expected dataframes for a single patient ID.
+    Uses a three day rolling window for the sum.
+    """
+    expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
+                                'StudyId': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                                'IsExac': [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+                                'ExacsPrevPeriod': [
+                                    0, 0, 0, 1, 1, 1, 1, 2, 2, 1, 0, 0, 0, 0, 0]})
+    output_df = copd.rolling_sum_previous_period(
+        df=input_df_single_patient, col='IsExac', output_col='ExacsPrevPeriod',
+        id_col='StudyId', date_col='Date', window=3)
+    pd.testing.assert_frame_equal(output_df, expected_df)
+@pytest.fixture
+def input_df_several_patients():
+    """Sample daily input data including exacerbations for a single patient."""
+    return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
+                         'StudyId': [1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 3, 1, 1, 3, 1],
+                         'IsExac': [0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]})
+def test_seven_day_window_several_patients(input_df_several_patients):
+    """Compare the output and expected dataframes for three patient IDs.
+    Uses a seven day rolling window for the sum.
+    """
+    expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
+                                'StudyId': [1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 3, 1, 1, 3, 1],
+                                'IsExac': [0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+                                'ExacsPrevPeriod': [
+                                    0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 0, 1, 1, 0, 0]})
+    output_df = copd.rolling_sum_previous_period(
+        df=input_df_several_patients, col='IsExac', output_col='ExacsPrevPeriod',
+        id_col='StudyId', date_col='Date', window=7)
+    pd.testing.assert_frame_equal(output_df, expected_df)
+def test_three_day_window_several_patients(input_df_several_patients):
+    """Compare the output and expected dataframes for three patient IDs.
+    Uses a three day rolling window for the sum.
+    """
+    expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
+                                'StudyId': [1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 3, 1, 1, 3, 1],
+                                'IsExac': [0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+                                'ExacsPrevPeriod': [
+                                    0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]})
+    output_df = copd.rolling_sum_previous_period(
+        df=input_df_several_patients, col='IsExac', output_col='ExacsPrevPeriod',
+        id_col='StudyId', date_col='Date', window=3)
+    pd.testing.assert_frame_equal(output_df, expected_df)

training/tests/test_set_prediction_window.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Unit tests for the set_prediction_window function."""
+import copd
+import pandas as pd
+import pytest
+@pytest.fixture
+def input_df():
+    """Sample input data including an exacerbation."""
+    return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
+                         'IsExac': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]})
+def test_check_correct_three_day_window_set(input_df):
+    """Check the correct rows are set to exacerbations for a three day window."""
+    output_df = copd.set_prediction_window(df=input_df, prediction_window=3)
+    expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
+                                'IsExac': [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]})
+    pd.testing.assert_frame_equal(output_df, expected_df)
+def test_check_correct_five_day_window_set(input_df):
+    """Check the correct rows are set to exacerbations for a five day window."""
+    output_df = copd.set_prediction_window(df=input_df, prediction_window=5)
+    expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
+                                'IsExac': [0, 0, 1, 1, 1, 1, 1, 0, 0, 0]})
+    pd.testing.assert_frame_equal(output_df, expected_df)

training/tests/test_set_pro_exac_dates.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Unit tests for the set_pro_exac_dates function."""
+import copd
+import numpy as np
+import pandas as pd
+import pytest
+@pytest.fixture
+def input_df():
+    """Sample input data.
+    Input data covers the following cases:
+    1. Duplicate non-exacerbation response (input rows 0 and 3)
+    2. Exacerbation with known date (row 1)
+    3. Exacerbation with unknown date (row 2)
+    """
+    return pd.DataFrame({'PatientId': [1, 2, 3, 1],
+                         'SymptomDiaryQ11a': [1, 2, np.nan, 1],
+                         'SymptomDiaryQ11b': [np.nan, pd.to_datetime('2022-01-01'),
+                                              np.nan, np.nan],
+                         'SubmissionTime': pd.to_datetime(['2022-01-03', '2022-01-05',
+                                                           '2022-01-06', '2022-01-03']),
+                         'IsCommExac': [1, 1, 0, 1]})
+@pytest.fixture
+def expected_df():
+    """Define expected output dataframe."""
+    return pd.DataFrame({'PatientId': [2, 3, 1],
+                         'SymptomDiaryQ11a': [2, np.nan, 1],
+                         'SymptomDiaryQ11b': [pd.to_datetime('2022-01-01'), np.nan,
+                                              np.nan],
+                         'SubmissionTime': pd.to_datetime(['2022-01-05', '2022-01-06',
+                                                          '2022-01-03']),
+                         'IsCommExac': [1, 0, 1],
+                         'DateOfEvent': pd.to_datetime(['2022-01-01', '2022-01-06',
+                                                        '2022-01-03'], utc=True
+                                                       ).normalize(),
+                         'ExacDateUnknown': [0, 0, 1]})
+def test_output_equals_expected(input_df, expected_df):
+    """Test output is as expected."""
+    output_df = copd.set_pro_exac_dates(input_df).reset_index(drop=True)
+    pd.testing.assert_frame_equal(output_df, expected_df)

training/tests/test_triple_inhaler_therapy_service.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""Unit tests for the triple_inhaler_therapy_service function."""
+import copd
+import pandas as pd
+def test_returns_zero_single_therapy_sitt():
+    """Check output for single inhaler types. Single Inhaler Triple Therapy only."""
+    input_df = pd.DataFrame({'Id': [1, 2, '3', 1],
+                             'InhalerType': ['LAMA', 'LABA', 'LAMA', 'LABA']})
+    expected_df = pd.DataFrame({'Id': [1, 2, '3'],
+                                'TripleTherapy': [0, 0, 0]})
+    output_df = copd.triple_inhaler_therapy_service(
+        df=input_df, id_col='Id', inhaler_col='InhalerType')
+    pd.testing.assert_frame_equal(expected_df, output_df)
+def test_returns_zero_single_therapy_mitt():
+    """Check output for single inhaler types. Includes Multiple Inhaler Triple Therapy."""
+    input_df = pd.DataFrame({'Id': [1, 2, '3', 1],
+                             'InhalerType': ['LAMA', 'LABA', 'LAMA', 'LABA']})
+    expected_df = pd.DataFrame({'Id': [1, 2, '3'],
+                                'TripleTherapy': [0, 0, 0]})
+    output_df = copd.triple_inhaler_therapy_service(
+        df=input_df, id_col='Id', inhaler_col='InhalerType', include_mitt=True)
+    pd.testing.assert_frame_equal(expected_df, output_df)
+def test_returns_zero_double_therapy_sitt():
+    """Check output for double inhaler types. Single Inhaler Triple Therapy only."""
+    input_df = pd.DataFrame({'Id': [1, 2, '3', 1],
+                             'InhalerType': ['LABA-LAMA', 'LABA-ICS', 'LABA-LAMA',
+                                             'LABA-ICS']})
+    expected_df = pd.DataFrame({'Id': [1, 2, '3'],
+                                'TripleTherapy': [0, 0, 0]})
+    output_df = copd.triple_inhaler_therapy_service(
+        df=input_df, id_col='Id', inhaler_col='InhalerType')
+    pd.testing.assert_frame_equal(expected_df, output_df)
+def test_returns_zero_double_therapy_mitt():
+    """Check output for double inhaler types. Includes Multiple Inhaler Triple Therapy."""
+    input_df = pd.DataFrame({'Id': [1, 2, '3', 1],
+                             'InhalerType': ['LABA-LAMA', 'LABA-ICS', 'LABA-LAMA',
+                                             'LABA-ICS']})
+    expected_df = pd.DataFrame({'Id': [1, 2, '3'],
+                                'TripleTherapy': [0, 0, 0]})
+    output_df = copd.triple_inhaler_therapy_service(
+        df=input_df, id_col='Id', inhaler_col='InhalerType', include_mitt=True)
+    pd.testing.assert_frame_equal(expected_df, output_df)
+def test_returns_one_triple_therapy_sitt():
+    """Check output for triple inhaler types. Single Inhaler Triple Therapy only."""
+    input_df = pd.DataFrame({'Id': [1, 2, '3', 1],
+                             'InhalerType': ['LABA-LAMA', 'LAMA +LABA-ICS',
+                                             'LABA-LAMA-ICS', 'LAMA +LABA-ICS']})
+    expected_df = pd.DataFrame({'Id': [1, 2, '3'],
+                                'TripleTherapy': [1, 1, 1]})
+    output_df = copd.triple_inhaler_therapy_service(
+        df=input_df, id_col='Id', inhaler_col='InhalerType')
+    pd.testing.assert_frame_equal(expected_df, output_df)
+def test_returns_zero_triple_therapy_sitt():
+    """Check output for triple inhaler types. Single Inhaler Triple Therapy only.
+    Input df includes SITT and also a patient with a valid MITT combination. Should return
+    zero for that patient as SITT only is required.
+    """
+    input_df = pd.DataFrame({'Id': [1, 2, '3', 1, 4, 4],
+                             'InhalerType': ['LABA-LAMA', 'LAMA +LABA-ICS',
+                                             'LABA-LAMA-ICS', 'LAMA +LABA-ICS', 'LAMA',
+                                             'LABA-ICS']})
+    expected_df = pd.DataFrame({'Id': [1, 2, 4, '3'],
+                                'TripleTherapy': [1, 1, 0, 1]})
+    output_df = copd.triple_inhaler_therapy_service(
+        df=input_df, id_col='Id', inhaler_col='InhalerType')
+    pd.testing.assert_frame_equal(expected_df, output_df)
+def test_returns_one_triple_therapy_mitt():
+    """Check output for triple inhaler types. Includes Multiple Inhaler Triple Therapy.
+    Input df includes SITT and also a patient with a valid MITT combination. Should return
+    one for all patients.
+    """
+    input_df = pd.DataFrame({'Id': [1, 2, '3', 1, 4, 4],
+                             'InhalerType': ['LABA-LAMA', 'LAMA +LABA-ICS',
+                                             'LABA-LAMA-ICS', 'LAMA +LABA-ICS', 'LAMA',
+                                             'LABA-ICS']})
+    expected_df = pd.DataFrame({'Id': [1, 2, 4, '3'],
+                                'TripleTherapy': [1, 1, 1, 1]})
+    output_df = copd.triple_inhaler_therapy_service(
+        df=input_df, id_col='Id', inhaler_col='InhalerType', include_mitt=True)
+    pd.testing.assert_frame_equal(expected_df, output_df)

training/tests/test_unit_lookup.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""Unit tests for the unit_lookup function."""
+import copd
+import pandas as pd
+def test_unit_lookup_defined():
+    """Test unit lookup output for all defined unit codes."""
+    assert copd.unit_lookup(pd.Series(0)) == 'Count'
+    assert copd.unit_lookup(pd.Series(1)) == 'CountPerSecond'
+    assert copd.unit_lookup(pd.Series(2)) == 'InternationalUnit'
+    assert copd.unit_lookup(pd.Series(3)) == 'Joule'
+    assert copd.unit_lookup(pd.Series(4)) == 'Kelvin'
+    assert copd.unit_lookup(pd.Series(5)) == 'Kilogram'
+    assert copd.unit_lookup(pd.Series(6)) == 'KilogramPerLiter'
+    assert copd.unit_lookup(pd.Series(7)) == 'KilogramPerSquareMeter'
+    assert copd.unit_lookup(pd.Series(8)) == 'Liter'
+    assert copd.unit_lookup(pd.Series(9)) == 'LiterPerKilogramSecond'
+    assert copd.unit_lookup(pd.Series(10)) == 'LiterPerSecond'
+    assert copd.unit_lookup(pd.Series(11)) == 'Meter'
+    assert copd.unit_lookup(pd.Series(12)) == 'Pascal'
+    assert copd.unit_lookup(pd.Series(13)) == 'Percent'
+    assert copd.unit_lookup(pd.Series(14)) == 'Second'
+    assert copd.unit_lookup(pd.Series(15)) == 'Siemen'
+def test_unit_lookup_undefined():
+    """Test unit lookup output for undefined unit code and other input."""
+    assert copd.unit_lookup(pd.Series(16)) == 'Undefined'
+    assert copd.unit_lookup(pd.Series(42)) == 'Undefined'
+    assert copd.unit_lookup(pd.Series('A')) == 'Undefined'

training/train_test_split.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""Splits the model C cohort and patient days into stratified train and test sets.
+The train set retains these characteristics of the full data set:
+    Exac days to non-exac days ratio (within 5%). Individual patients can only appear in
+    either train or test.
+    Sex ratio (within 0.05)
+    Age distribution (minimum p-value for Kolmogorov-Smirnov test=0.9)
+This script also splits the train data into balanced folds for cross-validation. Patient
+IDs for train, test and all data folds are stored for use in subsequent scripts.
+All data sets are divided into train and test and stored in separate folders.
+"""
+import numpy as np
+import os
+import pandas as pd
+import pickle
+from lenusml import splits
+data_dir = '<YOUR_DATA_PATH>/copd-dataset/'
+output_train_data_dir = '<YOUR_DATA_PATH>/train_data'
+output_test_data_dir = '<YOUR_DATA_PATH>/test_data'
+cohort_info_dir = '../data/cohort_info/'
+save_cohort_info = True
+data = pd.read_pickle(os.path.join(data_dir, 'exac_data.pkl'))
+##########################################
+# Prepare demographic info for splitting
+##########################################
+# Calculate decimal age on DateOfEvent
+data['DateOfBirth'] = pd.to_datetime(data['DateOfBirth'], utc=True)
+def calculate_age_decimal(dob, date):
+    age = date - dob
+    decimal_age = (age.days + age.seconds / 86400.0) / 365.2425
+    return decimal_age
+data['Age'] = data.apply(lambda x: calculate_age_decimal(
+    x['DateOfBirth'], x['DateOfEvent']), axis=1)
+data = data.drop(columns=['DateOfBirth'])
+##########################################
+# Merge with COPD status and inhaler data
+##########################################
+patient_details = pd.read_csv(os.path.join(data_dir, 'CopdDatasetPatientDetails.txt'),
+                              usecols=['StudyId', 'CopdStatusDetailsId'],
+                              delimiter="|")
+copd_status = pd.read_csv(os.path.join(data_dir, 'CopdDatasetCopdStatusDetails.txt'),
+                          usecols=['Id', 'SmokingStatus', 'RequiredAcuteNIV',
+                                   'RequiredICUAdmission',
+                                   'LungFunction_FEV1PercentPredicted',
+                                   'LabsHighestEosinophilCount'],
+                          delimiter="|")
+# Strip out % signs from spirometry and convert to float
+copd_status['LungFunction_FEV1PercentPredicted'] = copd_status[
+    'LungFunction_FEV1PercentPredicted'].str.strip('%').astype('float')
+patient_details = patient_details.merge(
+    copd_status, left_on='CopdStatusDetailsId', right_on='Id',
+    how='left').drop(columns=['CopdStatusDetailsId', 'Id'])
+data = data.merge(patient_details, on='StudyId', how='left')
+#################################
+# Define train and test cohorts
+#################################
+print('Split data into train and test')
+# Set the class ratio tolerance to 5% of the data class ratio
+class_ratio_tolerance = 0.05 * data.IsExac.value_counts(normalize=True)[0] /\
+    data.IsExac.value_counts(normalize=True)[1]
+print("Class ratio tolerance: ", class_ratio_tolerance)
+# Set the sex ratio tolerance to 5% of the data class ratio
+sex_ratio_tolerance = 0.05 * data.Sex.value_counts(normalize=True)['M'] /\
+    data.Sex.value_counts(normalize=True)['F']
+print("Sex ratio tolerance: ", sex_ratio_tolerance)
+train_data, test_data, train_ids, test_ids = splits.train_test_stratified_class_sex(
+    data=data, id_column='StudyId', class_column='IsExac', sex_column='Sex',
+    train_proportion=0.85,
+    proportion_tolerance=0.05, class_ratio_tolerance=class_ratio_tolerance,
+    sex_ratio_tolerance=sex_ratio_tolerance, random_seed=42)
+#################################
+# Create cross validation folds
+#################################
+fold_proportions, fold_class_ratios, fold_patients = splits.group_kfold_class_balanced(
+    data=train_data, id_column='StudyId', class_column='IsExac', K=5,
+    fold_proportion_tolerance=0.05,
+    fold_class_ratio_tolerance=class_ratio_tolerance, random_seed=42)
+if save_cohort_info:
+    os.makedirs(cohort_info_dir, exist_ok=True)
+    with open(os.path.join(cohort_info_dir, "test_ids.pkl"), 'wb') as f:
+        pickle.dump(list(test_ids), f)
+    with open(os.path.join(cohort_info_dir, "train_ids.pkl"), 'wb') as f:
+        pickle.dump(list(train_ids), f)
+    print('Train and test patient IDs saved')
+    with open(os.path.join(cohort_info_dir, "fold_proportions.pkl"), 'wb') as f:
+        pickle.dump(list(fold_proportions), f)
+    with open(os.path.join(cohort_info_dir, "fold_class_ratios.pkl"), 'wb') as f:
+        pickle.dump(list(fold_class_ratios), f)
+    np.save(os.path.join(cohort_info_dir, 'fold_patients.npy'), fold_patients,
+            allow_pickle=True)
+    print('Cross validation fold information saved')
+###############################
+# Save train and test sets
+###############################
+# Create the output directories
+os.makedirs(output_train_data_dir, exist_ok=True)
+os.makedirs(output_test_data_dir, exist_ok=True)
+# Save exac and patient details info
+train_data.to_pickle(os.path.join(output_train_data_dir, 'train_data.pkl'))
+test_data.to_pickle(os.path.join(output_test_data_dir, 'test_data.pkl'))
+print('Patient details/exac data saved')