IamGrooooot commited on
Commit
e69d4e4
·
1 Parent(s): bcaf08f

Initial release: 72-hour COPD exacerbation prediction model

Browse files
Files changed (40) hide show
  1. .gitignore +150 -0
  2. README.md +212 -0
  3. pipeline.yml +29 -0
  4. requirements.txt +1 -0
  5. setup.cfg +7 -0
  6. training/README.MD +15 -0
  7. training/copd.py +644 -0
  8. training/create_sh_lookup_table.py +43 -0
  9. training/cross_validation.py +208 -0
  10. training/cross_validation_algorithms.py +109 -0
  11. training/cross_validation_calibration.py +260 -0
  12. training/cross_validation_comorbs.py +191 -0
  13. training/define_exacerbations_prologic.py +466 -0
  14. training/fitbit_exploration.py +144 -0
  15. training/lookups/README.MD +1 -0
  16. training/lookups/type_lookup.txt +116 -0
  17. training/prepare_test_data.py +271 -0
  18. training/prepare_train_data.py +295 -0
  19. training/prepare_train_data_crossval.py +331 -0
  20. training/tests/__init__.py +0 -0
  21. training/tests/test_apply_logic_response_criterion.py +82 -0
  22. training/tests/test_bin_numeric_column.py +36 -0
  23. training/tests/test_calculate_days_since_last_event.py +36 -0
  24. training/tests/test_define_hospital admission.py +27 -0
  25. training/tests/test_define_service_exac_event.py +47 -0
  26. training/tests/test_extract_clinician_verified_exacerbations.py +42 -0
  27. training/tests/test_filter_symptom_diary.py +31 -0
  28. training/tests/test_get_logic_exacerbation_indices.py +56 -0
  29. training/tests/test_get_rescue_med_pro_responses.py +29 -0
  30. training/tests/test_logic_consecutive_negative_responses.py +175 -0
  31. training/tests/test_minimum_period_between_exacerbations.py +42 -0
  32. training/tests/test_remove_data_between_exacerbations.py +32 -0
  33. training/tests/test_remove_unknown_date_exacerbations.py +36 -0
  34. training/tests/test_rolling_mean_previous_period.py +101 -0
  35. training/tests/test_rolling_sum_previous_period.py +89 -0
  36. training/tests/test_set_prediction_window.py +28 -0
  37. training/tests/test_set_pro_exac_dates.py +46 -0
  38. training/tests/test_triple_inhaler_therapy_service.py +95 -0
  39. training/tests/test_unit_lookup.py +31 -0
  40. training/train_test_split.py +129 -0
.gitignore ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Folders for model cohort data, training data plots and logs
2
+ data/
3
+ training/logs/
4
+
5
+ # mlflow
6
+ training/tmp
7
+ training/mlruns
8
+ training/mlruns.sqlite
9
+
10
+ # VS Code
11
+ .vscode/
12
+
13
+ # Byte-compiled / optimized / DLL files
14
+ __pycache__/
15
+ *.py[cod]
16
+ *$py.class
17
+
18
+ # C extensions
19
+ *.so
20
+
21
+ # Distribution / packaging
22
+ .Python
23
+ build/
24
+ develop-eggs/
25
+ dist/
26
+ downloads/
27
+ eggs/
28
+ .eggs/
29
+ lib/
30
+ lib64/
31
+ parts/
32
+ sdist/
33
+ var/
34
+ wheels/
35
+ share/python-wheels/
36
+ *.egg-info/
37
+ .installed.cfg
38
+ *.egg
39
+ MANIFEST
40
+
41
+ # PyInstaller
42
+ # Usually these files are written by a python script from a template
43
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
44
+ *.manifest
45
+ *.spec
46
+
47
+ # Installer logs
48
+ pip-log.txt
49
+ pip-delete-this-directory.txt
50
+
51
+ # Unit test / coverage reports
52
+ htmlcov/
53
+ .tox/
54
+ .nox/
55
+ .coverage
56
+ .coverage.*
57
+ .cache
58
+ nosetests.xml
59
+ coverage.xml
60
+ *.cover
61
+ *.py,cover
62
+ .hypothesis/
63
+ .pytest_cache/
64
+ cover/
65
+
66
+ # Translations
67
+ *.mo
68
+ *.pot
69
+
70
+ # Django stuff:
71
+ *.log
72
+ local_settings.py
73
+ db.sqlite3
74
+ db.sqlite3-journal
75
+
76
+ # Flask stuff:
77
+ instance/
78
+ .webassets-cache
79
+
80
+ # Scrapy stuff:
81
+ .scrapy
82
+
83
+ # Sphinx documentation
84
+ docs/_build/
85
+
86
+ # PyBuilder
87
+ .pybuilder/
88
+ target/
89
+
90
+ # Jupyter Notebook
91
+ .ipynb_checkpoints
92
+
93
+ # IPython
94
+ profile_default/
95
+ ipython_config.py
96
+
97
+ # pyenv
98
+ # For a library or package, you might want to ignore these files since the code is
99
+ # intended to run in multiple environments; otherwise, check them in:
100
+ # .python-version
101
+
102
+ # pipenv
103
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
105
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
106
+ # install all needed dependencies.
107
+ #Pipfile.lock
108
+
109
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
110
+ __pypackages__/
111
+
112
+ # Celery stuff
113
+ celerybeat-schedule
114
+ celerybeat.pid
115
+
116
+ # SageMath parsed files
117
+ *.sage.py
118
+
119
+ # Environments
120
+ .env
121
+ .venv
122
+ env/
123
+ venv/
124
+ ENV/
125
+ env.bak/
126
+ venv.bak/
127
+
128
+ # Spyder project settings
129
+ .spyderproject
130
+ .spyproject
131
+
132
+ # Rope project settings
133
+ .ropeproject
134
+
135
+ # mkdocs documentation
136
+ /site
137
+
138
+ # mypy
139
+ .mypy_cache/
140
+ .dmypy.json
141
+ dmypy.json
142
+
143
+ # Pyre type checker
144
+ .pyre/
145
+
146
+ # pytype static type analyzer
147
+ .pytype/
148
+
149
+ # Cython debug symbols
150
+ cython_debug/
README.md ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: apache-2.0
4
+ tags:
5
+ - healthcare
6
+ - ehr
7
+ - copd
8
+ - clinical-risk
9
+ - tabular
10
+ - scikit-learn
11
+ - xgboost
12
+ - lightgbm
13
+ pipeline_tag: tabular-classification
14
+ library_name: sklearn
15
+ ---
16
+
17
+ # COPD Open Models — Model C (72-Hour Exacerbation Prediction)
18
+
19
+ ## Model Details
20
+
21
+ Model C predicts the risk of a COPD exacerbation within **72 hours** using features derived from NHS EHR datasets and patient-reported outcomes (PROs). It includes a reproducible training/evaluation pipeline and runs on standard Python ML libraries (pandas, scikit-learn, imbalanced-learn, plus optional gradient-boosting libraries).
22
+
23
+ ### Key Characteristics
24
+
25
+ - **PRO LOGIC** — a clinically-informed validation algorithm that deduplicates and filters patient-reported exacerbation events (14-day minimum between episodes, consecutive negative rescue-medication responses required for borderline events, 7-day rescue-med prescription spacing).
26
+ - Compares **10 algorithms** with per-fold preprocessing to prevent data leakage.
27
+ - Training code is fully decoupled from cloud infrastructure — runs locally with no Azure dependencies.
28
+
29
+ > **Note:** This repository contains no real patient-level data. All included data files are synthetic or example data for pipeline validation.
30
+
31
+ ### Model Type
32
+
33
+ Traditional tabular ML classifiers (multiple candidate estimators; see "Training Procedure").
34
+
35
+ ### Release Notes
36
+
37
+ - **Phase 1 (current):** Models C, E, H published as the initial "COPD Open Models" collection.
38
+ - **Phase 2 (planned):** Additional models may follow after codebase sanitisation.
39
+
40
+ ---
41
+
42
+ ## Intended Use
43
+
44
+ This model and code are published as **reference implementations** for research, education, and benchmarking on COPD prediction tasks.
45
+
46
+ ### Intended Users
47
+
48
+ - ML practitioners exploring tabular healthcare ML pipelines
49
+ - Researchers comparing feature engineering and evaluation approaches
50
+ - Developers building internal prototypes (non-clinical)
51
+
52
+ ### Out-of-Scope Uses
53
+
54
+ - **Not** for clinical decision-making, triage, diagnosis, or treatment planning.
55
+ - **Not** a substitute for clinical judgement or validated clinical tools.
56
+ - Do **not** deploy in healthcare settings without an appropriate regulatory, clinical safety, and information governance framework.
57
+
58
+ ### Regulatory Considerations (SaMD)
59
+
60
+ Regulatory status for software depends on the intended purpose expressed in documentation, labelling, and promotional materials. Downstream users integrating or deploying this model should determine whether their implementation qualifies as Software as a Medical Device (SaMD) and identify the legal "manufacturer" responsible for compliance and post-market obligations.
61
+
62
+ ---
63
+
64
+ ## Training Data
65
+
66
+ - **Source:** NHS EHR-derived datasets and Lenus COPD Service PRO data (training performed on controlled datasets; not distributed here).
67
+ - **Data available in this repo:** Synthetic/example datasets only.
68
+ - **Cohort:** ~302 COPD patients (84 RECEIVER + 218 Scale-Up). Daily predictions per patient.
69
+ - **Train/test split:** 85% / 15%, stratified by exacerbation status and sex.
70
+ - **Class balance:** Exacerbation days are minority class (~5–10% positive).
71
+
72
+ ### Features (35 total)
73
+
74
+ | Category | Features |
75
+ |----------|----------|
76
+ | **Daily PROs** | CAT Q1–Q8, CAT Score, Symptom Diary Q1–Q3, plus 3-day rolling mean difference variants for each |
77
+ | **Weekly PROs** | Q5 (rescue meds), Q8 (phlegm difficulty), Q9 (phlegm consistency), Q10 (phlegm colour) — target-encoded |
78
+ | **Clinical** | Sex_F, RequiredAcuteNIV, RequiredICUAdmission, HighestEosinophilCount_0_3, TripleTherapy, AsthmaOverlap |
79
+ | **Categorical (target-encoded)** | SmokingStatus, Age (binned: <50 / 50-59 / 60-69 / 70-79 / 80+), FEV1PercentPredicted (Mild / Moderate / Severe / Very Severe), Comorbidities (None / 1-2 / 3+), DaysSinceLastExac (binned) |
80
+ | **Temporal** | ExacsPrevYear (rolling 365-day sum), AdmissionsPrevYear (rolling 365-day sum) |
81
+
82
+ ### Data Preprocessing
83
+
84
+ 1. **Target encoding** — applied per-fold using K-fold encoding on categorical features.
85
+ 2. **MinMax scaling** — all features scaled to [0, 1], fit on training fold only.
86
+ 3. **Median imputation** — missing values imputed per-fold using training fold medians.
87
+
88
+ ---
89
+
90
+ ## Training Procedure
91
+
92
+ ### Training Framework
93
+
94
+ - pandas, scikit-learn, imbalanced-learn
95
+ - Optional: xgboost, lightgbm, interpret (for EBM)
96
+ - Experiment tracking: MLflow
97
+
98
+ ### Algorithms Evaluated
99
+
100
+ | # | Algorithm | Library |
101
+ |---|-----------|---------|
102
+ | 1 | RandomForestClassifier | sklearn |
103
+ | 2 | RandomForestClassifier (class_weight='balanced') | sklearn |
104
+ | 3 | BalancedBaggingClassifier | imblearn |
105
+ | 4 | **BalancedRandomForestClassifier** | imblearn |
106
+ | 5 | XGBClassifier | xgboost |
107
+ | 6 | XGBClassifier (scale_pos_weight) | xgboost |
108
+ | 7 | LGBMClassifier | lightgbm |
109
+ | 8 | ExplainableBoostingClassifier | interpret |
110
+ | 9 | LogisticRegression | sklearn |
111
+ | 10 | LogisticRegression (class_weight='balanced') | sklearn |
112
+
113
+ ### Evaluation Design
114
+
115
+ - **5-fold** stratified cross-validation, balanced by class and grouped by patient.
116
+ - Per-fold preprocessing (encoding, scaling, imputation) to prevent data leakage.
117
+ - Decision thresholds evaluated at: **0.3, 0.4, 0.5, 0.6, 0.7, 0.8**.
118
+ - Calibration tested: **sigmoid** and **isotonic** methods via CalibratedClassifierCV.
119
+
120
+ ---
121
+
122
+ ## Evaluation Results
123
+
124
+ > Replace this section with measured results from your training run.
125
+
126
+ | Metric | Value | Notes |
127
+ |--------|-------|-------|
128
+ | ROC-AUC | TBD | Cross-validation mean (± std) |
129
+ | AUC-PR | TBD | Primary metric for imbalanced outcome |
130
+ | F1 Score | TBD | At threshold 0.5 |
131
+ | Balanced Accuracy | TBD | Cross-validation mean |
132
+ | Precision | TBD | At chosen threshold |
133
+ | Recall | TBD | At chosen threshold |
134
+ | Brier Score | TBD | Probability calibration quality |
135
+
136
+ ### Caveats on Metrics
137
+
138
+ - Performance depends heavily on cohort definition, feature availability, and label construction.
139
+ - Reported metrics from controlled datasets may not transfer to other settings without recalibration and validation.
140
+ - Exacerbation labels are constructed via PRO LOGIC — different event definitions will produce different results.
141
+
142
+ ---
143
+
144
+ ## Bias, Risks, and Limitations
145
+
146
+ - **Dataset shift:** EHR coding practices, care pathways, and population characteristics vary across sites and time periods.
147
+ - **Label uncertainty:** Exacerbations may be incompletely observed in routine data; PRO LOGIC filtering may not generalise to all clinical contexts.
148
+ - **Fairness:** Outcomes and feature availability may vary by age, sex, deprivation, comorbidity burden, or service access.
149
+ - **Misuse risk:** Using predictions to drive clinical action without clinical safety processes can cause harm through false positives and negatives.
150
+ - **Cohort size:** ~302 patients is relatively small; results should be interpreted with appropriate uncertainty.
151
+
152
+ ---
153
+
154
+ ## How to Use
155
+
156
+ ### Pipeline Execution Order
157
+
158
+ ```bash
159
+ # 1. Install dependencies
160
+ pip install pandas numpy scikit-learn imbalanced-learn xgboost lightgbm interpret mlflow matplotlib seaborn
161
+
162
+ # 2. Define exacerbations with PRO LOGIC
163
+ python training/define_exacerbations_prologic.py
164
+
165
+ # 3. Train/test split (85/15, stratified)
166
+ python training/train_test_split.py
167
+
168
+ # 4. Prepare training data (encode, scale, impute)
169
+ python training/prepare_train_data.py
170
+
171
+ # 5. Prepare cross-validation folds (per-fold preprocessing)
172
+ python training/prepare_train_data_crossval.py
173
+
174
+ # 6. Prepare test data (using training encodings)
175
+ python training/prepare_test_data.py
176
+
177
+ # 7. Compare algorithms via cross-validation
178
+ python training/cross_validation_algorithms.py
179
+
180
+ # 8. Train final model (BalancedRandomForestClassifier)
181
+ python training/cross_validation.py
182
+
183
+ # 9. Evaluate calibration methods
184
+ python training/cross_validation_calibration.py
185
+ ```
186
+
187
+ ### Adapting to Your Data
188
+
189
+ Replace the input data paths in `define_exacerbations_prologic.py` with your own EHR extract. The pipeline expects CSV files with columns for patient ID, dates, diagnoses, PRO responses, and pharmacy records.
190
+
191
+ ---
192
+
193
+ ## Environmental Impact
194
+
195
+ Training computational requirements are minimal — all models are traditional tabular ML classifiers running on CPU. A full cross-validation sweep across 10 algorithms completes in minutes on a standard laptop.
196
+
197
+ ---
198
+
199
+ ## Citation
200
+
201
+ If you use this model or code, please cite:
202
+
203
+ - This repository: *(add citation format / Zenodo DOI if minted)*
204
+ - Associated publications: *(clinical trial results paper — forthcoming)*
205
+
206
+ ## Authors and Contributors
207
+
208
+ - **Storm ID** (maintainers)
209
+
210
+ ## License
211
+
212
+ This model and code are released under the **Apache 2.0** license.
pipeline.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ trigger:
2
+ branches:
3
+ include:
4
+ - main
5
+ - release/*
6
+
7
+ jobs:
8
+ - job: 'build'
9
+ pool:
10
+ vmImage: 'ubuntu-latest'
11
+
12
+ steps:
13
+ - task: UsePythonVersion@0
14
+ inputs:
15
+ versionSpec: '3.8'
16
+ architecture: 'x64'
17
+ displayName: 'Specify Python version'
18
+
19
+ - script: |
20
+ python -m pip install --upgrade pip
21
+ displayName: 'Install pip'
22
+
23
+ - script: |
24
+ pip install -r requirements.txt
25
+ displayName: 'Install CI dependencies'
26
+
27
+ - script: |
28
+ flake8
29
+ displayName: 'Run linting'
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ flake8
setup.cfg ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [tool:pytest]
2
+ filterwarnings =
3
+ ignore::DeprecationWarning
4
+ [flake8]
5
+ ignore = E501,W293,W292,W504
6
+ exclude = .git,__pycache__,docs/source/conf.py,old,build,dist
7
+ max-complexity = 10
training/README.MD ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Community exacerbations confirmed to 16/03/2021
2
+ * Hospital exacerbations confirmed to 31/08/2021
3
+ * How to use community events data post 16/03/2021
4
+ * Option 1: Use them
5
+ * Option 2: Don't use them and discard patient data for approx. 1 month around that event
6
+ * Option 3: Attempt to verify them in a more automated way (looking at prescribing data etc)
7
+
8
+
9
+ * Rescue meds bnf codes
10
+ steroid_codes = ['0603020T0AAACAC','0603020T0AABKBK', '0603020T0AAAXAX',
11
+ '0603020T0AAAGAG','0603020T0AABHBH','0603020T0AAACAC','0603020T0AABKBK',
12
+ '0603020T0AABNBN', '0603020T0AAAGAG', '0603020T0AABHBH']
13
+
14
+ antib_codes = ['0501013B0AAAAAA', '0501013B0AAABAB', '0501030I0AAABAB',
15
+ '0501030I0AAAAAA', '0501050B0AAAAAA', '0501050B0AAADAD', '0501013K0AAAJAJ']
training/copd.py ADDED
@@ -0,0 +1,644 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module containing code for model C (exacerbation prediction)."""
2
+ import numpy as np
3
+ import pandas as pd
4
+ from lenusml import encoding
5
+
6
+
7
+ def apply_logic_response_criterion(df, N=2, minimum_period=14, maximum_period=35):
8
+ """
9
+ Apply PRO LOGIC criterion 2 (consecutive negative Q5 replies required between events).
10
+
11
+ For events that occur after the minimum required period following a previous exac,
12
+ e.g. longer than 14 days, but before they are automatically considered as a new exac
13
+ event, e.g. 35 days, PRO LOGIC considers weekly PRO responses between the two events.
14
+ For subsequent events to count as separate events, there must be at least N
15
+ consecutive negative responses (no rescue meds taken) to weekly PROs between each
16
+ postive reply. Note PRO LOGIC is applied to both hospital and patient reported events.
17
+
18
+ Args:
19
+ df (pd.DataFrame): must contain columns for PatientId, DateOfEvent, Q5Answered,
20
+ NegativeQ5, IsExac and DaysSinceLastExac.
21
+ minimum_period (int): minimum number of days since the previous exac (any exacs
22
+ within this window will already be removed with PRO LOGIC criterion 1).
23
+ Default value is 14 days.
24
+ maximum_period (int): maximum number of days since the previous exac (any exacs
25
+ occurring after this period will automatically count as a separate event).
26
+ Default is 35 days.
27
+
28
+ Returns:
29
+ pd.DataFrame: input df with a new boolean column 'RemoveExac'.
30
+
31
+ """
32
+ # Retrieve dataframe indices of exacs falling under PRO LOGIC criterion 2 (Q5 replies)
33
+ indices = get_logic_exacerbation_indices(df, minimum_period=minimum_period,
34
+ maximum_period=maximum_period)
35
+ remove_exac = []
36
+ # Loop over each exac and evaluate PRO LOGIC criterion, returning 1 (remove) or 0
37
+ for exac_index in indices:
38
+ remove_exac.append(logic_consecutive_negative_responses(df, exac_index, N))
39
+ # Create dataframe containing exac indices and a boolean column stating whether to
40
+ # remove that exac due to failing Q5 response criterion and merge with original df
41
+ remove_exac = pd.DataFrame({'ind': indices, 'RemoveExac': remove_exac})
42
+ df = df.merge(remove_exac.set_index('ind'), left_index=True, right_index=True,
43
+ how='left')
44
+ return df
45
+
46
+
47
+ def bin_numeric_column(*, col, bins, labels):
48
+ """
49
+ Use pd.cut to bin numeric data into categories.
50
+
51
+ Args:
52
+ col (pd.Series): dataframe column to be binned.
53
+ bins (list): numeric values of bins.
54
+ labels (list): corresponding labels for the bins.
55
+
56
+ Returns:
57
+ pd.Series: binned column.
58
+ """
59
+ return pd.cut(col, bins=bins, labels=labels, right=False).astype('str')
60
+
61
+
62
+ def calculate_days_since_last_event(*, df, event_col, output_col):
63
+ """
64
+ Calculate the days since the last event, e.g. exacerbation or rescue med prescription.
65
+
66
+ Restarts the count from one the day following an event. Any days without a
67
+ previous event have the output column set to -1
68
+
69
+ Args:
70
+ df (pd.DataFrame): dataframe with a column containing dates and a boolean column
71
+ stating whether an event occurred on that date.
72
+ event_col (str): name of the boolean column for whether an event occurred.
73
+
74
+ Returns:
75
+ df: the input dateframe with an additional column stating the number of days since
76
+ the previous event occurred (or -1 if no previous event).
77
+
78
+ """
79
+ # Get all events
80
+ all_events = df[df[event_col].eq(1)].copy()
81
+ all_events['PrevEvent'] = all_events.index
82
+ # Merge the full df with the event df on their indices to the closest date in the past
83
+ # i.e. the most recent exacerbation
84
+ df = pd.merge_asof(df, all_events['PrevEvent'],
85
+ left_index=True, right_index=True,
86
+ direction='backward')
87
+ # Calculate the days since the previous event, restarting the count from 1 the
88
+ # day following an exacerbation (using shift)
89
+ df[output_col] = df.index - df['PrevEvent'].shift(1)
90
+ # Set to -1 for any rows without a prior exacerbation
91
+ df[output_col] = df[output_col].fillna(-1).astype('int64')
92
+ df = df.drop(columns=['PrevEvent'])
93
+ return df
94
+
95
+
96
+ def calculate_diff_from_rolling_mean(*, df, cols):
97
+ for col in cols:
98
+ df[col + '_diff'] = df[col] - df[col + '_ave']
99
+ return df
100
+
101
+
102
+ def extract_clinician_verified_exacerbations(df):
103
+ """
104
+ Extract verified events from clinician verification spreadsheets.
105
+
106
+ Extract only clinician verified events from verification spreadsheets and set the date
107
+ to the clinician supplied date if entered. Include a flag column for if the date was
108
+ changed from the PRO question response date.
109
+
110
+ Args:
111
+ df (pd.DataFrame): event verification data supplied by clinicians.
112
+
113
+ Returns:
114
+ pd.DataFrame: contains StudyId, DateOfEvent (a mix of true event dates and PRO
115
+ response dates if true dates unknown), IsCommExac (set to 1 here, used
116
+ after merging later) and ExacDateUnknown (boolean, 1 if clinicians did not
117
+ change the date).
118
+
119
+ """
120
+ # Filter for only verified events
121
+ df = df[df['Exacerbation confirmed'] == 1].copy()
122
+ df['DateRecorded'] = pd.to_datetime(df.DateRecorded, utc=True).dt.normalize()
123
+ df['New Date'] = pd.to_datetime(df['New Date'], utc=True).dt.normalize()
124
+ # Change the event date to the clinician supplied date if entered. This is considered
125
+ # the true event date. Set the event date to the PRO response date otherwise and flag
126
+ # that the true date is unknown
127
+ df['DateOfEvent'] = np.where(df['Date changed'] == 1, df['New Date'],
128
+ df['DateRecorded'])
129
+ df['ExacDateUnknown'] = np.int64(np.where(df['Date changed'] == 1, 0, 1))
130
+ # Flag all events as community events (this df will merge with hospital events later)
131
+ df['IsCommExac'] = 1
132
+ df = df[['StudyId', 'DateOfEvent', 'IsCommExac', 'ExacDateUnknown']]
133
+ return df
134
+
135
+
136
+ def define_hospital_admission(events):
137
+ """
138
+ Define whether a COPD service event was an admission and return 1 (yes) or 0 (no).
139
+
140
+ Args:
141
+ events (pd.DataFrame): events from COPD service previously merged with
142
+ PatientEventTypes.txt to get a column containing EventTypeId
143
+ event_name_col (str): name of column containing COPD service EventTypeId
144
+
145
+ Returns:
146
+ array: boolean stating whether an event was a hospital admission.
147
+
148
+ """
149
+ hospital_event_names = ['Hospital admission - emergency, COPD related',
150
+ 'Hospital admission - emergency, COPD unrelated']
151
+ return np.where(events.isin(hospital_event_names), 1, 0)
152
+
153
+
154
+ def define_service_exac_event(*, events, event_name_col='EventName',
155
+ include_community=False):
156
+ """State if a COPD service event was an exacerbation and return 1 (yes) or 0 (no).
157
+
158
+ Args:
159
+ events (pd.DataFrame): events from COPD service previously merged with
160
+ PatientEventTypes.txt to get a column containing EventTypeId
161
+ event_name_col (str): name of column containing COPD service EventTypeId
162
+ include_community (bool): whether to include event types corresponding to
163
+ patient reported exacerbations (e.g. community managed with rescue meds).
164
+ Defaults to False.
165
+
166
+ Returns:
167
+ array: boolean stating whether an event was an exacerbation.
168
+
169
+ """
170
+ if include_community is True:
171
+ exacerbation_event_names = ['Hospital admission - emergency, COPD related',
172
+ 'Exacerbation - self-managed with rescue pack',
173
+ 'GP review - emergency, COPD related',
174
+ 'Emergency department attendance, COPD related',
175
+ 'Exacerbation - started abs/steroid by clinical team']
176
+ else:
177
+ exacerbation_event_names = ['Hospital admission - emergency, COPD related',
178
+ 'GP review - emergency, COPD related',
179
+ 'Emergency department attendance, COPD related',
180
+ 'Exacerbation - started abs/steroid by clinical team']
181
+ return np.where(events.isin(exacerbation_event_names), 1, 0)
182
+
183
+
184
+ def fill_column_by_patient(*, df, id_col, col):
185
+ """
186
+ Forward and back fill data by patient to fill gaps, e.g. from merges.
187
+
188
+ Args:
189
+ df (pd.DataFrame): patient data. Must contain col and id_col columns.
190
+ id_col (str): name of column containing unique patient identifiers.
191
+ col (str): name of column to be filled.
192
+
193
+ Returns:
194
+ pd.DataFrame: input data with col infilled.
195
+ """
196
+ df[col] = df.groupby(id_col)[col].apply(lambda x: x.ffill().bfill())
197
+ return df
198
+
199
+
200
+ def filter_symptom_diary(*, df, patients, date_cutoff=None):
201
+ """
202
+ Filter COPD symptom diary data for patients and dates of interest.
203
+
204
+ Args:
205
+ df (pd.DataFrame): symptom diary data. Must contain 'SubmissionTime' and
206
+ 'PatientId' columns.
207
+ patients (list): patient IDs of interest.
208
+
209
+ Returns:
210
+ pd.DataFrame: filtered symptom diary.
211
+ """
212
+ df['SubmissionTime'] = pd.to_datetime(df.SubmissionTime, utc=True).dt.normalize()
213
+ # Take only data from after the cutoff if provided (e.g. weekly Q5 change)
214
+ if date_cutoff:
215
+ df = df[df.SubmissionTime >= date_cutoff]
216
+ # Filter for patients of interest
217
+ df = df[df.PatientId.isin(patients)]
218
+ return df
219
+
220
+
221
+ def get_logic_exacerbation_indices(df, minimum_period=14, maximum_period=35):
222
+ """
223
+ Return dataframe indices of exacs that need checking for PRO reponses since last exac.
224
+
225
+ Get the indices of exacerbations that occur long enough after the previous event to
226
+ not be removed by PRO LOGIC criterion 1 (e.g. within 14 days of previous exac) but
227
+ not long enough after to be counted as a separate event without further analysis.
228
+ Called by apply_logic_response_criterion.
229
+
230
+ Args:
231
+ df (pd.DataFrame): must contain IsExac and DaysSinceLastExac columns.
232
+ minimum_period (int): minimum number of days since the previous exac (any exacs
233
+ within this window will already be removed with PRO LOGIC criterion 1).
234
+ Default value is 14 days.
235
+ maximum_period (int): maximum number of days since the previous exac (any exacs
236
+ occurring after this period will automatically count as a separate event).
237
+ Default is 35 days.
238
+
239
+ Returns:
240
+ list: dataframe indices of relevant events.
241
+ """
242
+ # Get the dataframe indices for all exacerbations occurring within period of interest
243
+ indices = df[(df.IsExac.eq(1)) & (df.DaysSinceLastExac > minimum_period) &
244
+ (df.DaysSinceLastExac <= maximum_period)].index.to_list()
245
+ return indices
246
+
247
+
248
+ def get_rescue_med_pro_responses(df):
249
+ """Extract all responses to weekly PRO Q5 (rescue meds).
250
+
251
+ Add new boolean columns stating if Q5 was answered, whether it was a negative response
252
+ (no rescue meds taken in previous week) and whether the reply means a community
253
+ exacerbation. The latter two columns will be opposites.
254
+
255
+ Args:
256
+ df (pd.DataFrame): PRO symptom diary responses.
257
+
258
+ Returns:
259
+ pd.DataFrame: filtered weekly PROs with additional boolean columns Q5Answered,
260
+ NegativeQ5 and IsCommExac.
261
+
262
+ """
263
+ # Extract responses to weekly PRO Q5 (rescue meds)
264
+ df = df[df.SymptomDiaryQ5.notna()].copy()
265
+ df['SymptomDiaryQ5'] = df['SymptomDiaryQ5'].astype('int64')
266
+ # Columns for whether Q5 was answered and if the response was negative (no exac)
267
+ df['Q5Answered'] = 1
268
+ df['NegativeQ5'] = np.int64(np.where(df.SymptomDiaryQ5 == 0, 1, 0))
269
+ # Define community exacerbation as a positive reply to Q5
270
+ df['IsCommExac'] = np.int64(np.where(df.SymptomDiaryQ5 == 1, 1, 0))
271
+ return df
272
+
273
+
274
+ def logic_consecutive_negative_responses(df, i, N=2):
275
+ """
276
+ Calculate number of consecutive -ve Q5 replies since previous exac (PRO LOGIC).
277
+
278
+ Given the dataframe index of the current exac identified as falling under the Q5
279
+ criterion, calculate the number of negative replies to the weeky rescue med question
280
+ and check if there are enough for the event to count as distinct from the previous.
281
+ Called by apply_logic_response_criterion.
282
+
283
+ Args:
284
+ df (pd.DataFrame): must contain weekly PRO replies and output from
285
+ get_rescue_med_pro_responses, set_pro_exac_dates and
286
+ calculate_days_since_exacerbation.
287
+ i (int): index of exac of interest.
288
+ N (int): number fo consecutive negative rescue meds required for event to be
289
+ counted as a separate event and retained in data. Default is 2.
290
+
291
+ Returns:
292
+ int: flag for whether the exac failed the criterion. Returns 1 for failed (exac to
293
+ be removed) and 0 for passed (exac to be retained).
294
+
295
+ """
296
+ # Select data since the previous exacerbation
297
+ days = int(df.iloc[i].DaysSinceLastExac)
298
+ data = df.iloc[i - days + 1: i]
299
+
300
+ # Select replies to Q5
301
+ data = data[data.Q5Answered.eq(1)][['PatientId', 'DateOfEvent', 'Q5Answered',
302
+ 'NegativeQ5']]
303
+ # Check if there are sufficient responses
304
+ if len(data) < N:
305
+ return 1
306
+ else:
307
+ # Resample to 7 days (weekly) to account for missing responses. Resampling using
308
+ # the 'W' option can give spurious nans - use '7D' instead
309
+ data = data.set_index('DateOfEvent').resample('7D',
310
+ origin='start').sum().reset_index()
311
+ # Calculate number of consecutive negative replies to Q5 (no rescue meds taken)
312
+ consecutive_negative_responses = data[data.NegativeQ5.eq(1)][
313
+ 'NegativeQ5'].groupby(data.NegativeQ5.eq(0).cumsum()).sum().reset_index(
314
+ drop=True).max()
315
+
316
+ return 1 if consecutive_negative_responses < N else 0
317
+
318
+
319
+ def minimum_period_between_exacerbations(df, minimum_days=14):
320
+ """
321
+ Identify exacs occurring too soon after the previous exac based on DaysSinceLastExac.
322
+
323
+ Returns 1 if the exacerbation occurred within minimum_days of that patient's previous
324
+ exacerbation and 0 if not.
325
+
326
+ Args:
327
+ df (pd.DataFrame): must contain DaysSinceLastExac column.
328
+
329
+ Returns:
330
+ array: contains 1 or 0.
331
+ """
332
+ return np.where((df['DaysSinceLastExac'] > 0) &
333
+ (df['DaysSinceLastExac'] <= minimum_days), 1, 0)
334
+
335
+
336
+ def remove_data_between_exacerbations(df):
337
+ """
338
+ Remove data between first exac and subsequent exacs that failed PRO LOGIC criterion 2.
339
+
340
+ Ensures only the first in a series of related events are counted. Any subsequent exacs
341
+ that occurred too close to the initial event without sufficient negative weekly PRO
342
+ responses in the interim will be flagged for removal. This function removes flags for
343
+ removal all data from the day after the first event up to the date of events to be
344
+ removed. Data following the final event in the series will be removed by
345
+ minimum_period_between_exacerbations.
346
+
347
+ Args:
348
+ df (pd.DataFrame): must contain RemoveExac and DaysSinceLastExac columns.
349
+
350
+ Returns:
351
+ pd.DataFrame: days between first event and subsequent event(s) that failed the Q5
352
+ criterion are now flagged for removal in RemoveRow.
353
+
354
+ """
355
+ indices = df[df.RemoveExac.eq(1)].index.to_list()
356
+ # Check there are exacerbations that failed the logic criterion for N consecutive
357
+ # negative reponses to Q5 of weekly PROs (rescue meds)
358
+ if len(indices) > 0:
359
+ for exac_index in indices:
360
+ # Select data since the previous exacerbation
361
+ days = int(df.iloc[exac_index].DaysSinceLastExac)
362
+ # Set data since last exac up to and including current exac to be removed
363
+ df.loc[exac_index - days + 1: exac_index, 'RemoveRow'] = 1
364
+ return df
365
+
366
+
367
+ def remove_unknown_date_exacerbations(df, days_to_remove=7):
368
+ """
369
+ Remove data prior to and including an exacerbation whose date is unknown.
370
+
371
+ Args:
372
+ df (pd.DataFrame): one row per day per patient for full data window. Must include
373
+ ExacDateUnknown column.
374
+ days_to_remove (int): number of days of data to remove leading up to (and
375
+ including) the PRO response date. Default is 7 days.
376
+
377
+ Returns:
378
+ pd.DataFrame: input dataframe with updated RemoveRow column.
379
+
380
+ """
381
+ # Get indices of all exacs whose dates are flagged as unknown.
382
+ indices = df[df.ExacDateUnknown.eq(1)].index.to_list()
383
+ # Check there are exacerbations with unknown dates (answer=1 in SymptomDiaryQ11a)
384
+ if len(indices) > 0:
385
+ for exac_index in indices:
386
+ # Set specified number of previous days data up to and including current exac
387
+ # to be removed
388
+ df.loc[exac_index - days_to_remove + 1: exac_index, 'RemoveRow'] = 1
389
+ return df
390
+
391
+
392
+ def rolling_mean_previous_period(*, df, cols, date_col, id_col, window):
393
+ """
394
+ Resample data for each patient to daily and calculate rolling mean over window.
395
+
396
+ Uses daily resampling due to strange behaviour with weekly/yearly resampling and
397
+ calculating rolling quantities with missing/NaN entries in the series. Calculates the
398
+ rolling mean over the window (e.g. 365 days) and shifts the data so that each date
399
+ contains the rolling mean for the previous period, e.g. a rolling 365 day mean
400
+ includes data for the previous 365 days and not the current date. This will exclude
401
+ the current exacerbation or hospital admission from the counts.
402
+
403
+ Args:
404
+ df (pd.DataFrame): data of interest. Must contain specified col, date_col and
405
+ id_col columns.
406
+ cols (str): name of columns on which to calculate rolling mean.
407
+ date_col (str): name of columns containing dates (will be set as index for
408
+ aggregation)
409
+ id_col (str): name of column containing unique patient identifiers.
410
+ window (int): length of rolling window in days. Use window = 7 for weekly mean and
411
+ window = 365 for yearly mean.
412
+
413
+ Returns:
414
+ pd.DataFrame: input dataframe with columns for rolling means of cols added with an
415
+ '_ave_' suffix in the column names.
416
+ """
417
+ # Copy the original df to retain all columns and dates
418
+ df_copy = df.copy()
419
+ # Resample the time series to daily records per patient
420
+ rolling_mean = df_copy.set_index(date_col).groupby(id_col)[cols].resample(
421
+ 'D').mean().reset_index()
422
+ # Calculate the rolling mean over the specified window (in days)
423
+ rolling_mean = rolling_mean.set_index(date_col).groupby(id_col)[cols].rolling(
424
+ window=window, min_periods=1).mean().reset_index()
425
+ # Shift the series to exclude the current day
426
+ rolling_mean[cols] = rolling_mean.groupby(id_col)[cols].shift(1)
427
+ # # Add a suffix to the column name to denote it is an aggregation
428
+ rolling_mean = rolling_mean.rename(
429
+ columns={col: col + '_ave' for col in rolling_mean.columns if col in cols})
430
+ return rolling_mean
431
+
432
+
433
+ def rolling_sum_previous_period(*, df, col, date_col, id_col, window, output_col):
434
+ """
435
+ Resample data for each patient to daily and calculate rolling sum over window.
436
+
437
+ Uses daily resampling due to strange behaviour with weekly/yearly resampling and
438
+ calculating rolling quantities with missing/NaN entries in the series. Calculates the
439
+ rolling sum over the window (e.g. 365 days) and shifts the data so that each date
440
+ contains the rolling sum for the previous period, e.g. a rolling 365 day sum includes
441
+ data for the previous 365 days and not the current date. This will exclude the current
442
+ exacerbation or hospital admission from the counts.
443
+
444
+ Args:
445
+ df (pd.DataFrame): data of interest. Must contain specified col, date_col and
446
+ id_col columns.
447
+ col (str): name of column on which to calculate rolling sum.
448
+ date_col (str): name of columns containing dates (will be set as index for
449
+ aggregation)
450
+ id_col (str): name of column containing unique patient identifiers.
451
+ window (int): length of rolling window in days. Use window = 7 for weekly sums and
452
+ window = 365 for yearly sums.
453
+ output_col (str): name of rolling sum column in output dataframe.
454
+
455
+ Returns:
456
+ pd.DataFrame: input dataframe with column for rolling sum of col added.
457
+ """
458
+ # Copy the original df to retain all columns and dates
459
+ df_copy = df.copy()
460
+ # Resample the time series to daily records per patient
461
+ rolling_sum = df_copy.set_index(date_col).groupby(id_col)[col].resample(
462
+ 'D').sum().reset_index()
463
+ # Calculate the rolling sum over the specified window (in days)
464
+ rolling_sum = rolling_sum.set_index(date_col).groupby(id_col)[col].rolling(
465
+ window=window, min_periods=1).sum().reset_index()
466
+ # Shift the series to exclude the current day
467
+ rolling_sum[col] = rolling_sum.groupby(id_col)[col].shift(1).fillna(0)
468
+ # Rename the aggregate column as specified
469
+ rolling_sum = rolling_sum.rename(columns={col: output_col})
470
+ # Merge back onto the original df
471
+ df = df.merge(rolling_sum, on=[id_col, date_col], how='left')
472
+ df[output_col] = df[output_col].astype('int64')
473
+ return df
474
+
475
+
476
+ def set_prediction_window(*, df, prediction_window):
477
+ """
478
+ Set the prediction window to N days by setting IsExac=1 for N-1 days prior to events.
479
+
480
+ Set the prediction window to prediction_window (N) days, e.g. for N = 3, change the
481
+ IsExac label to 1 for the two days prior to the stated exac date to give three days of
482
+ exacerbation. The labels now represent whether an exac occurs within N days of the
483
+ prediction date rather than the exact date only.
484
+
485
+ Args:
486
+ df (pd.DataFrame): must contain IsExac column. Should contain the final list of
487
+ exacerbation events to be used for modelling.
488
+ prediction_window (int): length of model prediction window in days.
489
+
490
+ Returns:
491
+ pd.DataFrame: input data frame with extended exacerbation window.
492
+ """
493
+ # Get indices of all exacerbations
494
+ indices = df[df.IsExac.eq(1)].index.to_list()
495
+ # Check there are exacerbations in the data and process if so
496
+ if len(indices) > 0:
497
+ for exac_index in indices:
498
+ # Set specified number of previous days data up to and including current exac
499
+ # to be exacerbations
500
+ df.loc[exac_index - prediction_window + 1: exac_index, 'IsExac'] = 1
501
+ return df
502
+
503
+
504
+ def set_pro_exac_dates(df):
505
+ """
506
+ Set date of community exacerbations reported in weekly PROs Q5 and flag unknown dates.
507
+
508
+ Args:
509
+ df (pd.DataFrame: processed weekly PROs Q5 respnses, e.g. output of
510
+ get_rescue_med_pro_responses
511
+
512
+ Returns:
513
+ pd.DataFrame: input dataframe with additional columns for DateOfEvent (datetime)
514
+ and ExacDateUnknown (0 or 1).
515
+ """
516
+ # Take known exacerbation (rescue med) dates from SymptomDiaryQ11b, otherwise set the
517
+ # date to the date of PRO response
518
+ df['DateOfEvent'] = np.where(df.SymptomDiaryQ11a == 2, df.SymptomDiaryQ11b,
519
+ df.SubmissionTime)
520
+ # Flag which dates were unknown from the PRO response
521
+ df['ExacDateUnknown'] = np.int64(np.where((df.IsCommExac == 1) &
522
+ (df.SymptomDiaryQ11a != 2), 1, 0))
523
+ df['DateOfEvent'] = pd.to_datetime(df.DateOfEvent, utc=True).dt.normalize()
524
+ df = df.drop_duplicates(keep='last', subset=['PatientId', 'DateOfEvent'])
525
+ return df
526
+
527
+
528
+ def triple_inhaler_therapy_service(*, df, id_col, inhaler_col, include_mitt=False):
529
+ """
530
+ Create boolean (1/0) feature for whether a patient is taking triple inhaler therapy.
531
+
532
+ Option to include Single Inhaler Triple Therapy (SITT, default) only or also include
533
+ Multiple Inhaler Triple Therapy (MITT). SITT therapies are 'LAMA +LABA-ICS' and
534
+ 'LABA-LAMA-ICS'. MITT therapy is 'LAMA' + 'LABA-ICS'.
535
+
536
+ Args:
537
+ df (pd.DataFrame): dataframe containing list of inhaler names against patient IDs.
538
+ id_col (str): name of patient ID column.
539
+ inhaler_col (str): name of column containing inhaler types in the format of the
540
+ COPD service data, e.g. LAMA, LABA, LABA-LAMA-ICS, LAMA +LABA-ICS etc.
541
+ include_mitt (bool): whether to include Multiple Inhaler Triple Therapy (MITT).
542
+
543
+ Returns:
544
+ pd.DataFrame: input df with added boolean (1/0) feature for whether the patient is
545
+ taking triple inhaler therapy as defined by SITT (default) and MITT.
546
+ """
547
+ # Drop any duplicate entries
548
+ df = df.drop_duplicates()
549
+ # Pivot the table to one row per ID
550
+ df = df.pivot(index=id_col, columns=inhaler_col,
551
+ values=inhaler_col).reset_index().rename_axis(None, axis=1)
552
+ # Create columns for any service inhaler types not present in the data
553
+ types = ['LABA-LAMA', 'LAMA', 'LABA-ICS', 'LAMA +LABA-ICS', 'LABA-LAMA-ICS', 'LABA']
554
+ for inhaler in types:
555
+ if inhaler not in df:
556
+ df[inhaler] = np.nan
557
+ # Create column for triple inhaler therapies (SITT)
558
+ df['TripleTherapy'] = np.int64(np.where(
559
+ ~df['LABA-LAMA-ICS'].isna() | ~df['LAMA +LABA-ICS'].isna(), 1, 0))
560
+ # Modify SITT column to also include MITT if needed
561
+ if include_mitt is True:
562
+ df['TripleTherapy'] = np.int64(np.where(
563
+ ~df['LAMA'].isna() & ~df['LABA-ICS'].isna(), 1, df['TripleTherapy']))
564
+ df = df[[id_col, 'TripleTherapy']]
565
+ return df
566
+
567
+
568
+ def unit_lookup(units):
569
+ """Convert Lenus platform unit codes to human readable units.
570
+
571
+ Args:
572
+ units (pd.Series): Lenus platform unit codes for a measurement
573
+
574
+ Returns:
575
+ array: human readable measurement units.
576
+
577
+ """
578
+ units_lookup = {0: 'Count',
579
+ 1: 'CountPerSecond',
580
+ 2: 'InternationalUnit',
581
+ 3: 'Joule',
582
+ 4: 'Kelvin',
583
+ 5: 'Kilogram',
584
+ 6: 'KilogramPerLiter',
585
+ 7: 'KilogramPerSquareMeter',
586
+ 8: 'Liter',
587
+ 9: 'LiterPerKilogramSecond',
588
+ 10: 'LiterPerSecond',
589
+ 11: 'Meter',
590
+ 12: 'Pascal',
591
+ 13: 'Percent',
592
+ 14: 'Second',
593
+ 15: 'Siemen',
594
+ 16: 'Undefined'}
595
+ # Replace the unit code with its description if in the look up table, otherwise
596
+ # return 'Undefined'
597
+ units = np.where(units.isin(list(units_lookup.keys())), units.replace(units_lookup),
598
+ 'Undefined')
599
+ return units
600
+
601
+
602
+ def kfold_encode_train_data(*, df, id_col, fold_patients, cols_to_encode, target):
603
+ """
604
+ K-fold target encoding of train data.
605
+
606
+ Fold by fold target encoding of train data is used to prevent data leakage in cross-
607
+ validation (the same folds are used for encoding and CV). For example, in 10-fold
608
+ target encoding, each fold is encoded using the other nine folds and that fold is
609
+ then used as the validation fold in CV.
610
+
611
+ The complete train data set is used to target encode the holdout test data set.
612
+
613
+ Parameters
614
+ ----------
615
+ val_fold : dataframe
616
+ validation data to be target encoded. This could be one of the K folds used in
617
+ cross-validation or the holdout test set.
618
+ cols_to_encode : list of strings
619
+ names of columns to be encoded.
620
+ target : str
621
+ name of the target variable column.
622
+
623
+ Returns
624
+ -------
625
+ val_fold : dataframe
626
+ target encoded validation fold (or holdout test set).
627
+ encodings_all : dict
628
+ encodings used for each column.
629
+ """
630
+ # Encode the train data fold by fold
631
+ appended_data = []
632
+ # Loop over folds and target encode
633
+ for i, fold in enumerate(fold_patients):
634
+ print("Fold ", i)
635
+ val_fold_data = df[df[id_col].isin(fold)]
636
+ train_fold_data = df[~df[id_col].isin(fold)]
637
+ encoded_fold_data, encodings = encoding.encode_validation_fold(
638
+ val_fold=val_fold_data, train_folds=train_fold_data,
639
+ cols_to_encode=cols_to_encode, target=target)
640
+ appended_data.append(encoded_fold_data)
641
+ # Reconstruct full dataframe
642
+ df_encoded = pd.concat(appended_data)
643
+ df_encoded.reset_index(inplace=True, drop=True)
644
+ return df_encoded
training/create_sh_lookup_table.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+
4
+ data_dir = '<YOUR_DATA_PATH>/'
5
+
6
+ # Read lookups for RECEIVER
7
+ # receiver = pd.read_csv(os.path.join(data_dir, 'Receiver_IDs', 'COHORT_CONSENTED_2.csv'))
8
+ # receiver = receiver.rename(columns={'Study number': 'StudyId'})
9
+ # receiver = receiver[['SafeHavenID', 'StudyId']]
10
+ receiver = pd.read_csv(os.path.join(data_dir, 'Cohort3Rand.csv'))
11
+ receiver = receiver.rename(columns={'RNo': 'StudyId'})
12
+
13
+ # Read lookups for scale up
14
+ scaleup = pd.read_csv(os.path.join(data_dir, 'SU_IDs', 'Scale_Up_lookup.csv'))
15
+ scaleup = scaleup.rename(columns={'Study_Number': 'StudyId'})
16
+
17
+ # Concatenate tables and drop missing SH IDs (some study patients not in data extract)
18
+ all_patients = pd.concat([receiver, scaleup]).dropna()
19
+
20
+ # Save final mapping between StudyId and SafeHavenID
21
+ all_patients.to_pickle(os.path.join(data_dir, 'sh_to_studyid_mapping.pkl'))
22
+
23
+ # Check for matching age and sex between SafeHaven and Lenus data (mapping sanity check)
24
+ lenus_demographics = pd.read_csv(os.path.join(data_dir, 'copd-dataset',
25
+ 'CopdDatasetPatientDetails.txt'),
26
+ usecols=['StudyId', 'DateOfBirth', 'Sex'], sep='|')
27
+ sh_demographics = pd.read_csv(os.path.join(data_dir, 'EXAMPLE_STUDY_DATA',
28
+ 'Demographics_Cohort4.csv'),
29
+ usecols=['SafeHavenID', 'SEX', 'OBF_DOB'])
30
+
31
+ sh_demographics['OBF_DOB'] = pd.to_datetime(
32
+ sh_demographics['OBF_DOB'], utc=True).dt.normalize()
33
+
34
+ mapping = all_patients.merge(sh_demographics, on='SafeHavenID', how='inner')
35
+ mapping = mapping.merge(lenus_demographics, on='StudyId', how='inner')
36
+
37
+ # Check patient sex matches
38
+ mapping[mapping.SEX != mapping.Sex]
39
+ # There is one mismatch
40
+ all_patients[all_patients.duplicated(subset='SafeHavenID')]
41
+
42
+ # Check patient DOB matches
43
+ mapping[mapping.OBF_DOB != mapping.DateOfBirth]
training/cross_validation.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Perform CV (with explainability) on different feature sets and log to mlflow.
2
+
3
+ Includes functionality to nest runs under parent run (e.g. different feature sets
4
+ under a main run) and set a decision threshold for model scores. Logs the following
5
+ artifacts as well as metrics and parameters:
6
+ 1. List of model features
7
+ 2. Feature correlation matrix
8
+ 3. Global explainability (averaged over K folds)
9
+ 4. Cumulative gains curve
10
+ 5. Lift curve
11
+ 6. Probability distributions with KDE
12
+ """
13
+ from imblearn.ensemble import BalancedRandomForestClassifier
14
+ from lenusml import splits, crossvalidation, plots
15
+ import numpy as np
16
+ import os
17
+ import pandas as pd
18
+ from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
19
+ import mlflow
20
+ import matplotlib.pyplot as plt
21
+ # from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
22
+
23
+
24
+ def get_crossvalidation_importance(*, feature_names, crossval):
25
+ """
26
+ Create dataframe of mean global feature importance for all EBMs used in CV.
27
+
28
+ Args:
29
+ feature_names (list): list of model feature names
30
+ crossval (dict): output of cross_validation_return_estimator_and_scores
31
+
32
+ Returns:
33
+ pd.DataFrame: contains feature names, global importance for each of the K
34
+ estimators, mean importance across the estimators and scaled mean importance
35
+ relative to the most important feature.
36
+ """
37
+ # Obtain global importance from each EBM used in cross validation
38
+ for i, est in enumerate(crossval['estimator']):
39
+ exp_global = crossval['estimator'][i].feature_importances_
40
+
41
+ explanations = pd.DataFrame([feature_names, exp_global]).T
42
+ explanations.columns = ['Feature', 'Score_{}'.format(i)]
43
+
44
+ # Create dataframe with global feature importances for all K estimators
45
+ if i == 0:
46
+ explanations_all = explanations.copy()
47
+ else:
48
+ explanations_all = explanations_all.merge(explanations, on='Feature')
49
+
50
+ # Average the importances across all models
51
+ explanations_all['Mean'] = explanations_all.drop(columns=['Feature']).mean(axis=1)
52
+ explanations_all = explanations_all.sort_values('Mean', ascending=False)
53
+ # Create a scaled mean importance relative to the most imprtant feature
54
+ explanations_all['Mean_scaled'] = explanations_all['Mean'] /\
55
+ explanations_all['Mean'].abs().max()
56
+ return explanations_all
57
+
58
+
59
+ data_dir = '../data/models/model1/'
60
+ cohort_info_dir = '../data/cohort_info/'
61
+ output_dir = '../data/models/model1/output'
62
+
63
+ # Load CV folds and train data
64
+ fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
65
+ allow_pickle=True)
66
+ train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl'))
67
+
68
+ # Cross check fold patients with train data
69
+ cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients,
70
+ id_column='StudyId',
71
+ train_data=train_data)
72
+
73
+ mlflow.set_tracking_uri("sqlite:///mlruns.sqlite")
74
+ mlflow.set_experiment('model_drop2')
75
+
76
+ # Set CV scoring strategies and any model parameters
77
+ scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
78
+ 'average_precision']
79
+
80
+ ####
81
+ # Feature drop out here
82
+ #####
83
+
84
+ # Create list of model features
85
+ cols_to_drop = ['StudyId', 'IsExac']
86
+ features_list = [col for col in train_data.columns if col not in cols_to_drop]
87
+
88
+ # Separate features from target
89
+ features = train_data[features_list].astype('float')
90
+ target = train_data.IsExac.astype('float')
91
+
92
+ # Save the list of features and a correlation heatmap to the artifacts directory (to be
93
+ # logged in mlflow)
94
+ artifact_dir = './tmp'
95
+ # Create the artifacts directory if it doesn't exist
96
+ os.makedirs(artifact_dir, exist_ok=True)
97
+ # Remove any existing directory contents to not mix files between different runs
98
+ for f in os.listdir(artifact_dir):
99
+ os.remove(os.path.join(artifact_dir, f))
100
+
101
+ np.savetxt(os.path.join(artifact_dir, 'features.txt'), features_list,
102
+ delimiter=",", fmt='%s')
103
+
104
+ plots.plot_feature_correlations(features=features, figsize=(
105
+ len(features_list) // 2, len(features_list) // 2),
106
+ savefig=True, output_dir=artifact_dir,
107
+ figname='features_correlations.png')
108
+
109
+ # # Get the run_id of the best model from hyperparameter tuning and its parameters
110
+ # best_run = mlflow.search_runs([8], order_by=["metrics.precision DESC"]).iloc[0].run_id
111
+ # best_params = mlflow.get_run(best_run).data.params
112
+ # best_params
113
+
114
+ # params = {'inner_bags': 1,
115
+ # 'interactions': 4,
116
+ # 'learning_rate': 0.0012416471483555312,
117
+ # 'max_leaves': 12,
118
+ # 'max_rounds': 5000,
119
+ # 'min_samples_leaf': 5,
120
+ # 'outer_bags': 3,
121
+ # 'random_state': 0}
122
+
123
+ with mlflow.start_run(run_name='eosinophil_count_0.3_threshold'):
124
+ # runid = mlflow.active_run().info.run_id
125
+ # with mlflow.start_run(run_name='simplified_with_nanox', nested=True,
126
+ # tags={MLFLOW_PARENT_RUN_ID: runid}):
127
+
128
+ # Use the parameters from the best model in previous cross validation
129
+ model = BalancedRandomForestClassifier(random_state=0)
130
+ # crossval = cross_validate(model, features, target,
131
+ # cv=cross_validation_fold_indices,
132
+ # return_estimator=True, scoring=scoring)
133
+
134
+ # Perform K-fold cross validation with custom folds
135
+ # Set the probability threshold here if required
136
+ crossval, model_scores =\
137
+ crossvalidation.cross_validation_return_estimator_and_scores(
138
+ model=model, features=features,
139
+ target=target,
140
+ fold_indices=cross_validation_fold_indices)
141
+
142
+ # Log metrics averaged across folds
143
+ for score in scoring:
144
+ mlflow.log_metric(score, np.mean(crossval['test_' + score]))
145
+
146
+ # Log model parameters
147
+ params = model.get_params()
148
+ for param in params:
149
+ mlflow.log_param(param, params[param])
150
+
151
+ # Calculate average global feature importances across K models
152
+ explainability = get_crossvalidation_importance(feature_names=features_list,
153
+ crossval=crossval)
154
+ explainability.to_csv(os.path.join(artifact_dir,
155
+ 'global_feature_importances.csv'), index=False)
156
+ plots.plot_global_explainability_cv(importances=explainability,
157
+ scaled=True,
158
+ figsize=(
159
+ len(features_list) // 2.5,
160
+ len(features_list) // 6),
161
+ savefig=True, output_dir=artifact_dir)
162
+ # Plot lift and cumulative gains curves
163
+ plots.plot_lift_curve(scores=model_scores, savefig=True, output_dir=artifact_dir,
164
+ figname='cumulative_gains_curve.png')
165
+ plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True,
166
+ output_dir=artifact_dir,
167
+ figname='lift_curve.png')
168
+
169
+ # Plot distribution of model scores (histogram plus KDE)
170
+ plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac',
171
+ negative_class_name='No exac', savefig=True,
172
+ output_dir=artifact_dir,
173
+ figname='model_score_distribution.png')
174
+
175
+ # Plot CV confusion matrices with different decision thresholds
176
+ for threshold in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
177
+ plots.plot_confusion_matrix(
178
+ target_true=model_scores.true_label,
179
+ target_predicted=np.where(model_scores.model_score > threshold, 1, 0),
180
+ classes=['No exac', 'Exac'], savefig=True,
181
+ output_dir=artifact_dir,
182
+ figname='confusion_matrix_{}.png'.format(threshold))
183
+
184
+ # Plot the ROC and Precision-Recall curves
185
+ fig, ax = plt.subplots(figsize=(8, 6))
186
+ RocCurveDisplay.from_predictions(y_true=model_scores.true_label,
187
+ y_pred=model_scores.model_score, ax=ax)
188
+ ax.set_xlabel('False Positive Rate')
189
+ ax.set_ylabel('True Positive Rate')
190
+ plt.legend(frameon=False)
191
+ plt.tight_layout()
192
+ plt.savefig(os.path.join(artifact_dir, 'roc_curve.png'), dpi=150)
193
+ plt.close()
194
+
195
+ fig, ax = plt.subplots(figsize=(8, 6))
196
+ PrecisionRecallDisplay.from_predictions(y_true=model_scores.true_label,
197
+ y_pred=model_scores.model_score, ax=ax)
198
+ ax.set_xlabel('Recall')
199
+ ax.set_ylabel('Precision')
200
+ plt.legend(frameon=False)
201
+ plt.tight_layout()
202
+ plt.savefig(os.path.join(artifact_dir, 'precision_recall_curve.png'), dpi=150)
203
+ plt.close()
204
+
205
+ # Log artifacts
206
+ mlflow.log_artifacts(artifact_dir)
207
+ mlflow.end_run()
208
+ # mlflow.end_run()
training/cross_validation_algorithms.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Perform cross validation using a variety of algorithms."""
2
+ import os
3
+ import pandas as pd
4
+ import numpy as np
5
+
6
+ from lenusml import splits, plots
7
+
8
+ # Model training and evaluation
9
+ from sklearn.ensemble import RandomForestClassifier
10
+ from sklearn.model_selection import cross_validate, cross_val_predict
11
+ from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
12
+ from interpret.glassbox import ExplainableBoostingClassifier
13
+ import lightgbm as lgb
14
+ import xgboost as xgb
15
+ import mlflow
16
+
17
+
18
+ data_dir = '../data/models/model1/'
19
+ cohort_info_dir = '../data/cohort_info/'
20
+ output_dir = '../data/models/model1/output'
21
+
22
+ # Load CV folds and train data
23
+ fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
24
+ allow_pickle=True)
25
+ train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl'))
26
+
27
+ # Cross check fold patients with train data
28
+ cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients,
29
+ train_data=train_data,
30
+ id_column='StudyId')
31
+
32
+ # Create list of model features
33
+ cols_to_drop = ['StudyId', 'IsExac']
34
+ features_list = [col for col in train_data.columns if col not in cols_to_drop]
35
+
36
+ # Separate features from target
37
+ features = train_data[features_list].astype('float')
38
+ target = train_data.IsExac.astype('float')
39
+
40
+ scale_pos_weight = target.value_counts()[0] / target.value_counts()[1]
41
+
42
+ mlflow.set_tracking_uri("sqlite:///mlruns.sqlite")
43
+ mlflow.set_experiment('model_drop2')
44
+
45
+ # Set CV scoring strategies and any model parameters
46
+ scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
47
+ 'average_precision', 'neg_brier_score']
48
+ scale_pos_weight = target.value_counts()[0] / target.value_counts()[1]
49
+
50
+ models = []
51
+ models.append((RandomForestClassifier(random_state=0), 'random_forest'))
52
+ models.append((RandomForestClassifier(random_state=0, class_weight='balanced'),
53
+ 'random_forest_class_weight'))
54
+ models.append((BalancedBaggingClassifier(random_state=0),
55
+ 'balanced_bagging'))
56
+ models.append((BalancedRandomForestClassifier(random_state=0), 'balanced_random_forest'))
57
+ models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
58
+ eval_metric='logloss'), 'xgb'))
59
+ models.append((lgb.LGBMClassifier(random_state=0), 'lgbm'))
60
+ models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
61
+ eval_metric='logloss', scale_pos_weight=scale_pos_weight), 'xgb_spw'))
62
+ models.append((ExplainableBoostingClassifier(random_state=0), 'ebm'))
63
+
64
+ with mlflow.start_run(run_name='model_selection'):
65
+ # Perform K-fold cross validation with custom folds
66
+ for model in models:
67
+ with mlflow.start_run(run_name=model[1], nested=True):
68
+ # Create the artifacts directory if it doesn't exist
69
+ artifact_dir = './tmp'
70
+ os.makedirs(artifact_dir, exist_ok=True)
71
+ # Remove any existing directory contents to not mix files between different
72
+ # runs
73
+ for f in os.listdir(artifact_dir):
74
+ os.remove(os.path.join(artifact_dir, f))
75
+
76
+ crossval = cross_validate(model[0], features, target,
77
+ cv=cross_validation_fold_indices,
78
+ return_estimator=True, scoring=scoring)
79
+ # Get the predicted probabilities from each models
80
+ probabilities_cv = cross_val_predict(model[0], features, target,
81
+ cv=cross_validation_fold_indices,
82
+ method='predict_proba')[:, 1]
83
+ model_scores = pd.DataFrame({'model_score': probabilities_cv,
84
+ 'true_label': target})
85
+
86
+ # Log metrics averaged across folds
87
+ for score in scoring:
88
+ mlflow.log_metric(score, crossval['test_' + score].mean())
89
+
90
+ # Log model parameters
91
+ params = model[0].get_params()
92
+ for param in params:
93
+ mlflow.log_param(param, params[param])
94
+
95
+ plots.plot_lift_curve(scores=model_scores, savefig=True,
96
+ output_dir=artifact_dir, figname='lift_curve.png')
97
+ plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True,
98
+ output_dir=artifact_dir,
99
+ figname='cumulative_gains_curve.png')
100
+
101
+ # Plot distribution of model scores (histogram plus KDE)
102
+ plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac',
103
+ negative_class_name='No exac', savefig=True,
104
+ output_dir=artifact_dir,
105
+ figname='model_score_distribution.png')
106
+
107
+ # Log artifacts
108
+ mlflow.log_artifacts(artifact_dir)
109
+ mlflow.end_run()
training/cross_validation_calibration.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Perform model calibration in CV on different algorithms and log to mlflow.
2
+
3
+ Nests runs for different algos under parent run and logs the following
4
+ artifacts as well as metrics and parameters:
5
+ 1. Calibration curves for each child algo run (calibration in CV and calibration on
6
+ holdout test after applying isotonic and sigmoid calibration)
7
+ 2. Calibration curve under parent run to compare all algos in CV and post calibration
8
+ 3. Cumulative gains curve
9
+ 4. Lift curve
10
+ 5. Probability distributions with KDE (CV)
11
+ """
12
+ import matplotlib.pyplot as plt
13
+ import matplotlib.lines as mlines
14
+ from lenusml import splits, plots
15
+ import numpy as np
16
+ import os
17
+ import pandas as pd
18
+
19
+ from sklearn.model_selection import cross_val_predict, cross_validate
20
+ from sklearn.calibration import calibration_curve, CalibratedClassifierCV
21
+
22
+ from sklearn.linear_model import LogisticRegression
23
+ from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
24
+ from sklearn.ensemble import RandomForestClassifier
25
+ import xgboost as xgb
26
+ import lightgbm as lgb
27
+ from interpret.glassbox import ExplainableBoostingClassifier
28
+
29
+ import mlflow
30
+ from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
31
+
32
+
33
+ data_dir = '../data/models/model1/'
34
+ cohort_info_dir = '../data/cohort_info/'
35
+ output_dir = '../data/models/model1/output'
36
+
37
+ # Load CV folds and train data
38
+ fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
39
+ allow_pickle=True)
40
+ train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl'))
41
+ test_data = pd.read_pickle(os.path.join(data_dir, 'test_data.pkl'))
42
+ # Cross check fold patients with train data
43
+ cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients,
44
+ train_data=train_data,
45
+ id_column='StudyId')
46
+
47
+ mlflow.set_tracking_uri("sqlite:///mlruns.sqlite")
48
+ mlflow.set_experiment('model_drop2')
49
+
50
+ # Set CV scoring strategies and any model parameters
51
+ scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
52
+ 'average_precision', 'neg_brier_score']
53
+
54
+
55
+ def plot_calibration_curves(calibration_curves, savefig=True, output_dir=None,
56
+ figname=None, figsize=(8, 7)):
57
+ fig, ax = plt.subplots(figsize=figsize)
58
+ # reference line, legends, and axis labels
59
+ line = mlines.Line2D([0, 1], [0, 1], color='black')
60
+ transform = ax.transAxes
61
+ line.set_transform(transform)
62
+ ax.add_line(line)
63
+ fig.suptitle('Calibration plot')
64
+ ax.set_xlabel('Predicted probability')
65
+ ax.set_ylabel('True probability in each bin')
66
+ color = iter(plt.cm.rainbow(np.linspace(0, 1, len(calibration_curves))))
67
+ for cal_curve in calibration_curves:
68
+ c = next(color)
69
+ plt.plot(cal_curve[0][1], cal_curve[0][0], marker='o', c=c, linewidth=1,
70
+ label=cal_curve[1])
71
+ plt.xlim(0, 1)
72
+ plt.ylim(0, 1)
73
+ plt.legend(frameon=False, bbox_to_anchor=(1, 1), loc="upper left")
74
+ plt.tight_layout()
75
+ if savefig:
76
+ plt.savefig(os.path.join(output_dir, figname))
77
+
78
+
79
+ def plot_calibration_curves_algo(calibration_curves, savefig=True, output_dir=None,
80
+ figname=None, figsize=(8, 7)):
81
+ fig, ax = plt.subplots(figsize=figsize)
82
+ # reference line, legends, and axis labels
83
+ line = mlines.Line2D([0, 1], [0, 1], color='black')
84
+ transform = ax.transAxes
85
+ line.set_transform(transform)
86
+ ax.add_line(line)
87
+ fig.suptitle('Calibration plot')
88
+ ax.set_xlabel('Predicted probability')
89
+ ax.set_ylabel('True probability in each bin')
90
+ for cal_curve in calibration_curves:
91
+ plt.plot(cal_curve[0][1], cal_curve[0][0], marker='o', linewidth=1,
92
+ label=cal_curve[1])
93
+ plt.xlim(0, 1)
94
+ plt.ylim(0, 1)
95
+ plt.legend(frameon=False)
96
+ plt.tight_layout()
97
+ if savefig:
98
+ plt.savefig(os.path.join(output_dir, figname))
99
+
100
+
101
+ # Create list of model features
102
+ cols_to_drop = ['StudyId', 'IsExac']
103
+
104
+ # Get the features list from the preferred model
105
+ with open('./mlruns/2/7ebf60a5d17f49d9a79e41dd72dda858/artifacts/features.txt') as f:
106
+ features_list = f.read().splitlines()
107
+
108
+ # Separate features from target
109
+ features_train = train_data[features_list].astype('float')
110
+ target_train = train_data.IsExac.astype('float')
111
+ features_test = test_data[features_list].astype('float')
112
+ target_test = test_data.IsExac.astype('float')
113
+
114
+ artifact_dir = './tmp'
115
+ # Create the artifacts directory if it doesn't exist
116
+ os.makedirs(artifact_dir, exist_ok=True)
117
+ # Remove any existing directory contents to not mix files between different runs
118
+ for f in os.listdir(artifact_dir):
119
+ os.remove(os.path.join(artifact_dir, f))
120
+
121
+ scale_pos_weight = target_train.value_counts()[0] / target_train.value_counts()[1]
122
+
123
+ # Create list of algos to try
124
+ models = []
125
+ models.append((LogisticRegression(random_state=0, max_iter=200), 'LR'))
126
+ models.append((LogisticRegression(random_state=0, class_weight='balanced', max_iter=200),
127
+ 'LR_CW_balanced'))
128
+ models.append((lgb.LGBMClassifier(random_state=0), 'LGBM'))
129
+ models.append((BalancedBaggingClassifier(random_state=0),
130
+ 'Balanced_bagging'))
131
+ models.append((BalancedRandomForestClassifier(random_state=0), 'Balanced_RF'))
132
+ models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
133
+ eval_metric='logloss'), 'XGB'))
134
+ models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
135
+ eval_metric='logloss', scale_pos_weight=scale_pos_weight), 'XGB_SPW'))
136
+ models.append((ExplainableBoostingClassifier(random_state=0), 'EBM'))
137
+ models.append((RandomForestClassifier(random_state=0), 'RF'))
138
+ models.append((RandomForestClassifier(random_state=0, class_weight='balanced'),
139
+ 'RF_CW_balanced'))
140
+
141
+ calibration_curves_cv = []
142
+ calibration_curves_sigmoid = []
143
+ calibration_curves_isotonic = []
144
+
145
+ cal_curve_strategy = 'uniform'
146
+
147
+ with mlflow.start_run(run_name='sklearn_calibration_in_cv_uniform_bins'):
148
+ # Perform K-fold cross validation
149
+ runid = mlflow.active_run().info.run_id
150
+ for model in models:
151
+ with mlflow.start_run(run_name=model[1], nested=True,
152
+ tags={MLFLOW_PARENT_RUN_ID: runid}):
153
+ # Remove any existing directory contents to not mix files between different
154
+ # runs
155
+ for f in os.listdir(artifact_dir):
156
+ os.remove(os.path.join(artifact_dir, f))
157
+
158
+ calibration_curves_algo = []
159
+ crossval = cross_validate(model[0], features_train, target_train,
160
+ cv=cross_validation_fold_indices,
161
+ return_estimator=True, scoring=scoring,
162
+ error_score='raise')
163
+ probabilities_cv = cross_val_predict(model[0], features_train, target_train,
164
+ cv=cross_validation_fold_indices,
165
+ method='predict_proba')[:, 1]
166
+
167
+ model_scores = pd.DataFrame({'model_score': probabilities_cv,
168
+ 'true_label': target_train})
169
+ model_scores = model_scores.sort_values(by='model_score', ascending=False)
170
+
171
+ # Extract calibration curve
172
+ calibration_curves_cv.append((calibration_curve(target_train,
173
+ probabilities_cv, n_bins=10,
174
+ strategy=cal_curve_strategy), model[1]))
175
+
176
+ # Log metrics averaged across folds
177
+ for score in scoring:
178
+ mlflow.log_metric(score, np.mean(crossval['test_' + score]))
179
+
180
+ # Log model parameters
181
+ params = model[0].get_params()
182
+ for param in params:
183
+ mlflow.log_param(param, params[param])
184
+
185
+ # Calibrate model in CV
186
+ calibrated_sigmoid = CalibratedClassifierCV(model[0], method='sigmoid',
187
+ cv=cross_validation_fold_indices)
188
+ calibrated_sigmoid.fit(features_train, target_train)
189
+ probabilities_sigmoid = calibrated_sigmoid.predict_proba(features_test)[:, 1]
190
+
191
+ calibrated_isotonic = CalibratedClassifierCV(model[0], method='isotonic',
192
+ cv=cross_validation_fold_indices)
193
+ calibrated_isotonic.fit(features_train, target_train)
194
+ probabilities_isotonic = calibrated_isotonic.predict_proba(
195
+ features_test)[:, 1]
196
+
197
+ # Extract calibration curve
198
+ calibration_curves_sigmoid.append((calibration_curve(target_test,
199
+ probabilities_sigmoid, n_bins=10,
200
+ strategy=cal_curve_strategy),
201
+ model[1] + ' sigmoid'))
202
+ calibration_curves_isotonic.append((calibration_curve(target_test,
203
+ probabilities_isotonic, n_bins=10,
204
+ strategy=cal_curve_strategy),
205
+ model[1] + ' isotonic'))
206
+ calibration_curves_algo.append((calibration_curve(target_train,
207
+ probabilities_cv, n_bins=10,
208
+ strategy=cal_curve_strategy),
209
+ model[1] + ' uncalibrated'))
210
+ calibration_curves_algo.append((calibration_curve(target_test,
211
+ probabilities_sigmoid, n_bins=10,
212
+ strategy=cal_curve_strategy),
213
+ model[1] + ' sigmoid'))
214
+ calibration_curves_algo.append((calibration_curve(target_test,
215
+ probabilities_isotonic, n_bins=10,
216
+ strategy=cal_curve_strategy),
217
+ model[1] + ' isotonic'))
218
+
219
+ # Plot cumulative gains curves
220
+ plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True,
221
+ output_dir=artifact_dir,
222
+ figname='cumulative_gains_curve.png')
223
+ # Plot lift curves
224
+ plots.plot_lift_curve(scores=model_scores, savefig=True,
225
+ output_dir=artifact_dir, figname='lift_curve.png')
226
+
227
+ # Plot distribution of model scores (histogram plus KDE)
228
+ plots.plot_score_distribution(scores=model_scores,
229
+ postive_class_name='Exac',
230
+ negative_class_name='No exac', savefig=True,
231
+ output_dir=artifact_dir,
232
+ figname='model_score_distribution.png')
233
+
234
+ # Plot calibration curves for each algo
235
+ plot_calibration_curves_algo(calibration_curves=calibration_curves_algo,
236
+ savefig=True, output_dir=artifact_dir,
237
+ figname='calibration_curves.png',
238
+ figsize=(8, 7))
239
+
240
+ # Log artifacts under child runs
241
+ mlflow.log_artifacts(artifact_dir)
242
+ mlflow.end_run()
243
+
244
+ # Log artifacts under parent run
245
+ for f in os.listdir(artifact_dir):
246
+ os.remove(os.path.join(artifact_dir, f))
247
+
248
+ plot_calibration_curves(calibration_curves=calibration_curves_cv, savefig=True,
249
+ output_dir=artifact_dir,
250
+ figname='calibration_curves_cv.png', figsize=(15, 10))
251
+ plot_calibration_curves(calibration_curves=calibration_curves_sigmoid, savefig=True,
252
+ output_dir=artifact_dir,
253
+ figname='calibration_curves_sigmoid.png', figsize=(15, 10))
254
+ plot_calibration_curves(calibration_curves=calibration_curves_isotonic, savefig=True,
255
+ output_dir=artifact_dir,
256
+ figname='calibration_curves_isotonic.png', figsize=(15, 10))
257
+
258
+ with mlflow.start_run(run_id=runid):
259
+ mlflow.log_artifacts(artifact_dir)
260
+ mlflow.end_run()
training/cross_validation_comorbs.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Perform CV (with explainability) on different feature sets and log to mlflow.
2
+
3
+ Includes functionality to nest runs under parent run (e.g. different feature sets
4
+ under a main run) and set a decision threshold for model scores. Logs the following
5
+ artifacts as well as metrics and parameters:
6
+ 1. List of model features
7
+ 2. Feature correlation matrix
8
+ 3. Global explainability (averaged over K folds)
9
+ 4. Cumulative gains curve
10
+ 5. Lift curve
11
+ 6. Probability distributions with KDE
12
+ """
13
+ from imblearn.ensemble import BalancedRandomForestClassifier
14
+ from lenusml import splits, crossvalidation, plots
15
+ import numpy as np
16
+ import os
17
+ import pandas as pd
18
+
19
+ import mlflow
20
+ from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
21
+
22
+
23
+ def get_crossvalidation_importance(*, feature_names, crossval):
24
+ """
25
+ Create dataframe of mean global feature importance for all EBMs used in CV.
26
+
27
+ Args:
28
+ feature_names (list): list of model feature names
29
+ crossval (dict): output of cross_validation_return_estimator_and_scores
30
+
31
+ Returns:
32
+ pd.DataFrame: contains feature names, global importance for each of the K
33
+ estimators, mean importance across the estimators and scaled mean importance
34
+ relative to the most important feature.
35
+ """
36
+ # Obtain global importance from each EBM used in cross validation
37
+ for i, est in enumerate(crossval['estimator']):
38
+ exp_global = crossval['estimator'][i].feature_importances_
39
+
40
+ explanations = pd.DataFrame([feature_names, exp_global]).T
41
+ explanations.columns = ['Feature', 'Score_{}'.format(i)]
42
+
43
+ # Create dataframe with global feature importances for all K estimators
44
+ if i == 0:
45
+ explanations_all = explanations.copy()
46
+ else:
47
+ explanations_all = explanations_all.merge(explanations, on='Feature')
48
+
49
+ # Average the importances across all models
50
+ explanations_all['Mean'] = explanations_all.drop(columns=['Feature']).mean(axis=1)
51
+ explanations_all = explanations_all.sort_values('Mean', ascending=False)
52
+ # Create a scaled mean importance relative to the most imprtant feature
53
+ explanations_all['Mean_scaled'] = explanations_all['Mean'] /\
54
+ explanations_all['Mean'].abs().max()
55
+ return explanations_all
56
+
57
+
58
+ data_dir = '../data/models/model1/'
59
+ cohort_info_dir = '../data/cohort_info/'
60
+ output_dir = '../data/models/model1/output'
61
+
62
+ # Load CV folds and train data
63
+ fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
64
+ allow_pickle=True)
65
+ train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl'))
66
+
67
+ # Cross check fold patients with train data
68
+ cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients,
69
+ id_column='StudyId',
70
+ train_data=train_data)
71
+
72
+ mlflow.set_tracking_uri("sqlite:///mlruns.sqlite")
73
+ mlflow.set_experiment('model_drop2')
74
+
75
+ # Set CV scoring strategies and any model parameters
76
+ scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
77
+ 'average_precision']
78
+ # Load comorbidity data and get list of conditions captured in COPD service
79
+ comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt',
80
+ delimiter='|')
81
+ comorbidity_list = list(comorbidities.columns)
82
+ comorbidity_list.remove('Id')
83
+ comorbidity_list.remove('PatientId')
84
+ comorbidity_list.remove('Created')
85
+
86
+ # Add the StudyId column for merging with the train data
87
+ patient_details = pd.read_pickle(os.path.join('<YOUR_DATA_PATH>/copd-dataset',
88
+ 'patient_details.pkl'))
89
+ comorbidities = comorbidities.merge(patient_details[['PatientId', 'StudyId']],
90
+ on='PatientId', how='left')
91
+
92
+ # Map the True/False columns to 1/0
93
+ bool_mapping = {True: 1, False: 0}
94
+ comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace(
95
+ bool_mapping)
96
+
97
+ with mlflow.start_run(run_name='individual_comorbidities_no_binned'):
98
+ runid = mlflow.active_run().info.run_id
99
+ # Merge each comorbidity separately and train a model nested under the parent run
100
+ for comorbidity in comorbidity_list:
101
+ print(comorbidity)
102
+ # Merge comorb and fill missing data with 0
103
+ train_data = train_data.merge(comorbidities[['StudyId', comorbidity]],
104
+ on='StudyId', how='left')
105
+ train_data[comorbidity] = train_data[comorbidity].fillna(0)
106
+
107
+ with mlflow.start_run(run_name=comorbidity, nested=True,
108
+ tags={MLFLOW_PARENT_RUN_ID: runid}):
109
+ ####
110
+ # Feature addition/drop out here
111
+ #####
112
+ # Create list of model features
113
+ cols_to_drop = ['StudyId', 'IsExac', 'Comorbidities_te']
114
+ features_list = [col for col in train_data.columns if col not in cols_to_drop]
115
+
116
+ # Separate features from target
117
+ features = train_data[features_list].astype('float')
118
+ target = train_data.IsExac.astype('float')
119
+
120
+ # Save the list of features and a correlation heatmap to the artifacts
121
+ # directory (to be logged in mlflow)
122
+ artifact_dir = './tmp'
123
+ # Create the artifacts directory if it doesn't exist
124
+ os.makedirs(artifact_dir, exist_ok=True)
125
+ # Remove any existing directory contents to not mix files between different
126
+ # runs
127
+ for f in os.listdir(artifact_dir):
128
+ os.remove(os.path.join(artifact_dir, f))
129
+
130
+ np.savetxt(os.path.join(artifact_dir, 'features.txt'), features_list,
131
+ delimiter=",", fmt='%s')
132
+
133
+ plots.plot_feature_correlations(
134
+ features=features, figsize=(len(features_list) // 2,
135
+ len(features_list) // 2),
136
+ savefig=True, output_dir=artifact_dir,
137
+ figname="feature_correlations.png")
138
+
139
+ # Use the parameters from the best model in previous cross validation
140
+ model = BalancedRandomForestClassifier(random_state=0)
141
+ # crossval = cross_validate(model, features, target,
142
+ # cv=cross_validation_fold_indices,
143
+ # return_estimator=True, scoring=scoring)
144
+
145
+ # Perform K-fold cross validation with custom folds
146
+ # Set the probability threshold here if required
147
+ crossval, model_scores =\
148
+ crossvalidation.cross_validation_return_estimator_and_scores(
149
+ model=model, features=features,
150
+ target=target,
151
+ fold_indices=cross_validation_fold_indices)
152
+
153
+ # Log metrics averaged across folds
154
+ for score in scoring:
155
+ mlflow.log_metric(score, np.mean(crossval['test_' + score]))
156
+
157
+ # Log model parameters
158
+ params = model.get_params()
159
+ for param in params:
160
+ mlflow.log_param(param, params[param])
161
+
162
+ # Calculate average global feature importances across K models
163
+ explainability = get_crossvalidation_importance(feature_names=features_list,
164
+ crossval=crossval)
165
+ explainability.to_csv(os.path.join(artifact_dir,
166
+ 'global_feature_importances.csv'), index=False)
167
+ plots.plot_global_explainability_cv(importances=explainability,
168
+ scaled=True,
169
+ figsize=(
170
+ len(features_list) // 2.5,
171
+ len(features_list) // 6),
172
+ savefig=True, output_dir=artifact_dir)
173
+ # Plot lift and cumulative gains curves
174
+ plots.plot_lift_curve(scores=model_scores, savefig=True,
175
+ output_dir=artifact_dir, figname='lift_curve.png')
176
+ plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True,
177
+ output_dir=artifact_dir,
178
+ figname='cumulative_gains_curve.png')
179
+
180
+ # Plot distribution of model scores (histogram plus KDE)
181
+ plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac',
182
+ negative_class_name='No exac', savefig=True,
183
+ output_dir=artifact_dir,
184
+ figname='model_score_distribution.png')
185
+
186
+ # Log artifacts
187
+ mlflow.log_artifacts(artifact_dir)
188
+ mlflow.end_run()
189
+ # Drop the comorbidity column
190
+ train_data = train_data.drop(columns=[comorbidity])
191
+ # mlflow.end_run()
training/define_exacerbations_prologic.py ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Collate all hospital, clincian verified and patient reported events and apply LOGIC."""
2
+ import copd
3
+ import numpy as np
4
+ import os
5
+ import pandas as pd
6
+
7
+ data_dir = '<YOUR_DATA_PATH>/copd-dataset'
8
+
9
+ ############################################################################
10
+ # Define model cohort and training data windows
11
+ ############################################################################
12
+
13
+ # Read relevant info from patient details
14
+ patient_details = pd.read_csv(os.path.join(data_dir, 'CopdDatasetPatientDetails.txt'),
15
+ usecols=['PatientId', 'FirstSubmissionDate',
16
+ 'MostRecentSubmissionDate',
17
+ 'DateOfBirth', 'Sex', 'StudyId'],
18
+ delimiter="|")
19
+
20
+ # Select patients for inclusion (those with up to date events in service)
21
+ # Create list of patients for model inclusion
22
+ # Original RECEIVER cohort study id list
23
+ receiver_patients = ["RC{:02d}".format(i) for i in range(1, 85)]
24
+ # This patient needs removing
25
+ receiver_patients.remove('RC34')
26
+ # Scale up patients (subset)
27
+ scaleup_patients = ["SU{:02d}".format(i) for i in range(1, 219)]
28
+ scaleup_patients.append('SU287')
29
+
30
+ # List of all valid patients for modelling
31
+ valid_patients = receiver_patients + scaleup_patients
32
+
33
+ # Filter for valid patients accounting for white spaces in StudyId (e.g. RC 26 and RC 52)
34
+ patient_details = patient_details[patient_details.StudyId.str.replace(' ', '').isin(
35
+ valid_patients)]
36
+ # Select only non null entries in patient data start/end dates
37
+ patient_details = patient_details[(patient_details.FirstSubmissionDate.notna()) &
38
+ (patient_details.MostRecentSubmissionDate.notna())]
39
+
40
+ # Create a column stating the latest date permitted based on events added to service data
41
+ patient_details['LatestPredictionDate'] = '2022-02-28'
42
+
43
+ date_cols = ['FirstSubmissionDate', 'MostRecentSubmissionDate', 'LatestPredictionDate']
44
+ patient_details[date_cols] = patient_details[date_cols].apply(
45
+ lambda x: pd.to_datetime(x, utc=True).dt.normalize(), axis=1)
46
+
47
+ # Choose the earlier date out of the patient's last submission and the latest COPD data
48
+ # events
49
+ patient_details['LatestPredictionDate'] = patient_details[
50
+ ['MostRecentSubmissionDate', 'LatestPredictionDate']].min(axis=1)
51
+
52
+ # Add N days to start of data window because predictions are made N days in advance
53
+ # N=3 for the 72 hr exac model
54
+ N = 3
55
+ patient_details['EarliestPredictionDate'] = patient_details['FirstSubmissionDate']\
56
+ + pd.DateOffset(days=N)
57
+
58
+ # Remove any patients for whom the prediction start date overlaps the final submission
59
+ # date, i.e. they have too short a window of data
60
+ patient_details = patient_details[patient_details['EarliestPredictionDate'] <
61
+ patient_details['LatestPredictionDate']]
62
+ # List of remaining patients
63
+ model_patients = list(patient_details.PatientId.unique())
64
+ model_study_ids = list(patient_details.StudyId.unique())
65
+
66
+ print('Model cohort: {} patients. {} RECEIVER and {} SU'.format(
67
+ len(model_patients),
68
+ len(patient_details[patient_details['StudyId'].str.startswith('RC')]),
69
+ len(patient_details[patient_details['StudyId'].str.startswith('SU')])))
70
+
71
+ df = patient_details[['PatientId', 'DateOfBirth', 'Sex', 'StudyId',
72
+ 'FirstSubmissionDate', 'LatestPredictionDate']].copy()
73
+
74
+ # Create a dataframe with daily entries for each patient for their data window
75
+ # df["DateOfEvent"] = df.apply(lambda x: pd.date_range(x.FirstSubmissionDate,
76
+ # x.MostRecentSubmissionDate, freq='D'), axis=1)
77
+ df["DateOfEvent"] = df.apply(lambda x: pd.date_range(x.FirstSubmissionDate -
78
+ pd.DateOffset(days=N), x.LatestPredictionDate, freq='D'),
79
+ axis=1)
80
+ df = df.explode('DateOfEvent').reset_index(drop=True)
81
+
82
+ ############################################################################
83
+ # Extract hospital exacerbations and admissions from COPD service data
84
+ # Includes 1 year pre-onboarding plus time on Lenus COPD service
85
+ ############################################################################
86
+
87
+ # Contains exacerbations among other event types
88
+ patient_events = pd.read_csv(os.path.join(data_dir, 'PatientEvents.txt'),
89
+ delimiter="|", usecols=['PatientId', 'DateOfEvent',
90
+ 'EventTypeId'])
91
+
92
+ # Filter for only patients in model cohort - will still contain events out of data windows
93
+ patient_events = patient_events[patient_events.PatientId.isin(model_patients)]
94
+
95
+ # Lookup table for patient event types
96
+ patient_event_types = pd.read_csv(os.path.join(data_dir, 'PatientEventTypes.txt'),
97
+ delimiter="|", usecols=['Id', 'Name'])
98
+ patient_event_types = patient_event_types.rename(columns={'Id': 'EventTypeId',
99
+ 'Name': 'EventName'})
100
+ # Merge patient events with lookup table)
101
+ patient_events = patient_events.merge(patient_event_types, on='EventTypeId')
102
+
103
+ # Identify hospital exacerbation events
104
+ patient_events['IsHospExac'] = copd.define_service_exac_event(
105
+ events=patient_events.EventName, include_community=False)
106
+ # Identify hospital admissions (all causes)
107
+ patient_events['IsHospAdmission'] = copd.define_hospital_admission(
108
+ patient_events.EventName)
109
+
110
+ admissions = patient_events[patient_events.IsHospAdmission == 1][['PatientId',
111
+ 'DateOfEvent',
112
+ 'IsHospAdmission']]
113
+ hosp_exacs = patient_events[patient_events.IsHospExac == 1][['PatientId',
114
+ 'DateOfEvent',
115
+ 'IsHospExac']]
116
+ admissions['DateOfEvent'] = pd.to_datetime(admissions.DateOfEvent,
117
+ utc=True).dt.normalize()
118
+ hosp_exacs['DateOfEvent'] = pd.to_datetime(hosp_exacs.DateOfEvent,
119
+ utc=True).dt.normalize()
120
+
121
+ hosp_exacs = hosp_exacs.drop_duplicates()
122
+ admissions = admissions.drop_duplicates()
123
+ # Save hospital exacerbations and admissions data
124
+ hosp_exacs.to_pickle(os.path.join(data_dir, 'hospital_exacerbations.pkl'))
125
+ admissions.to_pickle(os.path.join(data_dir, 'admissions.pkl'))
126
+
127
+ ##########################################################################################
128
+ # Extract all rescue meds for model cohort in the year prior to onboarding. These will be
129
+ # used as a proxy for community exacerbations pre-OB (not captured in service data)
130
+ ##########################################################################################
131
+
132
+ # Read mapping between StudyId and SafeHavenID, and filter for model cohort
133
+ id_mapping = pd.read_pickle('../data/sh_to_studyid_mapping.pkl')
134
+ id_mapping = id_mapping[id_mapping.StudyId.isin(model_study_ids)]
135
+
136
+ # Read pharmacy data and filter for model cohort
137
+ pharmacy = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA',
138
+ 'Pharmacy_Cohort4.csv'))
139
+ pharmacy = pharmacy[pharmacy.SafeHavenID.isin(id_mapping.SafeHavenID)]
140
+
141
+ # Pull out rescue med prescriptions only
142
+ steroid_codes = ['0603020T0AAACAC', '0603020T0AABKBK', '0603020T0AAAXAX',
143
+ '0603020T0AAAGAG', '0603020T0AABHBH', '0603020T0AAACAC',
144
+ '0603020T0AABKBK', '0603020T0AABNBN', '0603020T0AAAGAG',
145
+ '0603020T0AABHBH']
146
+
147
+ antibiotic_codes = ['0501013B0AAAAAA', '0501013B0AAABAB', '0501030I0AAABAB',
148
+ '0501030I0AAAAAA', '0501050B0AAAAAA', '0501050B0AAADAD',
149
+ '0501013K0AAAJAJ']
150
+ rescue_med_bnf_codes = steroid_codes + antibiotic_codes
151
+ pharmacy = pharmacy[pharmacy.PI_BNF_Item_Code.isin(rescue_med_bnf_codes)]
152
+
153
+ # Get latest and earliest dates for model cohort
154
+ cohort_dates = id_mapping.merge(patient_details[
155
+ ['PatientId', 'StudyId', 'FirstSubmissionDate', 'LatestPredictionDate']],
156
+ on='StudyId')
157
+
158
+ # Merge and keep only rescue meds in the year before patient onboarding
159
+ pharmacy_exacs = cohort_dates.merge(pharmacy, on='SafeHavenID').drop(
160
+ columns=['PatientId', 'PI_BNF_Item_Code', 'PI_BNF_Item_Description',
161
+ 'DISP_DATE', 'SafeHavenID'])
162
+ pharmacy_exacs = pharmacy_exacs.rename(columns={'PRESC_DATE': 'DateOfEvent'})
163
+ pharmacy_exacs['DateOfEvent'] = pd.to_datetime(pharmacy_exacs['DateOfEvent'],
164
+ utc=True).dt.normalize()
165
+ # Drop duplicates
166
+ pharmacy_exacs = pharmacy_exacs.drop_duplicates()
167
+ # Filter on dates
168
+ pharmacy_exacs = pharmacy_exacs[
169
+ (pharmacy_exacs.DateOfEvent < pharmacy_exacs.FirstSubmissionDate) &
170
+ (pharmacy_exacs.DateOfEvent >= pharmacy_exacs.FirstSubmissionDate -
171
+ pd.DateOffset(years=1))]
172
+ # New column for rescue med exac type
173
+ pharmacy_exacs['IsRescueMedExac'] = 1
174
+ pharmacy_exacs = pharmacy_exacs.drop(
175
+ columns=['FirstSubmissionDate', 'LatestPredictionDate'])
176
+
177
+ # Save "pharmacy exacerbations" data
178
+ pharmacy_exacs.to_pickle(os.path.join(data_dir, 'pharmacy_exacerbations.pkl'))
179
+ ######################################################
180
+ # Extract patient reported exacerbation events
181
+ ######################################################
182
+ ########################
183
+ # Data post Q5 change
184
+ #######################
185
+
186
+ # Read file containing patient reported events (not patient_events because it contains
187
+ # the dates when patients answered PROs and not which date they reported as having taken
188
+ # their rescue meds)
189
+ symptom_diary = pd.read_csv(os.path.join(data_dir, 'CopdDatasetProSymptomDiary.txt'),
190
+ usecols=['PatientId', 'StudyId', 'Score', 'SubmissionTime',
191
+ 'SymptomDiaryQ5', 'SymptomDiaryQ11a', 'SymptomDiaryQ11b'],
192
+ delimiter="|")
193
+
194
+ Q5ChangeDate = pd.to_datetime('2021-04-22', utc=True)
195
+ symptom_diary = copd.filter_symptom_diary(df=symptom_diary, date_cutoff=Q5ChangeDate,
196
+ patients=model_patients)
197
+
198
+ weekly_pros = copd.get_rescue_med_pro_responses(symptom_diary)
199
+ weekly_pros = copd.set_pro_exac_dates(weekly_pros)
200
+ weekly_pros = weekly_pros[['PatientId', 'Q5Answered', 'NegativeQ5', 'IsCommExac',
201
+ 'DateOfEvent', 'ExacDateUnknown']]
202
+
203
+ #########################
204
+ # Pre Q5 change events
205
+ #########################
206
+
207
+ # RECEIVER cohort - community events verified up to 16/03/21
208
+ receiver = pd.read_excel('./LenusEvents/breakdown_of_com_exac_160321.xlsx')
209
+ receiver = receiver.rename(columns={'Study number': 'StudyId',
210
+ 'Exacerbation recorded': 'DateRecorded'})
211
+ receiver_exacs = copd.extract_clinician_verified_exacerbations(receiver)
212
+
213
+ # Scale up cohort - community events verified up to 17/05/2021
214
+ scaleup = pd.read_excel('./LenusEvents/Scale_Up_comm_exac_count_V9_deident.xlsx')
215
+ scaleup = scaleup.rename(columns={'Study Number': 'StudyId',
216
+ 'Date Exacerbation recorded': 'DateRecorded'})
217
+ scaleup['StudyId'] = scaleup['StudyId'].ffill()
218
+
219
+ scaleup_exacs = copd.extract_clinician_verified_exacerbations(scaleup)
220
+
221
+ # Combine RECEIVER and scale up events into one df
222
+ verified_exacs = pd.concat([receiver_exacs, scaleup_exacs])
223
+
224
+ ####################################################################################
225
+ # Merge hospital, patient reported and rescue med exacs with daily patient records
226
+ #
227
+ # Exacerbations occurring in Lenus service period include verified clinician events
228
+ # pre-April 2021 (after onboarding) and community exacerbations recorded in weekly
229
+ # PROs post-April 2021. Hospital exacs occur in year prior to OB and on Lenus service.
230
+ # Rescue med exacs are only used for the year prior to OB.
231
+ # Need to ensure each record has both StudyId and PatientId to prevent losing events
232
+ ######################################################################################
233
+
234
+ # Patient reported, clinician verified (during COPD service time only, inner join)
235
+ df = df.merge(verified_exacs, on=['StudyId', 'DateOfEvent'], how='left')
236
+
237
+ # Patient reported, new rescue med PRO (April 2021 onwards, inner join)
238
+ df = df.merge(weekly_pros, on=['PatientId', 'DateOfEvent'], how='left')
239
+
240
+ # Hospital exacs (one year prior to OB plus time on service, outer join)
241
+ df = df.merge(hosp_exacs, on=['PatientId', 'DateOfEvent'], how='outer')
242
+ df = copd.fill_column_by_patient(df=df, id_col='PatientId', col='StudyId')
243
+
244
+ # Pharmacy exacs, (one year prior to OB up to OB only, outer join)
245
+ df = df.merge(pharmacy_exacs, on=['StudyId', 'DateOfEvent'], how='outer')
246
+ df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='PatientId')
247
+
248
+ # Respiratory hospital admissions (one year prior to OB plus time on service, outer join)
249
+ df = df.merge(admissions, on=['PatientId', 'DateOfEvent'], how='outer')
250
+ df = copd.fill_column_by_patient(df=df, id_col='PatientId', col='StudyId')
251
+
252
+ # Combine cols from individual datasets into one
253
+ df['ExacDateUnknown'] = np.where((df.ExacDateUnknown_x == 1) |
254
+ (df.ExacDateUnknown_y == 1), 1, 0)
255
+ df['IsCommExac'] = np.where((df.IsCommExac_x == 1) |
256
+ (df.IsCommExac_y == 1) | (df.IsRescueMedExac == 1), 1, 0)
257
+
258
+ # Column for whether an exacerbation of any kind occurred on each date. To be filtered
259
+ # using (PRO) LOGIC
260
+ df['IsExac'] = np.where((df.IsCommExac == 1) | (df.IsHospExac == 1), 1, 0)
261
+
262
+ # Resample the df to one day per patient starting from the earliest record (may be a
263
+ # pre-onboarding exac. Complete daily records required for calculating DaysSinceLastExac)
264
+ df = df.set_index('DateOfEvent').groupby('StudyId').resample('D').asfreq().drop(
265
+ 'StudyId', axis=1).reset_index()
266
+
267
+ # Infill binary cols with zero where applicable
268
+ df[['Q5Answered', 'NegativeQ5', 'IsHospExac', 'IsCommExac', 'ExacDateUnknown', 'IsExac',
269
+ 'IsRescueMedExac', 'IsHospAdmission']] = df[
270
+ ['Q5Answered', 'NegativeQ5', 'IsHospExac', 'IsCommExac', 'ExacDateUnknown',
271
+ 'IsExac', 'IsRescueMedExac', 'IsHospAdmission']].fillna(0)
272
+
273
+ # Infill some columns by StudyId to populate entire df
274
+ df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='FirstSubmissionDate')
275
+ df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='LatestPredictionDate')
276
+ df = copd.fill_column_by_patient(df=df, id_col='StudyId', col='PatientId')
277
+
278
+ # Retain only dates before the end of each patient's data window
279
+ df = df[df.DateOfEvent <= df.LatestPredictionDate]
280
+
281
+ print('Starting number of exacerbations: {}'.format(df.IsExac.sum()))
282
+ print('Exacerbations pre-onboarding to COPD service: {}'.format(
283
+ len(df[(df.IsExac == 1) & (df.DateOfEvent < df.FirstSubmissionDate)])))
284
+ print('Exacerbations post-onboarding to COPD service: {}'.format(
285
+ len(df[(df.IsExac == 1) & (df.DateOfEvent >= df.FirstSubmissionDate)])))
286
+ print('Number of unique exacerbation patients: {}'.format(
287
+ len(df[df.IsExac == 1].PatientId.unique())))
288
+ # print('Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping'
289
+ # .format(df.IsHospExac.sum(), df.IsCommExac.sum(),
290
+ # len(df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1)])))
291
+ print('Rescue med prescriptions in year prior to onboarding: {} ({} unique patients, \
292
+ {} prescription dates overlapping with hospital events)'
293
+ .format(len(df[df.IsRescueMedExac == 1]),
294
+ len(df[df.IsRescueMedExac == 1].StudyId.unique()),
295
+ len(df[(df.IsRescueMedExac == 1) & (df.IsHospExac == 1)])))
296
+ print('Hospital exacerbations in year prior to onboarding: {} ({} unique patients)'
297
+ .format(len(df[(df.IsHospExac == 1) &
298
+ (df.DateOfEvent < df.FirstSubmissionDate)]),
299
+ len(df[(df.IsHospExac == 1) &
300
+ (df.DateOfEvent < df.FirstSubmissionDate)].StudyId.unique())))
301
+ print('Hospital exacerbations post-OB: {} ({} unique patients)'
302
+ .format(len(df[(df.IsHospExac == 1) &
303
+ (df.DateOfEvent >= df.FirstSubmissionDate)]),
304
+ len(df[(df.IsHospExac == 1) &
305
+ (df.DateOfEvent >= df.FirstSubmissionDate)].StudyId.unique())))
306
+ print('Clinician verified community exacerbations post-OB: {} ({} unique patients)'
307
+ .format(len(df[df.IsCommExac_x == 1]),
308
+ len(df[df.IsCommExac_x == 1].StudyId.unique())))
309
+ print('Community exacerbations post-OB from weekly PROs: {} ({} unique patients)'
310
+ .format(len(df[df.IsCommExac_y == 1]),
311
+ len(df[df.IsCommExac_y == 1].StudyId.unique())))
312
+
313
+ print('Number of patient reported exacerbations with unknown dates: {} ({} overlapping\
314
+ with hospital events)'.format(df.ExacDateUnknown.sum(),
315
+ len(df[(df.IsHospExac == 1) & (df.ExacDateUnknown == 1)])))
316
+
317
+ # Check for any patient reported events with unknown dates that occurred on the same day
318
+ # as a hospital event. Hospital events are trusted so set the date to known
319
+ df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1), 'ExacDateUnknown'] = 0
320
+ print('Remaining exacerbations with unknown dates: {}'.format(df.ExacDateUnknown.sum()))
321
+
322
+ df = df.drop(columns=['IsCommExac_x', 'IsCommExac_y', 'ExacDateUnknown_x',
323
+ 'ExacDateUnknown_y'])
324
+
325
+ ############################################################################
326
+ # Implement PRO LOGIC on hospital and patient reported exacerbation events
327
+ ############################################################################
328
+
329
+ # Define min and max days for PRO LOGIC. No predictions made or data used within
330
+ # logic_min_days after an exacerbation. Events falling between logic_min_days and
331
+ # logic_max_days after an event are subject to the weekly rescue med LOGIC criterion
332
+ logic_min_days = 14
333
+ logic_max_days = 35
334
+
335
+ # Calculate the days since last rescue med prescription
336
+ df = df.groupby('StudyId').apply(
337
+ lambda x: copd.calculate_days_since_last_event(
338
+ df=x, event_col='IsRescueMedExac',
339
+ output_col='DaysSinceLastRescueMeds')).reset_index(drop=True)
340
+
341
+ rescue_med_min_days = 7
342
+ print('Rescue med prescriptions occuring within {} days of a previous prescription: {}'
343
+ .format(rescue_med_min_days,
344
+ len(df[(df.DaysSinceLastRescueMeds > -1) &
345
+ (df.DaysSinceLastRescueMeds <= rescue_med_min_days) &
346
+ (df.IsRescueMedExac == 1)])))
347
+
348
+ # Reset IsExac to 0 for rescue med prescriptions within 7 days of a previous prescription
349
+ df.loc[(df.DaysSinceLastRescueMeds > -1) &
350
+ (df.DaysSinceLastRescueMeds <= rescue_med_min_days) &
351
+ (df.IsRescueMedExac == 1), 'IsExac'] = 0
352
+
353
+ # Calculate the days since the previous exacerbation for all patient days. Now includes
354
+ # events before patient onboarding
355
+ df = df.groupby('StudyId').apply(
356
+ lambda x: copd.calculate_days_since_last_event(
357
+ df=x, event_col='IsExac', output_col='DaysSinceLastExac')).reset_index(drop=True)
358
+
359
+ pre_onboarding_min_days = 14
360
+ print('Pre-onboarding exacerbations occuring within {} days of a previous exac: {}'
361
+ .format(pre_onboarding_min_days,
362
+ len(df[(df.IsExac == 1) &
363
+ (df.DaysSinceLastExac > -1) &
364
+ (df.DaysSinceLastExac <= pre_onboarding_min_days) &
365
+ (df.DateOfEvent < df.FirstSubmissionDate)])))
366
+
367
+ # Set IsExac to 0 for any pre-OB exacs within 14 days of a previous exac
368
+ df.loc[(df.DaysSinceLastExac > -1) & (df.DaysSinceLastExac <= pre_onboarding_min_days) &
369
+ (df.DateOfEvent < df.FirstSubmissionDate), 'IsExac'] = 0
370
+
371
+ # Recalculate DaysSinceLastExac to avoid counting exacs removed above
372
+ df = df.groupby('StudyId').apply(
373
+ lambda x: copd.calculate_days_since_last_event(
374
+ df=x, event_col='IsExac', output_col='DaysSinceLastExac')).reset_index(drop=True)
375
+
376
+ # Apply exclusion period following all exacerbations
377
+ df['RemoveRow'] = copd.minimum_period_between_exacerbations(
378
+ df, minimum_days=logic_min_days)
379
+ # Don't apply this criterion to pre-OB events (already accounted for above)
380
+ df.loc[(df.DateOfEvent < df.FirstSubmissionDate), 'RemoveRow'] = 0
381
+
382
+ print('Number of post-OB exacerbations excluded by PRO LOGIC {} day criterion: {}'.format(
383
+ logic_min_days, len(df[(df.IsExac == 1) & (df.RemoveRow == 1) &
384
+ (df.DateOfEvent >= df.FirstSubmissionDate)])))
385
+
386
+ # Apply criterion for negative weekly Q5 responses - doesn't capture anything post Q5
387
+ # change
388
+ consecutive_replies = 2
389
+ df = copd.apply_logic_response_criterion(df,
390
+ minimum_period=logic_min_days,
391
+ maximum_period=logic_max_days,
392
+ N=consecutive_replies)
393
+
394
+ print('Weekly rescue med (Q5) criterion applied to events occurring between {} and {} \
395
+ days after a previous event. {} consecutive negative replies required for the event to \
396
+ count as a new event'.format(logic_min_days, logic_max_days, consecutive_replies))
397
+ # Don't apply this criterion to pre-OB events (already accounted for above)
398
+ df.loc[(df.DateOfEvent < df.FirstSubmissionDate), 'RemoveExac'] = 0
399
+
400
+ print('Number of exacerbations excluded by PRO LOGIC Q5 response criterion: {}'.format(
401
+ df.RemoveExac.sum()))
402
+ print('Earliest and latest exacerbations excluded: {}, {}'.format(
403
+ df[df.RemoveExac == 1].DateOfEvent.min(), df[df.RemoveExac == 1].DateOfEvent.max()))
404
+
405
+ print('Remaining post-OB exacerbations: {}'.format(
406
+ len(df[(df.IsExac == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1) &
407
+ (df.DateOfEvent >= df.FirstSubmissionDate)])))
408
+
409
+ print('Remaining exacerbations with unknown dates: {}'.format(
410
+ len(df[(df.ExacDateUnknown == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1)])))
411
+
412
+ # Remove data between segments of prolonged events, count only first occurrence
413
+ df = copd.remove_data_between_exacerbations(df)
414
+
415
+ # Remove 7 days before each reported exacerbation within unknown date (meds in last week)
416
+ df = copd.remove_unknown_date_exacerbations(df)
417
+
418
+ # New df with unwanted rows removed for events breakdown. Don't drop rows before setting
419
+ # the prediction window in case of events that occur immediately after the exclusion
420
+ # period (prediction window is set on df index rather than dates so want full daily df)
421
+ df_counts = df[(df.RemoveRow != 1) & (df.DateOfEvent >= df.FirstSubmissionDate)].copy()
422
+
423
+ print('Final number of exacerbations: {}'.format(df_counts.IsExac.sum()))
424
+ exac_patients = pd.Series(df_counts[df_counts.IsExac == 1].StudyId.unique())
425
+ print('Number of unique exacerbation patients: {} ({} RC and {} SU)'.format(
426
+ len(exac_patients), exac_patients.str.startswith('RC').sum(),
427
+ exac_patients.str.startswith('SU').sum()))
428
+ print('Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping'
429
+ .format(df_counts.IsHospExac.sum(), df_counts.IsCommExac.sum(),
430
+ len(df_counts.loc[
431
+ (df_counts.IsCommExac == 1) & (df_counts.IsHospExac == 1)])))
432
+
433
+ #################################################################
434
+ # Set the prediction window to N days and remove unwanted rows
435
+ # Calculate rolling exac counts before removing pre-OB events
436
+ #################################################################
437
+ # Create column of exacerbations to use for rolling counts
438
+ df['ExacsToKeep'] = np.where((df.RemoveRow != 1) & (df.RemoveExac != 1), df.IsExac, 0)
439
+
440
+ # Calculate rolling 365 day sums of exacerbations and respiratory admissions
441
+ df = copd.rolling_sum_previous_period(df=df, date_col='DateOfEvent', col='ExacsToKeep',
442
+ id_col='StudyId', window=365,
443
+ output_col='ExacsPrevYear')
444
+ df = copd.rolling_sum_previous_period(df=df, date_col='DateOfEvent',
445
+ col='IsHospAdmission', id_col='StudyId', window=365,
446
+ output_col='AdmissionsPrevYear')
447
+
448
+ # Filter for data in the training data window (first submission date onwards)
449
+ df = df[(df.DateOfEvent >= df.FirstSubmissionDate) & (df.RemoveRow != 1)]
450
+
451
+ print('Setting {} day prediction window'.format(N))
452
+ df = copd.set_prediction_window(df=df, prediction_window=N)
453
+
454
+ print('Full data set now contains {} exacerbation days out of {} ({:.1f}%)'.format(
455
+ df.IsExac.value_counts()[1], len(df),
456
+ 100 * df.IsExac.value_counts(normalize=True)[1]))
457
+
458
+ ################
459
+ # Save data
460
+ ################
461
+ df = df[['PatientId', 'StudyId', 'DateOfBirth', 'Sex',
462
+ 'DateOfEvent', 'IsExac', 'DaysSinceLastExac', 'FirstSubmissionDate',
463
+ 'LatestPredictionDate', 'ExacsPrevYear', 'AdmissionsPrevYear']]
464
+
465
+ df.to_pickle(os.path.join(data_dir, 'exac_data.pkl'))
466
+ patient_details.to_pickle(os.path.join(data_dir, 'patient_details.pkl'))
training/fitbit_exploration.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copd
2
+ import os
3
+ import pandas as pd
4
+ from scipy.stats import ks_2samp, cramervonmises_2samp
5
+ import seaborn as sns
6
+ import matplotlib.pyplot as plt
7
+ sns.set(style='darkgrid', context='talk')
8
+ sns.set_palette('dark')
9
+ muted = sns.palettes.color_palette(palette='muted')
10
+ dark = sns.palettes.color_palette(palette='dark')
11
+
12
+ data_dir = '<YOUR_DATA_PATH>/lenus-samples-dataset'
13
+
14
+ # Load platform data
15
+ # DataServerDatasetSample.txt contains columns: 'Id', 'CategoryValue', 'ClientAssignedId',
16
+ # 'ClientId', 'CreationDate', 'CreatorSubject', 'DiscriminatedTypeIdentifier', 'EndDate',
17
+ # 'QuantityId', 'SampleId', 'SampleTypeDiscriminator', 'StartDate', 'Subject',
18
+ # 'TypeIdentifier'
19
+ # 'QuantityId' is a unique identifier to link one platform measurement (steps, HR etc)
20
+ # 'CreatorSubject' refers to the patient. Links to 'Id' in DataServerDatasetQuantity
21
+ lenus_sample = pd.read_csv(os.path.join(data_dir, "DataServerDatasetSample.txt"),
22
+ delimiter="|", usecols=['StartDate', 'EndDate',
23
+ 'CreatorSubject', 'QuantityId',
24
+ 'TypeIdentifier', 'CreationDate'])
25
+
26
+ # Convert datetime columns (strings) to datetime objects (in UTC)
27
+ # Not using a pandas apply to all columns here because it's very slow
28
+ date_cols = ['StartDate', 'EndDate', 'CreationDate']
29
+ for col in date_cols:
30
+ lenus_sample[col] = pd.to_datetime(lenus_sample[col], utc=True).dt.normalize()
31
+
32
+ # DataServerDatasetQuantity.txt contains columns: 'Id', 'Unit', 'value'
33
+ # 'Id' links to 'QuantityId' in DataServerDatasetSample
34
+ lenus_quantity = pd.read_csv(os.path.join(data_dir, "DataServerDatasetQuantity.txt"),
35
+ delimiter="|")
36
+
37
+ # Merge platform data on measurement id
38
+ platform_data = lenus_sample.merge(lenus_quantity, left_on='QuantityId',
39
+ right_on='Id').drop(columns=['Id'])
40
+
41
+ # Apply lookups to units and measurement types
42
+ platform_data['Units'] = copd.unit_lookup(platform_data['Unit'])
43
+ type_lookup = pd.read_csv('./lookups/type_lookup.txt')
44
+ platform_data = platform_data.merge(type_lookup, left_on='TypeIdentifier',
45
+ right_on=type_lookup.index)
46
+
47
+ # Drop unwanted columns
48
+ platform_data = platform_data.drop(columns=['TypeIdentifier', 'Unit'])
49
+
50
+ # Pivot the platform data to obtain columns for each measurement type
51
+ platform_data = pd.pivot_table(platform_data, values='Value',
52
+ index=['StartDate', 'EndDate', 'CreationDate',
53
+ 'CreatorSubject'],
54
+ columns=['Description']).reset_index()
55
+
56
+ data = pd.read_pickle(os.path.join('<YOUR_DATA_PATH>/copd-dataset', 'exac_data.pkl'))
57
+ patients = data.LenusId.unique()
58
+
59
+
60
+ def filter_on_date_and_id(df, min_date, patients):
61
+ return df[(df.CreationDate >= min_date) & (df.CreatorSubject.isin(patients))]
62
+
63
+
64
+ def resample_and_merge_median(df, fitbit):
65
+ # fitbit['DateOfEvent'] = fitbit['CreationDate']
66
+ # Resample for one value per day
67
+ fitbit = fitbit.set_index('CreationDate').groupby('CreatorSubject').resample(
68
+ '1d').median().dropna().reset_index()
69
+ data = df.merge(fitbit, left_on=['LenusId', 'DateOfEvent'],
70
+ right_on=['CreatorSubject', 'CreationDate'], how='inner')
71
+ return data
72
+
73
+
74
+ def resample_and_merge_last(df, fitbit):
75
+ fitbit['DateOfEvent'] = fitbit['CreationDate']
76
+ # Resample for one value per day
77
+ fitbit = fitbit.set_index('CreationDate').groupby('CreatorSubject').resample(
78
+ '1d').last().dropna().reset_index(drop=True)
79
+ data = df.merge(fitbit, left_on=['LenusId', 'DateOfEvent'],
80
+ right_on=['CreatorSubject', 'DateOfEvent'], how='inner')
81
+ return data
82
+
83
+
84
+ def print_numbers(df, measurement):
85
+ fitbit_patients = pd.Series(df.StudyId.unique())
86
+ print('{} patient days with {} data across {} unique patients ({} RC and {} SU)'.
87
+ format(len(df), measurement, len(df.PatientId.unique()),
88
+ fitbit_patients.str.startswith('RC').sum(),
89
+ fitbit_patients.str.startswith('SU').sum()))
90
+ exac_patients = pd.Series(df[df.IsExac == 1].StudyId.unique())
91
+ print('{} exacerbations across {} patients ({} RC and {} SU)'.format(df.IsExac.sum(),
92
+ len(df[df.IsExac == 1].PatientId.unique()),
93
+ exac_patients.str.startswith('RC').sum(),
94
+ exac_patients.str.startswith('SU').sum()))
95
+
96
+
97
+ # Select heart rate data from all platform data
98
+ heart_rate = platform_data[platform_data['heart rate'].notna()][
99
+ ['CreationDate', 'CreatorSubject', 'heart rate']]
100
+
101
+ # Filter for patients and dates of interest
102
+ heart_rate = filter_on_date_and_id(heart_rate, min_date='2010-01-01', patients=patients)
103
+ heart_rate.columns
104
+
105
+ hr_data = resample_and_merge_last(df=data, fitbit=heart_rate)
106
+ print_numbers(hr_data, 'HR')
107
+
108
+ steps = platform_data[platform_data['number of steps taken;'].notna()][[
109
+ 'CreationDate', 'CreatorSubject', 'number of steps taken;']]
110
+ # Filter for patients and dates of interest
111
+ steps = filter_on_date_and_id(steps, min_date='2010-01-01', patients=patients)
112
+ steps_data = resample_and_merge_median(df=data, fitbit=steps)
113
+
114
+ print_numbers(steps_data, 'steps')
115
+
116
+ hr_exac_patients = hr_data[hr_data.IsExac == 1]['PatientId'].unique()
117
+ hr_data = hr_data[hr_data.PatientId.isin(hr_exac_patients)]
118
+
119
+ hr_exac = hr_data[hr_data.IsExac == 1]['heart rate']
120
+ hr_no_exac = hr_data[hr_data.IsExac == 0]['heart rate']
121
+
122
+ ks_2samp(hr_exac, hr_no_exac)
123
+ cramervonmises_2samp(hr_exac, hr_no_exac)
124
+
125
+ steps_exac_patients = steps_data[steps_data.IsExac == 1]['PatientId'].unique()
126
+ steps_data = steps_data[steps_data.PatientId.isin(steps_exac_patients)]
127
+
128
+ steps_exac = steps_data[steps_data.IsExac == 1]['number of steps taken;']
129
+ steps_no_exac = steps_data[steps_data.IsExac == 0]['number of steps taken;']
130
+
131
+ ks_2samp(steps_exac, steps_no_exac)
132
+ cramervonmises_2samp(steps_exac, steps_no_exac)
133
+
134
+ fig, axes = plt.subplots(nrows=1, ncols=2, sharex=True, sharey=True,
135
+ constrained_layout=True, figsize=(8, 6))
136
+ sns.histplot(hr_data[hr_data.IsExac == 0], x="heart rate", binwidth=5, binrange=[50, 100],
137
+ alpha=.6, stat="density", legend=True, ax=axes[0], color=dark[0])
138
+ axes[0].set_xlabel(None)
139
+ plt.legend(['a'])
140
+ sns.histplot(hr_data[hr_data.IsExac == 1], x="heart rate", binwidth=5, binrange=[50, 100],
141
+ alpha=.6, stat="density", legend=True, ax=axes[1], color=dark[1])
142
+ axes[1].set_xlabel(None)
143
+ fig.supxlabel('heart rate')
144
+ plt.legend(['b'])
training/lookups/README.MD ADDED
@@ -0,0 +1 @@
 
 
1
+ .
training/lookups/type_lookup.txt ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Description
2
+ "body mass index";
3
+ "body fat percentage";
4
+ "height";
5
+ "weight (body mass)";
6
+ "lean body mass";
7
+ "waist circumference";
8
+ "number of steps taken";
9
+ "distance moved walking by walking or running";
10
+ "distance moved by cycling";
11
+ "distance moved using a wheelchair";
12
+ "resting energy used";
13
+ "active energy used";
14
+ "number of flights of stairs climbed";
15
+ "NikeFuel point earned";
16
+ "amount of time moved at an an average intensity of a brisk walk or greater";
17
+ "number of pushes performed while using a wheelchair";
18
+ "distance moved by swimming";
19
+ "number of strokes taken by swimming";
20
+ "vO2 max"
21
+ "heart rate"
22
+ "body temperature"
23
+ "body temperature during rest"
24
+ "systolic blood pressure"
25
+ "diastolic blood pressure"
26
+ "respiratory rate"
27
+ "heart rate at rest"
28
+ "average heart rate during walking"
29
+ "standard deviation of heartbeat intervals"
30
+ "oxygen saturation"
31
+ "peripheral perfusion index"
32
+ "blood glucose level"
33
+ "number of times fallen"
34
+ electrodermal activity"
35
+ "number of puffs the user takes from their inhaler"
36
+ "amount of insulin delivered"
37
+ "blood alcohol content"
38
+ "amount of air that can be forcibly exhaled from the lungs after taking the deepest breath possible"
39
+ "amount of air that can be forcibly exhaled from the lungs during the first second of a forced exhalation"
40
+ "maximum flow rate generated during a forceful exhalation"
41
+ "total amount of fat consumed"
42
+ "amount of polyunsaturated fat consumed"
43
+ "amount of monounsaturated fat consumed"
44
+ "amount of saturated fat consumed"
45
+ "amount of cholesterol consumed"
46
+ "amount of sodium consumed"
47
+ "amount of carbohydrates consumed"
48
+ "amount of fiber consumed"
49
+ "amount of sugar consumed"
50
+ "amount of energy consumed"
51
+ "amount of protein consumed"
52
+ "amount of vitamin A consumed"
53
+ "amount of vitamin B6 consumed"
54
+ "amount of vitamin B12 consumed"
55
+ "amount of vitamin C consumed"
56
+ "amount of vitamin D consumed"
57
+ "amount of vitamin E consumed"
58
+ "amount of vitamin K consumed"
59
+ "amount of calcium consumed"
60
+ "amount of iron consumed"
61
+ "amount of thiamin consumed"
62
+ "amount of riboflavin consumed"
63
+ "amount of niacin consumed"
64
+ "amount of folate consumed"
65
+ "amount of biotin consumed"
66
+ "amount of pantothenic acid consumed"
67
+ "amount of phosphorus consumed"
68
+ "amount of iodine consumed"
69
+ "amount of magnesium consumed"
70
+ "amount of zinc consumed"
71
+ "amount of selenium consumed"
72
+ "amount of copper consumed"
73
+ "amount of manganese consumed"
74
+ "amount of chromium consumed"
75
+ "amount of molybdenum consumed"
76
+ "amount of chloride consumed"
77
+ "amount of potassium consumed"
78
+ "amount of caffeine consumed"
79
+ "amount of water consumed"
80
+ "exposure to UV radiation"
81
+ "tgt ipap value 50"
82
+ "tgt ipap value 95"
83
+ "tgt ipap maximum value"
84
+ "tgt epap value 50"
85
+ "tgt epap value 95"
86
+ "tgt epap maximum value"
87
+ "leak value 50"
88
+ "leak value 95"
89
+ "leak maximum value"
90
+ "resp rate value 50"
91
+ "resp rate value 95"
92
+ "resp rate maximum value"
93
+ "ie ratio value 50"
94
+ "ie ratio value 95"
95
+ "ie ratio maximum value"
96
+ "minute vent value 50"
97
+ "minute vent value 95"
98
+ "minute vent maximum value"
99
+ "tidal vol value 50"
100
+ "tidal vol value 95"
101
+ "tidal vol maximum value"
102
+ "alveolar ventilation value 50"
103
+ "alveolar ventilation value 95"
104
+ "alveolar ventilation maximum value"
105
+ "spo2 minimum value"
106
+ "spo2 value 50"
107
+ "spo2 value 95"
108
+ "spo2 minutes below 88 Percent"
109
+ "spo2 seconds below dynamic threshold"
110
+ "spont trigg breaths"
111
+ "spont cycled breaths"
112
+ "resp events AHI"
113
+ "resp events AI"
114
+ "resp events HI"
115
+ "resp events ODI"
116
+ "amb humidity"
training/prepare_test_data.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prepare final test set for modelling (K fold encoded, scaled and imputed)."""
2
+ import copd
3
+ import json
4
+ import joblib
5
+ from lenusml import encoding
6
+ import os
7
+ import pandas as pd
8
+ import numpy as np
9
+
10
+ data_dir = '<YOUR_DATA_PATH>/test_data/'
11
+ cohort_info_dir = '../data/cohort_info/'
12
+ output_data_dir = '../data/models/model1'
13
+ artifact_dir = os.path.join(output_data_dir, 'artifacts')
14
+
15
+ data = pd.read_pickle(os.path.join(data_dir, 'test_data.pkl'))
16
+
17
+ ###############################################
18
+ # Map the True/False cols to integers
19
+ ###############################################
20
+ bool_mapping = {True: 1, False: 0}
21
+ data['RequiredAcuteNIV'] = data.RequiredAcuteNIV.replace(bool_mapping)
22
+ data['RequiredICUAdmission'] = data.RequiredICUAdmission.replace(bool_mapping)
23
+
24
+ # Map the M and F sex column to binary (1=F)
25
+ sex_mapping = {'F': 1, 'M': 0}
26
+ data['Sex_F'] = data.Sex.map(sex_mapping)
27
+ data = data.drop(columns=['Sex'])
28
+
29
+ ##############################################################
30
+ # Read daily PRO responses, calculate aggregations and merge
31
+ ##############################################################
32
+ cat = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProCat.txt'),
33
+ delimiter="|")
34
+
35
+ symptom_diary = pd.read_csv(
36
+ os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProSymptomDiary.txt'),
37
+ usecols=['PatientId', 'SubmissionTime', 'SymptomDiaryQ1', 'SymptomDiaryQ2',
38
+ 'SymptomDiaryQ3', 'SymptomDiaryQ8', 'SymptomDiaryQ9', 'SymptomDiaryQ10'],
39
+ delimiter="|")
40
+
41
+ cat['SubmissionTime'] = pd.to_datetime(cat.SubmissionTime,
42
+ utc=True).dt.normalize()
43
+ symptom_diary['SubmissionTime'] = pd.to_datetime(symptom_diary.SubmissionTime,
44
+ utc=True).dt.normalize()
45
+
46
+
47
+ # Filter for test patients
48
+ cat = cat[cat.PatientId.isin(data.PatientId)]
49
+ symptom_diary = symptom_diary[symptom_diary.PatientId.isin(data.PatientId)]
50
+
51
+ # Merge daily PROs accounting for days where patients answered the same PRO more than once
52
+ # per day
53
+ daily_pros = pd.merge(cat.drop_duplicates(subset=['PatientId', 'SubmissionTime']),
54
+ symptom_diary.drop_duplicates(subset=['PatientId',
55
+ 'SubmissionTime']),
56
+ on=['PatientId', 'SubmissionTime'], how='inner')
57
+
58
+ # Calculate rolling mean on previous days for numeric PROs
59
+ numeric_pros = ['CATQ1', 'CATQ2', 'CATQ3', 'CATQ4', 'CATQ5', 'CATQ6', 'CATQ7',
60
+ 'CATQ8', 'SymptomDiaryQ1', 'SymptomDiaryQ2', 'Score']
61
+
62
+ mean_pros = copd.rolling_mean_previous_period(df=daily_pros, cols=numeric_pros,
63
+ date_col='SubmissionTime',
64
+ id_col='StudyId', window=3)
65
+
66
+ # Merge the averaged PROs with the original responses and calculate differences
67
+ daily_pros = daily_pros.merge(mean_pros, on=['StudyId', 'SubmissionTime'], how='left')
68
+
69
+ daily_pros = copd.calculate_diff_from_rolling_mean(df=daily_pros, cols=numeric_pros)
70
+
71
+ # Remove the rolling average columns
72
+ daily_pros = daily_pros.loc[:, ~daily_pros.columns.str.endswith('_ave')]
73
+
74
+ # Merge PROs with full test data
75
+ test_data = pd.merge_asof(data.sort_values(by='DateOfEvent'), daily_pros.drop(
76
+ columns=['StudyId']).sort_values(by='SubmissionTime'),
77
+ left_on='DateOfEvent', right_on='SubmissionTime',
78
+ by='PatientId', direction='backward')
79
+
80
+ ################################################
81
+ # Include comorbidities from Lenus service
82
+ ################################################
83
+ comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt',
84
+ delimiter='|')
85
+ comorbidities = comorbidities.drop(columns=['Id', 'Created'])
86
+ # Get list of comorbidities captured in the service
87
+ comorbidity_list = list(comorbidities.columns)
88
+ comorbidity_list.remove('PatientId')
89
+
90
+ # Filter for test patients
91
+ comorbidities = comorbidities[comorbidities.PatientId.isin(data.PatientId)]
92
+ print('Test patients with entries in CopdDatasetCoMorbidityDetails: {} out of {}'.format(
93
+ len(comorbidities), len(data.PatientId.unique())))
94
+ comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace(
95
+ bool_mapping).fillna(0)
96
+ print('Comorbidity counts:', '\n', comorbidities[comorbidity_list].sum())
97
+
98
+ # Merge with test data, infill nans and get counts
99
+ test_data = test_data.merge(comorbidities, on='PatientId', how='left')
100
+ print('Comorbidity counts after merging with patient days:', '\n',
101
+ test_data[comorbidity_list].sum())
102
+ test_data[comorbidity_list] = test_data[comorbidity_list].fillna(0)
103
+
104
+ # Get comorb counts for each patient
105
+ test_data['Comorbidities'] = test_data[comorbidity_list].sum(axis=1)
106
+ comorb_counts = test_data.groupby('StudyId')['Comorbidities'].max().reset_index()
107
+ print('Patient comorbidity counts after infilling missing values: \n',
108
+ comorb_counts.value_counts())
109
+
110
+ # Drop comorbidities columns from test data but retain AsthmaOverlap
111
+ comorbidity_list.remove('AsthmaOverlap')
112
+ test_data = test_data.drop(columns=comorbidity_list)
113
+
114
+ ###############################################################
115
+ # Include inhaler type from Lenus service
116
+ ###############################################################
117
+ # Load inhaler data
118
+ inhaler_type = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetUsualTherapies.txt',
119
+ delimiter='|', usecols=['StudyId', 'InhalerType'])
120
+ # Filter for train patients
121
+ inhaler_type = inhaler_type[inhaler_type.StudyId.isin(data.StudyId)]
122
+ # Create new feature for triple therapy ('LABA-LAMA-ICS' or 'LAMA +LABA-ICS')
123
+ inhaler_type = copd.triple_inhaler_therapy_service(
124
+ df=inhaler_type, id_col='StudyId', inhaler_col='InhalerType', include_mitt=True)
125
+
126
+ print('Patients taking triple inhaler therapy: ', '\n',
127
+ inhaler_type.TripleTherapy.value_counts())
128
+ test_data = test_data.merge(inhaler_type, on='StudyId', how='left')
129
+
130
+ #####################################
131
+ # Map some categorical features
132
+ #####################################
133
+
134
+ # Replace SDQ8 with strings for phlegm difficulty and infill as None where no phlegm
135
+ # reported in CAT
136
+ test_data['SymptomDiaryQ8'] = test_data.SymptomDiaryQ8.replace(
137
+ {1: 'Not difficult', 2: 'A little difficult', 3: 'Quite difficult',
138
+ 4: 'Very difficult', np.nan: 'None'})
139
+
140
+ # Replace SDQ9 with strings for phlegm consistency and infill as None where no phlegm
141
+ # reported in CAT
142
+ test_data['SymptomDiaryQ9'] = test_data.SymptomDiaryQ9.replace(
143
+ {1: 'Watery', 2: 'Sticky liquid', 3: 'Semi-solid', 4: 'Solid', np.nan: 'None'})
144
+
145
+ # Replace SDQ10 with strings for phlegm colour and infill as None where no phlegm
146
+ # reported in CAT
147
+ test_data['SymptomDiaryQ10'] = test_data.SymptomDiaryQ10.replace(
148
+ {1: 'White', 2: 'Yellow', 3: 'Green', 4: 'Dark green', np.nan: 'None'})
149
+
150
+ # Replace smoking status with strings
151
+ test_data['SmokingStatus'] = test_data.SmokingStatus.replace(
152
+ {1: 'Smoker', 2: 'Ex-smoker', 3: 'Non-smoker'})
153
+
154
+ test_data['InExacWindow'] = test_data.IsExac.replace({0: False, 1: True})
155
+
156
+ #####################################################
157
+ # Calculate DaysSinceCAT and filter data if required
158
+ #####################################################
159
+
160
+ test_data['DaysSinceCAT'] = (test_data.DateOfEvent -
161
+ test_data.SubmissionTime).dt.days.astype('int')
162
+
163
+ DaysSinceCAT_cutoff = 14
164
+ test_data = test_data[test_data.DaysSinceCAT <= DaysSinceCAT_cutoff]
165
+
166
+ #####################################
167
+ # Bin some numeric features
168
+ #####################################
169
+
170
+ # Bin days since last exacerbation
171
+ exac_bins = [-1, 0, 21, 90, 180, np.inf]
172
+ exac_labels = ['None', '<21 days', '21 - 89 days', '90 - 179 days', '>= 180 days']
173
+
174
+ test_data['DaysSinceLastExac'] = copd.bin_numeric_column(
175
+ col=test_data['DaysSinceLastExac'], bins=exac_bins, labels=exac_labels)
176
+
177
+ # Bin patient age
178
+ age_bins = [0, 50, 60, 70, 80, np.inf]
179
+ age_labels = ['<50', '50-59', '60-69', '70-79', '80+']
180
+
181
+ test_data['Age'] = copd.bin_numeric_column(
182
+ col=test_data['Age'], bins=age_bins, labels=age_labels)
183
+
184
+ # Bin number of comorbidities
185
+ comorb_bins = [0, 1, 3, np.inf]
186
+ comorb_labels = ['None', '1-2', '3+']
187
+ test_data['Comorbidities'] = copd.bin_numeric_column(
188
+ col=test_data['Comorbidities'], bins=comorb_bins, labels=comorb_labels)
189
+
190
+ comorb_counts['Comorbidities_binned'] = copd.bin_numeric_column(
191
+ col=comorb_counts['Comorbidities'], bins=comorb_bins, labels=comorb_labels)
192
+
193
+ # Bin patient spirometry at onboarding
194
+ spirometry_bins = [0, 30, 50, 80, np.inf]
195
+ spirometry_labels = ['Very severe', 'Severe', 'Moderate', 'Mild']
196
+
197
+ test_data['FEV1PercentPredicted'] = copd.bin_numeric_column(
198
+ col=test_data['LungFunction_FEV1PercentPredicted'], bins=spirometry_bins,
199
+ labels=spirometry_labels)
200
+
201
+ test_data = test_data.drop(columns=['LungFunction_FEV1PercentPredicted'])
202
+ # Assign patients without spirometry in service data to the Mild category
203
+ test_data.loc[
204
+ test_data['FEV1PercentPredicted'] == 'nan', 'FEV1PercentPredicted'] = 'Mild'
205
+ test_data['FEV1PercentPredicted'].value_counts()
206
+
207
+ ##################################
208
+ # Service eosinophils feature
209
+ ##################################
210
+ test_data['HighestEosinophilCount_0_3'] = np.where(
211
+ test_data['LabsHighestEosinophilCount'] >= 0.3, 1, 0)
212
+ test_data = test_data.drop(columns=['LabsHighestEosinophilCount'])
213
+
214
+ # Target encode categorical data
215
+ categorical_columns = ['SmokingStatus', 'SymptomDiaryQ8', 'SymptomDiaryQ9',
216
+ 'SymptomDiaryQ10', 'DaysSinceLastExac', 'Age', 'Comorbidities',
217
+ 'FEV1PercentPredicted']
218
+
219
+ test_data[categorical_columns] = test_data[categorical_columns].astype("str")
220
+
221
+ # Encode test set based on entire train set
222
+ target_encodings = json.load(open(os.path.join(artifact_dir,
223
+ "target_encodings.json")))
224
+
225
+ data_encoded = encoding.apply_target_encodings(data=test_data, encodings=target_encodings,
226
+ cols_to_encode=categorical_columns)
227
+
228
+ ###################################
229
+ # Scale data
230
+ ###################################
231
+ data_encoded = data_encoded.drop(columns=['PatientId', 'InExacWindow',
232
+ 'DateOfEvent', 'SubmissionTime',
233
+ 'FirstSubmissionDate', 'LatestPredictionDate'])
234
+
235
+
236
+ scaler = joblib.load(os.path.join(artifact_dir, 'scaler.pkl'))
237
+
238
+ # Scale data ignoring patient ID and target
239
+ test_data_scaled = scaler.transform(
240
+ data_encoded.drop(columns=['StudyId', 'IsExac']))
241
+
242
+ # Place scaled results back into dataframe and add back the patient ID and cohort columns
243
+ test_data_scaled = pd.DataFrame(test_data_scaled, columns=data_encoded.drop(
244
+ columns=['StudyId', 'IsExac']).columns)
245
+ test_data_scaled.insert(0, 'StudyId', data_encoded.StudyId.values)
246
+ test_data_scaled['IsExac'] = data_encoded.IsExac.values
247
+ print('Test data scaled')
248
+
249
+ ###################################
250
+ # Infill missing data with median
251
+ ###################################
252
+ imputer = joblib.load(os.path.join(artifact_dir, 'imputer.pkl'))
253
+ imputer
254
+ # Use scaled data
255
+ test_data_imputed = imputer.transform(test_data_scaled.drop(
256
+ columns=['StudyId', 'IsExac']))
257
+
258
+ # Place imputed results back into dataframe and add back the patient ID and target columns
259
+ test_data_imputed = pd.DataFrame(test_data_imputed, columns=test_data_scaled.drop(
260
+ columns=['StudyId', 'IsExac']).columns)
261
+ test_data_imputed.insert(0, 'StudyId', test_data_scaled.StudyId.values)
262
+ test_data_imputed['IsExac'] = test_data_scaled.IsExac.values
263
+ print('Test data imputed')
264
+
265
+ ########################################
266
+ # Save final data
267
+ #########################################
268
+
269
+ # test data
270
+ test_data_imputed.to_pickle(os.path.join(output_data_dir, 'test_data.pkl'))
271
+ print('Final test data saved')
training/prepare_train_data.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prepare final train set (encoded, scaled and imputed) and save artifacts."""
2
+ import copd
3
+ import json
4
+ import joblib
5
+ from lenusml import encoding
6
+ import os
7
+ import pandas as pd
8
+ import numpy as np
9
+ from sklearn.preprocessing import MinMaxScaler
10
+ from sklearn.impute import SimpleImputer
11
+
12
+ data_dir = '<YOUR_DATA_PATH>/train_data/'
13
+ cohort_info_dir = '../data/cohort_info/'
14
+ output_data_dir = '../data/models/model1'
15
+
16
+ data = pd.read_pickle(os.path.join(data_dir, 'train_data.pkl'))
17
+
18
+ ###############################################
19
+ # Map the True/False cols to integers
20
+ ###############################################
21
+ bool_mapping = {True: 1, False: 0}
22
+ data['RequiredAcuteNIV'] = data.RequiredAcuteNIV.replace(bool_mapping)
23
+ data['RequiredICUAdmission'] = data.RequiredICUAdmission.replace(bool_mapping)
24
+
25
+ # Map the M and F sex column to binary (1=F)
26
+ sex_mapping = {'F': 1, 'M': 0}
27
+ data['Sex_F'] = data.Sex.map(sex_mapping)
28
+ data = data.drop(columns=['Sex'])
29
+
30
+ ##############################################################
31
+ # Read daily PRO responses, calculate aggregations and merge
32
+ ##############################################################
33
+ cat = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProCat.txt'),
34
+ delimiter="|")
35
+
36
+ symptom_diary = pd.read_csv(
37
+ os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProSymptomDiary.txt'),
38
+ usecols=['PatientId', 'SubmissionTime', 'SymptomDiaryQ1', 'SymptomDiaryQ2',
39
+ 'SymptomDiaryQ3', 'SymptomDiaryQ8', 'SymptomDiaryQ9', 'SymptomDiaryQ10'],
40
+ delimiter="|")
41
+
42
+ cat['SubmissionTime'] = pd.to_datetime(cat.SubmissionTime,
43
+ utc=True).dt.normalize()
44
+ symptom_diary['SubmissionTime'] = pd.to_datetime(symptom_diary.SubmissionTime,
45
+ utc=True).dt.normalize()
46
+
47
+
48
+ # Filter for train patients
49
+ cat = cat[cat.PatientId.isin(data.PatientId)]
50
+ symptom_diary = symptom_diary[symptom_diary.PatientId.isin(data.PatientId)]
51
+
52
+ # Merge daily PROs accounting for days where patients answered the same PRO more than once
53
+ # per day
54
+ daily_pros = pd.merge(cat.drop_duplicates(subset=['PatientId', 'SubmissionTime']),
55
+ symptom_diary.drop_duplicates(subset=['PatientId',
56
+ 'SubmissionTime']),
57
+ on=['PatientId', 'SubmissionTime'], how='inner')
58
+
59
+ # Calculate rolling mean on previous days for numeric PROs
60
+ numeric_pros = ['CATQ1', 'CATQ2', 'CATQ3', 'CATQ4', 'CATQ5', 'CATQ6', 'CATQ7',
61
+ 'CATQ8', 'SymptomDiaryQ1', 'SymptomDiaryQ2', 'Score']
62
+
63
+ mean_pros = copd.rolling_mean_previous_period(df=daily_pros, cols=numeric_pros,
64
+ date_col='SubmissionTime',
65
+ id_col='StudyId', window=3)
66
+
67
+ # Merge the averaged PROs with the original responses and calculate differences
68
+ daily_pros = daily_pros.merge(mean_pros, on=['StudyId', 'SubmissionTime'], how='left')
69
+
70
+ daily_pros = copd.calculate_diff_from_rolling_mean(df=daily_pros, cols=numeric_pros)
71
+
72
+ # Remove the rolling average columns
73
+ daily_pros = daily_pros.loc[:, ~daily_pros.columns.str.endswith('_ave')]
74
+
75
+ # Merge PROs with full train data
76
+ train_data = pd.merge_asof(data.sort_values(by='DateOfEvent'), daily_pros.drop(
77
+ columns=['StudyId']).sort_values(by='SubmissionTime'),
78
+ left_on='DateOfEvent', right_on='SubmissionTime',
79
+ by='PatientId', direction='backward')
80
+
81
+ ################################################
82
+ # Include comorbidities from Lenus service
83
+ ################################################
84
+ comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt',
85
+ delimiter='|')
86
+ comorbidities = comorbidities.drop(columns=['Id', 'Created'])
87
+ # Get list of comorbidities captured in the service
88
+ comorbidity_list = list(comorbidities.columns)
89
+ comorbidity_list.remove('PatientId')
90
+
91
+ # Filter for train patients
92
+ comorbidities = comorbidities[comorbidities.PatientId.isin(data.PatientId)]
93
+ print('Train patients with entries in CopdDatasetCoMorbidityDetails: {} out of {}'.format(
94
+ len(comorbidities), len(data.PatientId.unique())))
95
+ comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace(
96
+ bool_mapping).fillna(0)
97
+ print('Comorbidity counts:', '\n', comorbidities[comorbidity_list].sum())
98
+
99
+ # Merge with train data, infill nans and get counts
100
+ train_data = train_data.merge(comorbidities, on='PatientId', how='left')
101
+ print('Comorbidity counts after merging with patient days:', '\n',
102
+ train_data[comorbidity_list].sum())
103
+ train_data[comorbidity_list] = train_data[comorbidity_list].fillna(0)
104
+
105
+ # Get comorb counts for each patient
106
+ train_data['Comorbidities'] = train_data[comorbidity_list].sum(axis=1)
107
+ comorb_counts = train_data.groupby('StudyId')['Comorbidities'].max().reset_index()
108
+ # print('Patient comorbidity counts after infilling missing values: \n',
109
+ # comorb_counts.value_counts())
110
+
111
+ # Drop comorbidities columns from train data but retain AsthmaOverlap
112
+ comorbidity_list.remove('AsthmaOverlap')
113
+ train_data = train_data.drop(columns=comorbidity_list)
114
+
115
+ ###############################################################
116
+ # Include inhaler type from Lenus service
117
+ ###############################################################
118
+ # Load inhaler data
119
+ inhaler_type = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetUsualTherapies.txt',
120
+ delimiter='|', usecols=['StudyId', 'InhalerType'])
121
+ # Filter for train patients
122
+ inhaler_type = inhaler_type[inhaler_type.StudyId.isin(data.StudyId)]
123
+ # Create new feature for triple therapy ('LABA-LAMA-ICS' or 'LAMA +LABA-ICS')
124
+ inhaler_type = copd.triple_inhaler_therapy_service(
125
+ df=inhaler_type, id_col='StudyId', inhaler_col='InhalerType', include_mitt=True)
126
+
127
+ print('Patients taking triple inhaler therapy: ', '\n',
128
+ inhaler_type.TripleTherapy.value_counts())
129
+ train_data = train_data.merge(inhaler_type, on='StudyId', how='left')
130
+
131
+ #####################################
132
+ # Map some categorical features
133
+ #####################################
134
+
135
+ # Replace SDQ8 with strings for phlegm difficulty and infill as None where no phlegm
136
+ # reported in CAT
137
+ train_data['SymptomDiaryQ8'] = train_data.SymptomDiaryQ8.replace(
138
+ {1: 'Not difficult', 2: 'A little difficult', 3: 'Quite difficult',
139
+ 4: 'Very difficult', np.nan: 'None'})
140
+
141
+ # Replace SDQ9 with strings for phlegm consistency and infill as None where no phlegm
142
+ # reported in CAT
143
+ train_data['SymptomDiaryQ9'] = train_data.SymptomDiaryQ9.replace(
144
+ {1: 'Watery', 2: 'Sticky liquid', 3: 'Semi-solid', 4: 'Solid', np.nan: 'None'})
145
+
146
+ # Replace SDQ10 with strings for phlegm colour and infill as None where no phlegm
147
+ # reported in CAT
148
+ train_data['SymptomDiaryQ10'] = train_data.SymptomDiaryQ10.replace(
149
+ {1: 'White', 2: 'Yellow', 3: 'Green', 4: 'Dark green', np.nan: 'None'})
150
+
151
+ # Replace smoking status with strings
152
+ train_data['SmokingStatus'] = train_data.SmokingStatus.replace(
153
+ {1: 'Smoker', 2: 'Ex-smoker', 3: 'Non-smoker'})
154
+
155
+ train_data['InExacWindow'] = train_data.IsExac.replace({0: False, 1: True})
156
+
157
+ #####################################################
158
+ # Calculate DaysSinceCAT and filter data if required
159
+ #####################################################
160
+
161
+ train_data['DaysSinceCAT'] = (train_data.DateOfEvent -
162
+ train_data.SubmissionTime).dt.days.astype('int')
163
+
164
+ DaysSinceCAT_cutoff = 14
165
+ train_data = train_data[train_data.DaysSinceCAT <= DaysSinceCAT_cutoff]
166
+
167
+ #####################################
168
+ # Bin some numeric features
169
+ #####################################
170
+
171
+ # Bin days since last exacerbation
172
+ exac_bins = [-1, 0, 21, 90, 180, np.inf]
173
+ exac_labels = ['None', '<21 days', '21 - 89 days', '90 - 179 days', '>= 180 days']
174
+
175
+ train_data['DaysSinceLastExac'] = copd.bin_numeric_column(
176
+ col=train_data['DaysSinceLastExac'], bins=exac_bins, labels=exac_labels)
177
+
178
+ # Bin patient age
179
+ age_bins = [0, 50, 60, 70, 80, np.inf]
180
+ age_labels = ['<50', '50-59', '60-69', '70-79', '80+']
181
+
182
+ train_data['Age'] = copd.bin_numeric_column(
183
+ col=train_data['Age'], bins=age_bins, labels=age_labels)
184
+
185
+ # Bin number of comorbidities
186
+ comorb_bins = [0, 1, 3, np.inf]
187
+ comorb_labels = ['None', '1-2', '3+']
188
+ train_data['Comorbidities'] = copd.bin_numeric_column(
189
+ col=train_data['Comorbidities'], bins=comorb_bins, labels=comorb_labels)
190
+
191
+ comorb_counts['Comorbidities_binned'] = copd.bin_numeric_column(
192
+ col=comorb_counts['Comorbidities'], bins=comorb_bins, labels=comorb_labels)
193
+
194
+ # Bin patient spirometry at onboarding
195
+ spirometry_bins = [0, 30, 50, 80, np.inf]
196
+ spirometry_labels = ['Very severe', 'Severe', 'Moderate', 'Mild']
197
+
198
+ train_data['FEV1PercentPredicted'] = copd.bin_numeric_column(
199
+ col=train_data['LungFunction_FEV1PercentPredicted'], bins=spirometry_bins,
200
+ labels=spirometry_labels)
201
+
202
+ train_data = train_data.drop(columns=['LungFunction_FEV1PercentPredicted'])
203
+ # Assign patients without spirometry in service data to the Mild category
204
+ train_data.loc[
205
+ train_data['FEV1PercentPredicted'] == 'nan', 'FEV1PercentPredicted'] = 'Mild'
206
+ train_data['FEV1PercentPredicted'].value_counts()
207
+
208
+ ##################################
209
+ # Service eosinophils feature
210
+ ##################################
211
+ train_data['HighestEosinophilCount_0_3'] = np.where(
212
+ train_data['LabsHighestEosinophilCount'] >= 0.3, 1, 0)
213
+ train_data = train_data.drop(columns=['LabsHighestEosinophilCount'])
214
+
215
+ # Target encode categorical data
216
+ categorical_columns = ['SmokingStatus', 'SymptomDiaryQ8', 'SymptomDiaryQ9',
217
+ 'SymptomDiaryQ10', 'DaysSinceLastExac', 'Age', 'Comorbidities',
218
+ 'FEV1PercentPredicted']
219
+
220
+ train_data[categorical_columns] = train_data[categorical_columns].astype("str")
221
+
222
+ # Get encodings from entire train set (to be retained for holdout test
223
+ # or new patients)
224
+ target_encodings = encoding.get_target_encodings(train_data=train_data,
225
+ cols_to_encode=categorical_columns,
226
+ target='IsExac')
227
+
228
+ # Encode entire train set
229
+ data_encoded = encoding.apply_target_encodings(data=train_data,
230
+ encodings=target_encodings,
231
+ cols_to_encode=categorical_columns)
232
+
233
+ ###################################
234
+ # Scale data
235
+ ###################################
236
+ data_encoded = data_encoded.drop(columns=['PatientId', 'InExacWindow',
237
+ 'DateOfEvent', 'SubmissionTime',
238
+ 'FirstSubmissionDate', 'LatestPredictionDate'])
239
+
240
+ scaler = MinMaxScaler()
241
+ # Scale data ignoring patient ID and target
242
+ train_data_scaled = scaler.fit_transform(
243
+ data_encoded.drop(columns=['StudyId', 'IsExac']))
244
+
245
+ # Place scaled results back into dataframe and add back the patient ID and cohort columns
246
+ train_data_scaled = pd.DataFrame(train_data_scaled, columns=data_encoded.drop(
247
+ columns=['StudyId', 'IsExac']).columns)
248
+ train_data_scaled.insert(0, 'StudyId', data_encoded.StudyId.values)
249
+ train_data_scaled['IsExac'] = data_encoded.IsExac.values
250
+ print('Train data scaled')
251
+
252
+ ###################################
253
+ # Infill missing data with median
254
+ ###################################
255
+ imputer = SimpleImputer(missing_values=np.nan, strategy='median')
256
+
257
+ # Use scaled data
258
+ train_data_imputed = imputer.fit_transform(train_data_scaled.drop(
259
+ columns=['StudyId', 'IsExac']))
260
+
261
+ # Place imputed results back into dataframe and add back the patient ID and target columns
262
+ train_data_imputed = pd.DataFrame(train_data_imputed, columns=train_data_scaled.drop(
263
+ columns=['StudyId', 'IsExac']).columns)
264
+ train_data_imputed.insert(0, 'StudyId', train_data_scaled.StudyId.values)
265
+ train_data_imputed['IsExac'] = train_data_scaled.IsExac.values
266
+ print('Train data imputed')
267
+
268
+ ############################################
269
+ # Save encodings, imputer and scaler
270
+ ############################################
271
+ artifact_dir = os.path.join(output_data_dir, 'artifacts')
272
+ os.makedirs(artifact_dir, exist_ok=True)
273
+ # Remove any existing directory contents to not mix files between different runs
274
+ for f in os.listdir(artifact_dir):
275
+ os.remove(os.path.join(artifact_dir, f))
276
+
277
+
278
+ # Encodings
279
+ json.dump(target_encodings, open(os.path.join(artifact_dir,
280
+ 'target_encodings.json'), 'w'))
281
+ # Scaler
282
+ joblib.dump(scaler, os.path.join(artifact_dir, 'scaler.pkl'))
283
+ print('Minmax scaler saved')
284
+
285
+ # Imputer
286
+ joblib.dump(imputer, os.path.join(artifact_dir, 'imputer.pkl'))
287
+ print('Median imputer saved')
288
+
289
+ ########################################
290
+ # Save final data
291
+ #########################################
292
+
293
+ # Train data
294
+ train_data_imputed.to_pickle(os.path.join(output_data_dir, 'train_data.pkl'))
295
+ print('Final train data saved')
training/prepare_train_data_crossval.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prepare final train set for cross-validation (K fold encoded, scaled and imputed)."""
2
+ import copd
3
+ # import matplotlib.pyplot as plt
4
+ from lenusml import crossvalidation
5
+ import os
6
+ import pandas as pd
7
+ import numpy as np
8
+ import seaborn as sns
9
+ from sklearn.preprocessing import MinMaxScaler
10
+ from sklearn.impute import SimpleImputer
11
+
12
+ sns.set(style='darkgrid', context='talk')
13
+ sns.set_palette('dark')
14
+ muted = sns.palettes.color_palette(palette='muted')
15
+ dark = sns.palettes.color_palette(palette='dark')
16
+
17
+ data_dir = '<YOUR_DATA_PATH>/train_data/'
18
+ cohort_info_dir = '../data/cohort_info/'
19
+ output_data_dir = '../data/models/model1'
20
+
21
+ fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
22
+ allow_pickle=True)
23
+
24
+ data = pd.read_pickle(os.path.join(data_dir, 'train_data.pkl'))
25
+
26
+ exacs = data[data.IsExac == 1]
27
+ exac_patients = exacs.StudyId.unique()
28
+ # non_exac_patients = np.setdiff1d(data.StudyId, exac_patients)
29
+ # len(non_exac_patients)
30
+ # exac_counts = exacs.groupby('StudyId')['IsExac'].count().reset_index()
31
+ # exac_counts = pd.concat([exac_counts,
32
+ # pd.DataFrame({'StudyId': non_exac_patients,
33
+ # 'IsExac': len(non_exac_patients)*[0]})])
34
+ #
35
+ # exac_counts = exac_counts.merge(data[['StudyId', 'Sex', 'SmokingStatus',
36
+ # 'RequiredAcuteNIV', 'RequiredICUAdmission']],
37
+ # on='StudyId', how='left')
38
+
39
+ ###############################################
40
+ # Map the True/False cols to integers
41
+ ###############################################
42
+ bool_mapping = {True: 1, False: 0}
43
+ data['RequiredAcuteNIV'] = data.RequiredAcuteNIV.replace(bool_mapping)
44
+ data['RequiredICUAdmission'] = data.RequiredICUAdmission.replace(bool_mapping)
45
+
46
+ # Map the M and F sex column to binary (1=F)
47
+ sex_mapping = {'F': 1, 'M': 0}
48
+ data['Sex_F'] = data.Sex.map(sex_mapping)
49
+ data = data.drop(columns=['Sex'])
50
+
51
+ ##############################################################
52
+ # Read daily PRO responses, calculate aggregations and merge
53
+ ##############################################################
54
+ cat = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProCat.txt'),
55
+ delimiter="|")
56
+
57
+ symptom_diary = pd.read_csv(
58
+ os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProSymptomDiary.txt'),
59
+ usecols=['PatientId', 'SubmissionTime', 'SymptomDiaryQ1', 'SymptomDiaryQ2',
60
+ 'SymptomDiaryQ3', 'SymptomDiaryQ8', 'SymptomDiaryQ9', 'SymptomDiaryQ10'],
61
+ delimiter="|")
62
+
63
+ cat['SubmissionTime'] = pd.to_datetime(cat.SubmissionTime,
64
+ utc=True).dt.normalize()
65
+ symptom_diary['SubmissionTime'] = pd.to_datetime(symptom_diary.SubmissionTime,
66
+ utc=True).dt.normalize()
67
+
68
+
69
+ # Filter for train patients
70
+ cat = cat[cat.PatientId.isin(data.PatientId)]
71
+ symptom_diary = symptom_diary[symptom_diary.PatientId.isin(data.PatientId)]
72
+
73
+ # Merge daily PROs accounting for days where patients answered the same PRO more than once
74
+ # per day
75
+ daily_pros = pd.merge(cat.drop_duplicates(subset=['PatientId', 'SubmissionTime']),
76
+ symptom_diary.drop_duplicates(subset=['PatientId',
77
+ 'SubmissionTime']),
78
+ on=['PatientId', 'SubmissionTime'], how='inner')
79
+
80
+ # Calculate rolling mean on previous days for numeric PROs
81
+ numeric_pros = ['CATQ1', 'CATQ2', 'CATQ3', 'CATQ4', 'CATQ5', 'CATQ6', 'CATQ7',
82
+ 'CATQ8', 'SymptomDiaryQ1', 'SymptomDiaryQ2', 'Score']
83
+
84
+ mean_pros = copd.rolling_mean_previous_period(df=daily_pros, cols=numeric_pros,
85
+ date_col='SubmissionTime',
86
+ id_col='StudyId', window=3)
87
+
88
+ # Merge the averaged PROs with the original responses and calculate differences
89
+ daily_pros = daily_pros.merge(mean_pros, on=['StudyId', 'SubmissionTime'], how='left')
90
+
91
+ daily_pros = copd.calculate_diff_from_rolling_mean(df=daily_pros, cols=numeric_pros)
92
+
93
+ # Remove the rolling average columns
94
+ daily_pros = daily_pros.loc[:, ~daily_pros.columns.str.endswith('_ave')]
95
+
96
+ # Merge PROs with full train data
97
+ train_data = pd.merge_asof(data.sort_values(by='DateOfEvent'), daily_pros.drop(
98
+ columns=['StudyId']).sort_values(by='SubmissionTime'),
99
+ left_on='DateOfEvent', right_on='SubmissionTime',
100
+ by='PatientId', direction='backward')
101
+
102
+ ################################################
103
+ # Include comorbidities from Lenus service
104
+ ################################################
105
+ comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt',
106
+ delimiter='|')
107
+ comorbidities = comorbidities.drop(columns=['Id', 'Created'])
108
+ # Get list of comorbidities captured in the service
109
+ comorbidity_list = list(comorbidities.columns)
110
+ comorbidity_list.remove('PatientId')
111
+
112
+ # Filter for train patients
113
+ comorbidities = comorbidities[comorbidities.PatientId.isin(data.PatientId)]
114
+ print('Train patients with entries in CopdDatasetCoMorbidityDetails: {} out of {}'.format(
115
+ len(comorbidities), len(data.PatientId.unique())))
116
+ comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace(
117
+ bool_mapping).fillna(0)
118
+ print('Comorbidity counts:', '\n', comorbidities[comorbidity_list].sum())
119
+
120
+ # Merge with train data, infill nans and get counts
121
+ train_data = train_data.merge(comorbidities, on='PatientId', how='left')
122
+ print('Comorbidity counts after merging with patient days:', '\n',
123
+ train_data[comorbidity_list].sum())
124
+ train_data[comorbidity_list] = train_data[comorbidity_list].fillna(0)
125
+
126
+ # Get comorb counts for each patient
127
+ train_data['Comorbidities'] = train_data[comorbidity_list].sum(axis=1)
128
+ comorb_counts = train_data.groupby('StudyId')['Comorbidities'].max().reset_index()
129
+ # print('Patient comorbidity counts after infilling missing values: \n',
130
+ # comorb_counts.value_counts())
131
+
132
+ comorb_counts.loc[comorb_counts.StudyId.isin(exac_patients), 'IsExacPatient'] = 1
133
+ comorb_counts['IsExacPatient'] = comorb_counts['IsExacPatient'].fillna(0)
134
+
135
+ # Drop comorbidities columns from train data but retain AsthmaOverlap
136
+ comorbidity_list.remove('AsthmaOverlap')
137
+ train_data = train_data.drop(columns=comorbidity_list)
138
+
139
+ ###############################################################
140
+ # Include inhaler type from Lenus service
141
+ ###############################################################
142
+ # Load inhaler data
143
+ inhaler_type = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetUsualTherapies.txt',
144
+ delimiter='|', usecols=['StudyId', 'InhalerType'])
145
+ # Filter for train patients
146
+ inhaler_type = inhaler_type[inhaler_type.StudyId.isin(data.StudyId)]
147
+ # Create new feature for triple therapy ('LABA-LAMA-ICS' or 'LAMA +LABA-ICS')
148
+ inhaler_type = copd.triple_inhaler_therapy_service(
149
+ df=inhaler_type, id_col='StudyId', inhaler_col='InhalerType', include_mitt=True)
150
+
151
+ print('Patients taking triple inhaler therapy: ', '\n',
152
+ inhaler_type.TripleTherapy.value_counts())
153
+ train_data = train_data.merge(inhaler_type, on='StudyId', how='left')
154
+
155
+ #####################################
156
+ # Map some categorical features
157
+ #####################################
158
+
159
+ # Replace SDQ8 with strings for phlegm difficulty and infill as None where no phlegm
160
+ # reported in CAT
161
+ train_data['SymptomDiaryQ8'] = train_data.SymptomDiaryQ8.replace(
162
+ {1: 'Not difficult', 2: 'A little difficult', 3: 'Quite difficult',
163
+ 4: 'Very difficult', np.nan: 'None'})
164
+
165
+ # Replace SDQ9 with strings for phlegm consistency and infill as None where no phlegm
166
+ # reported in CAT
167
+ train_data['SymptomDiaryQ9'] = train_data.SymptomDiaryQ9.replace(
168
+ {1: 'Watery', 2: 'Sticky liquid', 3: 'Semi-solid', 4: 'Solid', np.nan: 'None'})
169
+
170
+ # Replace SDQ10 with strings for phlegm colour and infill as None where no phlegm
171
+ # reported in CAT
172
+ train_data['SymptomDiaryQ10'] = train_data.SymptomDiaryQ10.replace(
173
+ {1: 'White', 2: 'Yellow', 3: 'Green', 4: 'Dark green', np.nan: 'None'})
174
+
175
+ # Replace smoking status with strings
176
+ train_data['SmokingStatus'] = train_data.SmokingStatus.replace(
177
+ {1: 'Smoker', 2: 'Ex-smoker', 3: 'Non-smoker'})
178
+
179
+ train_data['InExacWindow'] = train_data.IsExac.replace({0: False, 1: True})
180
+
181
+ #####################################################
182
+ # Calculate DaysSinceCAT and filter data if required
183
+ #####################################################
184
+
185
+ train_data['DaysSinceCAT'] = (train_data.DateOfEvent -
186
+ train_data.SubmissionTime).dt.days.astype('int')
187
+
188
+ DaysSinceCAT_cutoff = 14
189
+ train_data = train_data[train_data.DaysSinceCAT <= DaysSinceCAT_cutoff]
190
+ #####################################
191
+ # Bin some numeric features
192
+ #####################################
193
+
194
+ # Bin days since last exacerbation
195
+ exac_bins = [-1, 0, 21, 90, 180, np.inf]
196
+ exac_labels = ['None', '<21 days', '21 - 89 days', '90 - 179 days', '>= 180 days']
197
+
198
+ train_data['DaysSinceLastExac'] = copd.bin_numeric_column(
199
+ col=train_data['DaysSinceLastExac'], bins=exac_bins, labels=exac_labels)
200
+
201
+ # Bin patient age
202
+ age_bins = [0, 50, 60, 70, 80, np.inf]
203
+ age_labels = ['<50', '50-59', '60-69', '70-79', '80+']
204
+
205
+ train_data['Age'] = copd.bin_numeric_column(
206
+ col=train_data['Age'], bins=age_bins, labels=age_labels)
207
+
208
+ # Bin number of comorbidities
209
+ comorb_bins = [0, 1, 3, np.inf]
210
+ comorb_labels = ['None', '1-2', '3+']
211
+ train_data['Comorbidities'] = copd.bin_numeric_column(
212
+ col=train_data['Comorbidities'], bins=comorb_bins, labels=comorb_labels)
213
+
214
+ comorb_counts['Comorbidities_binned'] = copd.bin_numeric_column(
215
+ col=comorb_counts['Comorbidities'], bins=comorb_bins, labels=comorb_labels)
216
+
217
+ # Bin patient spirometry at onboarding
218
+ spirometry_bins = [0, 30, 50, 80, np.inf]
219
+ spirometry_labels = ['Very severe', 'Severe', 'Moderate', 'Mild']
220
+
221
+ train_data['FEV1PercentPredicted'] = copd.bin_numeric_column(
222
+ col=train_data['LungFunction_FEV1PercentPredicted'], bins=spirometry_bins,
223
+ labels=spirometry_labels)
224
+
225
+ train_data = train_data.drop(columns=['LungFunction_FEV1PercentPredicted'])
226
+ # Assign patients without spirometry in service data to the Mild category
227
+ train_data.loc[
228
+ train_data['FEV1PercentPredicted'] == 'nan', 'FEV1PercentPredicted'] = 'Mild'
229
+ train_data['FEV1PercentPredicted'].value_counts()
230
+
231
+ ##################################
232
+ # Service eosinophils feature
233
+ ##################################
234
+ train_data['HighestEosinophilCount_0_3'] = np.where(
235
+ train_data['LabsHighestEosinophilCount'] >= 0.3, 1, 0)
236
+ train_data = train_data.drop(columns=['LabsHighestEosinophilCount'])
237
+
238
+ # import matplotlib.pyplot as plt
239
+ # def plot_categorical_against_target(*, df, column, target, savefig=False,
240
+ # output_dir=None, label_rotation=None,
241
+ # category_order=None):
242
+ # (df.groupby(target)[column].value_counts(normalize=True).mul(100).rename('Percent')
243
+ # .reset_index().pipe((sns.catplot, 'data'), x=column, y='Percent', hue=target,
244
+ # kind='bar', alpha=0.8, order=category_order))
245
+ # if label_rotation:
246
+ # plt.xticks(rotation=label_rotation, ha='right', rotation_mode='anchor')
247
+ # if savefig:
248
+ # plt.savefig(os.path.join(output_dir, column + '.png'), bbox_inches='tight',
249
+ # dpi=150)
250
+
251
+ # plot_categorical_against_target(df=train_data, column= 'SymptomDiaryQ10',
252
+ # target='InExacWindow', label_rotation=45,
253
+ # category_order=None, savefig=True,
254
+ # output_dir='../data/plots/')
255
+ # plt.show()
256
+
257
+ # plot_categorical_against_target(df=eosinophils, column= 'HighestEosinophilCount_0_3',
258
+ # target='IsExacPatient', label_rotation=None,
259
+ # category_order=None, savefig=True,
260
+ # output_dir='../data/plots/')
261
+ # plt.show()
262
+ # categorical_cols = ['Sex_F', 'RequiredAcuteNIV', 'RequiredICUAdmission',
263
+ # 'SmokingStatus', 'Comorbidities',
264
+ # 'CATQ1', 'CATQ2', 'CATQ3', 'CATQ4', 'CATQ5', 'CATQ6', 'CATQ7',
265
+ # 'CATQ8', 'SymptomDiaryQ1', 'SymptomDiaryQ2', 'SymptomDiaryQ3']
266
+
267
+ # for column in categorical_cols:
268
+ # plot_categorical_against_target(df=train_data, column=column, target='InExacWindow',
269
+ # savefig=True, output_dir='../data/plots/')
270
+
271
+
272
+ # def plot_numerical_against_target(*, df, column, target, bins=10, savefig=False,
273
+ # output_dir=None):
274
+ # sns.displot(x=column, hue=target, data=df, stat='density', bins=bins,
275
+ # common_norm=False)
276
+ # if savefig:
277
+ # plt.savefig(os.path.join(output_dir, column + '.png'), bbox_inches='tight',
278
+ # dpi=150)
279
+
280
+ # for col in numeric_pros:
281
+ # plot_numerical_against_target(
282
+ # df=train_data, column=col + '_diff', target='InExacWindow', bins=10,
283
+ # savefig=True, output_dir='../data/plots')
284
+ # plt.show()
285
+ # plot_numerical_against_target(
286
+ # df=spirometry, column='LungFunction_FEV1PercentPredicted',
287
+ # target='IsExacPatient', bins=20,
288
+ # savefig=True, output_dir='../data/plots')
289
+ # plt.show()
290
+
291
+ categorical_columns = ['SmokingStatus', 'SymptomDiaryQ8', 'SymptomDiaryQ9',
292
+ 'SymptomDiaryQ10', 'DaysSinceLastExac', 'Age', 'Comorbidities',
293
+ 'FEV1PercentPredicted']
294
+ train_data[categorical_columns] = train_data[categorical_columns].astype("str")
295
+ data_encoded = copd.kfold_encode_train_data(df=train_data, fold_patients=fold_patients,
296
+ cols_to_encode=categorical_columns,
297
+ target='IsExac', id_col='StudyId')
298
+ data_encoded = data_encoded.drop(columns=categorical_columns, axis=1)
299
+
300
+ ###################################
301
+ # Scale data
302
+ ###################################
303
+ data_encoded = data_encoded.drop(columns=['PatientId', 'InExacWindow',
304
+ 'DateOfEvent', 'SubmissionTime',
305
+ 'FirstSubmissionDate', 'LatestPredictionDate'])
306
+
307
+ scaler = MinMaxScaler()
308
+ train_data_scaled = crossvalidation.kfold_process_train_data(df=data_encoded,
309
+ fold_patients=fold_patients,
310
+ processor=scaler,
311
+ id_col='StudyId',
312
+ target='IsExac')
313
+
314
+ ###################################
315
+ # Infill missing data with median
316
+ ###################################
317
+ # K-fold impute data with the median
318
+ imputer = SimpleImputer(missing_values=np.nan, strategy='median')
319
+ train_data_imputed = crossvalidation.kfold_process_train_data(df=train_data_scaled,
320
+ fold_patients=fold_patients,
321
+ processor=imputer,
322
+ id_col='StudyId',
323
+ target='IsExac')
324
+ #########################################
325
+ # Save final data
326
+ #########################################
327
+
328
+ # Train data
329
+ os.makedirs(output_data_dir, exist_ok=True)
330
+ train_data_imputed.to_pickle(os.path.join(output_data_dir, 'train_data_cv.pkl'))
331
+ print('Final train data saved (CV)')
training/tests/__init__.py ADDED
File without changes
training/tests/test_apply_logic_response_criterion.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the apply_logic_response_criterion function."""
2
+
3
+ import copd
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pytest
7
+
8
+
9
+ @pytest.fixture
10
+ def exacerbation_event():
11
+ """Dataframe index (27) of the exacerbation event of interest."""
12
+ return 27
13
+
14
+
15
+ @pytest.fixture
16
+ def first_pro_response():
17
+ """Dataframe index (8) of the first weekly PRO response."""
18
+ return 8
19
+
20
+
21
+ @pytest.fixture
22
+ def second_pro_response(first_pro_response):
23
+ """Dataframe index of the second weekly PRO response. Seven days after first."""
24
+ return first_pro_response + 7
25
+
26
+
27
+ @pytest.fixture
28
+ def input_df(exacerbation_event):
29
+ """Sample input dataframe template - specific cases to be added in each test.
30
+
31
+ This initial dataframe has no PRO responses between the initial exac and the event of
32
+ interest with DaysSinceLastExac=25. Interim PRO responses should be added in tests.
33
+ """
34
+ df = pd.DataFrame({'PatientId': ['1'] * 31,
35
+ 'DateOfEvent': pd.date_range('2022-01-01', '2022-01-31'),
36
+ 'Q5Answered': [0] * 31,
37
+ 'NegativeQ5': [np.nan] * 31,
38
+ 'IsExac': [0] * 31,
39
+ 'DaysSinceLastExac': [-1, -1, -1] + list(np.arange(1, 26)) +
40
+ list(np.arange(1, 4))})
41
+ # Add initial event to simulate DaysSinceLastExac restart from 1
42
+ df.loc[2, 'Q5Answered'] = 1
43
+ df.loc[2, 'NegativeQ5'] = 0
44
+ df.loc[2, 'IsExac'] = 1
45
+ # Add event of interest (DaysSinceLastExac = 25)
46
+ df.loc[exacerbation_event, 'Q5Answered'] = 1
47
+ df.loc[exacerbation_event, 'NegativeQ5'] = 0
48
+ df.loc[exacerbation_event, 'IsExac'] = 1
49
+
50
+ # Add a negative response after the event of interest (should not be counted)
51
+ df.loc[exacerbation_event + 2, 'Q5Answered'] = 1
52
+ df.loc[exacerbation_event + 2, 'NegativeQ5'] = 1
53
+ return df
54
+
55
+
56
+ def test_output_equals_expected_criterion_failed(input_df, exacerbation_event):
57
+ """Test output is as expected for failed LOGIC response criterion."""
58
+ # Output should be same as input with additional RemoveExac column
59
+ expected_df = input_df.copy()
60
+ # Insert RemoveExac column with exac flagged for removal
61
+ expected_df['RemoveExac'] = np.nan
62
+ expected_df.loc[exacerbation_event, 'RemoveExac'] = 1
63
+ output_df = copd.apply_logic_response_criterion(input_df)
64
+ pd.testing.assert_frame_equal(output_df, expected_df)
65
+
66
+
67
+ def test_output_equals_expected_criterion_passed(
68
+ input_df, exacerbation_event, first_pro_response, second_pro_response):
69
+ """Test output is as expected for passed LOGIC response criterion."""
70
+ # Add PRO responses needed to pass LOGIC criterion
71
+ input_df.loc[first_pro_response, 'Q5Answered'] = 1
72
+ input_df.loc[first_pro_response, 'NegativeQ5'] = 1
73
+ input_df.loc[second_pro_response, 'Q5Answered'] = 1
74
+ input_df.loc[second_pro_response, 'NegativeQ5'] = 1
75
+
76
+ # Output should be same as input with additional RemoveExac column
77
+ expected_df = input_df.copy()
78
+ # Insert RemoveExac column with exac flagged for removal
79
+ expected_df['RemoveExac'] = np.nan
80
+ expected_df.loc[exacerbation_event, 'RemoveExac'] = 0
81
+ output_df = copd.apply_logic_response_criterion(input_df)
82
+ pd.testing.assert_frame_equal(output_df, expected_df)
training/tests/test_bin_numeric_column.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the bin_numeric_column function."""
2
+ import copd
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+
7
+ def test_binned_ages():
8
+ """Test output is as expected for typical age binning values."""
9
+ age_bins = [0, 50, 60, 70, 80, np.inf]
10
+ labels = ['<50', '50-59', '60-69', '70-79', '80+']
11
+ df = pd.DataFrame({'Age': [10, 49, 50, 55, 59, 60, 65, 69, 70, 75, 79, 80, 85, 100]})
12
+ output = copd.bin_numeric_column(col=df['Age'], bins=age_bins, labels=labels)
13
+ assert list(output.values) == ['<50', '<50', '50-59', '50-59', '50-59', '60-69',
14
+ '60-69', '60-69', '70-79', '70-79', '70-79', '80+',
15
+ '80+', '80+']
16
+
17
+
18
+ def test_binned_days_since():
19
+ """Test output is as expected for typical days since last exac binning."""
20
+ exac_bins = [-1, 0, 21, 90, 180, np.inf]
21
+ labels = ['None', '<21 days', '21 - 89 days', '90 - 179 days', '>= 180 days']
22
+ df = pd.DataFrame({'DaysSince': [-1, 0, 10, 21, 25, 89, 90, 150, 179, 180, 200]})
23
+ output = copd.bin_numeric_column(col=df['DaysSince'], bins=exac_bins, labels=labels)
24
+ assert list(output) == ['None', '<21 days', '<21 days', '21 - 89 days',
25
+ '21 - 89 days', '21 - 89 days', '90 - 179 days',
26
+ '90 - 179 days', '90 - 179 days', '>= 180 days',
27
+ '>= 180 days']
28
+
29
+
30
+ def test_binned_comorbidities():
31
+ """Test output is as expected for typical comorbidity count binning."""
32
+ comorb_bins = [0, 1, 3, np.inf]
33
+ labels = ['None', '1-2', '3+']
34
+ df = pd.DataFrame({'Comorbs': [0, 1, 2, 3, 4, 5]})
35
+ output = copd.bin_numeric_column(col=df['Comorbs'], bins=comorb_bins, labels=labels)
36
+ assert list(output) == ['None', '1-2', '1-2', '3+', '3+', '3+']
training/tests/test_calculate_days_since_last_event.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the calculate_days_since_last_event function."""
2
+
3
+ import copd
4
+ import pandas as pd
5
+
6
+
7
+ def test_output_equals_expected_exac():
8
+ """Compare the output and expected dataframes."""
9
+ input_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
10
+ 'IsExac': [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]})
11
+
12
+ expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
13
+ 'IsExac': [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
14
+ 'DaysSinceLastExac': [-1, -1, -1, 1, 2, 3, 1, 1, 2, 3, 4,
15
+ 5, 6, 7, 8]})
16
+
17
+ output_df = copd.calculate_days_since_last_event(
18
+ df=input_df, event_col='IsExac', output_col='DaysSinceLastExac')
19
+ pd.testing.assert_frame_equal(output_df, expected_df)
20
+
21
+
22
+ def test_output_equals_expected_rescue_meds():
23
+ """Compare the output and expected dataframes."""
24
+ input_df = pd.DataFrame({
25
+ 'Date': pd.date_range('2022-02-01', '2022-02-15'),
26
+ 'IsRescueMedExac': [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0]})
27
+
28
+ expected_df = pd.DataFrame({'Date': pd.date_range('2022-02-01', '2022-02-15'),
29
+ 'IsRescueMedExac': [
30
+ 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0],
31
+ 'DaysSinceLastRescueMeds': [
32
+ -1, -1, -1, 1, 2, 3, 1, 1, 2, 3, 4, 5, 1, 2, 3]})
33
+
34
+ output_df = copd.calculate_days_since_last_event(
35
+ df=input_df, event_col='IsRescueMedExac', output_col='DaysSinceLastRescueMeds')
36
+ pd.testing.assert_frame_equal(output_df, expected_df)
training/tests/test_define_hospital admission.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the define_hospital_admission function."""
2
+
3
+ import copd
4
+ import pandas as pd
5
+ import pytest
6
+
7
+
8
+ @pytest.mark.parametrize("event",
9
+ ['Hospital admission - emergency, COPD related',
10
+ 'Hospital admission - emergency, COPD unrelated'])
11
+ def test_admission_event(event):
12
+ """Test patient event definitions for hospital admissions."""
13
+ assert copd.define_hospital_admission(
14
+ pd.Series(event)) == 1
15
+
16
+
17
+ @pytest.mark.parametrize("event",
18
+ ['Death',
19
+ 'NHS 24 review - emergency, COPD related',
20
+ 'Exacerbation - self-managed with rescue pack',
21
+ 'GP review - emergency, COPD related',
22
+ 'Emergency department attendance, COPD related',
23
+ 'Exacerbation - started abs/steroid by clinical team'])
24
+ def test_non_admission_event(event):
25
+ """Test patient event definitions for non hospital admission events."""
26
+ assert copd.define_hospital_admission(
27
+ pd.Series(event)) == 0
training/tests/test_define_service_exac_event.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the define_service_exac_event function."""
2
+
3
+ import copd
4
+ import pandas as pd
5
+ import pytest
6
+
7
+
8
+ @pytest.mark.parametrize("event",
9
+ ['Hospital admission - emergency, COPD related',
10
+ 'GP review - emergency, COPD related',
11
+ 'Emergency department attendance, COPD related',
12
+ 'Exacerbation - started abs/steroid by clinical team'])
13
+ def test_positive_event_no_community(event):
14
+ """Test patient event definitions for COPD exacerbations - no community events."""
15
+ assert copd.define_service_exac_event(events=pd.Series(event)) == 1
16
+
17
+
18
+ @pytest.mark.parametrize("event",
19
+ ['Hospital admission - emergency, COPD unrelated',
20
+ 'Death',
21
+ 'NHS 24 review - emergency, COPD related',
22
+ 'Exacerbation - self-managed with rescue pack'])
23
+ def test_negative_event_no_community(event):
24
+ """Test patient event definitions for non-exac events - no community events."""
25
+ assert copd.define_service_exac_event(events=pd.Series(event)) == 0
26
+
27
+
28
+ @pytest.mark.parametrize("event",
29
+ ['Hospital admission - emergency, COPD related',
30
+ 'GP review - emergency, COPD related',
31
+ 'Emergency department attendance, COPD related',
32
+ 'Exacerbation - started abs/steroid by clinical team',
33
+ 'Exacerbation - self-managed with rescue pack'])
34
+ def test_positive_event_with_community(event):
35
+ """Test patient event definitions for COPD exacerbations - with community events."""
36
+ assert copd.define_service_exac_event(events=pd.Series(event),
37
+ include_community=(True)) == 1
38
+
39
+
40
+ @pytest.mark.parametrize("event",
41
+ ['Hospital admission - emergency, COPD unrelated',
42
+ 'Death',
43
+ 'NHS 24 review - emergency, COPD related'])
44
+ def test_negative_event_with_community(event):
45
+ """Test patient event definitions for non-exac events - with community events."""
46
+ assert copd.define_service_exac_event(events=pd.Series(event),
47
+ include_community=(True)) == 0
training/tests/test_extract_clinician_verified_exacerbations.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the extract_clinician_verified_exacerbations function."""
2
+
3
+ import copd
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pytest
7
+
8
+
9
+ @pytest.fixture
10
+ def input_df():
11
+ """Sample input data.
12
+
13
+ Input data covers the following cases:
14
+ 1. Non-verified events (input rows 0 and 3)
15
+ 2. Verified exacerbation with known date (row 1)
16
+ 3. Verified exacerbation with unknown date (row 2)
17
+ """
18
+ return pd.DataFrame({'StudyId': [1, '2a', 1, 4],
19
+ 'Exacerbation confirmed': [0, 1, 1, 0],
20
+ 'DateRecorded': pd.to_datetime(['2022-01-03', '2022-01-05',
21
+ '2022-01-06', '2022-01-09']),
22
+ 'New Date': [np.nan, '2022-01-05', np.nan, np.nan],
23
+ 'Date changed': [np.nan, 1, 0, np.nan],
24
+ 'Extra column': [1, 3, 'a', '4']})
25
+
26
+
27
+ @pytest.fixture
28
+ def expected_df():
29
+ """Define expected output dataframe."""
30
+ return pd.DataFrame({'StudyId': ['2a', 1],
31
+ 'DateOfEvent': pd.to_datetime(pd.Series(['2022-01-05',
32
+ '2022-01-06']),
33
+ utc=True).dt.normalize(),
34
+ 'IsCommExac': [1, 1],
35
+ 'ExacDateUnknown': [0, 1]})
36
+
37
+
38
+ def test_output_equals_expected(input_df, expected_df):
39
+ """Test output is as expected."""
40
+ output_df = copd.extract_clinician_verified_exacerbations(input_df).reset_index(
41
+ drop=True)
42
+ pd.testing.assert_frame_equal(output_df, expected_df)
training/tests/test_filter_symptom_diary.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the filter_symptom_diary function."""
2
+
3
+ import copd
4
+ import pandas as pd
5
+ import pytest
6
+
7
+
8
+ @pytest.fixture
9
+ def input_df():
10
+ """Sample input data."""
11
+ return pd.DataFrame({'PatientId': [1, '2a', 1, '2a', 3, 4, 4, 5, 5, 4],
12
+ 'SubmissionTime': pd.date_range('2022-01-01', '2022-01-10')})
13
+
14
+
15
+ def test_output_no_date_cutoff(input_df):
16
+ """Test output is as expected when called without a date cut off."""
17
+ output_df = copd.filter_symptom_diary(df=input_df, patients=[1, '2a', 3])
18
+ expected_df = pd.DataFrame({'PatientId': [1, '2a', 1, '2a', 3],
19
+ 'SubmissionTime':
20
+ pd.date_range('2022-01-01', '2022-01-05', tz='utc')})
21
+ pd.testing.assert_frame_equal(output_df, expected_df)
22
+
23
+
24
+ def test_output_with_date_cutoff(input_df):
25
+ """Test output is as expected when called with a date cut off."""
26
+ output_df = copd.filter_symptom_diary(df=input_df, patients=[1, '2a', 3],
27
+ date_cutoff='2022-01-03').reset_index(drop=True)
28
+ expected_df = pd.DataFrame({'PatientId': [1, '2a', 3],
29
+ 'SubmissionTime':
30
+ pd.date_range('2022-01-03', '2022-01-05', tz='utc')})
31
+ pd.testing.assert_frame_equal(output_df, expected_df)
training/tests/test_get_logic_exacerbation_indices.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the get_logic_exacerbation_indices function."""
2
+
3
+ import copd
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pytest
7
+
8
+
9
+ def test_returns_empty_list_for_no_exacs():
10
+ """Test output for input with no exacerbations."""
11
+ input_df = pd.DataFrame({'DaysSinceLastExac': np.arange(15, 20),
12
+ 'IsExac': [0, 0, 0, 0, 0]})
13
+ output_list = copd.get_logic_exacerbation_indices(input_df)
14
+ assert not output_list
15
+
16
+
17
+ @pytest.fixture
18
+ def input_df_no_relevant_exacs():
19
+ """Sample input data containing no relevant exacerbations."""
20
+ return pd.DataFrame({'DaysSinceLastExac': np.arange(40, 45),
21
+ 'IsExac': [0, 0, 1, 0, 0]})
22
+
23
+
24
+ def test_returns_empty_list_for_no_relevant_exacs_default(input_df_no_relevant_exacs):
25
+ """Test output for input with no relevant exacerbations. Default options."""
26
+ output_list = copd.get_logic_exacerbation_indices(input_df_no_relevant_exacs)
27
+ assert not output_list
28
+
29
+
30
+ def test_returns_empty_list_for_no_relevant_exacs_non_default(input_df_no_relevant_exacs):
31
+ """Test output for input with no relevant exacerbations. Specified time window."""
32
+ output_list = copd.get_logic_exacerbation_indices(input_df_no_relevant_exacs,
33
+ minimum_period=20,
34
+ maximum_period=38)
35
+ assert not output_list
36
+
37
+
38
+ @pytest.fixture
39
+ def input_df_with_exacs():
40
+ """Sample input data containing exacerbations."""
41
+ return pd.DataFrame({'DaysSinceLastExac': [15, 20, 42, 37, 22, 18],
42
+ 'IsExac': [1, 0, 1, 1, 1, 0]})
43
+
44
+
45
+ def test_returns_relevant_exacs_default(input_df_with_exacs):
46
+ """Test output for input with relevant exacerbations. Default options."""
47
+ output_list = copd.get_logic_exacerbation_indices(input_df_with_exacs)
48
+ assert output_list == [0, 4]
49
+
50
+
51
+ def test_returns_relevant_exacs_non_default(input_df_with_exacs):
52
+ """Test output for input with relevant exacerbations. Specified time window."""
53
+ output_list = copd.get_logic_exacerbation_indices(input_df_with_exacs,
54
+ minimum_period=20,
55
+ maximum_period=38)
56
+ assert output_list == [3, 4]
training/tests/test_get_rescue_med_pro_responses.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the get_rescue_med_pro_responses function."""
2
+
3
+ import copd
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pytest
7
+
8
+
9
+ @pytest.fixture
10
+ def input_df():
11
+ """Sample input data."""
12
+ return pd.DataFrame({'PatientId': [1, '2a', 1],
13
+ 'SymptomDiaryQ5': [0, 1, np.nan]})
14
+
15
+
16
+ @pytest.fixture
17
+ def expected_df():
18
+ """Define expected output dataframe."""
19
+ return pd.DataFrame({'PatientId': [1, '2a'],
20
+ 'SymptomDiaryQ5': [0, 1],
21
+ 'Q5Answered': [1, 1],
22
+ 'NegativeQ5': [1, 0],
23
+ 'IsCommExac': [0, 1]})
24
+
25
+
26
+ def test_output_equals_expected(input_df, expected_df):
27
+ """Test output is as expected."""
28
+ output_df = copd.get_rescue_med_pro_responses(input_df)
29
+ pd.testing.assert_frame_equal(output_df, expected_df)
training/tests/test_logic_consecutive_negative_responses.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the logic_consecutive_negative_responses function."""
2
+
3
+ import copd
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pytest
7
+
8
+
9
+ @pytest.fixture
10
+ def exacerbation_event():
11
+ """Dataframe index (27) of the exacerbation event of interest."""
12
+ return 27
13
+
14
+
15
+ @pytest.fixture
16
+ def first_pro_response():
17
+ """Dataframe index (8) of the first weekly PRO response."""
18
+ return 8
19
+
20
+
21
+ @pytest.fixture
22
+ def second_pro_response(first_pro_response):
23
+ """Dataframe index of the second weekly PRO response. Seven days after first."""
24
+ return first_pro_response + 7
25
+
26
+
27
+ @pytest.fixture
28
+ def third_pro_response(second_pro_response):
29
+ """Dataframe index of the third weekly PRO response. Seven days after second."""
30
+ return second_pro_response + 7
31
+
32
+
33
+ @pytest.fixture
34
+ def input_df(exacerbation_event):
35
+ """Sample input dataframe template - specific cases to be added in each test.
36
+
37
+ This initial dataframe has no PRO responses between the initial exacerbation at index
38
+ 2 and the event of interest with DaysSinceLastExac=25 at index exacerbation_event (set
39
+ to 27). Interim PRO responses should be added in tests. Each row is a different day
40
+ (in chronological order). Add/subtract N from exacerbation_event to refer to N days
41
+ before or after the event by the dataframe index, e.g. exacerbation_event - 7 refers
42
+ to the day a week prior.
43
+ """
44
+ df = pd.DataFrame({'PatientId': ['1'] * 31,
45
+ 'DateOfEvent': pd.date_range('2022-01-01', '2022-01-31'),
46
+ 'Q5Answered': [0] * 31,
47
+ 'NegativeQ5': [np.nan] * 31,
48
+ 'DaysSinceLastExac': [-1, -1, -1] + list(np.arange(1, 26)) +
49
+ list(np.arange(1, 4))})
50
+ # Add initial event to simulate DaysSinceLastExac restart from 1
51
+ df.loc[2, 'Q5Answered'] = 1
52
+ df.loc[2, 'NegativeQ5'] = 0
53
+ # Add event of interest (DaysSinceLastExac = 25)
54
+ df.loc[exacerbation_event, 'Q5Answered'] = 1
55
+ df.loc[exacerbation_event, 'NegativeQ5'] = 0
56
+
57
+ # Add a negative response 2 days after the event of interest (should not be counted)
58
+ df.loc[exacerbation_event + 2, 'Q5Answered'] = 1
59
+ df.loc[exacerbation_event + 2, 'NegativeQ5'] = 1
60
+ return df
61
+
62
+
63
+ def test_returns_one_when_no_responses(input_df, exacerbation_event):
64
+ """Verify returns 1 (flag for removal) for no interim PRO responses."""
65
+ assert copd.logic_consecutive_negative_responses(input_df, exacerbation_event) == 1
66
+
67
+
68
+ def test_returns_one_too_few_responses(input_df, exacerbation_event):
69
+ """Verify returns 1 (flag for removal) for too few interim PRO responses."""
70
+ # Add a single negative response 7 days before the exacerbation event. Should fail PRO
71
+ # LOGIC because the negative response at index 29 is after the event of interest.
72
+ input_df.loc[exacerbation_event - 7, 'Q5Answered'] = 1
73
+ input_df.loc[exacerbation_event - 7, 'NegativeQ5'] = 1
74
+ assert copd.logic_consecutive_negative_responses(input_df, exacerbation_event) == 1
75
+
76
+
77
+ def test_returns_one_too_few_negative_responses(
78
+ input_df, exacerbation_event, second_pro_response, third_pro_response):
79
+ """Verify returns 1 (flag for removal) for too few interim PRO responses."""
80
+ # Add a positive response and a single negative response. Should return one because
81
+ # the response at index 29 is after the period of interest.
82
+
83
+ input_df.loc[second_pro_response, 'Q5Answered'] = 1
84
+ input_df.loc[second_pro_response, 'NegativeQ5'] = 0
85
+
86
+ input_df.loc[third_pro_response, 'Q5Answered'] = 1
87
+ input_df.loc[third_pro_response, 'NegativeQ5'] = 1
88
+ assert copd.logic_consecutive_negative_responses(input_df, exacerbation_event) == 1
89
+
90
+
91
+ def test_returns_one_too_few_consecutive_negative_responses_missing(
92
+ input_df, exacerbation_event, first_pro_response, second_pro_response,
93
+ third_pro_response):
94
+ """Verify returns 1 (flag for removal) for too few consecutive -ve PRO responses.
95
+
96
+ Input has a missing response between the two negative responses.
97
+ """
98
+ # Add negative responses at indices 8 and 22 (missing response at 15)
99
+ input_df.loc[first_pro_response, 'Q5Answered'] = 1
100
+ input_df.loc[first_pro_response, 'NegativeQ5'] = 1
101
+
102
+ input_df.loc[third_pro_response, 'Q5Answered'] = 1
103
+ input_df.loc[third_pro_response, 'NegativeQ5'] = 1
104
+
105
+ assert copd.logic_consecutive_negative_responses(input_df, exacerbation_event) == 1
106
+
107
+
108
+ def test_returns_one_too_few_consecutive_negative_responses_positive(
109
+ input_df, exacerbation_event, first_pro_response, second_pro_response,
110
+ third_pro_response):
111
+ """Verify returns 1 (flag for removal) for too few consecutive -ve PRO responses.
112
+
113
+ Input has a positive response between the two negative responses.
114
+ """
115
+ # Add negative responses at indices 8 and 22, and a positive response at 15
116
+ input_df.loc[first_pro_response, 'Q5Answered'] = 1
117
+ input_df.loc[first_pro_response, 'NegativeQ5'] = 1
118
+
119
+ input_df.loc[second_pro_response, 'Q5Answered'] = 1
120
+ input_df.loc[second_pro_response, 'NegativeQ5'] = 0
121
+
122
+ input_df.loc[third_pro_response, 'Q5Answered'] = 1
123
+ input_df.loc[third_pro_response, 'NegativeQ5'] = 1
124
+ assert copd.logic_consecutive_negative_responses(input_df, exacerbation_event) == 1
125
+
126
+
127
+ def test_returns_zero_enough_consecutive_negative_responses_default(
128
+ input_df, exacerbation_event, first_pro_response, second_pro_response):
129
+ """Verify returns 0 (pass LOGIC criterion) for required consecutive -ve PRO responses.
130
+
131
+ Input has two consecutive negative responses. Should return 1 with default options.
132
+ """
133
+ # Add negative responses at indices 8 and 15
134
+ input_df.loc[first_pro_response, 'Q5Answered'] = 1
135
+ input_df.loc[first_pro_response, 'NegativeQ5'] = 1
136
+
137
+ input_df.loc[second_pro_response, 'Q5Answered'] = 1
138
+ input_df.loc[second_pro_response, 'NegativeQ5'] = 1
139
+ assert copd.logic_consecutive_negative_responses(input_df, exacerbation_event) == 0
140
+
141
+
142
+ def test_returns_one_too_few_consecutive_negative_responses_non_default(
143
+ input_df, exacerbation_event, first_pro_response, second_pro_response):
144
+ """Verify returns 1 (flag for removal) for too few consecutive -ve PRO responses.
145
+
146
+ Input has two consecutive negative responses. Should return 0 with N=3.
147
+ """
148
+ # Add negative responses at indices 8 and 15
149
+ input_df.loc[first_pro_response, 'Q5Answered'] = 1
150
+ input_df.loc[first_pro_response, 'NegativeQ5'] = 1
151
+
152
+ input_df.loc[second_pro_response, 'Q5Answered'] = 1
153
+ input_df.loc[second_pro_response, 'NegativeQ5'] = 1
154
+ assert copd.logic_consecutive_negative_responses(
155
+ input_df, exacerbation_event, N=3) == 1
156
+
157
+
158
+ def test_returns_zero_too_few_consecutive_negative_responses_non_default(
159
+ input_df, exacerbation_event, first_pro_response, second_pro_response,
160
+ third_pro_response):
161
+ """Verify returns 0 (pass LOGIC criterion) for required consecutive -ve PRO responses.
162
+
163
+ Input has three consecutive negative responses. Should return 0 with N=3
164
+ """
165
+ # Add negative responses at indices 8, 15, and 22
166
+ input_df.loc[first_pro_response, 'Q5Answered'] = 1
167
+ input_df.loc[first_pro_response, 'NegativeQ5'] = 1
168
+
169
+ input_df.loc[second_pro_response, 'Q5Answered'] = 1
170
+ input_df.loc[second_pro_response, 'NegativeQ5'] = 1
171
+
172
+ input_df.loc[third_pro_response, 'Q5Answered'] = 1
173
+ input_df.loc[third_pro_response, 'NegativeQ5'] = 1
174
+ assert copd.logic_consecutive_negative_responses(
175
+ input_df, exacerbation_event, N=3) == 0
training/tests/test_minimum_period_between_exacerbations.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the minimum_period_between_exacerbations function."""
2
+
3
+ import copd
4
+ import pandas as pd
5
+ import pytest
6
+
7
+
8
+ @pytest.mark.parametrize("input_values,expected_output",
9
+ [(pd.DataFrame({'DaysSinceLastExac': [-1]}), 0),
10
+ (pd.DataFrame({'DaysSinceLastExac': [7]}), 1),
11
+ (pd.DataFrame({'DaysSinceLastExac': [14]}), 1),
12
+ (pd.DataFrame({'DaysSinceLastExac': [20]}), 0)])
13
+ def test_threshold_equals_default(input_values, expected_output):
14
+ """Test output for a variety of input values.
15
+
16
+ Test cases cover:
17
+ 1. No previous exacerbation
18
+ 2. Very recent exac
19
+ 3. Exac on the threshold value (should count as too recent)
20
+ 4. Previous non-recent exac
21
+ for the default threshold of 14 days
22
+ """
23
+ assert copd.minimum_period_between_exacerbations(input_values) == expected_output
24
+
25
+
26
+ @pytest.mark.parametrize("input_values,expected_output",
27
+ [(pd.DataFrame({'DaysSinceLastExac': [-1]}), 0),
28
+ (pd.DataFrame({'DaysSinceLastExac': [6]}), 1),
29
+ (pd.DataFrame({'DaysSinceLastExac': [7]}), 1),
30
+ (pd.DataFrame({'DaysSinceLastExac': [14]}), 0)])
31
+ def test_threshold_equals_seven(input_values, expected_output):
32
+ """Test output for a variety of input values.
33
+
34
+ Test cases cover:
35
+ 1. No previous exacerbation
36
+ 2. Very recent exac
37
+ 3. Exac on the threshold value (should count as too recent)
38
+ 4. Previous non-recent exac
39
+ for a threshold of 7 days.
40
+ """
41
+ assert copd.minimum_period_between_exacerbations(
42
+ input_values, minimum_days=7) == expected_output
training/tests/test_remove_data_between_exacerbations.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the remove_data_between_exacerbations function."""
2
+
3
+ import copd
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pytest
7
+
8
+
9
+ @pytest.fixture
10
+ def input_df():
11
+ """Sample input data including an exacerbation flagged for removal."""
12
+ return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
13
+ 'IsExac': [0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
14
+ 'DaysSinceLastExac': [-1, -1, 1, 2, 3, 4, 5, 1, 2, 1],
15
+ 'RemoveExac': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]})
16
+
17
+
18
+ @pytest.fixture
19
+ def expected_df():
20
+ """Define expected output dataframe."""
21
+ return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
22
+ 'IsExac': [0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
23
+ 'DaysSinceLastExac': [-1, -1, 1, 2, 3, 4, 5, 1, 2, 1],
24
+ 'RemoveExac': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
25
+ 'RemoveRow': [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan,
26
+ np.nan, np.nan]})
27
+
28
+
29
+ def test_output_equals_expected(input_df, expected_df):
30
+ """Test output is as expected."""
31
+ output_df = copd.remove_data_between_exacerbations(input_df)
32
+ pd.testing.assert_frame_equal(output_df, expected_df)
training/tests/test_remove_unknown_date_exacerbations.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the remove_unknown_date_exacerbations function."""
2
+
3
+ import copd
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pytest
7
+
8
+
9
+ @pytest.fixture
10
+ def input_df():
11
+ """Sample input data including an exacerbation with an uncertain date."""
12
+ return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
13
+ 'IsExac': [0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
14
+ 'ExacDateUnknown': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]})
15
+
16
+
17
+ def test_check_correct_rows_flagged_default(input_df):
18
+ """Check the correct rows are flagged for removal using default option (7 days)."""
19
+ output_df = copd.remove_unknown_date_exacerbations(input_df)
20
+ expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
21
+ 'IsExac': [0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
22
+ 'ExacDateUnknown': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
23
+ 'RemoveRow': [np.nan, np.nan, 1, 1, 1, 1, 1, 1, 1,
24
+ np.nan]})
25
+ pd.testing.assert_frame_equal(output_df, expected_df)
26
+
27
+
28
+ def test_check_correct_rows_flagged_non_default(input_df):
29
+ """Check the correct rows are flagged for removal when specifying 5 days."""
30
+ output_df = copd.remove_unknown_date_exacerbations(input_df, days_to_remove=5)
31
+ expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
32
+ 'IsExac': [0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
33
+ 'ExacDateUnknown': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
34
+ 'RemoveRow': [np.nan, np.nan, np.nan, np.nan, 1, 1, 1, 1,
35
+ 1, np.nan]})
36
+ pd.testing.assert_frame_equal(output_df, expected_df)
training/tests/test_rolling_mean_previous_period.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the rolling_mean_previous_period function."""
2
+
3
+ import copd
4
+ import pandas as pd
5
+ import numpy as np
6
+ import pytest
7
+
8
+
9
+ @pytest.fixture
10
+ def input_df_single_patient():
11
+ """Sample daily input data including PRO responses for a single patient."""
12
+ return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
13
+ 'StudyId': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
14
+ 'Q1': [5, 6, 1, np.nan, 0, 4, 3, 7, np.nan, 1]})
15
+
16
+
17
+ def test_seven_day_window_single_patient_columns(input_df_single_patient):
18
+ """Compare the output and expected columns for a single patient ID.
19
+
20
+ Uses a seven day rolling window for the mean.
21
+ """
22
+ output_df = copd.rolling_mean_previous_period(
23
+ df=input_df_single_patient, cols='Q1', id_col='StudyId', date_col='Date',
24
+ window=7)
25
+ # pd.testing.assert_frame_equal(output_df, expected_df)
26
+ assert set(output_df.columns) == set(['Date', 'StudyId', 'Q1_ave'])
27
+
28
+
29
+ def test_seven_day_window_single_patient_values(input_df_single_patient):
30
+ """Compare the output and expected dataframes for a single patient ID.
31
+
32
+ Uses a seven day rolling window for the sum.
33
+ """
34
+ expected_df = pd.DataFrame(
35
+ {'Date': pd.date_range('2022-01-01', '2022-01-10'),
36
+ 'StudyId': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
37
+ 'Q1_ave': [np.nan, 5.0, 5.5, 4.0, 4.0, 3.0, 3.2, 3.16666667, 3.5, 3.0]})
38
+
39
+ output_df = copd.rolling_mean_previous_period(
40
+ df=input_df_single_patient, cols='Q1', id_col='StudyId', date_col='Date',
41
+ window=7)
42
+ pd.testing.assert_frame_equal(
43
+ output_df[['Date', 'StudyId', 'Q1_ave']],
44
+ expected_df[['Date', 'StudyId', 'Q1_ave']])
45
+
46
+
47
+ @pytest.fixture
48
+ def input_df_several_patients_two_columns():
49
+ """Sample daily input data including PRO responses for three patients."""
50
+ return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
51
+ 'StudyId': [1, 2, 1, 1, 2, 1, 2, '3', 1, '3'],
52
+ 'Q1': [5, 6, 1, np.nan, 0, 4, 3, 7, np.nan, 1],
53
+ 'Q2': [-5, -6, -1, np.nan, 0, -4, -3, -7, np.nan, -1]})
54
+
55
+
56
+ @pytest.fixture
57
+ def expected_df_several_patients_two_columns():
58
+ """Create expected output df including mean PRO responses for three patients."""
59
+ # Create expected output dataframe (daily records for each patient with rolling means)
60
+ patient1 = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-09'),
61
+ 'StudyId': [1] * 9,
62
+ 'Q1_ave': [np.nan, 5.0, 5.0, 3.0, 1.0, 1.0, 4.0, 4.0, 4.0],
63
+ 'Q2_ave': [np.nan, -5.0, -5.0, -3.0, -1.0, -1.0, -4.0, -4.0,
64
+ -4.0]})
65
+ patient2 = pd.DataFrame({'Date': pd.date_range('2022-01-02', '2022-01-07'),
66
+ 'StudyId': [2] * 6,
67
+ 'Q1_ave': [np.nan, 6.0, 6.0, 6.0, 0.0, 0.0],
68
+ 'Q2_ave': [np.nan, -6.0, -6.0, -6.0, 0.0, 0.0]})
69
+ patient3 = pd.DataFrame({'Date': pd.date_range('2022-01-08', '2022-01-10'),
70
+ 'StudyId': ['3'] * 3,
71
+ 'Q1_ave': [np.nan, 7.0, 7.0],
72
+ 'Q2_ave': [np.nan, -7.0, -7.0]})
73
+ # Combine individual patient series into one df
74
+ expected_df = pd.concat([patient1, patient2, patient3]).reset_index(drop=True)
75
+ return expected_df
76
+
77
+
78
+ def test_three_day_window_several_patients_columns(
79
+ input_df_several_patients_two_columns, expected_df_several_patients_two_columns):
80
+ """Compare the output and expected columns for three patient IDs and two mean columns.
81
+
82
+ Uses a three day rolling window for the mean.
83
+ """
84
+ output_df = copd.rolling_mean_previous_period(
85
+ df=input_df_several_patients_two_columns, cols=['Q1', 'Q2'], id_col='StudyId',
86
+ date_col='Date', window=3)
87
+ assert set(output_df.columns) == set(expected_df_several_patients_two_columns.columns)
88
+
89
+
90
+ def test_three_day_window_several_patients_values(
91
+ input_df_several_patients_two_columns, expected_df_several_patients_two_columns):
92
+ """Compare the output and expected dataframes for three patient IDs.
93
+
94
+ Uses a three day rolling window for the mean.
95
+ """
96
+ output_df = copd.rolling_mean_previous_period(
97
+ df=input_df_several_patients_two_columns, cols=['Q1', 'Q2'], id_col='StudyId',
98
+ date_col='Date', window=3)
99
+ pd.testing.assert_frame_equal(
100
+ expected_df_several_patients_two_columns[['Date', 'StudyId', 'Q1_ave', 'Q2_ave']],
101
+ output_df[['Date', 'StudyId', 'Q1_ave', 'Q2_ave']], check_like=True)
training/tests/test_rolling_sum_previous_period.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the rolling_sum_previous_period function."""
2
+
3
+ import copd
4
+ import pandas as pd
5
+ import pytest
6
+
7
+
8
+ @pytest.fixture
9
+ def input_df_single_patient():
10
+ """Sample daily input data including exacerbations for a single patient."""
11
+ return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
12
+ 'StudyId': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
13
+ 'IsExac': [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]})
14
+
15
+
16
+ def test_seven_day_window_single_patient(input_df_single_patient):
17
+ """Compare the output and expected dataframes for a single patient ID.
18
+
19
+ Uses a seven day rolling window for the sum.
20
+ """
21
+ expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
22
+ 'StudyId': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
23
+ 'IsExac': [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
24
+ 'ExacsPrevPeriod': [
25
+ 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 2, 2, 2, 1, 0]})
26
+
27
+ output_df = copd.rolling_sum_previous_period(
28
+ df=input_df_single_patient, col='IsExac', output_col='ExacsPrevPeriod',
29
+ id_col='StudyId', date_col='Date', window=7)
30
+ pd.testing.assert_frame_equal(output_df, expected_df)
31
+
32
+
33
+ def test_three_day_window_single_patient(input_df_single_patient):
34
+ """Compare the output and expected dataframes for a single patient ID.
35
+
36
+ Uses a three day rolling window for the sum.
37
+ """
38
+ expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
39
+ 'StudyId': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
40
+ 'IsExac': [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
41
+ 'ExacsPrevPeriod': [
42
+ 0, 0, 0, 1, 1, 1, 1, 2, 2, 1, 0, 0, 0, 0, 0]})
43
+
44
+ output_df = copd.rolling_sum_previous_period(
45
+ df=input_df_single_patient, col='IsExac', output_col='ExacsPrevPeriod',
46
+ id_col='StudyId', date_col='Date', window=3)
47
+ pd.testing.assert_frame_equal(output_df, expected_df)
48
+
49
+
50
+ @pytest.fixture
51
+ def input_df_several_patients():
52
+ """Sample daily input data including exacerbations for a single patient."""
53
+ return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
54
+ 'StudyId': [1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 3, 1, 1, 3, 1],
55
+ 'IsExac': [0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]})
56
+
57
+
58
+ def test_seven_day_window_several_patients(input_df_several_patients):
59
+ """Compare the output and expected dataframes for three patient IDs.
60
+
61
+ Uses a seven day rolling window for the sum.
62
+ """
63
+ expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
64
+ 'StudyId': [1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 3, 1, 1, 3, 1],
65
+ 'IsExac': [0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
66
+ 'ExacsPrevPeriod': [
67
+ 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 0, 1, 1, 0, 0]})
68
+
69
+ output_df = copd.rolling_sum_previous_period(
70
+ df=input_df_several_patients, col='IsExac', output_col='ExacsPrevPeriod',
71
+ id_col='StudyId', date_col='Date', window=7)
72
+ pd.testing.assert_frame_equal(output_df, expected_df)
73
+
74
+
75
+ def test_three_day_window_several_patients(input_df_several_patients):
76
+ """Compare the output and expected dataframes for three patient IDs.
77
+
78
+ Uses a three day rolling window for the sum.
79
+ """
80
+ expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-15'),
81
+ 'StudyId': [1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 3, 1, 1, 3, 1],
82
+ 'IsExac': [0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
83
+ 'ExacsPrevPeriod': [
84
+ 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]})
85
+
86
+ output_df = copd.rolling_sum_previous_period(
87
+ df=input_df_several_patients, col='IsExac', output_col='ExacsPrevPeriod',
88
+ id_col='StudyId', date_col='Date', window=3)
89
+ pd.testing.assert_frame_equal(output_df, expected_df)
training/tests/test_set_prediction_window.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the set_prediction_window function."""
2
+
3
+ import copd
4
+ import pandas as pd
5
+ import pytest
6
+
7
+
8
+ @pytest.fixture
9
+ def input_df():
10
+ """Sample input data including an exacerbation."""
11
+ return pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
12
+ 'IsExac': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]})
13
+
14
+
15
+ def test_check_correct_three_day_window_set(input_df):
16
+ """Check the correct rows are set to exacerbations for a three day window."""
17
+ output_df = copd.set_prediction_window(df=input_df, prediction_window=3)
18
+ expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
19
+ 'IsExac': [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]})
20
+ pd.testing.assert_frame_equal(output_df, expected_df)
21
+
22
+
23
+ def test_check_correct_five_day_window_set(input_df):
24
+ """Check the correct rows are set to exacerbations for a five day window."""
25
+ output_df = copd.set_prediction_window(df=input_df, prediction_window=5)
26
+ expected_df = pd.DataFrame({'Date': pd.date_range('2022-01-01', '2022-01-10'),
27
+ 'IsExac': [0, 0, 1, 1, 1, 1, 1, 0, 0, 0]})
28
+ pd.testing.assert_frame_equal(output_df, expected_df)
training/tests/test_set_pro_exac_dates.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the set_pro_exac_dates function."""
2
+
3
+ import copd
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pytest
7
+
8
+
9
+ @pytest.fixture
10
+ def input_df():
11
+ """Sample input data.
12
+
13
+ Input data covers the following cases:
14
+ 1. Duplicate non-exacerbation response (input rows 0 and 3)
15
+ 2. Exacerbation with known date (row 1)
16
+ 3. Exacerbation with unknown date (row 2)
17
+ """
18
+ return pd.DataFrame({'PatientId': [1, 2, 3, 1],
19
+ 'SymptomDiaryQ11a': [1, 2, np.nan, 1],
20
+ 'SymptomDiaryQ11b': [np.nan, pd.to_datetime('2022-01-01'),
21
+ np.nan, np.nan],
22
+ 'SubmissionTime': pd.to_datetime(['2022-01-03', '2022-01-05',
23
+ '2022-01-06', '2022-01-03']),
24
+ 'IsCommExac': [1, 1, 0, 1]})
25
+
26
+
27
+ @pytest.fixture
28
+ def expected_df():
29
+ """Define expected output dataframe."""
30
+ return pd.DataFrame({'PatientId': [2, 3, 1],
31
+ 'SymptomDiaryQ11a': [2, np.nan, 1],
32
+ 'SymptomDiaryQ11b': [pd.to_datetime('2022-01-01'), np.nan,
33
+ np.nan],
34
+ 'SubmissionTime': pd.to_datetime(['2022-01-05', '2022-01-06',
35
+ '2022-01-03']),
36
+ 'IsCommExac': [1, 0, 1],
37
+ 'DateOfEvent': pd.to_datetime(['2022-01-01', '2022-01-06',
38
+ '2022-01-03'], utc=True
39
+ ).normalize(),
40
+ 'ExacDateUnknown': [0, 0, 1]})
41
+
42
+
43
+ def test_output_equals_expected(input_df, expected_df):
44
+ """Test output is as expected."""
45
+ output_df = copd.set_pro_exac_dates(input_df).reset_index(drop=True)
46
+ pd.testing.assert_frame_equal(output_df, expected_df)
training/tests/test_triple_inhaler_therapy_service.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the triple_inhaler_therapy_service function."""
2
+ import copd
3
+ import pandas as pd
4
+
5
+
6
+ def test_returns_zero_single_therapy_sitt():
7
+ """Check output for single inhaler types. Single Inhaler Triple Therapy only."""
8
+ input_df = pd.DataFrame({'Id': [1, 2, '3', 1],
9
+ 'InhalerType': ['LAMA', 'LABA', 'LAMA', 'LABA']})
10
+ expected_df = pd.DataFrame({'Id': [1, 2, '3'],
11
+ 'TripleTherapy': [0, 0, 0]})
12
+ output_df = copd.triple_inhaler_therapy_service(
13
+ df=input_df, id_col='Id', inhaler_col='InhalerType')
14
+ pd.testing.assert_frame_equal(expected_df, output_df)
15
+
16
+
17
+ def test_returns_zero_single_therapy_mitt():
18
+ """Check output for single inhaler types. Includes Multiple Inhaler Triple Therapy."""
19
+ input_df = pd.DataFrame({'Id': [1, 2, '3', 1],
20
+ 'InhalerType': ['LAMA', 'LABA', 'LAMA', 'LABA']})
21
+ expected_df = pd.DataFrame({'Id': [1, 2, '3'],
22
+ 'TripleTherapy': [0, 0, 0]})
23
+ output_df = copd.triple_inhaler_therapy_service(
24
+ df=input_df, id_col='Id', inhaler_col='InhalerType', include_mitt=True)
25
+ pd.testing.assert_frame_equal(expected_df, output_df)
26
+
27
+
28
+ def test_returns_zero_double_therapy_sitt():
29
+ """Check output for double inhaler types. Single Inhaler Triple Therapy only."""
30
+ input_df = pd.DataFrame({'Id': [1, 2, '3', 1],
31
+ 'InhalerType': ['LABA-LAMA', 'LABA-ICS', 'LABA-LAMA',
32
+ 'LABA-ICS']})
33
+ expected_df = pd.DataFrame({'Id': [1, 2, '3'],
34
+ 'TripleTherapy': [0, 0, 0]})
35
+ output_df = copd.triple_inhaler_therapy_service(
36
+ df=input_df, id_col='Id', inhaler_col='InhalerType')
37
+ pd.testing.assert_frame_equal(expected_df, output_df)
38
+
39
+
40
+ def test_returns_zero_double_therapy_mitt():
41
+ """Check output for double inhaler types. Includes Multiple Inhaler Triple Therapy."""
42
+ input_df = pd.DataFrame({'Id': [1, 2, '3', 1],
43
+ 'InhalerType': ['LABA-LAMA', 'LABA-ICS', 'LABA-LAMA',
44
+ 'LABA-ICS']})
45
+ expected_df = pd.DataFrame({'Id': [1, 2, '3'],
46
+ 'TripleTherapy': [0, 0, 0]})
47
+ output_df = copd.triple_inhaler_therapy_service(
48
+ df=input_df, id_col='Id', inhaler_col='InhalerType', include_mitt=True)
49
+ pd.testing.assert_frame_equal(expected_df, output_df)
50
+
51
+
52
+ def test_returns_one_triple_therapy_sitt():
53
+ """Check output for triple inhaler types. Single Inhaler Triple Therapy only."""
54
+ input_df = pd.DataFrame({'Id': [1, 2, '3', 1],
55
+ 'InhalerType': ['LABA-LAMA', 'LAMA +LABA-ICS',
56
+ 'LABA-LAMA-ICS', 'LAMA +LABA-ICS']})
57
+ expected_df = pd.DataFrame({'Id': [1, 2, '3'],
58
+ 'TripleTherapy': [1, 1, 1]})
59
+ output_df = copd.triple_inhaler_therapy_service(
60
+ df=input_df, id_col='Id', inhaler_col='InhalerType')
61
+ pd.testing.assert_frame_equal(expected_df, output_df)
62
+
63
+
64
+ def test_returns_zero_triple_therapy_sitt():
65
+ """Check output for triple inhaler types. Single Inhaler Triple Therapy only.
66
+
67
+ Input df includes SITT and also a patient with a valid MITT combination. Should return
68
+ zero for that patient as SITT only is required.
69
+ """
70
+ input_df = pd.DataFrame({'Id': [1, 2, '3', 1, 4, 4],
71
+ 'InhalerType': ['LABA-LAMA', 'LAMA +LABA-ICS',
72
+ 'LABA-LAMA-ICS', 'LAMA +LABA-ICS', 'LAMA',
73
+ 'LABA-ICS']})
74
+ expected_df = pd.DataFrame({'Id': [1, 2, 4, '3'],
75
+ 'TripleTherapy': [1, 1, 0, 1]})
76
+ output_df = copd.triple_inhaler_therapy_service(
77
+ df=input_df, id_col='Id', inhaler_col='InhalerType')
78
+ pd.testing.assert_frame_equal(expected_df, output_df)
79
+
80
+
81
+ def test_returns_one_triple_therapy_mitt():
82
+ """Check output for triple inhaler types. Includes Multiple Inhaler Triple Therapy.
83
+
84
+ Input df includes SITT and also a patient with a valid MITT combination. Should return
85
+ one for all patients.
86
+ """
87
+ input_df = pd.DataFrame({'Id': [1, 2, '3', 1, 4, 4],
88
+ 'InhalerType': ['LABA-LAMA', 'LAMA +LABA-ICS',
89
+ 'LABA-LAMA-ICS', 'LAMA +LABA-ICS', 'LAMA',
90
+ 'LABA-ICS']})
91
+ expected_df = pd.DataFrame({'Id': [1, 2, 4, '3'],
92
+ 'TripleTherapy': [1, 1, 1, 1]})
93
+ output_df = copd.triple_inhaler_therapy_service(
94
+ df=input_df, id_col='Id', inhaler_col='InhalerType', include_mitt=True)
95
+ pd.testing.assert_frame_equal(expected_df, output_df)
training/tests/test_unit_lookup.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the unit_lookup function."""
2
+
3
+ import copd
4
+ import pandas as pd
5
+
6
+
7
+ def test_unit_lookup_defined():
8
+ """Test unit lookup output for all defined unit codes."""
9
+ assert copd.unit_lookup(pd.Series(0)) == 'Count'
10
+ assert copd.unit_lookup(pd.Series(1)) == 'CountPerSecond'
11
+ assert copd.unit_lookup(pd.Series(2)) == 'InternationalUnit'
12
+ assert copd.unit_lookup(pd.Series(3)) == 'Joule'
13
+ assert copd.unit_lookup(pd.Series(4)) == 'Kelvin'
14
+ assert copd.unit_lookup(pd.Series(5)) == 'Kilogram'
15
+ assert copd.unit_lookup(pd.Series(6)) == 'KilogramPerLiter'
16
+ assert copd.unit_lookup(pd.Series(7)) == 'KilogramPerSquareMeter'
17
+ assert copd.unit_lookup(pd.Series(8)) == 'Liter'
18
+ assert copd.unit_lookup(pd.Series(9)) == 'LiterPerKilogramSecond'
19
+ assert copd.unit_lookup(pd.Series(10)) == 'LiterPerSecond'
20
+ assert copd.unit_lookup(pd.Series(11)) == 'Meter'
21
+ assert copd.unit_lookup(pd.Series(12)) == 'Pascal'
22
+ assert copd.unit_lookup(pd.Series(13)) == 'Percent'
23
+ assert copd.unit_lookup(pd.Series(14)) == 'Second'
24
+ assert copd.unit_lookup(pd.Series(15)) == 'Siemen'
25
+
26
+
27
+ def test_unit_lookup_undefined():
28
+ """Test unit lookup output for undefined unit code and other input."""
29
+ assert copd.unit_lookup(pd.Series(16)) == 'Undefined'
30
+ assert copd.unit_lookup(pd.Series(42)) == 'Undefined'
31
+ assert copd.unit_lookup(pd.Series('A')) == 'Undefined'
training/train_test_split.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Splits the model C cohort and patient days into stratified train and test sets.
2
+
3
+ The train set retains these characteristics of the full data set:
4
+ Exac days to non-exac days ratio (within 5%). Individual patients can only appear in
5
+ either train or test.
6
+ Sex ratio (within 0.05)
7
+ Age distribution (minimum p-value for Kolmogorov-Smirnov test=0.9)
8
+
9
+ This script also splits the train data into balanced folds for cross-validation. Patient
10
+ IDs for train, test and all data folds are stored for use in subsequent scripts.
11
+
12
+ All data sets are divided into train and test and stored in separate folders.
13
+ """
14
+
15
+ import numpy as np
16
+ import os
17
+ import pandas as pd
18
+ import pickle
19
+
20
+ from lenusml import splits
21
+
22
+ data_dir = '<YOUR_DATA_PATH>/copd-dataset/'
23
+ output_train_data_dir = '<YOUR_DATA_PATH>/train_data'
24
+ output_test_data_dir = '<YOUR_DATA_PATH>/test_data'
25
+ cohort_info_dir = '../data/cohort_info/'
26
+
27
+ save_cohort_info = True
28
+
29
+ data = pd.read_pickle(os.path.join(data_dir, 'exac_data.pkl'))
30
+
31
+ ##########################################
32
+ # Prepare demographic info for splitting
33
+ ##########################################
34
+ # Calculate decimal age on DateOfEvent
35
+ data['DateOfBirth'] = pd.to_datetime(data['DateOfBirth'], utc=True)
36
+
37
+
38
+ def calculate_age_decimal(dob, date):
39
+ age = date - dob
40
+ decimal_age = (age.days + age.seconds / 86400.0) / 365.2425
41
+ return decimal_age
42
+
43
+
44
+ data['Age'] = data.apply(lambda x: calculate_age_decimal(
45
+ x['DateOfBirth'], x['DateOfEvent']), axis=1)
46
+
47
+ data = data.drop(columns=['DateOfBirth'])
48
+
49
+ ##########################################
50
+ # Merge with COPD status and inhaler data
51
+ ##########################################
52
+ patient_details = pd.read_csv(os.path.join(data_dir, 'CopdDatasetPatientDetails.txt'),
53
+ usecols=['StudyId', 'CopdStatusDetailsId'],
54
+ delimiter="|")
55
+
56
+ copd_status = pd.read_csv(os.path.join(data_dir, 'CopdDatasetCopdStatusDetails.txt'),
57
+ usecols=['Id', 'SmokingStatus', 'RequiredAcuteNIV',
58
+ 'RequiredICUAdmission',
59
+ 'LungFunction_FEV1PercentPredicted',
60
+ 'LabsHighestEosinophilCount'],
61
+ delimiter="|")
62
+
63
+ # Strip out % signs from spirometry and convert to float
64
+ copd_status['LungFunction_FEV1PercentPredicted'] = copd_status[
65
+ 'LungFunction_FEV1PercentPredicted'].str.strip('%').astype('float')
66
+
67
+ patient_details = patient_details.merge(
68
+ copd_status, left_on='CopdStatusDetailsId', right_on='Id',
69
+ how='left').drop(columns=['CopdStatusDetailsId', 'Id'])
70
+ data = data.merge(patient_details, on='StudyId', how='left')
71
+
72
+ #################################
73
+ # Define train and test cohorts
74
+ #################################
75
+
76
+ print('Split data into train and test')
77
+ # Set the class ratio tolerance to 5% of the data class ratio
78
+ class_ratio_tolerance = 0.05 * data.IsExac.value_counts(normalize=True)[0] /\
79
+ data.IsExac.value_counts(normalize=True)[1]
80
+ print("Class ratio tolerance: ", class_ratio_tolerance)
81
+ # Set the sex ratio tolerance to 5% of the data class ratio
82
+ sex_ratio_tolerance = 0.05 * data.Sex.value_counts(normalize=True)['M'] /\
83
+ data.Sex.value_counts(normalize=True)['F']
84
+ print("Sex ratio tolerance: ", sex_ratio_tolerance)
85
+
86
+ train_data, test_data, train_ids, test_ids = splits.train_test_stratified_class_sex(
87
+ data=data, id_column='StudyId', class_column='IsExac', sex_column='Sex',
88
+ train_proportion=0.85,
89
+ proportion_tolerance=0.05, class_ratio_tolerance=class_ratio_tolerance,
90
+ sex_ratio_tolerance=sex_ratio_tolerance, random_seed=42)
91
+
92
+ #################################
93
+ # Create cross validation folds
94
+ #################################
95
+ fold_proportions, fold_class_ratios, fold_patients = splits.group_kfold_class_balanced(
96
+ data=train_data, id_column='StudyId', class_column='IsExac', K=5,
97
+ fold_proportion_tolerance=0.05,
98
+ fold_class_ratio_tolerance=class_ratio_tolerance, random_seed=42)
99
+ if save_cohort_info:
100
+ os.makedirs(cohort_info_dir, exist_ok=True)
101
+ with open(os.path.join(cohort_info_dir, "test_ids.pkl"), 'wb') as f:
102
+ pickle.dump(list(test_ids), f)
103
+
104
+ with open(os.path.join(cohort_info_dir, "train_ids.pkl"), 'wb') as f:
105
+ pickle.dump(list(train_ids), f)
106
+ print('Train and test patient IDs saved')
107
+
108
+ with open(os.path.join(cohort_info_dir, "fold_proportions.pkl"), 'wb') as f:
109
+ pickle.dump(list(fold_proportions), f)
110
+
111
+ with open(os.path.join(cohort_info_dir, "fold_class_ratios.pkl"), 'wb') as f:
112
+ pickle.dump(list(fold_class_ratios), f)
113
+
114
+ np.save(os.path.join(cohort_info_dir, 'fold_patients.npy'), fold_patients,
115
+ allow_pickle=True)
116
+ print('Cross validation fold information saved')
117
+
118
+ ###############################
119
+ # Save train and test sets
120
+ ###############################
121
+
122
+ # Create the output directories
123
+ os.makedirs(output_train_data_dir, exist_ok=True)
124
+ os.makedirs(output_test_data_dir, exist_ok=True)
125
+
126
+ # Save exac and patient details info
127
+ train_data.to_pickle(os.path.join(output_train_data_dir, 'train_data.pkl'))
128
+ test_data.to_pickle(os.path.join(output_test_data_dir, 'test_data.pkl'))
129
+ print('Patient details/exac data saved')