File size: 35,825 Bytes
000de75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 | """
Derive features PRO responses for 2 models:
Parallel model 1: uses both hospital and community exacerbation events
Parallel model 2: uses only hospital exacerbation events
"""
import numpy as np
import pandas as pd
import sys
import os
import re
from collections import defaultdict
import yaml
def calc_total_pro_engagement(pro_df, pro_name):
"""
Calculates PRO engagement per patient across their entire time within the service.
Args:
pro_df (pd.DataFrame): dataframe containing the onboarding date and the latest
prediction date.
pro_name (str): name of the PRO.
Returns:
pd.DataFrame: the input dateframe with an additional column stating the total
engagement for each patient across the service.
"""
# Calculate time in service according to type of PRO
if pro_name == "EQ5D":
date_unit = "M"
if pro_name == "MRC":
date_unit = "W"
if (pro_name == "CAT") | (pro_name == "SymptomDiary"):
date_unit = "D"
pro_df["TimeInService"] = np.floor(
(
(pro_df.LatestPredictionDate - pro_df.FirstSubmissionDate)
/ np.timedelta64(1, date_unit)
)
)
# PRO engagement for the total time in service
pro_response_count = pro_df.groupby("StudyId").count()[["PatientId"]].reset_index()
pro_response_count = pro_response_count.rename(
columns={"PatientId": "Response" + pro_name}
)
pro_df = pro_df.merge(pro_response_count, on="StudyId", how="left")
pro_df["TotalEngagement" + pro_name] = round(
pro_df["Response" + pro_name] / pro_df["TimeInService"], 2
)
return pro_df
def calc_pro_engagement_in_time_window(pro_df, pro_name, time_window, data):
"""
Calculates PRO engagement per patient across a specified time window. The time
window is in format 'months', and consists of the specified time period prior to
IndexDate.
Args:
pro_df (pd.DataFrame): dataframe containing the index dates and PRO response
submission dates.
pro_name (str): name of the PRO.
time_window (int): number of months in which to calculate PRO engagement.
data (pd.DataFrame): main dataframe.
Returns:
pd.DataFrame: a dataframe containing the calculated PRO engagement.
"""
# Calculate time in service according to type of PRO.
if pro_name == "EQ5D":
unit_val = 1
if pro_name == "MRC":
unit_val = 4
if (pro_name == "CAT") | (pro_name == "SymptomDiary"):
unit_val = 30
pro_df["SubmissionTime"] = pd.to_datetime(pro_df["SubmissionTime"], utc=True)
pro_engagement_6mo = pro_df.copy()
pro_engagement_6mo["TimeSinceSubmission"] = (
pro_engagement_6mo["IndexDate"] - pro_engagement_6mo["SubmissionTime"]
).dt.days
# Only include PRO responses within the specified time window
pro_engagement_6mo = pro_engagement_6mo[
pro_engagement_6mo["TimeSinceSubmission"].between(
0, (time_window * 30), inclusive="both"
)
]
# Calculate number of PRO responses within specified time window
pro_engagement_6mo = (
pro_engagement_6mo.groupby(["StudyId", "IndexDate"])
.count()[["PatientId"]]
.reset_index()
)
pro_engagement_6mo = pro_engagement_6mo.rename(
columns={"PatientId": "ResponseCountTW" + str(time_window)}
)
pro_engagement_6mo["Engagement" + pro_name + "TW" + str(time_window)] = round(
pro_engagement_6mo["ResponseCountTW" + str(time_window)]
/ (time_window * unit_val),
2,
)
pro_engagement_6mo = data[["StudyId", "IndexDate"]].merge(
pro_engagement_6mo, on=["StudyId", "IndexDate"], how="left"
)
# Fill N/As with 0 as no engagement was observed for those patients
pro_engagement_6mo = pro_engagement_6mo.fillna(0)
return pro_engagement_6mo
def calc_pro_engagement_at_specific_month(pro_df, pro_name, month_num, data):
# Calculate time in service according to type of PRO.
if pro_name == "EQ5D":
unit_val = 1
if pro_name == "MRC":
unit_val = 4
if (pro_name == "CAT") | (pro_name == "SymptomDiary"):
unit_val = 30
pro_df["SubmissionTime"] = pd.to_datetime(pro_df["SubmissionTime"], utc=True)
pro_engagement = pro_df.copy()
pro_engagement["TimeSinceSubmission"] = (
pro_engagement["IndexDate"] - pro_engagement["SubmissionTime"]
).dt.days
# Only include PRO responses for the month specified
# Calculate the number of months between index date and specified month
months_between_index_and_specified = month_num - 1
pro_engagement = pro_engagement[
pro_engagement["TimeSinceSubmission"].between(
(months_between_index_and_specified * 30),
(month_num * 30),
inclusive="both",
)
]
# Calculate number of PRO responses within specified time window
pro_engagement = (
pro_engagement.groupby(["StudyId", "IndexDate"])
.count()[["PatientId"]]
.reset_index()
)
pro_engagement = pro_engagement.rename(
columns={"PatientId": "ResponseCountMonth" + str(month_num)}
)
pro_engagement["Engagement" + pro_name + "Month" + str(month_num)] = round(
pro_engagement["ResponseCountMonth" + str(month_num)] / (1 * unit_val),
2,
)
pro_engagement = data[["StudyId", "IndexDate"]].merge(
pro_engagement, on=["StudyId", "IndexDate"], how="left"
)
# Fill N/As with 0 as no engagement was observed for those patients
pro_engagement = pro_engagement.fillna(0)
return pro_engagement
def calc_last_pro_score(pro_df, pro_name):
"""
Calculates the most recent PRO response. The latest PRO score is set to be within 2
months of the index date to allow recency of data without having many missing
values.
Args:
pro_df (pd.DataFrame): dataframe containing the index dates and PRO response
submission dates.
pro_name (str): name of the PRO.
Returns:
pd.DataFrame: the input dateframe with additional columns stating the latest PRO
score for each PRO question.
"""
# Calculate last PRO score
pro_df["TimeSinceSubmission"] = (
pro_df["IndexDate"] - pro_df["SubmissionTime"]
).dt.days
pro_df = pro_df[pro_df["TimeSinceSubmission"] > 0]
pro_df = pro_df.sort_values(
by=["StudyId", "IndexDate", "TimeSinceSubmission"], ascending=True
)
latest_pro = pro_df.drop_duplicates(subset=["StudyId", "IndexDate"], keep="first")
# Ensure that the latest PRO Score is within 2 months of the index date
latest_pro = latest_pro[latest_pro["TimeSinceSubmission"] <= 365]
# Select specific columns
question_cols = latest_pro.columns[
latest_pro.columns.str.startswith(pro_name)
].tolist()
question_cols.extend(
["StudyId", "IndexDate", "Score", "SubmissionTime", "TimeSinceSubmission"]
)
latest_pro = latest_pro[question_cols]
# if pro_name == "EQ5D":
# median_val_q1 = latest_pro["EQ5DScoreWithoutQ6"].median()
# print(median_val_q1)
# latest_pro = weigh_features_by_recency(
# df=latest_pro,
# feature="EQ5DScoreWithoutQ6",
# feature_recency_days="TimeSinceSubmission",
# median_value=median_val_q1,
# decay_rate=0.001,
# )
# print(latest_pro.columns)
#
# # Add prefix to question columns
# cols_to_rename = latest_pro.columns[
# ~latest_pro.columns.isin(
# ["StudyId", "IndexDate", "Score", "SubmissionTime"]
# )
# ]
# latest_pro = latest_pro.rename(
# columns=dict(zip(cols_to_rename, "Latest" + cols_to_rename))
# )
#
# # Rename columns where prefix not added
# latest_pro = latest_pro.rename(
# columns={
# "Score": "Latest" + pro_name + "Score",
# "SubmissionTime": "LatestPRODate",
# }
# )
#
# elif pro_name == "MRC":
# median_val_q1 = latest_pro["Score"].median()
# print(median_val_q1)
# latest_pro = weigh_features_by_recency(
# df=latest_pro,
# feature="Score",
# feature_recency_days="TimeSinceSubmission",
# median_value=median_val_q1,
# decay_rate=0.001,
# )
# print(latest_pro.columns)
# # Add prefix to question columns
# cols_to_rename = latest_pro.columns[
# ~latest_pro.columns.isin(
# ["StudyId", "IndexDate", "Score", "SubmissionTime", "ScoreWeighted"]
# )
# ]
# latest_pro = latest_pro.rename(
# columns=dict(zip(cols_to_rename, "Latest" + cols_to_rename))
# )
# # Rename columns where prefix not added
# latest_pro = latest_pro.rename(
# columns={
# "Score": "Latest" + pro_name + "Score",
# "ScoreWeighted": "Latest" + pro_name + "ScoreWeighted",
# "SubmissionTime": "LatestPRODate",
# }
# )
# else:
# Add prefix to question columns
cols_to_rename = latest_pro.columns[
~latest_pro.columns.isin(["StudyId", "IndexDate", "Score", "SubmissionTime"])
]
latest_pro = latest_pro.rename(
columns=dict(zip(cols_to_rename, "Latest" + cols_to_rename))
)
# Rename columns where prefix not added
latest_pro = latest_pro.rename(
columns={
"Score": "Latest" + pro_name + "Score",
"SubmissionTime": "LatestPRODate",
}
)
pro_df = pro_df.merge(latest_pro, on=["StudyId", "IndexDate"], how="left")
return pro_df
def calc_pro_score_prior_to_latest(pro_df, pro_name, time_prior_to_latest=60):
"""
Finds the PRO score prior to the latest PRO score before index date.
Args:
pro_df (pd.DataFrame): dataframe containing the latest PRO score and PRO
response submission dates.
pro_name (str): name of the PRO.
time_prior_to_latest (int, optional): time period before latest PRO score in
days. Default time frame set to 60 days (two months).
Returns:
pd.DataFrame: the input dateframe with additional columns stating the previous
score closest to the latest PRO score for each PRO question.
"""
pro_previous = pro_df.copy()
pro_previous = pro_previous[
pro_previous["SubmissionTime"] < pro_previous["LatestPRODate"]
]
pro_previous = pro_previous.sort_values(
by=["StudyId", "IndexDate", "SubmissionTime"], ascending=[True, True, False]
)
pro_previous = pro_previous.drop_duplicates(
subset=["StudyId", "IndexDate"], keep="first"
)
# Make sure that previous score is within two months of the LatestPRODate
pro_previous["TimeSinceLatestPRODate"] = (
pro_previous["LatestPRODate"] - pro_previous["SubmissionTime"]
).dt.days
pro_previous = pro_previous[
pro_previous["TimeSinceLatestPRODate"] <= time_prior_to_latest
]
# Add prefix to question columns
cols_to_rename = [col for col in pro_previous if col.startswith(pro_name)]
cols_to_rename = pro_previous[cols_to_rename].columns
pro_previous = pro_previous.rename(
columns=dict(zip(cols_to_rename, "Prev" + cols_to_rename))
)
pro_previous = pro_previous[["StudyId", "IndexDate", "Score"]].join(
pro_previous.filter(regex="^Prev")
)
pro_previous = pro_previous.rename(columns={"Score": "Prev" + pro_name + "Score"})
pro_df = pro_df.merge(pro_previous, on=["StudyId", "IndexDate"], how="left")
return pro_df
def define_mapping_for_calcs(pro_name, questions, prefixes):
"""
Defines the mapping for calculations between PRO responses.
Args:
pro_name (str): name of the PRO.
questions (list): question names of PRO.
prefixes (list): prefixes to identify which columns to use in calculations. The
possible prefixes are: 'Avg', 'Prev', 'LongerAvg', 'WeekPrevAvg'.
Returns:
dict: mapping that maps columns for performing calculations.
"""
# Create empty dictionary to append questions
mapping = defaultdict(list)
# Iterate through questions and create mapping for calculations
for question in questions:
if (pro_name == "EQ5D") | (pro_name == "MRC"):
map_key = "Latest" + pro_name + question
if (pro_name == "CAT") | (pro_name == "SymptomDiary"):
map_key = "WeekAvg" + pro_name + question
for prefix in prefixes:
mapping[map_key].append(prefix + pro_name + question)
return mapping
def calc_pro_average(pro_df, pro_name, time_window=None, avg_period=None):
"""
Calculate the PRO average before the latest PRO score and within a specified time
window.
Args:
pro_df (pd.DataFrame): dataframe containing index dates and PRO submission
dates.
pro_name (str): name of the PRO.
time_window (int, optional): time window (in months) used for calculating the
average of PRO responses. Defaults to None.
avg_period (str, optional): identifies which prefix to add to output columns.
Defaults to None.
Returns:
pd.Dataframe: the input dateframe with additional columns with the calculated
averages.
"""
# Calculate average in PRO responses for the time window specified prior to the
# index date
pro_df = pro_df.loc[
:,
~(
pro_df.columns.str.startswith("Avg")
| pro_df.columns.str.startswith("Longer")
),
]
if avg_period is None:
prefix = "Avg"
pro_df["AvgStartDate"] = pro_df["IndexDate"] - pd.DateOffset(months=time_window)
avg_pro = pro_df[
(pro_df["SubmissionTime"] >= pro_df["AvgStartDate"])
& (pro_df["SubmissionTime"] < pro_df["LatestPRODate"])
]
else:
pro_df["WeekStartDate"] = pro_df["IndexDate"] - pd.DateOffset(weeks=1)
pro_df["WeekPrevStartDate"] = pro_df["WeekStartDate"] - pd.DateOffset(weeks=1)
# When looking at daily PROs, three averages are calculated:
# The weekly average is the average of PRO scores in the week prior to IndexDate
if avg_period == "WeeklyAvg":
prefix = "WeekAvg"
avg_pro = pro_df[
(pro_df["SubmissionTime"] >= pro_df["WeekStartDate"])
& (pro_df["SubmissionTime"] <= pro_df["IndexDate"])
]
# The weekly previous average is the average of PRO scores in the week prior to the
# WeeklyAvg. This is needed to calculate the difference of scores between the most
# recent week and the week before that
elif avg_period == "WeekPrevAvg":
prefix = "WeekPrevAvg"
avg_pro = pro_df[
(pro_df["SubmissionTime"] >= pro_df["WeekPrevStartDate"])
& (pro_df["SubmissionTime"] < pro_df["WeekStartDate"])
]
# Longer average calculated is the time window specified prior to the WeekStartDate
elif avg_period == "LongerAvg":
prefix = "LongerAvg"
pro_df["AvgStartDate"] = pro_df["IndexDate"] - pd.DateOffset(months=time_window)
avg_pro = pro_df[
(pro_df["SubmissionTime"] >= pro_df["AvgStartDate"])
& (pro_df["SubmissionTime"] < pro_df["WeekStartDate"])
]
# Select specific columns
cols_required = avg_pro.columns[avg_pro.columns.str.startswith(pro_name)].tolist()
cols_required.extend(["StudyId", "IndexDate", "Score"])
avg_pro = avg_pro[cols_required]
# Calculate average pro scores
avg_pro = avg_pro.groupby(["StudyId", "IndexDate"]).mean().reset_index()
# Add prefix to question columns
cols_to_rename = avg_pro.columns[
~avg_pro.columns.isin(["StudyId", "IndexDate", "Score"])
]
avg_pro = avg_pro.rename(columns=dict(zip(cols_to_rename, prefix + cols_to_rename)))
# Rename columns where prefix not added
avg_pro = avg_pro.rename(columns={"Score": prefix + pro_name + "Score"})
# Merge average PRO with rest of the df
pro_df = pro_df.merge(avg_pro, on=["StudyId", "IndexDate"], how="left")
return pro_df
def calc_diff_pro_scores(pro_df, pro_name, latest_pro, other_pro, time_window=None):
"""
Calculate the difference between PRO scores.
Args:
pro_df (pd.DataFrame): dataframe containing columns required for calculations.
pro_name (str): name of the PRO.
latest_pro (str): column name containing the latest PRO response for PROs EQ5D
and MRC, and the latest week average for PROs CAT and SymptomDiary.
other_pro (str): column name containing the other variable for calculating
difference.
time_window (int, optional): time window (in months) used to specify which
column to use when calculating differences.
Returns:
pd.Dataframe: the input dateframe with additional columns with the calculated
differences.
"""
# Remove prefix of score
split_feat_name = re.findall(r"[A-Z][^A-Z]*", latest_pro)
# Remove first element of list to get the base name of feature
split_feat_name.pop(0)
# Remove the second element in list if PRO is CAT or SymptomDiary
if pro_name in ["CAT", "SymptomDiary"]:
split_feat_name.pop(0)
# Combine remaining elements of list
stripped_feat_name = "".join(split_feat_name)
if time_window is None:
pro_df["DiffLatestPrev" + stripped_feat_name] = (
pro_df[latest_pro] - pro_df[other_pro]
)
else:
pro_df["DiffLatestAvg" + stripped_feat_name + "TW" + str(time_window)] = (
pro_df[latest_pro] - pro_df[other_pro]
)
return pro_df
def calc_variation(pro_df, pro_name):
"""
Calculate the variation (standard deviation) of PRO responses for a time window of
1 month.
Args:
pro_df (pd.DataFrame): dataframe containing index dates and PRO submission
dates.
pro_name (str): name of the PRO.
Returns:
pd.Dataframe: the input dateframe with additional columns with the calculated
variance.
"""
# Only calculate variation in the scores within 1 month before the IndexDate
if "TimeSinceSubmission" not in pro_df:
pro_df["TimeSinceSubmission"] = (
pro_df["IndexDate"] - pro_df["SubmissionTime"]
).dt.days
pro_var = pro_df[
(pro_df["TimeSinceSubmission"] > 0) & (pro_df["TimeSinceSubmission"] <= 30)
]
# Select specific columns
cols_required = pro_var.columns[pro_var.columns.str.startswith(pro_name)].tolist()
cols_required.extend(["StudyId", "IndexDate", "Score"])
pro_var = pro_var[cols_required]
# Calculate variation
pro_var = pro_var.groupby(["StudyId", "IndexDate"]).std().reset_index()
# Add prefix to question columns
cols_to_rename = pro_var.columns[
~pro_var.columns.isin(["StudyId", "IndexDate", "Score"])
]
pro_var = pro_var.rename(columns=dict(zip(cols_to_rename, "Var" + cols_to_rename)))
# Rename columns where prefix not added
pro_var = pro_var.rename(columns={"Score": "Var" + pro_name + "Score"})
# Merge back to main df
pro_df = pro_df.merge(pro_var, on=["StudyId", "IndexDate"], how="left")
return pro_df
def calc_sum_binary_vals(pro_df, binary_cols, time_window=1):
"""
For SymptomDiary questions that contain binary values, calculate the sum of the
binary values for a specified time window.
Args:
pro_df (pd.DataFrame): dataframe containing index dates and PRO submission
dates.
binary_cols (list): column names for which sum of binary values is to be
calculated for.
time_window (int, optional): time window (in months) for which the sum of the
binary values is calculated for. Defaults to 1.
Returns:
pd.Dataframe: a dataframe containing the sum of the binary values.
"""
# Make sure only entries before the index date and after the time window start date
# are used
pro_df["TimeWindowStartDate"] = pro_df["IndexDate"] - pd.DateOffset(
months=time_window
)
pro_df = pro_df[
(pro_df["SubmissionTime"] >= pro_df["TimeWindowStartDate"])
& (pro_df["SubmissionTime"] <= pro_df["IndexDate"])
]
sum_df = pro_df.groupby(["StudyId", "IndexDate"])[binary_cols].sum()
# Rename columns
sum_df = sum_df.add_prefix("Sum")
sum_df = sum_df.add_suffix("TW" + str(time_window))
sum_df = sum_df.reset_index()
return sum_df
def scale_sum_to_response_rate(pro_df, sum, engagement_rate):
"""
Scale the sum calculated using copd.calc_sum_binary_vals() to the response
rate to obtain a feature that is comparable between patients.
Args:
pro_df (pd.DataFrame): dataframe containing the columns for scaling the sum by
the engagement rate.
sum (str): column name that contains the data for the sum of the binary values.
engagement_rate (str): column name that contains the data for the response rate.
Returns:
pd.Dataframe: the input dateframe with additional columns with the scaled sum.
"""
pro_df["Scaled" + sum] = pro_df[sum] / pro_df[engagement_rate]
return pro_df
with open("./training/config.yaml", "r") as config:
config = yaml.safe_load(config)
# Specify which model to generate features for
model_type = config["model_settings"]["model_type"]
# Setup log file
log = open("./training/logging/process_pros_" + model_type + ".log", "w")
sys.stdout = log
# Dataset to process - set through config file
data_to_process = config["model_settings"]["data_to_process"]
# Load cohort data
if data_to_process == "forward_val":
data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl")
patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl")
else:
data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl")
patient_details = pd.read_pickle("./data/patient_details.pkl")
data = data[["StudyId", "IndexDate"]]
patient_details = data.merge(
patient_details[["StudyId", "FirstSubmissionDate", "LatestPredictionDate"]],
on="StudyId",
how="left",
)
# Calculate the lookback start date. Will need this to aggreggate data for model
# features
data["LookbackStartDate"] = data["IndexDate"] - pd.DateOffset(
days=config["model_settings"]["lookback_period"]
)
############################################
# Monthly PROs - EQ5D
############################################
pro_eq5d = pd.read_csv(config["inputs"]["raw_data_paths"]["pro_eq5d"], delimiter="|")
pro_eq5d = pro_eq5d.merge(
patient_details,
on="StudyId",
how="inner",
)
# EQ5DQ6 is a much less structured question compared to the other questions in EQ5D.
# A new score will be calculated using only EQ5DQ1-Q5 to prevent Q6 affecting the score
pro_eq5d["EQ5DScoreWithoutQ6"] = pro_eq5d.loc[:, "EQ5DQ1":"EQ5DQ5"].sum(axis=1)
# Calculate engagement over service
pro_eq5d = calc_total_pro_engagement(pro_eq5d, "EQ5D")
# Calculate engagement for a time window of 1 month (time window chosen based on signal
# output observed from results of feature_eng_multiple_testing)
pro_eq5d_engagement = calc_pro_engagement_in_time_window(
pro_eq5d, "EQ5D", time_window=1, data=data
)
pro_eq5d = pro_eq5d.merge(pro_eq5d_engagement, on=["StudyId", "IndexDate"], how="left")
# Calculate last PRO score
pro_eq5d = calc_last_pro_score(pro_eq5d, "EQ5D")
# Mapping to calculate the difference between the latest PRO scores and the average
# PRO score
question_names_eq5d = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Score", "ScoreWithoutQ6"]
mapping_eq5d = define_mapping_for_calcs("EQ5D", question_names_eq5d, prefixes=["Avg"])
# Calculate average PRO score for a time window of 1 month prior to IndexDate,
# ignoring the latest PRO score
pro_eq5d = calc_pro_average(pro_eq5d, "EQ5D", time_window=1)
for key in mapping_eq5d:
calc_diff_pro_scores(pro_eq5d, "EQ5D", key, mapping_eq5d[key][0], time_window=1)
# Calculate variation of scores across 1 month
pro_eq5d = calc_variation(pro_eq5d, "EQ5D")
# Remove unwanted columns and duplicates
pro_eq5d = pro_eq5d.loc[
:,
~(
pro_eq5d.columns.str.startswith("Avg")
| pro_eq5d.columns.str.startswith("EQ5D")
| pro_eq5d.columns.str.startswith("Response")
),
]
pro_eq5d = pro_eq5d.drop(
columns=[
"Score",
"SubmissionTime",
"FirstSubmissionDate",
"TimeInService",
"TimeSinceSubmission",
"LatestPredictionDate",
"LatestPRODate",
]
)
pro_eq5d = pro_eq5d.drop_duplicates()
############################################
# Weekly PROs - MRC
############################################
pro_mrc = pd.read_csv(config["inputs"]["raw_data_paths"]["pro_mrc"], delimiter="|")
pro_mrc = pro_mrc.merge(
patient_details,
on="StudyId",
how="inner",
)
# Calculate engagement over service
pro_mrc = calc_total_pro_engagement(pro_mrc, "MRC")
# Calculate engagement for a time window of 1 month
pro_mrc_engagement = calc_pro_engagement_in_time_window(
pro_mrc, "MRC", time_window=1, data=data
)
pro_mrc = pro_mrc.merge(pro_mrc_engagement, on=["StudyId", "IndexDate"], how="left")
# Calculate last PRO score
pro_mrc = calc_last_pro_score(pro_mrc, "MRC")
# Mapping to calculate the difference between the latest PRO scores and the average
# PRO score
question_names_mrc = ["Q1"]
mapping_mrc = define_mapping_for_calcs("MRC", question_names_mrc, prefixes=["Avg"])
# Calculate average PRO score for a time window of 1 month prior to IndexDate,
# ignoring the latest PRO score
pro_mrc = calc_pro_average(pro_mrc, "MRC", time_window=1)
for key in mapping_mrc:
calc_diff_pro_scores(pro_mrc, "MRC", key, mapping_mrc[key][0], time_window=1)
# Calculate variation of scores across 1 month
pro_mrc = calc_variation(pro_mrc, "MRC")
# Remove unwanted columns and duplicates
pro_mrc = pro_mrc.loc[
:,
~(
pro_mrc.columns.str.startswith("Avg")
| pro_mrc.columns.str.startswith("MRC")
| pro_mrc.columns.str.startswith("Response")
),
]
pro_mrc = pro_mrc.drop(
columns=[
"SubmissionTime",
"Score",
"FirstSubmissionDate",
"TimeInService",
"TimeSinceSubmission",
"LatestPredictionDate",
"LatestPRODate",
]
)
pro_mrc = pro_mrc.drop_duplicates()
############################################
# Daily PROs - CAT
############################################
pro_cat_full = pd.read_csv(config["inputs"]["raw_data_paths"]["pro_cat"], delimiter="|")
pro_cat = pro_cat_full.merge(
patient_details,
on="StudyId",
how="inner",
)
# Calculate engagement over service
pro_cat = calc_total_pro_engagement(pro_cat, "CAT")
# Calculate engagement for a time window of 1 month
pro_cat_engagement = calc_pro_engagement_in_time_window(
pro_cat, "CAT", time_window=1, data=data
)
pro_cat = pro_cat.merge(pro_cat_engagement, on=["StudyId", "IndexDate"], how="left")
# Calculate engagement in the month prior to the most recent month to index date
pro_cat_month1 = calc_pro_engagement_at_specific_month(
pro_cat, "CAT", month_num=1, data=data
)
pro_cat_month2 = calc_pro_engagement_at_specific_month(
pro_cat, "CAT", month_num=2, data=data
)
pro_cat_month3 = calc_pro_engagement_at_specific_month(
pro_cat, "CAT", month_num=3, data=data
)
pro_cat = pro_cat.merge(pro_cat_month1, on=["StudyId", "IndexDate"], how="left")
pro_cat = pro_cat.merge(pro_cat_month2, on=["StudyId", "IndexDate"], how="left")
pro_cat = pro_cat.merge(pro_cat_month3, on=["StudyId", "IndexDate"], how="left")
pro_cat["EngagementDiffMonth1and2"] = (
pro_cat["EngagementCATMonth1"] - pro_cat["EngagementCATMonth2"]
)
pro_cat["EngagementDiffMonth1and3"] = (
pro_cat["EngagementCATMonth1"] - pro_cat["EngagementCATMonth3"]
)
# Calculate PRO average for the week before the index date
pro_cat = calc_pro_average(pro_cat, "CAT", avg_period="WeeklyAvg")
# Calculate variation of scores across 1 month
pro_cat = calc_variation(pro_cat, "CAT")
# Remove unwanted columns and duplicates
pro_cat = pro_cat.loc[
:,
~(
pro_cat.columns.str.startswith("CAT")
| pro_cat.columns.str.startswith("Response")
),
]
pro_cat = pro_cat.drop(
columns=[
"Score",
"SubmissionTime",
"FirstSubmissionDate",
"TimeSinceSubmission",
"LatestPredictionDate",
"TimeInService",
"WeekStartDate",
"WeekPrevStartDate",
]
)
pro_cat = pro_cat.drop_duplicates()
############################################
# Daily PROs - Symptom Diary
############################################
# Symptom diary have some questions that are numeric and some that are categorical
pro_sd_full = pd.read_csv(
config["inputs"]["raw_data_paths"]["pro_symptom_diary"], delimiter="|"
)
pro_sd = pro_sd_full.merge(
patient_details,
on="StudyId",
how="inner",
)
# Calculate engagement over service
pro_sd = calc_total_pro_engagement(pro_sd, "SymptomDiary")
pro_sd_engagement = pro_sd[
["StudyId", "PatientId", "IndexDate", "TotalEngagementSymptomDiary"]
]
# Calculate engagement for 1 month prior to IndexDate
pro_sd_engagement_tw = calc_pro_engagement_in_time_window(
pro_sd, "SymptomDiary", time_window=1, data=data
)
pro_sd_engagement = pro_sd_engagement.merge(
pro_sd_engagement_tw, on=["StudyId", "IndexDate"], how="left"
)
pro_sd_engagement = pro_sd_engagement.drop_duplicates()
###############################
# Categorical questions
# (Q8, Q9, Q10)
###############################
pro_cat_q5 = pro_cat_full[["StudyId", "SubmissionTime", "CATQ5"]]
pro_sd_categ = pro_sd_full[
[
"StudyId",
"SubmissionTime",
"SymptomDiaryQ8",
"SymptomDiaryQ9",
"SymptomDiaryQ10",
"Score",
]
]
# Split timestamp column into separate date and time columns as same day entries in CAT
# and SymptomDiary have different timestamps
for df in [pro_cat_q5, pro_sd_categ]:
df["Date"] = (pd.to_datetime(df["SubmissionTime"], utc=True)).dt.date
pro_sd_cat = pro_sd_categ.merge(pro_cat_q5, on=["StudyId", "Date"], how="outer")
# If CATQ5 is a 0, then Symptom Diary questions 8, 9 and 10 don't get asked. Add this as
# an option to the columns. There are some cases where patients have a 0 in CATQ5 but
# have also answered Symptom Diary questions 8, 9, and 10 - keep these answers as is.
for col in ["SymptomDiaryQ8", "SymptomDiaryQ9", "SymptomDiaryQ10"]:
pro_sd_cat[col] = np.where(
(pro_sd_cat["CATQ5"] == 0) & (pro_sd_cat[col].isna()),
"Question Not Asked",
pro_sd_cat[col],
)
# Calculate the most recent score for SymptomDiary categorical questions
pro_sd_cat = pro_sd_cat.merge(data[["StudyId", "IndexDate"]], on="StudyId", how="inner")
pro_sd_cat = pro_sd_cat.rename(columns={"SubmissionTime_x": "SubmissionTime"})
pro_sd_cat["SubmissionTime"] = pd.to_datetime(pro_sd_cat["SubmissionTime"], utc=True)
pro_sd_cat = calc_last_pro_score(pro_sd_cat, "SymptomDiary")
pro_sd_cat = pro_sd_cat.drop(
columns=[
"SubmissionTime",
"SubmissionTime_y",
"CATQ5",
"SymptomDiaryQ8",
"SymptomDiaryQ9",
"Date",
"SymptomDiaryQ10",
"Score",
"LatestSymptomDiaryScore",
"LatestPRODate",
"TimeSinceSubmission",
]
)
pro_sd_cat = pro_sd_cat.drop_duplicates()
###############################
# Numeric questions
# (Q1, Q2)
# Q3 included for comparison
###############################
# Calculate PRO average for the week before the index date
pro_sd_numeric = pro_sd[
[
"StudyId",
"PatientId",
"IndexDate",
"SubmissionTime",
"Score",
"SymptomDiaryQ1",
"SymptomDiaryQ2",
"SymptomDiaryQ3",
]
]
pro_sd_numeric = calc_pro_average(
pro_sd_numeric, "SymptomDiary", avg_period="WeeklyAvg"
)
# Calculate variation of scores across 1 month
pro_sd_numeric = calc_variation(pro_sd_numeric, "SymptomDiary")
###############################
# Binary questions
# (Q3)
###############################
# Calculate sum of binary values for a time window of 1 months
sd_sum_all = pro_sd_numeric[["StudyId", "IndexDate"]]
sd_sum_all = sd_sum_all.drop_duplicates()
sd_sum = calc_sum_binary_vals(
pro_sd_numeric, binary_cols=["SymptomDiaryQ3"], time_window=1
)
sd_sum_all = sd_sum_all.merge(sd_sum, on=["StudyId", "IndexDate"], how="left")
# Scale sums by how often patients responded
sd_sum_all = sd_sum_all.merge(
pro_sd_engagement, on=["StudyId", "IndexDate"], how="left"
)
mapping_scaling = {"SumSymptomDiaryQ3TW1": "EngagementSymptomDiaryTW1"}
for key in mapping_scaling:
scale_sum_to_response_rate(sd_sum_all, key, mapping_scaling[key])
# Combine numeric, categorical and binary dfs
pro_sd_all = pro_sd_numeric.merge(
sd_sum_all, on=["StudyId", "PatientId", "IndexDate"], how="left"
)
pro_sd_all = pro_sd_all.merge(pro_sd_cat, on=["StudyId", "IndexDate"], how="left")
# Remove unwanted columns from numeric df
pro_sd_all = pro_sd_all.loc[
:,
~(
pro_sd_all.columns.str.startswith("Symptom")
| pro_sd_all.columns.str.startswith("Sum")
| pro_sd_all.columns.str.startswith("Response")
),
]
pro_sd_all = pro_sd_all.drop(
columns=[
"Score",
"SubmissionTime",
"TimeWindowStartDate",
"WeekStartDate",
"WeekPrevStartDate",
"TimeSinceSubmission",
]
)
pro_sd_all = pro_sd_all.drop_duplicates()
# Combine pros into one df
pro_df = pro_eq5d.merge(pro_mrc, on=["StudyId", "PatientId", "IndexDate"], how="left")
pro_df = pro_df.merge(pro_cat, on=["StudyId", "PatientId", "IndexDate"], how="left")
pro_df = pro_df.merge(pro_sd_all, on=["StudyId", "PatientId", "IndexDate"], how="left")
###############################
# Map some categorical features
###############################
# Replace SDQ8 with strings for phlegm difficulty
q8_dict = {
"1.0": "Not difficult",
"2.0": "A little difficult",
"3.0": "Quite difficult",
"4.0": "Very difficult",
}
for key in q8_dict:
pro_df["LatestSymptomDiaryQ8"] = pro_df["LatestSymptomDiaryQ8"].str.replace(
key, q8_dict[key]
)
# Replace SDQ9 with strings for phlegm consistency
q9_dict = {
"1.0": "Watery",
"2.0": "Sticky liquid",
"3.0": "Semi-solid",
"4.0": "Solid",
}
for key in q9_dict:
pro_df["LatestSymptomDiaryQ9"] = pro_df["LatestSymptomDiaryQ9"].str.replace(
key, q9_dict[key]
)
# Replace SDQ10 with strings for phlegm colour
q10_dict = {
"1.0": "White",
"2.0": "Yellow",
"3.0": "Green",
"4.0": "Dark green",
}
for key in q10_dict:
pro_df["LatestSymptomDiaryQ10"] = pro_df["LatestSymptomDiaryQ10"].str.replace(
key, q10_dict[key]
)
pro_df = pro_df.drop(
columns=[
"PatientId",
"LatestTimeSinceSubmission",
"LatestTimeSinceSubmission_x",
"LatestTimeSinceSubmission_y",
]
)
# Save data
os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True)
if data_to_process == "forward_val":
pro_df.to_pickle(
os.path.join(
config["outputs"]["processed_data_dir"],
"pros_forward_val_" + model_type + ".pkl",
)
)
else:
pro_df.to_pickle(
os.path.join(
config["outputs"]["processed_data_dir"],
"pros_" + model_type + ".pkl",
)
)
|