| |
| |
| """ |
| Last Modified on Sep 20, 2025 |
| @purpose: This Code produces a stratified random sample of a large |
| dataset with a nominal target and a nominal predictor |
| @notes: Uses sci-kit learn train_test_split with stratify option |
| @data: CellphoneActivity.csv with n=165,633 cases and k=19 columns |
| Saves CellphoneActivity_StratifiedRS.csv with n=33127 (20% of full data) |
| @email: ejones@tamu.edu |
| """ |
| |
| RED = "\033[38;5;197m" |
| GOLD = "\033[38;5;185m" |
| TEAL = "\033[38;5;50m" |
| GREEN = "\033[38;5;82m" |
| RESET = "\033[0m" |
|
|
| from AdvancedAnalytics.ReplaceImputeEncode import DT, ReplaceImputeEncode |
| from sklearn.model_selection import train_test_split |
| import warnings |
| import pandas as pd |
|
|
| warnings.filterwarnings("ignore", |
| category=UserWarning) |
| data_map = { |
| 'activity':[DT.Nominal, ('sitting', 'sittingdown', |
| 'standing', 'standingup', 'walking')], |
| 'user': [DT.Nominal,('debora', 'jose_carlos', 'katia', 'wallace')], |
| 'gender': [DT.Binary, ('Man', 'Woman')], |
| 'age': [DT.Interval,(20, 80)], |
| 'height': [DT.Interval,(1.5, 1.75)], |
| 'weight': [DT.Interval,(50, 85)], |
| 'BMI': [DT.Interval,(20, 30)], |
| 'x1': [DT.Interval,(-750, +750)], |
| 'y1': [DT.Interval,(-750, +750)], |
| 'z1': [DT.Interval,(-750, +750)], |
| 'x2': [DT.Interval,(-750, +750)], |
| 'y2': [DT.Interval,(-750, +750)], |
| 'z2': [DT.Interval,(-750, +750)], |
| 'x3': [DT.Interval,(-750, +750)], |
| 'y3': [DT.Interval,(-750, +750)], |
| 'z3': [DT.Interval,(-750, +750)], |
| 'x4': [DT.Interval,(-750, +750)], |
| 'y4': [DT.Interval,(-750, +750)], |
| 'z4': [DT.Interval,(-750, +750)] |
| } |
| target = "activity" |
| check_var = ["weight", "x1", "y1", "z1"] |
| size = 0.2 |
| file_in = "../data/CellphoneActivity.csv" |
| file_out = "../data/CellphoneActivity_StratifiedRS.csv" |
| |
| categorical_list = [] |
| for key in data_map: |
| if data_map[key][0] == DT.Nominal or data_map[key][0] == DT.Binary: |
| categorical_list.append(key) |
| l = len(file_in) |
| print("") |
| print(f"{RED}--> READING ORIGINAL DATASET: {file_in} {RESET}") |
| df = pd.read_csv(file_in, index_col=None) |
|
|
| Xt, Xv = train_test_split(df, train_size = size, stratify=df[categorical_list], |
| random_state=12345) |
| print(f"{RED}--> SAVING RANDOM STRATIFIED SAMPLE: {file_out} {RESET}\n") |
| Xt.to_csv(file_out) |
|
|
| df2 = pd.read_csv(file_out) |
| target2 = df2[target].value_counts() |
| target1 = df[target].value_counts() |
| print(f"{TEAL}"+39*"="+f"{RESET}") |
| print(f"{GREEN}"+2*"*", "COMPARING ORIGINAL TO SAMPLE DATA", |
| 2*"*"+f"{RESET}") |
| print(f"{TEAL}"+39*"="+f"{RESET}\n") |
|
|
| print(f"{TEAL}"+36*"="+f"{RESET}") |
| print(f"{GREEN}SAMPLE SIZES{RESET}") |
| print(f"{GREEN}TARGET ORI OUT RATIO{RESET}") |
| print(f"{TEAL}"+36*"-"+f"{RESET}") |
| for target_val in target1.index: |
| ori = target1[target_val] |
| out = target2[target_val] |
| ratio = ori/out |
| print(f"{target_val:15s}{ori: 7.0f}{out: 7.0f}{ratio: 7.1f}") |
|
|
| for var in check_var: |
| print("") |
| check_ori = df.groupby (target)[var].mean() |
| check_out = df2.groupby(target)[var].mean() |
| print(f"{TEAL}"+36*"="+f"{RESET}") |
| print(f"{GREEN}CHECK AVERAGES FOR {var.upper():5s}{RESET}") |
| print(f"{GREEN}TARGET ORI OUT RATIO{RESET}") |
| print(f"{TEAL}"+36*"-"+f"{RESET}") |
| for target_val in check_ori.index: |
| ori = check_ori[target_val] |
| out = check_out[target_val] |
| ratio = ori/out |
| print(f"{target_val:15s}{ori: 7.2f}{out: 7.2f}{ratio: 7.2f}") |