| """ |
| @Data: Medical Diagnosis Dataset |
| @Date: Oct 2025 |
| @Author: eJones |
| @Email: ejones@tamu.edu |
| """ |
| |
| RED = "\033[38;5;197m"; GOLD = "\033[38;5;185m"; TEAL = "\033[38;5;50m" |
| GREEN = "\033[38;5;82m"; RESET = "\033[0m" |
|
|
| |
| import pandas as pd |
| import numpy as np |
| from AdvancedAnalytics.ReplaceImputeEncode import DT, ReplaceImputeEncode |
| from AdvancedAnalytics.Forest import forest_classifier |
|
|
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.tree import DecisionTreeClassifier |
| from sklearn.model_selection import train_test_split, GridSearchCV |
| from sklearn.model_selection import cross_validate |
| from sklearn.metrics import accuracy_score |
| import warnings, time |
|
|
| def print_boundary(lbl, b_width=60, boundary=True): |
| """Print formatted section boundary with label""" |
| print("") |
| margin = b_width - len(lbl) - 2 |
| lmargin = int(margin/2) |
| rmargin = lmargin |
| if lmargin+rmargin < margin: |
| lmargin += 1 |
| if boundary: |
| print(f"{TEAL}", "="*b_width, f"{RESET}") |
| print(f"{GREEN}", lmargin*"*", lbl, rmargin*"*", f"{RESET}") |
| if boundary: |
| print(f"{TEAL}", "="*b_width, f"{RESET}") |
|
|
| def print_acc_ratio(scores, n): |
| n_folds = len(scores["train_score"]) |
| train_misc = (1.0 - scores["train_score"]) |
| train_smisc = 2.0*(1.0 - scores["train_score"]).std() |
| val_misc = (1.0 - scores["test_score"]) |
| val_smisc = 2.0*(1.0 - scores["test_score"]).std() |
| ratio_misc = np.zeros(n_folds) |
| for i in range(0, n_folds): |
| if train_misc[i]>0: |
| ratio_misc[i] = val_misc[i] / train_misc[i] |
| elif val_misc[i]>0: |
| ratio_misc[i] = np.inf |
| else: |
| ratio_misc[i] = 1.0 |
| try: |
| s_ratio = 2.0*ratio_misc.std() |
| except: |
| s_ratio = np.nan |
| train_misc = train_misc.mean() |
| val_misc = val_misc.mean() |
| ratio = val_misc/train_misc if train_misc>0 else np.inf |
| print(f"{TEAL}\n") |
| print(f" ====== {n_folds:.0f}-Fold Cross Validation =======") |
| print(f" Train Avg. MISC..... {train_misc:.4f} +/-{train_smisc:.4f}") |
| print(f" Test Avg. MISC..... {val_misc:.4f} +/-{val_smisc:.4f}") |
| if s_ratio == np.nan or s_ratio == np.inf: |
| print(f" Mean Misc Ratio..... {ratio:.4f}") |
| else: |
| print(f" Mean Misc Ratio..... {ratio:.4f} +/-{s_ratio:.4f}") |
| print(" ", 39*"=", f"{RESET}") |
| n_v = n*(1.0/n_folds) |
| n_t = n - n_v |
| print(f"Equivalent to {n_folds:.0f} splits each with "+ |
| f"{n_t:.0f}/{n_v:.0f} Cases") |
|
|
| def print_summary(train_acc, val_acc): |
| train_misc = 1.0 - train_acc |
| val_misc = 1.0 - val_acc |
| ratio_acc = train_acc / val_acc if val_acc>0 else np.inf |
| if train_misc>0: |
| ratio_misc = val_misc / train_misc |
| elif val_misc>0: |
| ratio_misc = np.inf |
| else: |
| ratio_misc = 1.0 |
| |
| print(f"{GREEN}{'TRAIN':>28s} {'VALIDATION':>11s} {'RATIO':>7s}") |
| if ratio_acc < 1.2: |
| color = GREEN |
| else: |
| color = RED |
| print(f"{GREEN} {'ACCURACY':.<20s}{GOLD}{train_acc:>7.4f}", |
| f" {val_acc:>7.4f} {color}{ratio_acc:>7.4f}{RESET}") |
|
|
| if ratio_misc < 1.2: |
| color = GREEN |
| else: |
| color = RED |
| print(f"{GREEN} {'MISCLASSIFICATION':.<20s}{GOLD}{train_misc:>7.4f}", |
| f" {val_misc:>7.4f} {color}{ratio_misc:>7.4f}{RESET}") |
| print(f"{TEAL}","-"*47, f"{RESET}") |
|
|
| def tree_selection(X, y, threshold=0.9): |
| if threshold >= 1.0: |
| return X |
| dt_selector = DecisionTreeClassifier(random_state=42).fit(X, y) |
| |
| feature_importance = dt_selector.feature_importances_ |
| feature_name = X.columns |
|
|
| |
| importance_df = pd.DataFrame({ |
| 'feature': feature_name, |
| 'importance': feature_importance |
| }).sort_values('importance', ascending=False) |
| |
| importance_df['cumulative_importance'] = importance_df['importance'].cumsum() |
|
|
| print(f"\n{GREEN} Feature Importance Analysis{GOLD}") |
| print(f"{'='*51}") |
| print(f"{'Feature':<27} {'Importance':<12} {'Cumulative':<12}") |
| print(f"{'-'*51}") |
| lne = False |
| for idx, row in importance_df.iterrows(): |
| print(f"{row['feature']:.<29} {row['importance']:<12.4f} ", |
| f"{row['cumulative_importance']:<12.4f}") |
| if row['cumulative_importance'] > threshold and not lne: |
| print(f"{RED}{25*'- '}{GOLD}"); lne = True |
| print(f"{'='*51}") |
| |
| cumulative_threshold = threshold |
| selected_mask = importance_df['cumulative_importance'] >= cumulative_threshold |
|
|
| if selected_mask.any(): |
| |
| first_idx = selected_mask.idxmax() |
| selected_features = importance_df.loc[:first_idx, 'feature'].tolist() |
| threshold_reached = importance_df.loc[first_idx, 'cumulative_importance'] |
| else: |
| |
| selected_features = importance_df['feature'].tolist() |
| threshold_reached = importance_df['cumulative_importance'].max() |
| print(f"\n{RED}No feature combination reaches 90% importance. ", |
| f"Using all features (cumulative: {threshold_reached:.1%})") |
|
|
| print(f"\n{GREEN}Selected {len(selected_features)} features accounting for ", |
| f"{threshold_reached:.0%} of importance") |
| |
| X_selected = X[selected_features] |
| print(f"{GREEN}Feature selection complete - reduction: ", |
| f"{RED}{X.shape[1]} -→ {X_selected.shape[1]} {GREEN}features{RESET}") |
| |
| return X_selected |
| """ =========================================================== """ |
| lbl = "Step 1: Reading Medical Diagnosis Data" |
| print_boundary(lbl) |
| df = pd.read_csv('../data/medical_diagnosis_data.csv') |
|
|
| print(f"{GOLD}Data loaded: {df.shape[0]} ", |
| f"observations and {df.shape[1]} columns.{RESET}") |
|
|
| |
| print(f"\n{GOLD}First 5 rows of the data:{RESET}") |
| print(df.head()) |
|
|
| |
| lbl = "Step 2: Data Map and ReplaceImputeEncode Processing" |
| print_boundary(lbl) |
|
|
| data_map = { |
| "patient_id": [DT.ID, ("")], |
| |
| "age": [DT.Interval, (18, 85)], |
| "bmi": [DT.Interval, (18.0, 40.0)], |
| "cholesterol": [DT.Interval, (120, 350)], |
| "stress_level": [DT.Interval, (1, 10)], |
| "sleep_hours": [DT.Interval, (4.0, 11.0)], |
| |
| "blood_pressure": [DT.Nominal, ("Normal", "Elevated", "High_Stage1", |
| "High_Stage2")], |
| "exercise_freq": [DT.Nominal, ("none", "light", "moderate", "intense")], |
| |
| "smoker": [DT.Binary, (0, 1)], |
| "family_history": [DT.Binary, (0, 1)], |
| "gender": [DT.Binary, (0, 1)], |
| |
| "has_disease": [DT.Binary, (0, 1)] |
| } |
|
|
| print(f"{GOLD}") |
| print(15*"=", "DATA MAP", 15*"=") |
| lk = len(max(data_map, key=len)) + 1 |
| ignored = 0 |
| for col, (dt_type, valid_values) in data_map.items(): |
| if dt_type.name == "ID" or dt_type.name == "Ignore": |
| ignored += 1 |
| print(f" {TEAL}{col:.<{lk}s} {GOLD}{dt_type.name:9s}{GREEN}{valid_values}") |
| print(f"{GOLD} === Data Map has{RED}", len(data_map)-ignored, |
| f"{GOLD}attribute columns", 3*"=",f"{RESET}") |
|
|
| |
| |
| target = "has_disease" |
| print(f"{GOLD}Target variable: {target}{RESET}") |
|
|
| rie = ReplaceImputeEncode(data_map=data_map, |
| interval_scale=None, |
| no_impute=[target], |
| binary_encoding ="one-hot", |
| nominal_encoding="one-hot", |
| drop=False, |
| display=True) |
|
|
| |
| encoded_df = rie.fit_transform(df) |
| print(f"\n{RED}encoded_df{RESET}:", |
| f"{encoded_df.shape[0]} cases and", |
| f"{encoded_df.shape[1]} columns,\n", |
| " including targets.") |
| print(f"{RESET}") |
|
|
| print(f"\n{GOLD}Preprocessing complete. Ready for next step.{RESET}") |
|
|
| |
| lbl = "Step 3: Kitchen Sink Random Forest (Default Parameters)" |
| print_boundary(lbl) |
|
|
| y = encoded_df[target] |
| X = encoded_df.drop(target, axis=1) |
|
|
| |
| print(f"{GOLD}Fitting kitchen sink random forest using entire dataset") |
| kitchen_sink_forest = RandomForestClassifier(random_state=42) |
| kitchen_sink_forest = kitchen_sink_forest.fit(X, y) |
| forest_classifier.display_metrics(kitchen_sink_forest, X, y) |
| print(f"{RED}'Overfitting?'{RESET}") |
|
|
| |
| lbl = "70/30 Holdout Validation of Kitchen Sink Forest" |
| print_boundary(lbl) |
|
|
| |
| X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, |
| stratify=y, random_state=42) |
|
|
| |
| kitchen_sink_forest_cv = RandomForestClassifier(random_state=42) |
| kitchen_sink_forest_cv = kitchen_sink_forest_cv.fit(X_train, y_train) |
|
|
| |
| print(f"{GOLD}") |
| forest_classifier.display_split_metrics(kitchen_sink_forest_cv, |
| X_train, y_train, X_val, y_val) |
|
|
| |
| train_pred = kitchen_sink_forest_cv.predict(X_train) |
| val_pred = kitchen_sink_forest_cv.predict(X_val) |
| train_acc = accuracy_score(y_train, train_pred) |
| val_acc = accuracy_score(y_val, val_pred) |
|
|
| lbl = "Kitchen Sink 70/30 Validation" |
| print_boundary(lbl, 47) |
| print_summary(train_acc, val_acc) |
|
|
| |
| print(f"{GOLD}\nTop 10 Feature Importance (from training data):") |
| forest_classifier.display_importance(kitchen_sink_forest_cv, X.columns, |
| top='all', plot=True) |
| print(f"{RESET}") |
|
|
| |
| lbl = "Step 4.1: Feature Selection using Decision Tree Importance" |
| print_boundary(lbl) |
| threshold = 0.9 |
| X_selected = tree_selection(X, y, threshold) |
| selected_features = X_selected.columns |
|
|
| |
| lbl = "Step 4.2: Case Reduction using Stratified Random Sample" |
| print_boundary(lbl) |
| |
| train_size = 0.5 |
| if train_size < 1.0: |
| X_train, X_val, y_train, y_val = \ |
| train_test_split(X_selected, y, train_size=train_size, |
| stratify=y, random_state=123) |
| else: |
| X_train = X |
| y_train = y |
| |
| |
| lbl = "Step 4.3: Random Forest Hyperparameter Optimization" |
| print_boundary(lbl) |
| |
| param_grid = { |
| 'n_estimators': [50, 100], |
| 'criterion': ['gini', 'entropy'], |
| 'max_depth': [6, 7, 9, 10, None], |
| 'min_samples_split': [14, 28, 32, 34, 36, 38, 40, 42], |
| 'min_samples_leaf': [14, 16, 17, 20, 22 ], |
| 'max_features': ['sqrt', 4, 5, 7, 8, 9, None] |
| } |
| """ |
| |
| Grid Search: 5600 parameter combinations with |
| 4-fold CV requires 22400 total fits. |
| |
| Hyperparameter optimization uses only the top 0.9% of |
| features and randomly selected 0.5% of data. |
| |
| Parallel processing using n_jobs=-1 |
| |
| Starting grid search... |
| Grid search completed in 88.8 seconds |
| Average time per parameter combination: 0.02 seconds |
| |
| =============================================== |
| *********** Optimum Hyperparameters *********** |
| =============================================== |
| criterion........... entropy |
| max_depth........... 7 |
| max_features........ 4 |
| min_samples_leaf.... 14 |
| min_samples_split... 40 |
| n_estimators........ 50 |
| |
| =============================================== |
| ****** Optimum Forest Performance Metrics ***** |
| =============================================== |
| TRAIN VALIDATION RATIO |
| ACCURACY............ 0.8064 0.7685 1.0493 |
| MISCLASSIFICATION... 0.1936 0.2315 1.1957 |
| ----------------------------------------------- |
| """ |
| lbl = "Hyperparameters" |
| print_boundary(lbl, 47) |
| for parm in param_grid: |
| print(f"{GREEN} {parm:.<20s}{GOLD}{param_grid[parm][0:]}{RESET}") |
|
|
| |
| total_combinations = 1 |
| for param_list in param_grid.values(): |
| total_combinations *= len(param_list) |
| total_fits = total_combinations * 4 |
|
|
| njobs = -1 |
| print(f"\n{GOLD}Grid Search: {total_combinations} parameter combinations with") |
| print(f"4-fold CV requires {total_fits} total fits.\n") |
| t_size = 100*train_size; t = 100*threshold |
| print(f"Hyperparameter optimization uses only the top {t}% of\n", |
| f"features and randomly selected {t_size}% of data.\n") |
| print(f"{GOLD}Parallel processing using {RED}n_jobs={njobs}") |
|
|
| |
| start_time = time.time() |
| print(f"\n{GREEN}Starting grid search...{RESET}") |
| rf = RandomForestClassifier(random_state=42) |
| |
| grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, |
| cv=4, scoring='accuracy', return_train_score=True, |
| n_jobs=njobs).fit(X_train, y_train) |
| end_time = time.time() |
| elapsed_time = end_time - start_time |
| print(f"{GOLD}Grid search completed in {elapsed_time:.1f} seconds") |
| print(f"Average time per parameter combination: ", |
| f"{elapsed_time/total_combinations:.2f} seconds{RESET}") |
|
|
| lbl = "Optimum Hyperparameters" |
| print_boundary(lbl, 47) |
| |
| max_param_len = max(len(str(val)) for val in grid_search.best_params_.values()) |
| sze = max_param_len + 3 |
| for parm in grid_search.best_params_: |
| parameter = str(grid_search.best_params_[parm]) |
| print(f"{GREEN} {parm:.<20s}{GOLD}{parameter:>{sze}s}{RESET}") |
|
|
| best_idx = np.argmin(grid_search.cv_results_['rank_test_score']) |
| val_acc = grid_search.cv_results_['mean_test_score'][best_idx] |
| train_acc = grid_search.cv_results_['mean_train_score'][best_idx] |
|
|
| lbl = "Optimum Forest Performance Metrics" |
| print_boundary(lbl, 47) |
| print_summary(train_acc, val_acc) |
|
|
| lbl = "Best Random Forest Importance" |
| print_boundary(lbl, 47) |
|
|
| best_forest = grid_search.best_estimator_ |
| importance = best_forest.feature_importances_ |
| feature = X_train.columns |
| data = {'feature': feature, 'importance':importance} |
| df = pd.DataFrame(data) |
| df = df.sort_values(by='importance', ascending=False) |
| df['cumulative'] = df['importance'].cumsum() |
| print(df.to_string(index=False)) |
|
|
| |
| lbl = "Step 5: Holdout Validation (Selected Features)" |
| print_boundary(lbl) |
|
|
| Xt_full, Xv_full, yt, yv = train_test_split(X, y, test_size=0.3, |
| stratify=y, random_state=42) |
| |
| Xt = Xt_full[selected_features] |
| Xv = Xv_full[selected_features] |
|
|
| |
| hold_out_forest = best_forest |
| hold_out_forest = hold_out_forest.fit(Xt, yt) |
| print(f"{GOLD}") |
| forest_classifier.display_split_metrics(hold_out_forest, Xt, yt, Xv, yv) |
| print(f"{RESET}") |
| |
| train_pred = hold_out_forest.predict(Xt) |
| val_pred = hold_out_forest.predict(Xv) |
| train_acc = accuracy_score(yt, train_pred) |
| val_acc = accuracy_score(yv, val_pred) |
| train_misc = 1.0 - train_acc |
| val_misc = 1.0 - val_acc |
| ratio_acc = train_acc / val_acc if val_acc > 0 else np.inf |
| ratio_misc = val_misc / train_misc if train_misc > 0 else np.inf |
|
|
| lbl = "Holdout Validation Performance Summary (Selected Features)" |
| print_boundary(lbl, 47) |
| print_summary(train_acc, val_acc) |
|
|
| |
| print(f"{GOLD}\nFeature Importance (optimized model - selected features):") |
| forest_classifier.display_importance(hold_out_forest, selected_features, |
| top='all', plot=True) |
|
|
| |
| lbl = "Step 6: K-Fold Cross-Validation (Selected Features)" |
| print_boundary(lbl) |
|
|
| warnings.filterwarnings('ignore', category=RuntimeWarning) |
| n = X_selected.shape[0] |
| best_val_acc = 0 |
| for k in range(2, 11): |
| scores = cross_validate(best_forest, X_selected, y, scoring='accuracy', |
| cv=k, return_train_score=True) |
| |
| train_acc = scores["train_score"].mean() |
| val_acc = scores["test_score"].mean() |
|
|
| print_acc_ratio(scores, n) |
| if val_acc > best_val_acc: |
| best_k = k |
| best_train_acc = train_acc |
| best_val_acc = val_acc |
|
|
| print(f"\n{GOLD} Best K :", |
| f"{RED}{best_k}-Fold{GOLD}") |
| lbl = "K-Fold Cross-Validation Performance Summary (Selected Features)" |
| print_boundary(lbl, 47) |
| print_summary(best_train_acc, best_val_acc) |
| lbl = "AI Development Productivity Random Forest Analysis Complete" |
| print_boundary(lbl) |
| """ |
| Model Metrics.......... Training Validation |
| Observations........... 1983 851 |
| Features............... 8 8 |
| Maximum Tree Depth..... 7 7 |
| Minimum Leaf Size...... 14 14 |
| Minimum split Size..... 40 40 |
| Mean Absolute Error.... 0.3084 0.3345 |
| Avg Squared Error...... 0.1394 0.1672 |
| Accuracy............... 0.7948 0.7450 |
| Precision.................. 0.8437 0.7751 |
| Recall (Sensitivity)....... 0.7238 0.6894 |
| Specificity................ 0.8658 0.8005 |
| F1-score................... 0.7792 0.7298 |
| Total Misclassifications... 407 217 |
| MISC (Misclassification)... 20.5% 25.5% |
| class 0............... 13.4% 20.0% |
| class 1............... 27.6% 31.1% |
| =============================================== |
| Holdout Validation Performance Summary (Selected Features) |
| =============================================== |
| TRAIN VALIDATION RATIO |
| ACCURACY............ 0.7948 0.7450 1.0668 |
| MISCLASSIFICATION... 0.2052 0.2550 1.2424 |
| ----------------------------------------------- |
| |
| Best K : 9-Fold |
| |
| =============================================== |
| K-Fold Cross-Validation Performance Summary (Selected Features) |
| =============================================== |
| TRAIN VALIDATION RATIO |
| ACCURACY............ 0.7914 0.7583 1.0437 |
| MISCLASSIFICATION... 0.2086 0.2417 1.1588 |
| ----------------------------------------------- |
| """ |
|
|