""" @Data: Medical Diagnosis Dataset @Date: Oct 2025 @Author: eJones @Email: ejones@tamu.edu """ # ANSI color codes for output formatting RED = "\033[38;5;197m"; GOLD = "\033[38;5;185m"; TEAL = "\033[38;5;50m" GREEN = "\033[38;5;82m"; RESET = "\033[0m" # Import required packages import pandas as pd import numpy as np from AdvancedAnalytics.ReplaceImputeEncode import DT, ReplaceImputeEncode from AdvancedAnalytics.Forest import forest_classifier from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.model_selection import cross_validate from sklearn.metrics import accuracy_score import warnings, time def print_boundary(lbl, b_width=60, boundary=True): """Print formatted section boundary with label""" print("") margin = b_width - len(lbl) - 2 lmargin = int(margin/2) rmargin = lmargin if lmargin+rmargin < margin: lmargin += 1 if boundary: print(f"{TEAL}", "="*b_width, f"{RESET}") print(f"{GREEN}", lmargin*"*", lbl, rmargin*"*", f"{RESET}") if boundary: print(f"{TEAL}", "="*b_width, f"{RESET}") def print_acc_ratio(scores, n): n_folds = len(scores["train_score"]) train_misc = (1.0 - scores["train_score"]) train_smisc = 2.0*(1.0 - scores["train_score"]).std() val_misc = (1.0 - scores["test_score"]) val_smisc = 2.0*(1.0 - scores["test_score"]).std() ratio_misc = np.zeros(n_folds) for i in range(0, n_folds): if train_misc[i]>0: ratio_misc[i] = val_misc[i] / train_misc[i] elif val_misc[i]>0: ratio_misc[i] = np.inf else: ratio_misc[i] = 1.0 try: s_ratio = 2.0*ratio_misc.std() except: s_ratio = np.nan train_misc = train_misc.mean() val_misc = val_misc.mean() ratio = val_misc/train_misc if train_misc>0 else np.inf print(f"{TEAL}\n") print(f" ====== {n_folds:.0f}-Fold Cross Validation =======") print(f" Train Avg. MISC..... {train_misc:.4f} +/-{train_smisc:.4f}") print(f" Test Avg. MISC..... {val_misc:.4f} +/-{val_smisc:.4f}") if s_ratio == np.nan or s_ratio == np.inf: print(f" Mean Misc Ratio..... {ratio:.4f}") else: print(f" Mean Misc Ratio..... {ratio:.4f} +/-{s_ratio:.4f}") print(" ", 39*"=", f"{RESET}") n_v = n*(1.0/n_folds) n_t = n - n_v print(f"Equivalent to {n_folds:.0f} splits each with "+ f"{n_t:.0f}/{n_v:.0f} Cases") def print_summary(train_acc, val_acc): train_misc = 1.0 - train_acc val_misc = 1.0 - val_acc ratio_acc = train_acc / val_acc if val_acc>0 else np.inf if train_misc>0: ratio_misc = val_misc / train_misc elif val_misc>0: ratio_misc = np.inf else: ratio_misc = 1.0 #print accuracy and misclassification summary for train/validation print(f"{GREEN}{'TRAIN':>28s} {'VALIDATION':>11s} {'RATIO':>7s}") if ratio_acc < 1.2: color = GREEN else: color = RED print(f"{GREEN} {'ACCURACY':.<20s}{GOLD}{train_acc:>7.4f}", f" {val_acc:>7.4f} {color}{ratio_acc:>7.4f}{RESET}") if ratio_misc < 1.2: color = GREEN else: color = RED print(f"{GREEN} {'MISCLASSIFICATION':.<20s}{GOLD}{train_misc:>7.4f}", f" {val_misc:>7.4f} {color}{ratio_misc:>7.4f}{RESET}") print(f"{TEAL}","-"*47, f"{RESET}") def tree_selection(X, y, threshold=0.9): if threshold >= 1.0: return X dt_selector = DecisionTreeClassifier(random_state=42).fit(X, y) # Get feature importances feature_importance = dt_selector.feature_importances_ feature_name = X.columns # Create a DataFrame for easier manipulation importance_df = pd.DataFrame({ 'feature': feature_name, 'importance': feature_importance }).sort_values('importance', ascending=False) # Calculate cumulative importance importance_df['cumulative_importance'] = importance_df['importance'].cumsum() print(f"\n{GREEN} Feature Importance Analysis{GOLD}") print(f"{'='*51}") print(f"{'Feature':<27} {'Importance':<12} {'Cumulative':<12}") print(f"{'-'*51}") lne = False for idx, row in importance_df.iterrows(): print(f"{row['feature']:.<29} {row['importance']:<12.4f} ", f"{row['cumulative_importance']:<12.4f}") if row['cumulative_importance'] > threshold and not lne: print(f"{RED}{25*'- '}{GOLD}"); lne = True print(f"{'='*51}") # Select features that account for at least threshold of total importance cumulative_threshold = threshold selected_mask = importance_df['cumulative_importance'] >= cumulative_threshold if selected_mask.any(): # Find the first feature that makes cumulative importance >= 90% first_idx = selected_mask.idxmax() selected_features = importance_df.loc[:first_idx, 'feature'].tolist() threshold_reached = importance_df.loc[first_idx, 'cumulative_importance'] else: # If no combination reaches 90%, take all features selected_features = importance_df['feature'].tolist() threshold_reached = importance_df['cumulative_importance'].max() print(f"\n{RED}No feature combination reaches 90% importance. ", f"Using all features (cumulative: {threshold_reached:.1%})") print(f"\n{GREEN}Selected {len(selected_features)} features accounting for ", f"{threshold_reached:.0%} of importance") # Reduce feature set to selected features X_selected = X[selected_features] print(f"{GREEN}Feature selection complete - reduction: ", f"{RED}{X.shape[1]} -→ {X_selected.shape[1]} {GREEN}features{RESET}") return X_selected """ =========================================================== """ lbl = "Step 1: Reading Medical Diagnosis Data" print_boundary(lbl) df = pd.read_csv('../data/medical_diagnosis_data.csv') print(f"{GOLD}Data loaded: {df.shape[0]} ", f"observations and {df.shape[1]} columns.{RESET}") # Display first few rows to verify data structure print(f"\n{GOLD}First 5 rows of the data:{RESET}") print(df.head()) # Step 2: Create Data Map and Apply ReplaceImputeEncode lbl = "Step 2: Data Map and ReplaceImputeEncode Processing" print_boundary(lbl) data_map = { "patient_id": [DT.ID, ("")], # Interval features "age": [DT.Interval, (18, 85)], "bmi": [DT.Interval, (18.0, 40.0)], "cholesterol": [DT.Interval, (120, 350)], "stress_level": [DT.Interval, (1, 10)], "sleep_hours": [DT.Interval, (4.0, 11.0)], # Nominal features "blood_pressure": [DT.Nominal, ("Normal", "Elevated", "High_Stage1", "High_Stage2")], "exercise_freq": [DT.Nominal, ("none", "light", "moderate", "intense")], # Binary features "smoker": [DT.Binary, (0, 1)], "family_history": [DT.Binary, (0, 1)], "gender": [DT.Binary, (0, 1)], # Target variable "has_disease": [DT.Binary, (0, 1)] #cardiovascular condition } print(f"{GOLD}") print(15*"=", "DATA MAP", 15*"=") lk = len(max(data_map, key=len)) + 1 ignored = 0 for col, (dt_type, valid_values) in data_map.items(): if dt_type.name == "ID" or dt_type.name == "Ignore": ignored += 1 print(f" {TEAL}{col:.<{lk}s} {GOLD}{dt_type.name:9s}{GREEN}{valid_values}") print(f"{GOLD} === Data Map has{RED}", len(data_map)-ignored, f"{GOLD}attribute columns", 3*"=",f"{RESET}") # Step 3: Apply RIE preprocessing # Set target variable target = "has_disease" print(f"{GOLD}Target variable: {target}{RESET}") rie = ReplaceImputeEncode(data_map=data_map, interval_scale=None, # No standardization no_impute=[target], # Don't impute target binary_encoding ="one-hot", nominal_encoding="one-hot", drop=False, # Keep all encoded columns display=True) # Transform the data encoded_df = rie.fit_transform(df) print(f"\n{RED}encoded_df{RESET}:", f"{encoded_df.shape[0]} cases and", f"{encoded_df.shape[1]} columns,\n", " including targets.") print(f"{RESET}") print(f"\n{GOLD}Preprocessing complete. Ready for next step.{RESET}") # Step 3: Kitchen Sink Random Forest Evaluation lbl = "Step 3: Kitchen Sink Random Forest (Default Parameters)" print_boundary(lbl) y = encoded_df[target] X = encoded_df.drop(target, axis=1) # First show the overfitting on full data print(f"{GOLD}Fitting kitchen sink random forest using entire dataset") kitchen_sink_forest = RandomForestClassifier(random_state=42) # Defaults kitchen_sink_forest = kitchen_sink_forest.fit(X, y) forest_classifier.display_metrics(kitchen_sink_forest, X, y) print(f"{RED}'Overfitting?'{RESET}") # Now evaluate with proper holdout validation lbl = "70/30 Holdout Validation of Kitchen Sink Forest" print_boundary(lbl) # Split the data 70/30 X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42) # Fit kitchen sink forest on training data only kitchen_sink_forest_cv = RandomForestClassifier(random_state=42) # Defaults kitchen_sink_forest_cv = kitchen_sink_forest_cv.fit(X_train, y_train) # Evaluate using AdvancedAnalytics display_split_metrics print(f"{GOLD}") forest_classifier.display_split_metrics(kitchen_sink_forest_cv, X_train, y_train, X_val, y_val) # Calculate and display accuracy ratio and misclassification ratio train_pred = kitchen_sink_forest_cv.predict(X_train) val_pred = kitchen_sink_forest_cv.predict(X_val) train_acc = accuracy_score(y_train, train_pred) val_acc = accuracy_score(y_val, val_pred) lbl = "Kitchen Sink 70/30 Validation" print_boundary(lbl, 47) print_summary(train_acc, val_acc) #train/validation summary # Show feature importance from the properly trained model print(f"{GOLD}\nTop 10 Feature Importance (from training data):") forest_classifier.display_importance(kitchen_sink_forest_cv, X.columns, top='all', plot=True) print(f"{RESET}") # Step 4.1: Feature Selection using Decision Tree Importance lbl = "Step 4.1: Feature Selection using Decision Tree Importance" print_boundary(lbl) threshold = 0.9 #Select top 90% of Important Features X_selected = tree_selection(X, y, threshold) selected_features = X_selected.columns # Step 4,2: Case Reduction - Construct Stratified Random Sample lbl = "Step 4.2: Case Reduction using Stratified Random Sample" print_boundary(lbl) #Using different random_state (123) than Step 3 (42) to get different split train_size = 0.5 if train_size < 1.0: X_train, X_val, y_train, y_val = \ train_test_split(X_selected, y, train_size=train_size, stratify=y, random_state=123) else: X_train = X y_train = y # Step 5: Random Forest Hyperparameter Optimization using selected features lbl = "Step 4.3: Random Forest Hyperparameter Optimization" print_boundary(lbl) #n=2834; 50% N = 1417; 0.5%xN = 7 or 14; use 14 since valdation users N=2834 param_grid = { 'n_estimators': [50, 100], 'criterion': ['gini', 'entropy'], 'max_depth': [6, 7, 9, 10, None], 'min_samples_split': [14, 28, 32, 34, 36, 38, 40, 42], 'min_samples_leaf': [14, 16, 17, 20, 22 ], 'max_features': ['sqrt', 4, 5, 7, 8, 9, None] } """ Grid Search: 5600 parameter combinations with 4-fold CV requires 22400 total fits. Hyperparameter optimization uses only the top 0.9% of features and randomly selected 0.5% of data. Parallel processing using n_jobs=-1 Starting grid search... Grid search completed in 88.8 seconds Average time per parameter combination: 0.02 seconds =============================================== *********** Optimum Hyperparameters *********** =============================================== criterion........... entropy max_depth........... 7 max_features........ 4 min_samples_leaf.... 14 min_samples_split... 40 n_estimators........ 50 =============================================== ****** Optimum Forest Performance Metrics ***** =============================================== TRAIN VALIDATION RATIO ACCURACY............ 0.8064 0.7685 1.0493 MISCLASSIFICATION... 0.1936 0.2315 1.1957 ----------------------------------------------- """ lbl = "Hyperparameters" print_boundary(lbl, 47) for parm in param_grid: print(f"{GREEN} {parm:.<20s}{GOLD}{param_grid[parm][0:]}{RESET}") # Calculate and display grid search information total_combinations = 1 for param_list in param_grid.values(): total_combinations *= len(param_list) total_fits = total_combinations * 4 # cv=4 njobs = -1 print(f"\n{GOLD}Grid Search: {total_combinations} parameter combinations with") print(f"4-fold CV requires {total_fits} total fits.\n") t_size = 100*train_size; t = 100*threshold print(f"Hyperparameter optimization uses only the top {t}% of\n", f"features and randomly selected {t_size}% of data.\n") print(f"{GOLD}Parallel processing using {RED}n_jobs={njobs}") # Start timing and run grid search start_time = time.time() print(f"\n{GREEN}Starting grid search...{RESET}") rf = RandomForestClassifier(random_state=42) #Grid Search using only selected features grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=4, scoring='accuracy', return_train_score=True, n_jobs=njobs).fit(X_train, y_train) end_time = time.time() elapsed_time = end_time - start_time print(f"{GOLD}Grid search completed in {elapsed_time:.1f} seconds") print(f"Average time per parameter combination: ", f"{elapsed_time/total_combinations:.2f} seconds{RESET}") lbl = "Optimum Hyperparameters" print_boundary(lbl, 47) # Find the longest parameter value for consistent formatting max_param_len = max(len(str(val)) for val in grid_search.best_params_.values()) sze = max_param_len + 3 for parm in grid_search.best_params_: parameter = str(grid_search.best_params_[parm]) print(f"{GREEN} {parm:.<20s}{GOLD}{parameter:>{sze}s}{RESET}") best_idx = np.argmin(grid_search.cv_results_['rank_test_score']) val_acc = grid_search.cv_results_['mean_test_score'][best_idx] train_acc = grid_search.cv_results_['mean_train_score'][best_idx] lbl = "Optimum Forest Performance Metrics" print_boundary(lbl, 47) print_summary(train_acc, val_acc) lbl = "Best Random Forest Importance" print_boundary(lbl, 47) best_forest = grid_search.best_estimator_ importance = best_forest.feature_importances_ feature = X_train.columns data = {'feature': feature, 'importance':importance} df = pd.DataFrame(data) df = df.sort_values(by='importance', ascending=False) df['cumulative'] = df['importance'].cumsum() print(df.to_string(index=False)) # Step 5: Holdout Validation (70/30 split) - Selected Features lbl = "Step 5: Holdout Validation (Selected Features)" print_boundary(lbl) Xt_full, Xv_full, yt, yv = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42) # Use only selected features for training Xt = Xt_full[selected_features] Xv = Xv_full[selected_features] # Train optimized model on final training set hold_out_forest = best_forest hold_out_forest = hold_out_forest.fit(Xt, yt) print(f"{GOLD}") forest_classifier.display_split_metrics(hold_out_forest, Xt, yt, Xv, yv) print(f"{RESET}") # Calculate final performance metrics train_pred = hold_out_forest.predict(Xt) val_pred = hold_out_forest.predict(Xv) train_acc = accuracy_score(yt, train_pred) val_acc = accuracy_score(yv, val_pred) train_misc = 1.0 - train_acc val_misc = 1.0 - val_acc ratio_acc = train_acc / val_acc if val_acc > 0 else np.inf ratio_misc = val_misc / train_misc if train_misc > 0 else np.inf lbl = "Holdout Validation Performance Summary (Selected Features)" print_boundary(lbl, 47) print_summary(train_acc, val_acc) # Show feature importance from the optimized model (selected features only) print(f"{GOLD}\nFeature Importance (optimized model - selected features):") forest_classifier.display_importance(hold_out_forest, selected_features, top='all', plot=True) # Step 6: K-Fold Cross Validation - Selected Features lbl = "Step 6: K-Fold Cross-Validation (Selected Features)" print_boundary(lbl) warnings.filterwarnings('ignore', category=RuntimeWarning) n = X_selected.shape[0] best_val_acc = 0 for k in range(2, 11): # Test 2-fold through 10-fold CV scores = cross_validate(best_forest, X_selected, y, scoring='accuracy', cv=k, return_train_score=True) # Calculate metrics train_acc = scores["train_score"].mean() val_acc = scores["test_score"].mean() print_acc_ratio(scores, n) if val_acc > best_val_acc: best_k = k best_train_acc = train_acc best_val_acc = val_acc print(f"\n{GOLD} Best K :", f"{RED}{best_k}-Fold{GOLD}") lbl = "K-Fold Cross-Validation Performance Summary (Selected Features)" print_boundary(lbl, 47) print_summary(best_train_acc, best_val_acc) lbl = "AI Development Productivity Random Forest Analysis Complete" print_boundary(lbl) """ Model Metrics.......... Training Validation Observations........... 1983 851 Features............... 8 8 Maximum Tree Depth..... 7 7 Minimum Leaf Size...... 14 14 Minimum split Size..... 40 40 Mean Absolute Error.... 0.3084 0.3345 Avg Squared Error...... 0.1394 0.1672 Accuracy............... 0.7948 0.7450 Precision.................. 0.8437 0.7751 Recall (Sensitivity)....... 0.7238 0.6894 Specificity................ 0.8658 0.8005 F1-score................... 0.7792 0.7298 Total Misclassifications... 407 217 MISC (Misclassification)... 20.5% 25.5% class 0............... 13.4% 20.0% class 1............... 27.6% 31.1% =============================================== Holdout Validation Performance Summary (Selected Features) =============================================== TRAIN VALIDATION RATIO ACCURACY............ 0.7948 0.7450 1.0668 MISCLASSIFICATION... 0.2052 0.2550 1.2424 ----------------------------------------------- Best K : 9-Fold =============================================== K-Fold Cross-Validation Performance Summary (Selected Features) =============================================== TRAIN VALIDATION RATIO ACCURACY............ 0.7914 0.7583 1.0437 MISCLASSIFICATION... 0.2086 0.2417 1.1588 ----------------------------------------------- """