dr_jones / Random_Forest /BinaryRandomForest_Template3.py
anly656's picture
Upload 50 files
8643b59 verified
"""
@Data: Medical Diagnosis Dataset
@Date: Oct 2025
@Author: eJones
@Email: ejones@tamu.edu
"""
# ANSI color codes for output formatting
RED = "\033[38;5;197m"; GOLD = "\033[38;5;185m"; TEAL = "\033[38;5;50m"
GREEN = "\033[38;5;82m"; RESET = "\033[0m"
# Import required packages
import pandas as pd
import numpy as np
from AdvancedAnalytics.ReplaceImputeEncode import DT, ReplaceImputeEncode
from AdvancedAnalytics.Forest import forest_classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
import warnings, time
def print_boundary(lbl, b_width=60, boundary=True):
"""Print formatted section boundary with label"""
print("")
margin = b_width - len(lbl) - 2
lmargin = int(margin/2)
rmargin = lmargin
if lmargin+rmargin < margin:
lmargin += 1
if boundary:
print(f"{TEAL}", "="*b_width, f"{RESET}")
print(f"{GREEN}", lmargin*"*", lbl, rmargin*"*", f"{RESET}")
if boundary:
print(f"{TEAL}", "="*b_width, f"{RESET}")
def print_acc_ratio(scores, n):
n_folds = len(scores["train_score"])
train_misc = (1.0 - scores["train_score"])
train_smisc = 2.0*(1.0 - scores["train_score"]).std()
val_misc = (1.0 - scores["test_score"])
val_smisc = 2.0*(1.0 - scores["test_score"]).std()
ratio_misc = np.zeros(n_folds)
for i in range(0, n_folds):
if train_misc[i]>0:
ratio_misc[i] = val_misc[i] / train_misc[i]
elif val_misc[i]>0:
ratio_misc[i] = np.inf
else:
ratio_misc[i] = 1.0
try:
s_ratio = 2.0*ratio_misc.std()
except:
s_ratio = np.nan
train_misc = train_misc.mean()
val_misc = val_misc.mean()
ratio = val_misc/train_misc if train_misc>0 else np.inf
print(f"{TEAL}\n")
print(f" ====== {n_folds:.0f}-Fold Cross Validation =======")
print(f" Train Avg. MISC..... {train_misc:.4f} +/-{train_smisc:.4f}")
print(f" Test Avg. MISC..... {val_misc:.4f} +/-{val_smisc:.4f}")
if s_ratio == np.nan or s_ratio == np.inf:
print(f" Mean Misc Ratio..... {ratio:.4f}")
else:
print(f" Mean Misc Ratio..... {ratio:.4f} +/-{s_ratio:.4f}")
print(" ", 39*"=", f"{RESET}")
n_v = n*(1.0/n_folds)
n_t = n - n_v
print(f"Equivalent to {n_folds:.0f} splits each with "+
f"{n_t:.0f}/{n_v:.0f} Cases")
def print_summary(train_acc, val_acc):
train_misc = 1.0 - train_acc
val_misc = 1.0 - val_acc
ratio_acc = train_acc / val_acc if val_acc>0 else np.inf
if train_misc>0:
ratio_misc = val_misc / train_misc
elif val_misc>0:
ratio_misc = np.inf
else:
ratio_misc = 1.0
#print accuracy and misclassification summary for train/validation
print(f"{GREEN}{'TRAIN':>28s} {'VALIDATION':>11s} {'RATIO':>7s}")
if ratio_acc < 1.2:
color = GREEN
else:
color = RED
print(f"{GREEN} {'ACCURACY':.<20s}{GOLD}{train_acc:>7.4f}",
f" {val_acc:>7.4f} {color}{ratio_acc:>7.4f}{RESET}")
if ratio_misc < 1.2:
color = GREEN
else:
color = RED
print(f"{GREEN} {'MISCLASSIFICATION':.<20s}{GOLD}{train_misc:>7.4f}",
f" {val_misc:>7.4f} {color}{ratio_misc:>7.4f}{RESET}")
print(f"{TEAL}","-"*47, f"{RESET}")
def tree_selection(X, y, threshold=0.9):
if threshold >= 1.0:
return X
dt_selector = DecisionTreeClassifier(random_state=42).fit(X, y)
# Get feature importances
feature_importance = dt_selector.feature_importances_
feature_name = X.columns
# Create a DataFrame for easier manipulation
importance_df = pd.DataFrame({
'feature': feature_name,
'importance': feature_importance
}).sort_values('importance', ascending=False)
# Calculate cumulative importance
importance_df['cumulative_importance'] = importance_df['importance'].cumsum()
print(f"\n{GREEN} Feature Importance Analysis{GOLD}")
print(f"{'='*51}")
print(f"{'Feature':<27} {'Importance':<12} {'Cumulative':<12}")
print(f"{'-'*51}")
lne = False
for idx, row in importance_df.iterrows():
print(f"{row['feature']:.<29} {row['importance']:<12.4f} ",
f"{row['cumulative_importance']:<12.4f}")
if row['cumulative_importance'] > threshold and not lne:
print(f"{RED}{25*'- '}{GOLD}"); lne = True
print(f"{'='*51}")
# Select features that account for at least threshold of total importance
cumulative_threshold = threshold
selected_mask = importance_df['cumulative_importance'] >= cumulative_threshold
if selected_mask.any():
# Find the first feature that makes cumulative importance >= 90%
first_idx = selected_mask.idxmax()
selected_features = importance_df.loc[:first_idx, 'feature'].tolist()
threshold_reached = importance_df.loc[first_idx, 'cumulative_importance']
else:
# If no combination reaches 90%, take all features
selected_features = importance_df['feature'].tolist()
threshold_reached = importance_df['cumulative_importance'].max()
print(f"\n{RED}No feature combination reaches 90% importance. ",
f"Using all features (cumulative: {threshold_reached:.1%})")
print(f"\n{GREEN}Selected {len(selected_features)} features accounting for ",
f"{threshold_reached:.0%} of importance")
# Reduce feature set to selected features
X_selected = X[selected_features]
print(f"{GREEN}Feature selection complete - reduction: ",
f"{RED}{X.shape[1]} -→ {X_selected.shape[1]} {GREEN}features{RESET}")
return X_selected
""" =========================================================== """
lbl = "Step 1: Reading Medical Diagnosis Data"
print_boundary(lbl)
df = pd.read_csv('../data/medical_diagnosis_data.csv')
print(f"{GOLD}Data loaded: {df.shape[0]} ",
f"observations and {df.shape[1]} columns.{RESET}")
# Display first few rows to verify data structure
print(f"\n{GOLD}First 5 rows of the data:{RESET}")
print(df.head())
# Step 2: Create Data Map and Apply ReplaceImputeEncode
lbl = "Step 2: Data Map and ReplaceImputeEncode Processing"
print_boundary(lbl)
data_map = {
"patient_id": [DT.ID, ("")],
# Interval features
"age": [DT.Interval, (18, 85)],
"bmi": [DT.Interval, (18.0, 40.0)],
"cholesterol": [DT.Interval, (120, 350)],
"stress_level": [DT.Interval, (1, 10)],
"sleep_hours": [DT.Interval, (4.0, 11.0)],
# Nominal features
"blood_pressure": [DT.Nominal, ("Normal", "Elevated", "High_Stage1",
"High_Stage2")],
"exercise_freq": [DT.Nominal, ("none", "light", "moderate", "intense")],
# Binary features
"smoker": [DT.Binary, (0, 1)],
"family_history": [DT.Binary, (0, 1)],
"gender": [DT.Binary, (0, 1)],
# Target variable
"has_disease": [DT.Binary, (0, 1)] #cardiovascular condition
}
print(f"{GOLD}")
print(15*"=", "DATA MAP", 15*"=")
lk = len(max(data_map, key=len)) + 1
ignored = 0
for col, (dt_type, valid_values) in data_map.items():
if dt_type.name == "ID" or dt_type.name == "Ignore":
ignored += 1
print(f" {TEAL}{col:.<{lk}s} {GOLD}{dt_type.name:9s}{GREEN}{valid_values}")
print(f"{GOLD} === Data Map has{RED}", len(data_map)-ignored,
f"{GOLD}attribute columns", 3*"=",f"{RESET}")
# Step 3: Apply RIE preprocessing
# Set target variable
target = "has_disease"
print(f"{GOLD}Target variable: {target}{RESET}")
rie = ReplaceImputeEncode(data_map=data_map,
interval_scale=None, # No standardization
no_impute=[target], # Don't impute target
binary_encoding ="one-hot",
nominal_encoding="one-hot",
drop=False, # Keep all encoded columns
display=True)
# Transform the data
encoded_df = rie.fit_transform(df)
print(f"\n{RED}encoded_df{RESET}:",
f"{encoded_df.shape[0]} cases and",
f"{encoded_df.shape[1]} columns,\n",
" including targets.")
print(f"{RESET}")
print(f"\n{GOLD}Preprocessing complete. Ready for next step.{RESET}")
# Step 3: Kitchen Sink Random Forest Evaluation
lbl = "Step 3: Kitchen Sink Random Forest (Default Parameters)"
print_boundary(lbl)
y = encoded_df[target]
X = encoded_df.drop(target, axis=1)
# First show the overfitting on full data
print(f"{GOLD}Fitting kitchen sink random forest using entire dataset")
kitchen_sink_forest = RandomForestClassifier(random_state=42) # Defaults
kitchen_sink_forest = kitchen_sink_forest.fit(X, y)
forest_classifier.display_metrics(kitchen_sink_forest, X, y)
print(f"{RED}'Overfitting?'{RESET}")
# Now evaluate with proper holdout validation
lbl = "70/30 Holdout Validation of Kitchen Sink Forest"
print_boundary(lbl)
# Split the data 70/30
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3,
stratify=y, random_state=42)
# Fit kitchen sink forest on training data only
kitchen_sink_forest_cv = RandomForestClassifier(random_state=42) # Defaults
kitchen_sink_forest_cv = kitchen_sink_forest_cv.fit(X_train, y_train)
# Evaluate using AdvancedAnalytics display_split_metrics
print(f"{GOLD}")
forest_classifier.display_split_metrics(kitchen_sink_forest_cv,
X_train, y_train, X_val, y_val)
# Calculate and display accuracy ratio and misclassification ratio
train_pred = kitchen_sink_forest_cv.predict(X_train)
val_pred = kitchen_sink_forest_cv.predict(X_val)
train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
lbl = "Kitchen Sink 70/30 Validation"
print_boundary(lbl, 47)
print_summary(train_acc, val_acc) #train/validation summary
# Show feature importance from the properly trained model
print(f"{GOLD}\nTop 10 Feature Importance (from training data):")
forest_classifier.display_importance(kitchen_sink_forest_cv, X.columns,
top='all', plot=True)
print(f"{RESET}")
# Step 4.1: Feature Selection using Decision Tree Importance
lbl = "Step 4.1: Feature Selection using Decision Tree Importance"
print_boundary(lbl)
threshold = 0.9 #Select top 90% of Important Features
X_selected = tree_selection(X, y, threshold)
selected_features = X_selected.columns
# Step 4,2: Case Reduction - Construct Stratified Random Sample
lbl = "Step 4.2: Case Reduction using Stratified Random Sample"
print_boundary(lbl)
#Using different random_state (123) than Step 3 (42) to get different split
train_size = 0.5
if train_size < 1.0:
X_train, X_val, y_train, y_val = \
train_test_split(X_selected, y, train_size=train_size,
stratify=y, random_state=123)
else:
X_train = X
y_train = y
# Step 5: Random Forest Hyperparameter Optimization using selected features
lbl = "Step 4.3: Random Forest Hyperparameter Optimization"
print_boundary(lbl)
#n=2834; 50% N = 1417; 0.5%xN = 7 or 14; use 14 since valdation users N=2834
param_grid = {
'n_estimators': [50, 100],
'criterion': ['gini', 'entropy'],
'max_depth': [6, 7, 9, 10, None],
'min_samples_split': [14, 28, 32, 34, 36, 38, 40, 42],
'min_samples_leaf': [14, 16, 17, 20, 22 ],
'max_features': ['sqrt', 4, 5, 7, 8, 9, None]
}
"""
Grid Search: 5600 parameter combinations with
4-fold CV requires 22400 total fits.
Hyperparameter optimization uses only the top 0.9% of
features and randomly selected 0.5% of data.
Parallel processing using n_jobs=-1
Starting grid search...
Grid search completed in 88.8 seconds
Average time per parameter combination: 0.02 seconds
===============================================
*********** Optimum Hyperparameters ***********
===============================================
criterion........... entropy
max_depth........... 7
max_features........ 4
min_samples_leaf.... 14
min_samples_split... 40
n_estimators........ 50
===============================================
****** Optimum Forest Performance Metrics *****
===============================================
TRAIN VALIDATION RATIO
ACCURACY............ 0.8064 0.7685 1.0493
MISCLASSIFICATION... 0.1936 0.2315 1.1957
-----------------------------------------------
"""
lbl = "Hyperparameters"
print_boundary(lbl, 47)
for parm in param_grid:
print(f"{GREEN} {parm:.<20s}{GOLD}{param_grid[parm][0:]}{RESET}")
# Calculate and display grid search information
total_combinations = 1
for param_list in param_grid.values():
total_combinations *= len(param_list)
total_fits = total_combinations * 4 # cv=4
njobs = -1
print(f"\n{GOLD}Grid Search: {total_combinations} parameter combinations with")
print(f"4-fold CV requires {total_fits} total fits.\n")
t_size = 100*train_size; t = 100*threshold
print(f"Hyperparameter optimization uses only the top {t}% of\n",
f"features and randomly selected {t_size}% of data.\n")
print(f"{GOLD}Parallel processing using {RED}n_jobs={njobs}")
# Start timing and run grid search
start_time = time.time()
print(f"\n{GREEN}Starting grid search...{RESET}")
rf = RandomForestClassifier(random_state=42)
#Grid Search using only selected features
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
cv=4, scoring='accuracy', return_train_score=True,
n_jobs=njobs).fit(X_train, y_train)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"{GOLD}Grid search completed in {elapsed_time:.1f} seconds")
print(f"Average time per parameter combination: ",
f"{elapsed_time/total_combinations:.2f} seconds{RESET}")
lbl = "Optimum Hyperparameters"
print_boundary(lbl, 47)
# Find the longest parameter value for consistent formatting
max_param_len = max(len(str(val)) for val in grid_search.best_params_.values())
sze = max_param_len + 3
for parm in grid_search.best_params_:
parameter = str(grid_search.best_params_[parm])
print(f"{GREEN} {parm:.<20s}{GOLD}{parameter:>{sze}s}{RESET}")
best_idx = np.argmin(grid_search.cv_results_['rank_test_score'])
val_acc = grid_search.cv_results_['mean_test_score'][best_idx]
train_acc = grid_search.cv_results_['mean_train_score'][best_idx]
lbl = "Optimum Forest Performance Metrics"
print_boundary(lbl, 47)
print_summary(train_acc, val_acc)
lbl = "Best Random Forest Importance"
print_boundary(lbl, 47)
best_forest = grid_search.best_estimator_
importance = best_forest.feature_importances_
feature = X_train.columns
data = {'feature': feature, 'importance':importance}
df = pd.DataFrame(data)
df = df.sort_values(by='importance', ascending=False)
df['cumulative'] = df['importance'].cumsum()
print(df.to_string(index=False))
# Step 5: Holdout Validation (70/30 split) - Selected Features
lbl = "Step 5: Holdout Validation (Selected Features)"
print_boundary(lbl)
Xt_full, Xv_full, yt, yv = train_test_split(X, y, test_size=0.3,
stratify=y, random_state=42)
# Use only selected features for training
Xt = Xt_full[selected_features]
Xv = Xv_full[selected_features]
# Train optimized model on final training set
hold_out_forest = best_forest
hold_out_forest = hold_out_forest.fit(Xt, yt)
print(f"{GOLD}")
forest_classifier.display_split_metrics(hold_out_forest, Xt, yt, Xv, yv)
print(f"{RESET}")
# Calculate final performance metrics
train_pred = hold_out_forest.predict(Xt)
val_pred = hold_out_forest.predict(Xv)
train_acc = accuracy_score(yt, train_pred)
val_acc = accuracy_score(yv, val_pred)
train_misc = 1.0 - train_acc
val_misc = 1.0 - val_acc
ratio_acc = train_acc / val_acc if val_acc > 0 else np.inf
ratio_misc = val_misc / train_misc if train_misc > 0 else np.inf
lbl = "Holdout Validation Performance Summary (Selected Features)"
print_boundary(lbl, 47)
print_summary(train_acc, val_acc)
# Show feature importance from the optimized model (selected features only)
print(f"{GOLD}\nFeature Importance (optimized model - selected features):")
forest_classifier.display_importance(hold_out_forest, selected_features,
top='all', plot=True)
# Step 6: K-Fold Cross Validation - Selected Features
lbl = "Step 6: K-Fold Cross-Validation (Selected Features)"
print_boundary(lbl)
warnings.filterwarnings('ignore', category=RuntimeWarning)
n = X_selected.shape[0]
best_val_acc = 0
for k in range(2, 11): # Test 2-fold through 10-fold CV
scores = cross_validate(best_forest, X_selected, y, scoring='accuracy',
cv=k, return_train_score=True)
# Calculate metrics
train_acc = scores["train_score"].mean()
val_acc = scores["test_score"].mean()
print_acc_ratio(scores, n)
if val_acc > best_val_acc:
best_k = k
best_train_acc = train_acc
best_val_acc = val_acc
print(f"\n{GOLD} Best K :",
f"{RED}{best_k}-Fold{GOLD}")
lbl = "K-Fold Cross-Validation Performance Summary (Selected Features)"
print_boundary(lbl, 47)
print_summary(best_train_acc, best_val_acc)
lbl = "AI Development Productivity Random Forest Analysis Complete"
print_boundary(lbl)
"""
Model Metrics.......... Training Validation
Observations........... 1983 851
Features............... 8 8
Maximum Tree Depth..... 7 7
Minimum Leaf Size...... 14 14
Minimum split Size..... 40 40
Mean Absolute Error.... 0.3084 0.3345
Avg Squared Error...... 0.1394 0.1672
Accuracy............... 0.7948 0.7450
Precision.................. 0.8437 0.7751
Recall (Sensitivity)....... 0.7238 0.6894
Specificity................ 0.8658 0.8005
F1-score................... 0.7792 0.7298
Total Misclassifications... 407 217
MISC (Misclassification)... 20.5% 25.5%
class 0............... 13.4% 20.0%
class 1............... 27.6% 31.1%
===============================================
Holdout Validation Performance Summary (Selected Features)
===============================================
TRAIN VALIDATION RATIO
ACCURACY............ 0.7948 0.7450 1.0668
MISCLASSIFICATION... 0.2052 0.2550 1.2424
-----------------------------------------------
Best K : 9-Fold
===============================================
K-Fold Cross-Validation Performance Summary (Selected Features)
===============================================
TRAIN VALIDATION RATIO
ACCURACY............ 0.7914 0.7583 1.0437
MISCLASSIFICATION... 0.2086 0.2417 1.1588
-----------------------------------------------
"""