Spaces:

anly656
/

dr_jones

Sleeping

App Files Files Community

dr_jones / Random_Forest /BinaryRandomForest_Template2.py

anly656

Upload 50 files

8643b59 verified 2 months ago

raw

history blame contribute delete

17.7 kB

	"""
	@Code: HW7_Improved.py - Customer Churn Random Forest Analysis (Improved V2)
	Step-by-step development of random forest model using AdvancedAnalytics
	IMPROVED v1: Uses training data only for hyperparameter optimization
	IMPROVED v2: Feature selection using decision tree importance scores
	@Data: customer_churn_data.csv
	@Date: Oct 2025
	@Course: Anly 656
	@Author: eJones
	"""
	# ANSI color codes for output formatting
	RED = "\033[38;5;197m"; GOLD = "\033[38;5;185m"; TEAL = "\033[38;5;50m"
	GREEN = "\033[38;5;82m"; RESET = "\033[0m"

	# Import required packages
	import pandas as pd
	import numpy as np
	from AdvancedAnalytics.ReplaceImputeEncode import DT, ReplaceImputeEncode
	from AdvancedAnalytics.Forest import forest_classifier

	from sklearn.ensemble import RandomForestClassifier
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.model_selection import train_test_split, GridSearchCV
	from sklearn.model_selection import cross_validate
	from sklearn.metrics import accuracy_score
	import warnings, time

	def print_boundary(lbl, b_width=60, boundary=True):
	"""Print formatted section boundary with label"""
	print("")
	margin = b_width - len(lbl) - 2
	lmargin = int(margin/2)
	rmargin = lmargin
	if lmargin+rmargin < margin:
	lmargin += 1
	if boundary:
	print(f"{TEAL}", "="*b_width, f"{RESET}")
	print(f"{GREEN}", lmargin"", lbl, rmargin"", f"{RESET}")
	if boundary:
	print(f"{TEAL}", "="*b_width, f"{RESET}")

	def print_acc_ratio(scores, n):
	n_folds = len(scores["train_score"])
	train_misc = (1.0 - scores["train_score"])
	train_smisc = 2.0*(1.0 - scores["train_score"]).std()
	val_misc = (1.0 - scores["test_score"])
	val_smisc = 2.0*(1.0 - scores["test_score"]).std()
	ratio_misc = np.zeros(n_folds)
	for i in range(0, n_folds):
	if train_misc[i]>0:
	ratio_misc[i] = val_misc[i] / train_misc[i]
	elif val_misc[i]>0:
	ratio_misc[i] = np.inf
	else:
	ratio_misc[i] = 1.0
	try:
	s_ratio = 2.0*ratio_misc.std()
	except:
	s_ratio = np.nan
	train_misc = train_misc.mean()
	val_misc = val_misc.mean()
	ratio = val_misc/train_misc if train_misc>0 else np.inf
	print(f"{TEAL}\n")
	print(f" ====== {n_folds:.0f}-Fold Cross Validation =======")
	print(f" Train Avg. MISC..... {train_misc:.4f} +/-{train_smisc:.4f}")
	print(f" Test Avg. MISC..... {val_misc:.4f} +/-{val_smisc:.4f}")
	if s_ratio == np.nan or s_ratio == np.inf:
	print(f" Mean Misc Ratio..... {ratio:.4f}")
	else:
	print(f" Mean Misc Ratio..... {ratio:.4f} +/-{s_ratio:.4f}")
	print(" ", 39*"=", f"{RESET}")
	n_v = n*(1.0/n_folds)
	n_t = n - n_v
	print(f"Equivalent to {n_folds:.0f} splits each with "+
	f"{n_t:.0f}/{n_v:.0f} Cases")

	def print_summary(train_acc, val_acc):
	train_misc = 1.0 - train_acc
	val_misc = 1.0 - val_acc
	ratio_acc = train_acc / val_acc if val_acc>0 else np.inf
	if train_misc>0:
	ratio_misc = val_misc / train_misc
	elif val_misc>0:
	ratio_misc = np.inf
	else:
	ratio_misc = 1.0
	#print accuracy and misclassification summary for train/validation
	print(f"{GREEN}{'TRAIN':>28s} {'VALIDATION':>11s} {'RATIO':>7s}")
	if ratio_acc < 1.2:
	color = GREEN
	else:
	color = RED
	print(f"{GREEN} {'ACCURACY':.<20s}{GOLD}{train_acc:>7.4f}",
	f" {val_acc:>7.4f} {color}{ratio_acc:>7.4f}{RESET}")

	if ratio_misc < 1.2:
	color = GREEN
	else:
	color = RED
	print(f"{GREEN} {'MISCLASSIFICATION':.<20s}{GOLD}{train_misc:>7.4f}",
	f" {val_misc:>7.4f} {color}{ratio_misc:>7.4f}{RESET}")
	print(f"{TEAL}","-"*47, f"{RESET}")

	def tree_selection(X, y, threshold=0.9):
	dt_selector = DecisionTreeClassifier(random_state=42).fit(X, y)
	# Get feature importances
	feature_importance = dt_selector.feature_importances_
	feature_name = X.columns

	# Create a DataFrame for easier manipulation
	importance_df = pd.DataFrame({
	'feature': feature_name,
	'importance': feature_importance
	}).sort_values('importance', ascending=False)
	# Calculate cumulative importance
	importance_df['cumulative_importance'] = importance_df['importance'].cumsum()

	print(f"\n{GREEN} Feature Importance Analysis{GOLD}")
	print(f"{'='*51}")
	print(f"{'Feature':<27} {'Importance':<12} {'Cumulative':<12}")
	print(f"{'-'*51}")
	lne = False
	for idx, row in importance_df.iterrows():
	print(f"{row['feature']:.<29} {row['importance']:<12.4f} ",
	f"{row['cumulative_importance']:<12.4f}")
	if row['cumulative_importance'] > threshold and not lne:
	print(f"{RED}{25*'- '}{GOLD}"); lne = True
	print(f"{'='*51}")
	# Select features that account for at least threshold of total importance
	cumulative_threshold = threshold
	selected_mask = importance_df['cumulative_importance'] >= cumulative_threshold

	if selected_mask.any():
	# Find the first feature that makes cumulative importance >= 90%
	first_idx = selected_mask.idxmax()
	selected_features = importance_df.loc[:first_idx, 'feature'].tolist()
	threshold_reached = importance_df.loc[first_idx, 'cumulative_importance']
	else:
	# If no combination reaches 90%, take all features
	selected_features = importance_df['feature'].tolist()
	threshold_reached = importance_df['cumulative_importance'].max()
	print(f"\n{RED}No feature combination reaches 90% importance. ",
	f"Using all features (cumulative: {threshold_reached:.1%})")

	print(f"\n{GREEN}Selected {len(selected_features)} features accounting for ",
	f"{threshold_reached:.0%} of importance")
	# Reduce feature set to selected features
	X_selected = X[selected_features]
	print(f"{GREEN}Feature selection complete - reduction: ",
	f"{RED}{X.shape[1]} -→ {X_selected.shape[1]} {GREEN}features{RESET}")

	return X_selected
	""" =========================================================== """
	# Step 1: Read the Customer Churn Data
	lbl = "Step 1: Reading Customer Churn Data"
	print_boundary(lbl)

	# Read the customer churn data
	df = pd.read_csv("../data/customer_churn_data.csv")
	print(f"{GOLD}Data loaded: {df.shape[0]} observations and {df.shape[1]} columns.{RESET}")

	# Display first few rows to verify data structure
	print(f"\n{GOLD}First 5 rows of the data:{RESET}")
	print(df.head())

	# Step 2: Create Data Map and Apply ReplaceImputeEncode
	lbl = "Step 2: Data Map and ReplaceImputeEncode Processing"
	print_boundary(lbl)

	# Create data map based on data dictionary
	data_map = {
	"customer_id": [DT.ID, ("")],
	"churn": [DT.Binary, (0, 1)],
	"has_partner": [DT.Binary, (0, 1)],
	"has_dependents": [DT.Binary, (0, 1)],
	"internet_service": [DT.Nominal, ("No", "DSL", "Fiber optic")],
	"contract_type": [DT.Nominal, ("Month-to-month", "One year",
	"Two year")],
	"age": [DT.Interval, (25, 65)],
	"income": [DT.Interval, (30000, 95000)],
	"tenure_months": [DT.Interval, (1, 60)],
	"monthly_charges": [DT.Interval, (10, 150)],
	"num_support_tickets": [DT.Interval, (0, 8)],
	"satisfaction_score": [DT.Interval, (1.0, 5.0)]
	}

	print(f"{GOLD}")
	print(15"=", "DATA MAP", 15"=")
	lk = len(max(data_map, key=len)) + 1
	ignored = 0
	for col, (dt_type, valid_values) in data_map.items():
	if dt_type.name == "ID" or dt_type.name == "Ignore":
	ignored += 1
	print(f" {TEAL}{col:.<{lk}s} {GOLD}{dt_type.name:9s}{GREEN}{valid_values}")
	print(f"{GOLD} === Data Map has{RED}", len(data_map)-ignored,
	f"{GOLD}attribute columns", 3*"=",f"{RESET}")

	# Set target variable
	target = "churn"
	print(f"{GOLD}Target variable: {target}{RESET}")

	# Apply ReplaceImputeEncode preprocessing
	rie = ReplaceImputeEncode(data_map=data_map,
	interval_scale=None, # No standardization of interval features
	no_impute=[target], # Do not impute target variable
	binary_encoding="one-hot",
	nominal_encoding="one-hot",
	drop=False, # Keep all columns
	display=True)

	# Transform the data
	encoded_df = rie.fit_transform(df)
	print(f"\n{RED}encoded_df{RESET}:",
	f"{encoded_df.shape[0]} cases and",
	f"{encoded_df.shape[1]} columns,\n",
	" including targets.")
	print(f"{RESET}")

	print(f"\n{GOLD}Preprocessing complete. Ready for next step.{RESET}")

	# Step 3: Kitchen Sink Random Forest Evaluation
	lbl = "Step 3: Kitchen Sink Random Forest (Default Parameters)"
	print_boundary(lbl)

	y = encoded_df[target]
	X = encoded_df.drop(target, axis=1)

	# First show the overfitting on full data
	print(f"{GOLD}Fitting kitchen sink random forest using entire dataset")
	kitchen_sink_forest = RandomForestClassifier(random_state=42) # Defaults
	kitchen_sink_forest = kitchen_sink_forest.fit(X, y)
	forest_classifier.display_metrics(kitchen_sink_forest, X, y)
	print(f"{RED}'Overfitting?'{RESET}")

	# Now evaluate with proper holdout validation
	lbl = "70/30 Holdout Validation of Kitchen Sink Forest"
	print_boundary(lbl)

	# Split the data 70/30
	X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3,
	stratify=y, random_state=42)

	# Fit kitchen sink forest on training data only
	kitchen_sink_forest_cv = RandomForestClassifier(random_state=42) # Defaults
	kitchen_sink_forest_cv = kitchen_sink_forest_cv.fit(X_train, y_train)

	# Evaluate using AdvancedAnalytics display_split_metrics
	print(f"{GOLD}")
	forest_classifier.display_split_metrics(kitchen_sink_forest_cv,
	X_train, y_train, X_val, y_val)

	# Calculate and display accuracy ratio and misclassification ratio
	train_pred = kitchen_sink_forest_cv.predict(X_train)
	val_pred = kitchen_sink_forest_cv.predict(X_val)
	train_acc = accuracy_score(y_train, train_pred)
	val_acc = accuracy_score(y_val, val_pred)

	lbl = "Kitchen Sink 70/30 Validation"
	print_boundary(lbl, 47)
	print_summary(train_acc, val_acc) #train/validation summary

	# Show feature importance from the properly trained model
	print(f"{GOLD}\nTop 10 Feature Importance (from training data):")
	forest_classifier.display_importance(kitchen_sink_forest_cv, X.columns,
	top='all', plot=True)
	print(f"{RESET}")

	# Step 4.1: Feature Selection using Decision Tree Importance
	lbl = "Step 4.1: Feature Selection using Decision Tree Importance"
	print_boundary(lbl)
	threshold = 0.9 #Select top 90% of Important Features
	X_selected = tree_selection(X, y, threshold)
	selected_features = X_selected.columns

	# Step 4,2: Case Reduction - Construct Stratified Random Sample
	lbl = "Step 4.2: Case Reduction using Stratified Random Sample"
	print_boundary(lbl)
	#Using different random_state (123) than Step 3 (42) to get different split
	train_size = 0.5
	X_train, X_val, y_train, y_val = \
	train_test_split(X_selected, y, train_size=train_size,
	stratify=y, random_state=123)

	# Step 5: Random Forest Hyperparameter Optimization using selected features
	lbl = "Step 4.3: Random Forest Hyperparameter Optimization"
	print_boundary(lbl)

	#Using only 70% of total cases (1750 cases)
	param_grid = {
	'n_estimators': [50, 100, 150],
	'criterion': ['gini', 'entropy'],
	'max_depth': [3, 4, 5, None],
	'min_samples_split': [16, 18, 20, 22],
	'min_samples_leaf': [ 8, 9, 10, 16],
	'max_features': ['sqrt', 4, None]
	}
	"""
	Grid Search: 1152 parameter combinations with
	4-fold CV requires 4608 total fits.

	Grid search completed in 22.3 seconds
	Average time per parameter combination: 0.02 seconds

	===============================================
	********* Optimum Hyperparameters *********
	===============================================
	criterion........... gini
	max_depth........... 4
	max_features........ 4
	min_samples_leaf.... 16
	min_samples_split... 16
	n_estimators........ 50

	===============================================
	**** Optimum Forest Performance Metrics ***
	===============================================
	TRAIN VALIDATION RATIO
	ACCURACY............ 0.7525 0.7408 1.0158
	MISCLASSIFICATION... 0.2475 0.2592 1.0474
	-----------------------------------------------
	"""

	lbl = "Hyperparameters"
	print_boundary(lbl, 47)
	for parm in param_grid:
	print(f"{GREEN} {parm:.<20s}{GOLD}{param_grid[parm][0:]}{RESET}")

	# Calculate and display grid search information
	total_combinations = 1
	for param_list in param_grid.values():
	total_combinations *= len(param_list)
	total_fits = total_combinations * 4 # cv=4

	njobs = -1
	print(f"\n{GOLD}Grid Search: {total_combinations} parameter combinations with")
	print(f"4-fold CV requires {total_fits} total fits.\n")
	print(f"Hyperparameter optimization uses only the top {threshold}% of\n",
	f"features and randomly selected {train_size}% of data.\n")
	print(f"{GOLD}Parallel processing using {RED}n_jobs={njobs}")

	# Start timing and run grid search
	start_time = time.time()
	print(f"\n{GREEN}Starting grid search...{RESET}")
	rf = RandomForestClassifier(random_state=42)
	#Grid Search using only selected features
	grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
	cv=4, scoring='accuracy', return_train_score=True,
	n_jobs=njobs).fit(X_train, y_train)
	end_time = time.time()
	elapsed_time = end_time - start_time
	print(f"{GOLD}Grid search completed in {elapsed_time:.1f} seconds")
	print(f"Average time per parameter combination: ",
	f"{elapsed_time/total_combinations:.2f} seconds{RESET}")

	lbl = "Optimum Hyperparameters"
	print_boundary(lbl, 47)
	# Find the longest parameter value for consistent formatting
	max_param_len = max(len(str(val)) for val in grid_search.best_params_.values())
	sze = max_param_len + 3
	for parm in grid_search.best_params_:
	parameter = str(grid_search.best_params_[parm])
	print(f"{GREEN} {parm:.<20s}{GOLD}{parameter:>{sze}s}{RESET}")

	best_idx = np.argmin(grid_search.cv_results_['rank_test_score'])
	val_acc = grid_search.cv_results_['mean_test_score'][best_idx]
	train_acc = grid_search.cv_results_['mean_train_score'][best_idx]

	lbl = "Optimum Forest Performance Metrics"
	print_boundary(lbl, 47)
	print_summary(train_acc, val_acc)

	lbl = "Best Random Forest Importance"
	print_boundary(lbl, 47)

	best_forest = grid_search.best_estimator_
	importance = best_forest.feature_importances_
	feature = X_train.columns
	data = {'feature': feature, 'importance':importance}
	df = pd.DataFrame(data)
	df = df.sort_values(by='importance', ascending=False)
	df['cumulative'] = df['importance'].cumsum()
	print(df.to_string(index=False))

	# Step 5: Holdout Validation (70/30 split) - Selected Features
	lbl = "Step 5: Holdout Validation (Selected Features)"
	print_boundary(lbl)

	Xt_full, Xv_full, yt, yv = train_test_split(X, y, test_size=0.3,
	stratify=y, random_state=42)
	# Use only selected features for training
	Xt = Xt_full[selected_features]
	Xv = Xv_full[selected_features]

	# Train optimized model on final training set
	hold_out_forest = best_forest
	hold_out_forest = hold_out_forest.fit(Xt, yt)
	print(f"{GOLD}")
	forest_classifier.display_split_metrics(hold_out_forest, Xt, yt, Xv, yv)
	print(f"{RESET}")
	# Calculate final performance metrics
	train_pred = hold_out_forest.predict(Xt)
	val_pred = hold_out_forest.predict(Xv)
	train_acc = accuracy_score(yt, train_pred)
	val_acc = accuracy_score(yv, val_pred)
	train_misc = 1.0 - train_acc
	val_misc = 1.0 - val_acc
	ratio_acc = train_acc / val_acc if val_acc > 0 else np.inf
	ratio_misc = val_misc / train_misc if train_misc > 0 else np.inf

	lbl = "Holdout Validation Performance Summary"
	print_boundary(lbl, 47)
	print_summary(train_acc, val_acc)

	# Show feature importance from the optimized model (selected features only)
	print(f"{GOLD}\nFeature Importance (optimized model - selected features):")
	forest_classifier.display_importance(hold_out_forest, selected_features,
	top='all', plot=True)

	# Step 6: K-Fold Cross Validation - Selected Features
	lbl = "Step 6: K-Fold Cross-Validation (Selected Features)"
	print_boundary(lbl)

	warnings.filterwarnings('ignore', category=RuntimeWarning)
	n = X_selected.shape[0]
	best_val_acc = 0
	for k in range(2, 11): # Test 2-fold through 10-fold CV
	scores = cross_validate(best_forest, X_selected, y, scoring='accuracy',
	cv=k, return_train_score=True)
	# Calculate metrics
	train_acc = scores["train_score"].mean()
	val_acc = scores["test_score"].mean()

	print_acc_ratio(scores, n)
	if val_acc > best_val_acc:
	best_k = k
	best_train_acc = train_acc
	best_val_acc = val_acc

	print(f"\n{GOLD} Best K :",
	f"{RED}{best_k}-Fold{GOLD}")
	lbl = "K-Fold Cross-Validation Performance Summary (Selected Features)"
	print_boundary(lbl, 47)
	print_summary(best_train_acc, best_val_acc)

	lbl = "Customer Churn Random Forest Analysis Complete (Improved Version 2)"
	print_boundary(lbl)