Spaces:

anly656
/

dr_jones

Sleeping

App Files Files Community

dr_jones / Regression /example_reg_stepwise.py

anly656

Upload 50 files

8643b59 verified about 2 months ago

raw

history blame contribute delete

8.5 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from matplotlib.lines import Line2D
	import statsmodels.api as sm
	from statsmodels.stats.outliers_influence import OLSInfluence
	from AdvancedAnalytics.Regression import stepwise
	from AdvancedAnalytics.ReplaceImputeEncode import DT, ReplaceImputeEncode

	# Data Map for OilProduction.csv
	data_map = {
	'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ],
	'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ],
	'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ],
	'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ],
	'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ],
	'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ],
	'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ],
	'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ],
	'N_Stages': [ DT.Interval , (2, 14) ],
	'X_Well': [ DT.Interval , (-100.0, -95.0) ],
	'Y_Well': [ DT.Interval , (30.0, 35.0) ],
	'Operator': [ DT.Nominal , tuple(range(1, 29))],
	'County': [ DT.Nominal , tuple(range(1, 15))]
	}
	target = "Log_Cum_Production" # Identify Target Attribute in Data File
	df = pd.read_csv("OilProduction.csv")
	# RIE - Data Cleaning & Preprocessing
	rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot',
	no_impute=[target], no_encode=[target],
	drop=False, display=True)
	encoded_df = rie.fit_transform(df)
	print("Encoded Data has", encoded_df.shape[1], "Columns\n")

	# Hyperparameter Optimization to Select Features
	selected = stepwise(encoded_df, target, reg="linear", method="stepwise",
	crit_in=0.05, crit_out=0.05, verbose=True).fit_transform()
	print(len(selected), "out of", encoded_df.shape[1]-1,
	"Features were selected by Stepwise.")
	# Extract target and predictors from encoded_df
	y = encoded_df[target]
	X = encoded_df[selected]

	# Add constant for intercept
	X_sm = sm.add_constant(X)
	# Fit the model
	sm_model = sm.OLS(y, X_sm).fit()
	# Display summary
	print(sm_model.summary())

	y_pred = sm_model.predict(X_sm) #Predicted Values
	influence = OLSInfluence(sm_model) #Statsmodels Influence Object
	std_resid = influence.resid_studentized_internal #Studentized Residuals
	cooks_d = influence.cooks_distance[0] #Cooks D
	count_gt2 = sum(1 for resid in std_resid if abs(resid) > 2)
	count_gt3 = sum(1 for resid in std_resid if abs(resid) > 3)
	count_gt6 = sum(1 for resid in std_resid if abs(resid) > 6)
	print(f"Abs Std. Residuals >2 {count_gt2: 4d}")
	print(f"Abs Std. Residuals >3 {count_gt3: 4d}")
	print(f"Abs Std. Residuals >6 {count_gt6: 4d}")

	# Determine threshold based on sample size
	threshold = 3
	if len(y) > 500:
	threshold = 6
	print(f"Using threshold of ±{threshold} for standardized residuals")

	gold = '#D4AF37'
	plt.style.use('dark_background')
	# Create mask for outliers (points beyond threshold)
	outlier_mask = np.abs(std_resid) > threshold
	yellow_mask = np.abs(std_resid) > 3
	print(f"Found {np.sum(outlier_mask)} outliers beyond threshol")

	# Standardized Residuals vs Predicted Values
	plt.figure(figsize=(12, 6))
	# Plot regular points in cyan
	plt.scatter(y_pred[~yellow_mask], std_resid[~yellow_mask],
	alpha=0.9, color="cyan", edgecolors='k')
	# Plot outlier points in gold or red
	if np.any(yellow_mask):
	plt.scatter(y_pred[yellow_mask], std_resid[yellow_mask],
	alpha=0.9, color=gold, edgecolors='k')
	if np.any(outlier_mask):
	plt.scatter(y_pred[outlier_mask], std_resid[outlier_mask],
	alpha=0.9, color='r', edgecolors='k')
	plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5)
	plt.axhline(y= 3, color=gold,linestyle='--',alpha=0.8, linewidth=2)
	plt.axhline(y=-3, color=gold, linestyle='--',alpha=0.8, linewidth=2)
	plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5)
	plt.axhline(y=-6, color='r', linestyle='-', linewidth=1.5)
	plt.xlabel("Predicted "+target, color=gold,
	fontweight="bold", fontsize=14)
	plt.ylabel('Standardized Residuals', color=gold,
	fontweight="bold", fontsize=14)
	plt.title('Standardized Residuals vs Predicted',
	color=gold, fontweight="bold", fontsize=16)
	plt.grid(True, linestyle='--', alpha=0.7)
	# Create custom legend with explicit colors
	legend_elements = [
	Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold,
	markersize=10, label='3 Sigma'),
	Line2D([0], [0], marker='o', color='r', markerfacecolor='r',
	markersize=10, label='6 Sigma')
	]
	# Add legend
	legend_properties = {'size': 14, 'weight': 'bold'}
	plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9,
	prop=legend_properties)
	plt.tight_layout()
	plt.savefig('residuals_vs_predicted.png', dpi=300)
	plt.show()

	# Standardized Residuals vs Sequence Number
	plt.figure(figsize=(12, 6))
	plt.scatter(std_resid.index[~yellow_mask], std_resid[~yellow_mask],
	alpha=0.9, color="cyan", edgecolors='k')
	# Plot outlier points in gold or red
	if np.any(yellow_mask):
	plt.scatter(std_resid.index[yellow_mask], std_resid[yellow_mask],
	alpha=0.9, color=gold, edgecolors='k')
	if np.any(outlier_mask):
	plt.scatter(std_resid.index[outlier_mask], std_resid[outlier_mask],
	alpha=0.9, color='r', edgecolors='k')

	plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5)
	plt.axhline(y= 3, color=gold, linestyle='--', alpha=0.8, linewidth=2)
	plt.axhline(y=-3, color=gold, linestyle='--', alpha=0.8, linewidth=2)
	plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5)
	plt.axhline(y=-6, color='r', linestyle='-',linewidth=1.5)
	plt.xlabel('Observation Number', color=gold,
	fontweight="bold", fontsize=14)
	plt.ylabel('Standardized Residuals', color=gold,
	fontweight="bold", fontsize=14)
	plt.title('Time Series of Standardized Residuals', color=gold,
	fontweight="bold", fontsize=16)
	plt.grid(True, linestyle='--', alpha=0.7)
	# Create custom legend with explicit colors
	legend_elements = [
	Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold,
	markersize=10, label='3 Sigma'),
	Line2D([0], [0], marker='o', color='r', markerfacecolor='r',
	markersize=10, label='6 Sigma')
	]
	# Add legend
	legend_properties = {'size': 14, 'weight': 'bold'}
	plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9,
	prop=legend_properties)
	plt.tight_layout()
	plt.savefig('residuals_vs_sequence.png', dpi=300)
	plt.show()

	# Standardized Residuals vs Cook's Distance
	plt.figure(figsize=(12, 6))
	# Plot regular points in cyan
	plt.scatter(cooks_d[~yellow_mask], std_resid[~yellow_mask],
	alpha=0.9, color="cyan", edgecolors='k')
	# Plot outlier points in gold or red
	if np.any(yellow_mask):
	plt.scatter(cooks_d[yellow_mask], std_resid[yellow_mask],
	alpha=0.9, color=gold, edgecolors='k')
	if np.any(outlier_mask):
	plt.scatter(cooks_d[outlier_mask], std_resid[outlier_mask],
	alpha=0.9, color='r', edgecolors='k')
	plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5)
	plt.axhline(y= 3, color=gold,linestyle='--', alpha=0.8, linewidth=2)
	plt.axhline(y=-3, color=gold,linestyle='--', alpha=0.8, linewidth=2)
	plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5)
	plt.axhline(y=-6, color='r', linestyle='-',linewidth=1.5)
	plt.axvline(x=4/len(X), color='r', linestyle='--', label="Cook's D Threshold")
	plt.ylabel('Standardized Residuals', color=gold,
	fontweight="bold", fontsize=14)
	plt.xlabel("Cook's Distance", color=gold,
	fontweight="bold", fontsize=14)
	plt.title("Standardized Residuals vs Cook's Distance", color=gold,
	fontweight="bold", fontsize=16)
	plt.legend(fontsize=12)
	plt.grid(True, linestyle='--', alpha=0.7)
	# Create custom legend with explicit colors
	legend_elements = [
	Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold,
	markersize=10, label='3 Sigma'),
	Line2D([0], [0], marker='o', color='r', markerfacecolor='r',
	markersize=10, label='6 Sigma')
	]
	# Add legend
	legend_properties = {'size': 14, 'weight': 'bold'}
	plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9,
	prop=legend_properties)
	plt.tight_layout()
	plt.savefig('residuals_vs_cooks_d.png', dpi=300)
	plt.show()
	plt.close()