dr_jones / Regression /example_reg_stepwise.py
anly656's picture
Upload 50 files
8643b59 verified
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence
from AdvancedAnalytics.Regression import stepwise
from AdvancedAnalytics.ReplaceImputeEncode import DT, ReplaceImputeEncode
# Data Map for OilProduction.csv
data_map = {
'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ],
'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ],
'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ],
'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ],
'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ],
'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ],
'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ],
'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ],
'N_Stages': [ DT.Interval , (2, 14) ],
'X_Well': [ DT.Interval , (-100.0, -95.0) ],
'Y_Well': [ DT.Interval , (30.0, 35.0) ],
'Operator': [ DT.Nominal , tuple(range(1, 29))],
'County': [ DT.Nominal , tuple(range(1, 15))]
}
target = "Log_Cum_Production" # Identify Target Attribute in Data File
df = pd.read_csv("OilProduction.csv")
# RIE - Data Cleaning & Preprocessing
rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot',
no_impute=[target], no_encode=[target],
drop=False, display=True)
encoded_df = rie.fit_transform(df)
print("Encoded Data has", encoded_df.shape[1], "Columns\n")
# Hyperparameter Optimization to Select Features
selected = stepwise(encoded_df, target, reg="linear", method="stepwise",
crit_in=0.05, crit_out=0.05, verbose=True).fit_transform()
print(len(selected), "out of", encoded_df.shape[1]-1,
"Features were selected by Stepwise.")
# Extract target and predictors from encoded_df
y = encoded_df[target]
X = encoded_df[selected]
# Add constant for intercept
X_sm = sm.add_constant(X)
# Fit the model
sm_model = sm.OLS(y, X_sm).fit()
# Display summary
print(sm_model.summary())
y_pred = sm_model.predict(X_sm) #Predicted Values
influence = OLSInfluence(sm_model) #Statsmodels Influence Object
std_resid = influence.resid_studentized_internal #Studentized Residuals
cooks_d = influence.cooks_distance[0] #Cooks D
count_gt2 = sum(1 for resid in std_resid if abs(resid) > 2)
count_gt3 = sum(1 for resid in std_resid if abs(resid) > 3)
count_gt6 = sum(1 for resid in std_resid if abs(resid) > 6)
print(f"Abs Std. Residuals >2 {count_gt2: 4d}")
print(f"Abs Std. Residuals >3 {count_gt3: 4d}")
print(f"Abs Std. Residuals >6 {count_gt6: 4d}")
# Determine threshold based on sample size
threshold = 3
if len(y) > 500:
threshold = 6
print(f"Using threshold of ±{threshold} for standardized residuals")
gold = '#D4AF37'
plt.style.use('dark_background')
# Create mask for outliers (points beyond threshold)
outlier_mask = np.abs(std_resid) > threshold
yellow_mask = np.abs(std_resid) > 3
print(f"Found {np.sum(outlier_mask)} outliers beyond threshol")
# Standardized Residuals vs Predicted Values
plt.figure(figsize=(12, 6))
# Plot regular points in cyan
plt.scatter(y_pred[~yellow_mask], std_resid[~yellow_mask],
alpha=0.9, color="cyan", edgecolors='k')
# Plot outlier points in gold or red
if np.any(yellow_mask):
plt.scatter(y_pred[yellow_mask], std_resid[yellow_mask],
alpha=0.9, color=gold, edgecolors='k')
if np.any(outlier_mask):
plt.scatter(y_pred[outlier_mask], std_resid[outlier_mask],
alpha=0.9, color='r', edgecolors='k')
plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5)
plt.axhline(y= 3, color=gold,linestyle='--',alpha=0.8, linewidth=2)
plt.axhline(y=-3, color=gold, linestyle='--',alpha=0.8, linewidth=2)
plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5)
plt.axhline(y=-6, color='r', linestyle='-', linewidth=1.5)
plt.xlabel("Predicted "+target, color=gold,
fontweight="bold", fontsize=14)
plt.ylabel('Standardized Residuals', color=gold,
fontweight="bold", fontsize=14)
plt.title('Standardized Residuals vs Predicted',
color=gold, fontweight="bold", fontsize=16)
plt.grid(True, linestyle='--', alpha=0.7)
# Create custom legend with explicit colors
legend_elements = [
Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold,
markersize=10, label='3 Sigma'),
Line2D([0], [0], marker='o', color='r', markerfacecolor='r',
markersize=10, label='6 Sigma')
]
# Add legend
legend_properties = {'size': 14, 'weight': 'bold'}
plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9,
prop=legend_properties)
plt.tight_layout()
plt.savefig('residuals_vs_predicted.png', dpi=300)
plt.show()
# Standardized Residuals vs Sequence Number
plt.figure(figsize=(12, 6))
plt.scatter(std_resid.index[~yellow_mask], std_resid[~yellow_mask],
alpha=0.9, color="cyan", edgecolors='k')
# Plot outlier points in gold or red
if np.any(yellow_mask):
plt.scatter(std_resid.index[yellow_mask], std_resid[yellow_mask],
alpha=0.9, color=gold, edgecolors='k')
if np.any(outlier_mask):
plt.scatter(std_resid.index[outlier_mask], std_resid[outlier_mask],
alpha=0.9, color='r', edgecolors='k')
plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5)
plt.axhline(y= 3, color=gold, linestyle='--', alpha=0.8, linewidth=2)
plt.axhline(y=-3, color=gold, linestyle='--', alpha=0.8, linewidth=2)
plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5)
plt.axhline(y=-6, color='r', linestyle='-',linewidth=1.5)
plt.xlabel('Observation Number', color=gold,
fontweight="bold", fontsize=14)
plt.ylabel('Standardized Residuals', color=gold,
fontweight="bold", fontsize=14)
plt.title('Time Series of Standardized Residuals', color=gold,
fontweight="bold", fontsize=16)
plt.grid(True, linestyle='--', alpha=0.7)
# Create custom legend with explicit colors
legend_elements = [
Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold,
markersize=10, label='3 Sigma'),
Line2D([0], [0], marker='o', color='r', markerfacecolor='r',
markersize=10, label='6 Sigma')
]
# Add legend
legend_properties = {'size': 14, 'weight': 'bold'}
plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9,
prop=legend_properties)
plt.tight_layout()
plt.savefig('residuals_vs_sequence.png', dpi=300)
plt.show()
# Standardized Residuals vs Cook's Distance
plt.figure(figsize=(12, 6))
# Plot regular points in cyan
plt.scatter(cooks_d[~yellow_mask], std_resid[~yellow_mask],
alpha=0.9, color="cyan", edgecolors='k')
# Plot outlier points in gold or red
if np.any(yellow_mask):
plt.scatter(cooks_d[yellow_mask], std_resid[yellow_mask],
alpha=0.9, color=gold, edgecolors='k')
if np.any(outlier_mask):
plt.scatter(cooks_d[outlier_mask], std_resid[outlier_mask],
alpha=0.9, color='r', edgecolors='k')
plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5)
plt.axhline(y= 3, color=gold,linestyle='--', alpha=0.8, linewidth=2)
plt.axhline(y=-3, color=gold,linestyle='--', alpha=0.8, linewidth=2)
plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5)
plt.axhline(y=-6, color='r', linestyle='-',linewidth=1.5)
plt.axvline(x=4/len(X), color='r', linestyle='--', label="Cook's D Threshold")
plt.ylabel('Standardized Residuals', color=gold,
fontweight="bold", fontsize=14)
plt.xlabel("Cook's Distance", color=gold,
fontweight="bold", fontsize=14)
plt.title("Standardized Residuals vs Cook's Distance", color=gold,
fontweight="bold", fontsize=16)
plt.legend(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
# Create custom legend with explicit colors
legend_elements = [
Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold,
markersize=10, label='3 Sigma'),
Line2D([0], [0], marker='o', color='r', markerfacecolor='r',
markersize=10, label='6 Sigma')
]
# Add legend
legend_properties = {'size': 14, 'weight': 'bold'}
plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9,
prop=legend_properties)
plt.tight_layout()
plt.savefig('residuals_vs_cooks_d.png', dpi=300)
plt.show()
plt.close()