import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib.lines import Line2D import statsmodels.api as sm from statsmodels.stats.outliers_influence import OLSInfluence from AdvancedAnalytics.Regression import stepwise from AdvancedAnalytics.ReplaceImputeEncode import DT, ReplaceImputeEncode # Data Map for OilProduction.csv data_map = { 'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ], 'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ], 'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ], 'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ], 'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ], 'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ], 'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ], 'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ], 'N_Stages': [ DT.Interval , (2, 14) ], 'X_Well': [ DT.Interval , (-100.0, -95.0) ], 'Y_Well': [ DT.Interval , (30.0, 35.0) ], 'Operator': [ DT.Nominal , tuple(range(1, 29))], 'County': [ DT.Nominal , tuple(range(1, 15))] } target = "Log_Cum_Production" # Identify Target Attribute in Data File df = pd.read_csv("OilProduction.csv") # RIE - Data Cleaning & Preprocessing rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot', no_impute=[target], no_encode=[target], drop=False, display=True) encoded_df = rie.fit_transform(df) print("Encoded Data has", encoded_df.shape[1], "Columns\n") # Hyperparameter Optimization to Select Features selected = stepwise(encoded_df, target, reg="linear", method="stepwise", crit_in=0.05, crit_out=0.05, verbose=True).fit_transform() print(len(selected), "out of", encoded_df.shape[1]-1, "Features were selected by Stepwise.") # Extract target and predictors from encoded_df y = encoded_df[target] X = encoded_df[selected] # Add constant for intercept X_sm = sm.add_constant(X) # Fit the model sm_model = sm.OLS(y, X_sm).fit() # Display summary print(sm_model.summary()) y_pred = sm_model.predict(X_sm) #Predicted Values influence = OLSInfluence(sm_model) #Statsmodels Influence Object std_resid = influence.resid_studentized_internal #Studentized Residuals cooks_d = influence.cooks_distance[0] #Cooks D count_gt2 = sum(1 for resid in std_resid if abs(resid) > 2) count_gt3 = sum(1 for resid in std_resid if abs(resid) > 3) count_gt6 = sum(1 for resid in std_resid if abs(resid) > 6) print(f"Abs Std. Residuals >2 {count_gt2: 4d}") print(f"Abs Std. Residuals >3 {count_gt3: 4d}") print(f"Abs Std. Residuals >6 {count_gt6: 4d}") # Determine threshold based on sample size threshold = 3 if len(y) > 500: threshold = 6 print(f"Using threshold of ±{threshold} for standardized residuals") gold = '#D4AF37' plt.style.use('dark_background') # Create mask for outliers (points beyond threshold) outlier_mask = np.abs(std_resid) > threshold yellow_mask = np.abs(std_resid) > 3 print(f"Found {np.sum(outlier_mask)} outliers beyond threshol") # Standardized Residuals vs Predicted Values plt.figure(figsize=(12, 6)) # Plot regular points in cyan plt.scatter(y_pred[~yellow_mask], std_resid[~yellow_mask], alpha=0.9, color="cyan", edgecolors='k') # Plot outlier points in gold or red if np.any(yellow_mask): plt.scatter(y_pred[yellow_mask], std_resid[yellow_mask], alpha=0.9, color=gold, edgecolors='k') if np.any(outlier_mask): plt.scatter(y_pred[outlier_mask], std_resid[outlier_mask], alpha=0.9, color='r', edgecolors='k') plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5) plt.axhline(y= 3, color=gold,linestyle='--',alpha=0.8, linewidth=2) plt.axhline(y=-3, color=gold, linestyle='--',alpha=0.8, linewidth=2) plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5) plt.axhline(y=-6, color='r', linestyle='-', linewidth=1.5) plt.xlabel("Predicted "+target, color=gold, fontweight="bold", fontsize=14) plt.ylabel('Standardized Residuals', color=gold, fontweight="bold", fontsize=14) plt.title('Standardized Residuals vs Predicted', color=gold, fontweight="bold", fontsize=16) plt.grid(True, linestyle='--', alpha=0.7) # Create custom legend with explicit colors legend_elements = [ Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold, markersize=10, label='3 Sigma'), Line2D([0], [0], marker='o', color='r', markerfacecolor='r', markersize=10, label='6 Sigma') ] # Add legend legend_properties = {'size': 14, 'weight': 'bold'} plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9, prop=legend_properties) plt.tight_layout() plt.savefig('residuals_vs_predicted.png', dpi=300) plt.show() # Standardized Residuals vs Sequence Number plt.figure(figsize=(12, 6)) plt.scatter(std_resid.index[~yellow_mask], std_resid[~yellow_mask], alpha=0.9, color="cyan", edgecolors='k') # Plot outlier points in gold or red if np.any(yellow_mask): plt.scatter(std_resid.index[yellow_mask], std_resid[yellow_mask], alpha=0.9, color=gold, edgecolors='k') if np.any(outlier_mask): plt.scatter(std_resid.index[outlier_mask], std_resid[outlier_mask], alpha=0.9, color='r', edgecolors='k') plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5) plt.axhline(y= 3, color=gold, linestyle='--', alpha=0.8, linewidth=2) plt.axhline(y=-3, color=gold, linestyle='--', alpha=0.8, linewidth=2) plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5) plt.axhline(y=-6, color='r', linestyle='-',linewidth=1.5) plt.xlabel('Observation Number', color=gold, fontweight="bold", fontsize=14) plt.ylabel('Standardized Residuals', color=gold, fontweight="bold", fontsize=14) plt.title('Time Series of Standardized Residuals', color=gold, fontweight="bold", fontsize=16) plt.grid(True, linestyle='--', alpha=0.7) # Create custom legend with explicit colors legend_elements = [ Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold, markersize=10, label='3 Sigma'), Line2D([0], [0], marker='o', color='r', markerfacecolor='r', markersize=10, label='6 Sigma') ] # Add legend legend_properties = {'size': 14, 'weight': 'bold'} plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9, prop=legend_properties) plt.tight_layout() plt.savefig('residuals_vs_sequence.png', dpi=300) plt.show() # Standardized Residuals vs Cook's Distance plt.figure(figsize=(12, 6)) # Plot regular points in cyan plt.scatter(cooks_d[~yellow_mask], std_resid[~yellow_mask], alpha=0.9, color="cyan", edgecolors='k') # Plot outlier points in gold or red if np.any(yellow_mask): plt.scatter(cooks_d[yellow_mask], std_resid[yellow_mask], alpha=0.9, color=gold, edgecolors='k') if np.any(outlier_mask): plt.scatter(cooks_d[outlier_mask], std_resid[outlier_mask], alpha=0.9, color='r', edgecolors='k') plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5) plt.axhline(y= 3, color=gold,linestyle='--', alpha=0.8, linewidth=2) plt.axhline(y=-3, color=gold,linestyle='--', alpha=0.8, linewidth=2) plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5) plt.axhline(y=-6, color='r', linestyle='-',linewidth=1.5) plt.axvline(x=4/len(X), color='r', linestyle='--', label="Cook's D Threshold") plt.ylabel('Standardized Residuals', color=gold, fontweight="bold", fontsize=14) plt.xlabel("Cook's Distance", color=gold, fontweight="bold", fontsize=14) plt.title("Standardized Residuals vs Cook's Distance", color=gold, fontweight="bold", fontsize=16) plt.legend(fontsize=12) plt.grid(True, linestyle='--', alpha=0.7) # Create custom legend with explicit colors legend_elements = [ Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold, markersize=10, label='3 Sigma'), Line2D([0], [0], marker='o', color='r', markerfacecolor='r', markersize=10, label='6 Sigma') ] # Add legend legend_properties = {'size': 14, 'weight': 'bold'} plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9, prop=legend_properties) plt.tight_layout() plt.savefig('residuals_vs_cooks_d.png', dpi=300) plt.show() plt.close()