| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
| from matplotlib.lines import Line2D |
| import statsmodels.api as sm |
| from statsmodels.stats.outliers_influence import OLSInfluence |
| from AdvancedAnalytics.Regression import stepwise |
| from AdvancedAnalytics.ReplaceImputeEncode import DT, ReplaceImputeEncode |
|
|
| |
| data_map = { |
| 'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ], |
| 'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ], |
| 'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ], |
| 'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ], |
| 'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ], |
| 'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ], |
| 'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ], |
| 'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ], |
| 'N_Stages': [ DT.Interval , (2, 14) ], |
| 'X_Well': [ DT.Interval , (-100.0, -95.0) ], |
| 'Y_Well': [ DT.Interval , (30.0, 35.0) ], |
| 'Operator': [ DT.Nominal , tuple(range(1, 29))], |
| 'County': [ DT.Nominal , tuple(range(1, 15))] |
| } |
| target = "Log_Cum_Production" |
| df = pd.read_csv("OilProduction.csv") |
| |
| rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot', |
| no_impute=[target], no_encode=[target], |
| drop=False, display=True) |
| encoded_df = rie.fit_transform(df) |
| print("Encoded Data has", encoded_df.shape[1], "Columns\n") |
|
|
| |
| selected = stepwise(encoded_df, target, reg="linear", method="stepwise", |
| crit_in=0.05, crit_out=0.05, verbose=True).fit_transform() |
| print(len(selected), "out of", encoded_df.shape[1]-1, |
| "Features were selected by Stepwise.") |
| |
| y = encoded_df[target] |
| X = encoded_df[selected] |
|
|
| |
| X_sm = sm.add_constant(X) |
| |
| sm_model = sm.OLS(y, X_sm).fit() |
| |
| print(sm_model.summary()) |
|
|
| y_pred = sm_model.predict(X_sm) |
| influence = OLSInfluence(sm_model) |
| std_resid = influence.resid_studentized_internal |
| cooks_d = influence.cooks_distance[0] |
| count_gt2 = sum(1 for resid in std_resid if abs(resid) > 2) |
| count_gt3 = sum(1 for resid in std_resid if abs(resid) > 3) |
| count_gt6 = sum(1 for resid in std_resid if abs(resid) > 6) |
| print(f"Abs Std. Residuals >2 {count_gt2: 4d}") |
| print(f"Abs Std. Residuals >3 {count_gt3: 4d}") |
| print(f"Abs Std. Residuals >6 {count_gt6: 4d}") |
|
|
| |
| threshold = 3 |
| if len(y) > 500: |
| threshold = 6 |
| print(f"Using threshold of ±{threshold} for standardized residuals") |
|
|
| gold = '#D4AF37' |
| plt.style.use('dark_background') |
| |
| outlier_mask = np.abs(std_resid) > threshold |
| yellow_mask = np.abs(std_resid) > 3 |
| print(f"Found {np.sum(outlier_mask)} outliers beyond threshol") |
| |
| |
| plt.figure(figsize=(12, 6)) |
| |
| plt.scatter(y_pred[~yellow_mask], std_resid[~yellow_mask], |
| alpha=0.9, color="cyan", edgecolors='k') |
| |
| if np.any(yellow_mask): |
| plt.scatter(y_pred[yellow_mask], std_resid[yellow_mask], |
| alpha=0.9, color=gold, edgecolors='k') |
| if np.any(outlier_mask): |
| plt.scatter(y_pred[outlier_mask], std_resid[outlier_mask], |
| alpha=0.9, color='r', edgecolors='k') |
| plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5) |
| plt.axhline(y= 3, color=gold,linestyle='--',alpha=0.8, linewidth=2) |
| plt.axhline(y=-3, color=gold, linestyle='--',alpha=0.8, linewidth=2) |
| plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5) |
| plt.axhline(y=-6, color='r', linestyle='-', linewidth=1.5) |
| plt.xlabel("Predicted "+target, color=gold, |
| fontweight="bold", fontsize=14) |
| plt.ylabel('Standardized Residuals', color=gold, |
| fontweight="bold", fontsize=14) |
| plt.title('Standardized Residuals vs Predicted', |
| color=gold, fontweight="bold", fontsize=16) |
| plt.grid(True, linestyle='--', alpha=0.7) |
| |
| legend_elements = [ |
| Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold, |
| markersize=10, label='3 Sigma'), |
| Line2D([0], [0], marker='o', color='r', markerfacecolor='r', |
| markersize=10, label='6 Sigma') |
| ] |
| |
| legend_properties = {'size': 14, 'weight': 'bold'} |
| plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9, |
| prop=legend_properties) |
| plt.tight_layout() |
| plt.savefig('residuals_vs_predicted.png', dpi=300) |
| plt.show() |
|
|
| |
| plt.figure(figsize=(12, 6)) |
| plt.scatter(std_resid.index[~yellow_mask], std_resid[~yellow_mask], |
| alpha=0.9, color="cyan", edgecolors='k') |
| |
| if np.any(yellow_mask): |
| plt.scatter(std_resid.index[yellow_mask], std_resid[yellow_mask], |
| alpha=0.9, color=gold, edgecolors='k') |
| if np.any(outlier_mask): |
| plt.scatter(std_resid.index[outlier_mask], std_resid[outlier_mask], |
| alpha=0.9, color='r', edgecolors='k') |
|
|
| plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5) |
| plt.axhline(y= 3, color=gold, linestyle='--', alpha=0.8, linewidth=2) |
| plt.axhline(y=-3, color=gold, linestyle='--', alpha=0.8, linewidth=2) |
| plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5) |
| plt.axhline(y=-6, color='r', linestyle='-',linewidth=1.5) |
| plt.xlabel('Observation Number', color=gold, |
| fontweight="bold", fontsize=14) |
| plt.ylabel('Standardized Residuals', color=gold, |
| fontweight="bold", fontsize=14) |
| plt.title('Time Series of Standardized Residuals', color=gold, |
| fontweight="bold", fontsize=16) |
| plt.grid(True, linestyle='--', alpha=0.7) |
| |
| legend_elements = [ |
| Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold, |
| markersize=10, label='3 Sigma'), |
| Line2D([0], [0], marker='o', color='r', markerfacecolor='r', |
| markersize=10, label='6 Sigma') |
| ] |
| |
| legend_properties = {'size': 14, 'weight': 'bold'} |
| plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9, |
| prop=legend_properties) |
| plt.tight_layout() |
| plt.savefig('residuals_vs_sequence.png', dpi=300) |
| plt.show() |
|
|
| |
| plt.figure(figsize=(12, 6)) |
| |
| plt.scatter(cooks_d[~yellow_mask], std_resid[~yellow_mask], |
| alpha=0.9, color="cyan", edgecolors='k') |
| |
| if np.any(yellow_mask): |
| plt.scatter(cooks_d[yellow_mask], std_resid[yellow_mask], |
| alpha=0.9, color=gold, edgecolors='k') |
| if np.any(outlier_mask): |
| plt.scatter(cooks_d[outlier_mask], std_resid[outlier_mask], |
| alpha=0.9, color='r', edgecolors='k') |
| plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5) |
| plt.axhline(y= 3, color=gold,linestyle='--', alpha=0.8, linewidth=2) |
| plt.axhline(y=-3, color=gold,linestyle='--', alpha=0.8, linewidth=2) |
| plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5) |
| plt.axhline(y=-6, color='r', linestyle='-',linewidth=1.5) |
| plt.axvline(x=4/len(X), color='r', linestyle='--', label="Cook's D Threshold") |
| plt.ylabel('Standardized Residuals', color=gold, |
| fontweight="bold", fontsize=14) |
| plt.xlabel("Cook's Distance", color=gold, |
| fontweight="bold", fontsize=14) |
| plt.title("Standardized Residuals vs Cook's Distance", color=gold, |
| fontweight="bold", fontsize=16) |
| plt.legend(fontsize=12) |
| plt.grid(True, linestyle='--', alpha=0.7) |
| |
| legend_elements = [ |
| Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold, |
| markersize=10, label='3 Sigma'), |
| Line2D([0], [0], marker='o', color='r', markerfacecolor='r', |
| markersize=10, label='6 Sigma') |
| ] |
| |
| legend_properties = {'size': 14, 'weight': 'bold'} |
| plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9, |
| prop=legend_properties) |
| plt.tight_layout() |
| plt.savefig('residuals_vs_cooks_d.png', dpi=300) |
| plt.show() |
| plt.close() |
|
|