import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
from   matplotlib.lines import Line2D
import statsmodels.api  as sm
from   statsmodels.stats.outliers_influence import OLSInfluence
from AdvancedAnalytics.Regression           import stepwise
from AdvancedAnalytics.ReplaceImputeEncode  import DT, ReplaceImputeEncode

# Data Map for OilProduction.csv
data_map = {
	'Log_Cum_Production': 	[ DT.Interval , (8.0, 15.0) ],
	'Log_Proppant_LB': 	    [ DT.Interval , (6.0, 18.0) ],
	'Log_Carbonate': 	    [ DT.Interval , (-4.0, 4.0) ],
	'Log_Frac_Fluid_GL': 	[ DT.Interval , (7.0, 18.0) ],
	'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ],
	'Log_LowerPerforation_xy': 	   [ DT.Interval , (8.0, 10.0) ],
	'Log_UpperPerforation_xy': 	   [ DT.Interval , (8.0, 10.0) ],
	'Log_TotalDepth': 	[ DT.Interval , (8.0, 10.0) ],
	'N_Stages': 	[ DT.Interval , (2, 14) ],
	'X_Well': 	[ DT.Interval , (-100.0, -95.0) ],
	'Y_Well': 	[ DT.Interval , (30.0, 35.0) ],
	'Operator': 	[ DT.Nominal , tuple(range(1, 29))],
	'County': 	[ DT.Nominal , tuple(range(1, 15))]
}
target = "Log_Cum_Production" # Identify Target Attribute in Data File
df     = pd.read_csv("OilProduction.csv")
# RIE - Data Cleaning & Preprocessing
rie    = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot', 
                             no_impute=[target], no_encode=[target], 
                             drop=False, display=True)
encoded_df = rie.fit_transform(df)
print("Encoded Data has", encoded_df.shape[1], "Columns\n")

# Hyperparameter Optimization to Select Features
selected = stepwise(encoded_df, target, reg="linear", method="stepwise",
                    crit_in=0.05, crit_out=0.05, verbose=True).fit_transform()
print(len(selected), "out of",  encoded_df.shape[1]-1,
      "Features were selected by Stepwise.")
# Extract target and predictors from encoded_df
y = encoded_df[target]
X = encoded_df[selected]

# Add constant for intercept
X_sm = sm.add_constant(X)
# Fit the model
sm_model = sm.OLS(y, X_sm).fit()
# Display summary
print(sm_model.summary())

y_pred    = sm_model.predict(X_sm)               #Predicted Values
influence = OLSInfluence(sm_model)               #Statsmodels Influence Object
std_resid = influence.resid_studentized_internal #Studentized Residuals
cooks_d   = influence.cooks_distance[0]          #Cooks D
count_gt2 = sum(1 for resid in std_resid if abs(resid) > 2)
count_gt3 = sum(1 for resid in std_resid if abs(resid) > 3) 
count_gt6 = sum(1 for resid in std_resid if abs(resid) > 6)
print(f"Abs Std. Residuals >2 {count_gt2: 4d}")
print(f"Abs Std. Residuals >3 {count_gt3: 4d}")
print(f"Abs Std. Residuals >6 {count_gt6: 4d}")

# Determine threshold based on sample size
threshold = 3 
if len(y) > 500:
    threshold = 6
print(f"Using threshold of ±{threshold} for standardized residuals")

gold = '#D4AF37'
plt.style.use('dark_background')
# Create mask for outliers (points beyond threshold)
outlier_mask = np.abs(std_resid) > threshold
yellow_mask  = np.abs(std_resid) > 3
print(f"Found {np.sum(outlier_mask)} outliers beyond threshol")
    
# Standardized Residuals vs Predicted Values
plt.figure(figsize=(12, 6))
# Plot regular points in cyan
plt.scatter(y_pred[~yellow_mask], std_resid[~yellow_mask], 
           alpha=0.9, color="cyan", edgecolors='k')
# Plot outlier points in gold or red
if np.any(yellow_mask):
    plt.scatter(y_pred[yellow_mask], std_resid[yellow_mask], 
               alpha=0.9, color=gold, edgecolors='k')
if np.any(outlier_mask):
    plt.scatter(y_pred[outlier_mask], std_resid[outlier_mask], 
               alpha=0.9, color='r', edgecolors='k')
plt.axhline(y= 0, color='r',  linestyle='-', linewidth=1.5)
plt.axhline(y= 3,  color=gold,linestyle='--',alpha=0.8, linewidth=2)
plt.axhline(y=-3, color=gold, linestyle='--',alpha=0.8, linewidth=2)
plt.axhline(y= 6, color='r',  linestyle='-', linewidth=1.5)
plt.axhline(y=-6, color='r',  linestyle='-', linewidth=1.5)
plt.xlabel("Predicted "+target, color=gold, 
                           fontweight="bold", fontsize=14)
plt.ylabel('Standardized Residuals', color=gold, 
                           fontweight="bold", fontsize=14)
plt.title('Standardized Residuals vs Predicted', 
          color=gold, fontweight="bold", fontsize=16)
plt.grid(True, linestyle='--', alpha=0.7)
# Create custom legend with explicit colors
legend_elements = [
    Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold, 
           markersize=10, label='3 Sigma'),
    Line2D([0], [0], marker='o', color='r', markerfacecolor='r', 
           markersize=10, label='6 Sigma')
]
# Add legend
legend_properties = {'size': 14, 'weight': 'bold'}
plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9, 
           prop=legend_properties)
plt.tight_layout()
plt.savefig('residuals_vs_predicted.png', dpi=300)
plt.show()

# Standardized Residuals vs Sequence Number
plt.figure(figsize=(12, 6))
plt.scatter(std_resid.index[~yellow_mask], std_resid[~yellow_mask], 
           alpha=0.9, color="cyan", edgecolors='k')
# Plot outlier points in gold or red
if np.any(yellow_mask):
    plt.scatter(std_resid.index[yellow_mask], std_resid[yellow_mask], 
               alpha=0.9, color=gold, edgecolors='k')
if np.any(outlier_mask):
    plt.scatter(std_resid.index[outlier_mask], std_resid[outlier_mask], 
               alpha=0.9, color='r', edgecolors='k')

plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5)
plt.axhline(y= 3,  color=gold, linestyle='--', alpha=0.8, linewidth=2)
plt.axhline(y=-3, color=gold, linestyle='--',  alpha=0.8, linewidth=2)
plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5)
plt.axhline(y=-6, color='r', linestyle='-',linewidth=1.5)
plt.xlabel('Observation Number', color=gold, 
                           fontweight="bold", fontsize=14)
plt.ylabel('Standardized Residuals', color=gold, 
                           fontweight="bold", fontsize=14)
plt.title('Time Series of Standardized Residuals', color=gold, 
                           fontweight="bold", fontsize=16)
plt.grid(True, linestyle='--', alpha=0.7)
# Create custom legend with explicit colors
legend_elements = [
    Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold, 
           markersize=10, label='3 Sigma'),
    Line2D([0], [0], marker='o', color='r', markerfacecolor='r', 
           markersize=10, label='6 Sigma')
]
# Add legend
legend_properties = {'size': 14, 'weight': 'bold'}
plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9, 
           prop=legend_properties)
plt.tight_layout()
plt.savefig('residuals_vs_sequence.png', dpi=300)
plt.show()

# Standardized Residuals vs Cook's Distance
plt.figure(figsize=(12, 6))
# Plot regular points in cyan
plt.scatter(cooks_d[~yellow_mask], std_resid[~yellow_mask], 
           alpha=0.9, color="cyan", edgecolors='k')
# Plot outlier points in gold or red
if np.any(yellow_mask):
    plt.scatter(cooks_d[yellow_mask], std_resid[yellow_mask], 
               alpha=0.9, color=gold, edgecolors='k')
if np.any(outlier_mask):
    plt.scatter(cooks_d[outlier_mask], std_resid[outlier_mask], 
               alpha=0.9, color='r', edgecolors='k')
plt.axhline(y= 0, color='r', linestyle='-', linewidth=1.5)
plt.axhline(y= 3, color=gold,linestyle='--', alpha=0.8, linewidth=2)
plt.axhline(y=-3, color=gold,linestyle='--', alpha=0.8, linewidth=2)
plt.axhline(y= 6, color='r', linestyle='-', linewidth=1.5)
plt.axhline(y=-6, color='r', linestyle='-',linewidth=1.5)
plt.axvline(x=4/len(X), color='r', linestyle='--', label="Cook's D Threshold")
plt.ylabel('Standardized Residuals', color=gold, 
                           fontweight="bold", fontsize=14)
plt.xlabel("Cook's Distance", color=gold, 
                           fontweight="bold", fontsize=14)
plt.title("Standardized Residuals vs Cook's Distance", color=gold, 
                           fontweight="bold", fontsize=16)
plt.legend(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
# Create custom legend with explicit colors
legend_elements = [
    Line2D([0], [0], marker='o', color=gold, markerfacecolor=gold, 
           markersize=10, label='3 Sigma'),
    Line2D([0], [0], marker='o', color='r', markerfacecolor='r', 
           markersize=10, label='6 Sigma')
]
# Add legend
legend_properties = {'size': 14, 'weight': 'bold'}
plt.legend(handles=legend_elements, loc='lower center', framealpha=0.9, 
           prop=legend_properties)
plt.tight_layout()
plt.savefig('residuals_vs_cooks_d.png', dpi=300)
plt.show()
plt.close()