dr_jones / Regression /example_reg_validation.py
anly656's picture
Upload 50 files
8643b59 verified
import pandas as pd
from sklearn.model_selection import train_test_split #Hold-Out Sets
from sklearn.linear_model import LinearRegression #Ordinary Regression
from sklearn.model_selection import cross_validate #k-fold validation
from sklearn.metrics import mean_squared_error #ASE
from AdvancedAnalytics.Regression import linreg, stepwise
from AdvancedAnalytics.ReplaceImputeEncode import ReplaceImputeEncode, DT
data_map = {
'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ],
'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ],
'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ],
'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ],
'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ],
'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ],
'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ],
'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ],
'N_Stages': [ DT.Interval , (2, 14) ],
'X_Well': [ DT.Interval , (-100.0, -95.0) ],
'Y_Well': [ DT.Interval , (30.0, 35.0) ],
'Operator': [ DT.Nominal , tuple(range(1, 29)) ],
'County': [ DT.Nominal , tuple(range(1, 15))]
}
target = "Log_Cum_Production" # Identify Target Attribute in Data File
rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot',
no_impute=[target], drop=False, display=True)
df = pd.read_csv("OilProduction.csv")
encoded_df = rie.fit_transform(df)
# Define target and features
target = "Log_Cum_Production"
# Hyperparameter Optimization to Select Features
print("\nSTEPWISE SELECTION")
selected = stepwise(encoded_df, target, reg="linear", method="stepwise",
crit_in=0.05, crit_out=0.05, verbose=True).fit_transform()
y = encoded_df[target]
X = encoded_df[selected]
print("\nHOLD-OUT VALIDATION ==========================================")
X_train, X_val, y_train, y_val = train_test_split(X, y,
test_size=0.3, random_state=12345)
# Display number of cases
print(f"Training/Validation Cases: {y_train.shape[0]}/{y_val.shape[0]} ")
# Fit Regression Model to Training Data
lr = LinearRegression()
lr = lr.fit(X_train, y_train)
# Display hold-out metrics using AdvancedAnalytics
print("\nTraining and Validation Metrics:")
linreg.display_split_metrics(lr, X_train, y_train, X_val, y_val)
# Examine Possible Overfitting
train_predict = lr.predict(X_train)
val_predict = lr.predict(X_val)
ASE_train = mean_squared_error(y_train, train_predict)
ASE_val = mean_squared_error(y_val, val_predict)
overfit_ratio = ASE_val / ASE_train
print(f"\nASE ratio (validation/train): {ASE_val/ASE_train:.2f}")
# Check for overfitting
if overfit_ratio > 1.2:
print("Warning: Potential Overfitting Detected")
print("\nN-FOLD CROSS-VALIDATION ======================================")
def print_ase_ratio(scores, n_folds, n):
train_ase = -scores["train_score"].mean()
train_sase = 2.0*scores["train_score"].std()
val_ase = -scores["test_score"].mean()
val_sase = 2.0*scores["test_score"].std()
ratios = scores["test_score"]/scores["train_score"]
ratio = ratios.mean()
s_ratio = 2.0*ratios.std()
print(f"\n====== {n_folds:.0f}-Fold Cross Validation ======")
print(f" Train Avg. ASE..... {train_ase:.4f} +/-{train_sase:.4f}")
print(f" Test Avg. ASE..... {val_ase:.4f} +/-{val_sase:.4f}")
print(f" Mean ASE Ratio..... {ratio:.4f} +/-{s_ratio:.4f}")
print(38*"=")
n_v = n*(1.0/n_folds)
n_t = n - n_v
print(f"Equivalent to {n_folds:.0f} splits each with "+
f"{n_t:.0f}/{n_v:.0f} Cases")
n = X.shape[0]
lr = LinearRegression()
for n_folds in range(2, 6):
lr = LinearRegression()
scores = cross_validate(lr, X[selected], y,
scoring="neg_mean_squared_error",
cv=n_folds, return_train_score=True, )
print_ase_ratio(scores, n_folds, n)