Spaces:

anly656
/

dr_jones

Sleeping

File size: 3,946 Bytes

8643b59

import pandas as pd
from sklearn.model_selection import train_test_split   #Hold-Out Sets
from sklearn.linear_model    import LinearRegression   #Ordinary Regression
from sklearn.model_selection import cross_validate     #k-fold validation
from sklearn.metrics         import mean_squared_error #ASE
from AdvancedAnalytics.Regression          import linreg, stepwise
from AdvancedAnalytics.ReplaceImputeEncode import ReplaceImputeEncode, DT

data_map = {
	'Log_Cum_Production': 	[ DT.Interval , (8.0, 15.0) ],
	'Log_Proppant_LB': 	    [ DT.Interval , (6.0, 18.0) ],
	'Log_Carbonate': 	    [ DT.Interval , (-4.0, 4.0) ],
	'Log_Frac_Fluid_GL': 	[ DT.Interval , (7.0, 18.0) ],
	'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ],
	'Log_LowerPerforation_xy': 	   [ DT.Interval , (8.0, 10.0) ],
	'Log_UpperPerforation_xy': 	   [ DT.Interval , (8.0, 10.0) ],
	'Log_TotalDepth': 	[ DT.Interval , (8.0, 10.0) ],
	'N_Stages': 	[ DT.Interval , (2, 14) ],
	'X_Well': 	[ DT.Interval , (-100.0, -95.0) ],
	'Y_Well': 	[ DT.Interval , (30.0, 35.0) ],
	'Operator': 	[ DT.Nominal , tuple(range(1, 29)) ],
	'County': 	[ DT.Nominal , tuple(range(1, 15))]
}
target = "Log_Cum_Production" # Identify Target Attribute in Data File
rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot', 
                          no_impute=[target], drop=False, display=True)
df  = pd.read_csv("OilProduction.csv")
encoded_df = rie.fit_transform(df)

# Define target and features
target = "Log_Cum_Production"
# Hyperparameter Optimization to Select Features
print("\nSTEPWISE SELECTION")
selected = stepwise(encoded_df, target, reg="linear", method="stepwise",
                    crit_in=0.05, crit_out=0.05, verbose=True).fit_transform()
y = encoded_df[target]
X = encoded_df[selected]

print("\nHOLD-OUT  VALIDATION ==========================================")
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                    test_size=0.3, random_state=12345)
# Display number of cases
print(f"Training/Validation Cases: {y_train.shape[0]}/{y_val.shape[0]} ") 
# Fit Regression Model to Training Data
lr = LinearRegression()
lr = lr.fit(X_train, y_train)

# Display hold-out metrics using AdvancedAnalytics
print("\nTraining and Validation Metrics:")
linreg.display_split_metrics(lr, X_train, y_train, X_val, y_val)

# Examine Possible Overfitting
train_predict = lr.predict(X_train)
val_predict   = lr.predict(X_val)
ASE_train     = mean_squared_error(y_train, train_predict)
ASE_val       = mean_squared_error(y_val, val_predict)
overfit_ratio = ASE_val / ASE_train
print(f"\nASE ratio (validation/train): {ASE_val/ASE_train:.2f}")
# Check for overfitting
if overfit_ratio > 1.2:
    print("Warning: Potential Overfitting Detected")

print("\nN-FOLD CROSS-VALIDATION ======================================")
def print_ase_ratio(scores, n_folds, n):
    train_ase  = -scores["train_score"].mean()
    train_sase = 2.0*scores["train_score"].std()
    val_ase    = -scores["test_score"].mean()
    val_sase   = 2.0*scores["test_score"].std()
    ratios     = scores["test_score"]/scores["train_score"]
    ratio      = ratios.mean()
    s_ratio    = 2.0*ratios.std()

    print(f"\n====== {n_folds:.0f}-Fold Cross Validation ======")
    print(f"  Train Avg. ASE..... {train_ase:.4f} +/-{train_sase:.4f}")
    print(f"  Test  Avg. ASE..... {val_ase:.4f} +/-{val_sase:.4f}")
    print(f"  Mean ASE Ratio..... {ratio:.4f} +/-{s_ratio:.4f}")
    print(38*"=")
    n_v = n*(1.0/n_folds)
    n_t = n - n_v
    print(f"Equivalent to {n_folds:.0f} splits each with "+
          f"{n_t:.0f}/{n_v:.0f} Cases")

n  = X.shape[0]
lr = LinearRegression()
for n_folds in range(2, 6):
    lr = LinearRegression()
    scores  = cross_validate(lr, X[selected], y, 
                             scoring="neg_mean_squared_error", 
                             cv=n_folds, return_train_score=True, )
    print_ase_ratio(scores, n_folds, n)