import pandas as pd from sklearn.model_selection import train_test_split #Hold-Out Sets from sklearn.linear_model import LinearRegression #Ordinary Regression from sklearn.model_selection import cross_validate #k-fold validation from sklearn.metrics import mean_squared_error #ASE from AdvancedAnalytics.Regression import linreg, stepwise from AdvancedAnalytics.ReplaceImputeEncode import ReplaceImputeEncode, DT data_map = { 'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ], 'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ], 'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ], 'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ], 'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ], 'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ], 'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ], 'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ], 'N_Stages': [ DT.Interval , (2, 14) ], 'X_Well': [ DT.Interval , (-100.0, -95.0) ], 'Y_Well': [ DT.Interval , (30.0, 35.0) ], 'Operator': [ DT.Nominal , tuple(range(1, 29)) ], 'County': [ DT.Nominal , tuple(range(1, 15))] } target = "Log_Cum_Production" # Identify Target Attribute in Data File rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot', no_impute=[target], drop=False, display=True) df = pd.read_csv("OilProduction.csv") encoded_df = rie.fit_transform(df) # Define target and features target = "Log_Cum_Production" # Hyperparameter Optimization to Select Features print("\nSTEPWISE SELECTION") selected = stepwise(encoded_df, target, reg="linear", method="stepwise", crit_in=0.05, crit_out=0.05, verbose=True).fit_transform() y = encoded_df[target] X = encoded_df[selected] print("\nHOLD-OUT VALIDATION ==========================================") X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=12345) # Display number of cases print(f"Training/Validation Cases: {y_train.shape[0]}/{y_val.shape[0]} ") # Fit Regression Model to Training Data lr = LinearRegression() lr = lr.fit(X_train, y_train) # Display hold-out metrics using AdvancedAnalytics print("\nTraining and Validation Metrics:") linreg.display_split_metrics(lr, X_train, y_train, X_val, y_val) # Examine Possible Overfitting train_predict = lr.predict(X_train) val_predict = lr.predict(X_val) ASE_train = mean_squared_error(y_train, train_predict) ASE_val = mean_squared_error(y_val, val_predict) overfit_ratio = ASE_val / ASE_train print(f"\nASE ratio (validation/train): {ASE_val/ASE_train:.2f}") # Check for overfitting if overfit_ratio > 1.2: print("Warning: Potential Overfitting Detected") print("\nN-FOLD CROSS-VALIDATION ======================================") def print_ase_ratio(scores, n_folds, n): train_ase = -scores["train_score"].mean() train_sase = 2.0*scores["train_score"].std() val_ase = -scores["test_score"].mean() val_sase = 2.0*scores["test_score"].std() ratios = scores["test_score"]/scores["train_score"] ratio = ratios.mean() s_ratio = 2.0*ratios.std() print(f"\n====== {n_folds:.0f}-Fold Cross Validation ======") print(f" Train Avg. ASE..... {train_ase:.4f} +/-{train_sase:.4f}") print(f" Test Avg. ASE..... {val_ase:.4f} +/-{val_sase:.4f}") print(f" Mean ASE Ratio..... {ratio:.4f} +/-{s_ratio:.4f}") print(38*"=") n_v = n*(1.0/n_folds) n_t = n - n_v print(f"Equivalent to {n_folds:.0f} splits each with "+ f"{n_t:.0f}/{n_v:.0f} Cases") n = X.shape[0] lr = LinearRegression() for n_folds in range(2, 6): lr = LinearRegression() scores = cross_validate(lr, X[selected], y, scoring="neg_mean_squared_error", cv=n_folds, return_train_score=True, ) print_ase_ratio(scores, n_folds, n)