| import pandas as pd |
| from sklearn.model_selection import train_test_split |
| from sklearn.linear_model import LinearRegression |
| from sklearn.model_selection import cross_validate |
| from sklearn.metrics import mean_squared_error |
| from AdvancedAnalytics.Regression import linreg, stepwise |
| from AdvancedAnalytics.ReplaceImputeEncode import ReplaceImputeEncode, DT |
|
|
| data_map = { |
| 'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ], |
| 'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ], |
| 'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ], |
| 'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ], |
| 'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ], |
| 'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ], |
| 'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ], |
| 'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ], |
| 'N_Stages': [ DT.Interval , (2, 14) ], |
| 'X_Well': [ DT.Interval , (-100.0, -95.0) ], |
| 'Y_Well': [ DT.Interval , (30.0, 35.0) ], |
| 'Operator': [ DT.Nominal , tuple(range(1, 29)) ], |
| 'County': [ DT.Nominal , tuple(range(1, 15))] |
| } |
| target = "Log_Cum_Production" |
| rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot', |
| no_impute=[target], drop=False, display=True) |
| df = pd.read_csv("OilProduction.csv") |
| encoded_df = rie.fit_transform(df) |
|
|
| |
| target = "Log_Cum_Production" |
| |
| print("\nSTEPWISE SELECTION") |
| selected = stepwise(encoded_df, target, reg="linear", method="stepwise", |
| crit_in=0.05, crit_out=0.05, verbose=True).fit_transform() |
| y = encoded_df[target] |
| X = encoded_df[selected] |
|
|
| print("\nHOLD-OUT VALIDATION ==========================================") |
| X_train, X_val, y_train, y_val = train_test_split(X, y, |
| test_size=0.3, random_state=12345) |
| |
| print(f"Training/Validation Cases: {y_train.shape[0]}/{y_val.shape[0]} ") |
| |
| lr = LinearRegression() |
| lr = lr.fit(X_train, y_train) |
|
|
| |
| print("\nTraining and Validation Metrics:") |
| linreg.display_split_metrics(lr, X_train, y_train, X_val, y_val) |
|
|
| |
| train_predict = lr.predict(X_train) |
| val_predict = lr.predict(X_val) |
| ASE_train = mean_squared_error(y_train, train_predict) |
| ASE_val = mean_squared_error(y_val, val_predict) |
| overfit_ratio = ASE_val / ASE_train |
| print(f"\nASE ratio (validation/train): {ASE_val/ASE_train:.2f}") |
| |
| if overfit_ratio > 1.2: |
| print("Warning: Potential Overfitting Detected") |
|
|
| print("\nN-FOLD CROSS-VALIDATION ======================================") |
| def print_ase_ratio(scores, n_folds, n): |
| train_ase = -scores["train_score"].mean() |
| train_sase = 2.0*scores["train_score"].std() |
| val_ase = -scores["test_score"].mean() |
| val_sase = 2.0*scores["test_score"].std() |
| ratios = scores["test_score"]/scores["train_score"] |
| ratio = ratios.mean() |
| s_ratio = 2.0*ratios.std() |
|
|
| print(f"\n====== {n_folds:.0f}-Fold Cross Validation ======") |
| print(f" Train Avg. ASE..... {train_ase:.4f} +/-{train_sase:.4f}") |
| print(f" Test Avg. ASE..... {val_ase:.4f} +/-{val_sase:.4f}") |
| print(f" Mean ASE Ratio..... {ratio:.4f} +/-{s_ratio:.4f}") |
| print(38*"=") |
| n_v = n*(1.0/n_folds) |
| n_t = n - n_v |
| print(f"Equivalent to {n_folds:.0f} splits each with "+ |
| f"{n_t:.0f}/{n_v:.0f} Cases") |
|
|
| n = X.shape[0] |
| lr = LinearRegression() |
| for n_folds in range(2, 6): |
| lr = LinearRegression() |
| scores = cross_validate(lr, X[selected], y, |
| scoring="neg_mean_squared_error", |
| cv=n_folds, return_train_score=True, ) |
| print_ase_ratio(scores, n_folds, n) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|