| import pandas as pd |
| import numpy as np |
| from sklearn.linear_model import LinearRegression |
| from itertools import combinations |
| from AdvancedAnalytics.Regression import linreg |
| from AdvancedAnalytics.ReplaceImputeEncode import ReplaceImputeEncode, DT |
|
|
| data_map = { |
| 'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ], |
| 'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ], |
| 'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ], |
| 'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ], |
| 'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ], |
| 'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ], |
| 'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ], |
| 'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ], |
| 'N_Stages': [ DT.Interval , (2, 14) ], |
| 'X_Well': [ DT.Interval , (-100.0, -95.0) ], |
| 'Y_Well': [ DT.Interval , (30.0, 35.0) ], |
| 'County': [ DT.Nominal , (1, 2, 3, 4)] |
| } |
| target = "Log_Cum_Production" |
| df = pd.read_csv("OilProduction.csv") |
| df = df[df["County"]<5] |
| df = df.drop(["Operator"], axis=1) |
| rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot', |
| display=True) |
|
|
| encoded_df = rie.fit_transform(df) |
|
|
| |
| target = "Log_Cum_Production" |
| y = encoded_df[target] |
| features = [col for col in encoded_df.columns if col != target] |
|
|
| |
| def evaluate_model(X, y): |
| model = LinearRegression() |
| model.fit(X, y) |
| y_pred = model.predict(X) |
| residuals = y - y_pred |
| n = len(y) |
| k = X.shape[1] |
| |
| |
| ase = np.sum(residuals**2) / n |
| bic = n * np.log(ase) + k * np.log(n) |
| |
| return {"ase": ase, "bic": bic, "model": model} |
|
|
| |
| results = [] |
|
|
| |
| max_features = min(10, len(features)) |
| for k in range(1, max_features + 1): |
| print(f"Testing combinations with {k} features...") |
| for combo in combinations(features, k): |
| X = encoded_df[list(combo)] |
| result = evaluate_model(X, y) |
| results.append({ |
| "features": combo, |
| "n_features": k, |
| "ase": result["ase"], |
| "bic": result["bic"], |
| "model": result["model"] |
| }) |
|
|
| |
| results_ase = sorted(results, key=lambda x: x["ase"]) |
| results_bic = sorted(results, key=lambda x: x["bic"]) |
|
|
| |
| print("\nTop 5 feature combinations by ASE:") |
| for i, result in enumerate(results_ase[:5]): |
| print(f"{i+1}. Features: {result['features']}, ASE: {result['ase']:.4f}, BIC: {result['bic']:.4f}") |
|
|
| print("\nTop 5 feature combinations by BIC:") |
| for i, result in enumerate(results_bic[:5]): |
| print(f"{i+1}. Features: {result['features']}, ASE: {result['ase']:.4f}, BIC: {result['bic']:.4f}") |
|
|
| |
| best_ase_features = results_ase[0]["features"] |
| best_bic_features = results_bic[0]["features"] |
|
|
| X_ase = encoded_df[list(best_ase_features)] |
| X_bic = encoded_df[list(best_bic_features)] |
|
|
| lr_ase = LinearRegression().fit(X_ase, y) |
| lr_bic = LinearRegression().fit(X_bic, y) |
|
|
| |
| print("\nBest ASE Model:") |
| linreg.display_coef(lr_ase, X_ase, y) |
| linreg.display_metrics(lr_ase, X_ase, y) |
|
|
| print("\nBest BIC Model:") |
| linreg.display_coef(lr_bic, X_bic, y) |
| linreg.display_metrics(lr_bic, X_bic, y) |
|
|