| import pandas as pd |
| import numpy as np |
| from sklearn.linear_model import Lasso, LassoCV |
| from AdvancedAnalytics.Regression import linreg |
| from AdvancedAnalytics.ReplaceImputeEncode import ReplaceImputeEncode, DT |
|
|
| data_map = { |
| 'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ], |
| 'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ], |
| 'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ], |
| 'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ], |
| 'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ], |
| 'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ], |
| 'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ], |
| 'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ], |
| 'N_Stages': [ DT.Interval , (2, 14) ], |
| 'X_Well': [ DT.Interval , (-100.0, -95.0) ], |
| 'Y_Well': [ DT.Interval , (30.0, 35.0) ], |
| 'Operator': [ DT.Nominal , tuple(range(1, 29))], |
| 'County': [ DT.Nominal , tuple(range(1, 15))] |
| } |
| target = "Log_Cum_Production" |
| df = pd.read_csv("OilProduction.csv") |
| rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot', |
| display=True) |
|
|
| encoded_df = rie.fit_transform(df) |
|
|
| |
| target = "Log_Cum_Production" |
| y = encoded_df[target] |
| X = encoded_df.drop(target, axis=1) |
| feature_names = X.columns.tolist() |
|
|
| |
| lasso_cv = LassoCV(cv=10, random_state=42, max_iter=10000) |
| lasso_cv.fit(X, y) |
| optimal_alpha = lasso_cv.alpha_ |
|
|
| print(f"\nOptimal alpha from CV: {optimal_alpha:.6f}") |
|
|
| |
| final_lasso = Lasso(alpha=optimal_alpha, max_iter=10000) |
| final_lasso.fit(X, y) |
|
|
| |
| selected_indices = np.where(final_lasso.coef_ != 0)[0] |
| selected_features = [feature_names[i] for i in selected_indices] |
| selected_coef = final_lasso.coef_[selected_indices] |
|
|
| linreg.display_metrics(final_lasso, X, y) |
| print(f"\nSelected {len(selected_features)} features: COEF") |
| for feature, coef in zip(selected_features, selected_coef): |
| print("{:.<28s}{:8.4f}".format(feature, coef)) |
|
|
|
|
|
|