import pandas as pd import numpy as np from sklearn.linear_model import Lasso, LassoCV from AdvancedAnalytics.Regression import linreg from AdvancedAnalytics.ReplaceImputeEncode import ReplaceImputeEncode, DT data_map = { 'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ], 'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ], 'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ], 'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ], 'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ], 'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ], 'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ], 'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ], 'N_Stages': [ DT.Interval , (2, 14) ], 'X_Well': [ DT.Interval , (-100.0, -95.0) ], 'Y_Well': [ DT.Interval , (30.0, 35.0) ], 'Operator': [ DT.Nominal , tuple(range(1, 29))], 'County': [ DT.Nominal , tuple(range(1, 15))] } target = "Log_Cum_Production" # Identify Target Attribute in Data File df = pd.read_csv("OilProduction.csv") rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot', display=True) encoded_df = rie.fit_transform(df) # Define target and features target = "Log_Cum_Production" y = encoded_df[target] X = encoded_df.drop(target, axis=1) feature_names = X.columns.tolist() # Find optimal alpha using LassoCV lasso_cv = LassoCV(cv=10, random_state=42, max_iter=10000) lasso_cv.fit(X, y) optimal_alpha = lasso_cv.alpha_ print(f"\nOptimal alpha from CV: {optimal_alpha:.6f}") # Fit final model with optimal alpha final_lasso = Lasso(alpha=optimal_alpha, max_iter=10000) final_lasso.fit(X, y) # Get selected features selected_indices = np.where(final_lasso.coef_ != 0)[0] selected_features = [feature_names[i] for i in selected_indices] selected_coef = final_lasso.coef_[selected_indices] linreg.display_metrics(final_lasso, X, y) print(f"\nSelected {len(selected_features)} features: COEF") for feature, coef in zip(selected_features, selected_coef): print("{:.<28s}{:8.4f}".format(feature, coef))