import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from itertools import combinations from AdvancedAnalytics.Regression import linreg from AdvancedAnalytics.ReplaceImputeEncode import ReplaceImputeEncode, DT data_map = { 'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ], 'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ], 'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ], 'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ], 'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ], 'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ], 'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ], 'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ], 'N_Stages': [ DT.Interval , (2, 14) ], 'X_Well': [ DT.Interval , (-100.0, -95.0) ], 'Y_Well': [ DT.Interval , (30.0, 35.0) ], 'County': [ DT.Nominal , (1, 2, 3, 4)] } target = "Log_Cum_Production" # Identify Target Attribute in Data File df = pd.read_csv("OilProduction.csv") df = df[df["County"]<5] df = df.drop(["Operator"], axis=1) rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot', display=True) encoded_df = rie.fit_transform(df) # Define target and features target = "Log_Cum_Production" y = encoded_df[target] features = [col for col in encoded_df.columns if col != target] # Function to evaluate feature combination def evaluate_model(X, y): model = LinearRegression() model.fit(X, y) y_pred = model.predict(X) residuals = y - y_pred n = len(y) k = X.shape[1] # Number of features # Calculate metrics ase = np.sum(residuals**2) / n # Sum of squared residuals bic = n * np.log(ase) + k * np.log(n) return {"ase": ase, "bic": bic, "model": model} # Store results results = [] # Try combinations up to max_features max_features = min(10, len(features)) # Limit for computational feasibility for k in range(1, max_features + 1): print(f"Testing combinations with {k} features...") for combo in combinations(features, k): X = encoded_df[list(combo)] result = evaluate_model(X, y) results.append({ "features": combo, "n_features": k, "ase": result["ase"], "bic": result["bic"], "model": result["model"] }) # Sort by ASE and BIC results_ase = sorted(results, key=lambda x: x["ase"]) results_bic = sorted(results, key=lambda x: x["bic"]) # Display top 5 for each criterion print("\nTop 5 feature combinations by ASE:") for i, result in enumerate(results_ase[:5]): print(f"{i+1}. Features: {result['features']}, ASE: {result['ase']:.4f}, BIC: {result['bic']:.4f}") print("\nTop 5 feature combinations by BIC:") for i, result in enumerate(results_bic[:5]): print(f"{i+1}. Features: {result['features']}, ASE: {result['ase']:.4f}, BIC: {result['bic']:.4f}") # Fit final models best_ase_features = results_ase[0]["features"] best_bic_features = results_bic[0]["features"] X_ase = encoded_df[list(best_ase_features)] X_bic = encoded_df[list(best_bic_features)] lr_ase = LinearRegression().fit(X_ase, y) lr_bic = LinearRegression().fit(X_bic, y) # Display results print("\nBest ASE Model:") linreg.display_coef(lr_ase, X_ase, y) linreg.display_metrics(lr_ase, X_ase, y) print("\nBest BIC Model:") linreg.display_coef(lr_bic, X_bic, y) linreg.display_metrics(lr_bic, X_bic, y)