dr_jones / Regression /example_grid_search.py
anly656's picture
Upload 50 files
8643b59 verified
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from itertools import combinations
from AdvancedAnalytics.Regression import linreg
from AdvancedAnalytics.ReplaceImputeEncode import ReplaceImputeEncode, DT
data_map = {
'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ],
'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ],
'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ],
'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ],
'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ],
'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ],
'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ],
'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ],
'N_Stages': [ DT.Interval , (2, 14) ],
'X_Well': [ DT.Interval , (-100.0, -95.0) ],
'Y_Well': [ DT.Interval , (30.0, 35.0) ],
'County': [ DT.Nominal , (1, 2, 3, 4)]
}
target = "Log_Cum_Production" # Identify Target Attribute in Data File
df = pd.read_csv("OilProduction.csv")
df = df[df["County"]<5]
df = df.drop(["Operator"], axis=1)
rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot',
display=True)
encoded_df = rie.fit_transform(df)
# Define target and features
target = "Log_Cum_Production"
y = encoded_df[target]
features = [col for col in encoded_df.columns if col != target]
# Function to evaluate feature combination
def evaluate_model(X, y):
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
residuals = y - y_pred
n = len(y)
k = X.shape[1] # Number of features
# Calculate metrics
ase = np.sum(residuals**2) / n # Sum of squared residuals
bic = n * np.log(ase) + k * np.log(n)
return {"ase": ase, "bic": bic, "model": model}
# Store results
results = []
# Try combinations up to max_features
max_features = min(10, len(features)) # Limit for computational feasibility
for k in range(1, max_features + 1):
print(f"Testing combinations with {k} features...")
for combo in combinations(features, k):
X = encoded_df[list(combo)]
result = evaluate_model(X, y)
results.append({
"features": combo,
"n_features": k,
"ase": result["ase"],
"bic": result["bic"],
"model": result["model"]
})
# Sort by ASE and BIC
results_ase = sorted(results, key=lambda x: x["ase"])
results_bic = sorted(results, key=lambda x: x["bic"])
# Display top 5 for each criterion
print("\nTop 5 feature combinations by ASE:")
for i, result in enumerate(results_ase[:5]):
print(f"{i+1}. Features: {result['features']}, ASE: {result['ase']:.4f}, BIC: {result['bic']:.4f}")
print("\nTop 5 feature combinations by BIC:")
for i, result in enumerate(results_bic[:5]):
print(f"{i+1}. Features: {result['features']}, ASE: {result['ase']:.4f}, BIC: {result['bic']:.4f}")
# Fit final models
best_ase_features = results_ase[0]["features"]
best_bic_features = results_bic[0]["features"]
X_ase = encoded_df[list(best_ase_features)]
X_bic = encoded_df[list(best_bic_features)]
lr_ase = LinearRegression().fit(X_ase, y)
lr_bic = LinearRegression().fit(X_bic, y)
# Display results
print("\nBest ASE Model:")
linreg.display_coef(lr_ase, X_ase, y)
linreg.display_metrics(lr_ase, X_ase, y)
print("\nBest BIC Model:")
linreg.display_coef(lr_bic, X_bic, y)
linreg.display_metrics(lr_bic, X_bic, y)