Spaces:

anly656
/

dr_jones

Sleeping

App Files Files Community

dr_jones / Regression /example_reg_validation.py

anly656

Upload 50 files

8643b59 verified about 2 months ago

raw

history blame contribute delete

3.95 kB

	import pandas as pd
	from sklearn.model_selection import train_test_split #Hold-Out Sets
	from sklearn.linear_model import LinearRegression #Ordinary Regression
	from sklearn.model_selection import cross_validate #k-fold validation
	from sklearn.metrics import mean_squared_error #ASE
	from AdvancedAnalytics.Regression import linreg, stepwise
	from AdvancedAnalytics.ReplaceImputeEncode import ReplaceImputeEncode, DT

	data_map = {
	'Log_Cum_Production': [ DT.Interval , (8.0, 15.0) ],
	'Log_Proppant_LB': [ DT.Interval , (6.0, 18.0) ],
	'Log_Carbonate': [ DT.Interval , (-4.0, 4.0) ],
	'Log_Frac_Fluid_GL': [ DT.Interval , (7.0, 18.0) ],
	'Log_GrossPerforatedInterval': [ DT.Interval , (4.0, 9.0) ],
	'Log_LowerPerforation_xy': [ DT.Interval , (8.0, 10.0) ],
	'Log_UpperPerforation_xy': [ DT.Interval , (8.0, 10.0) ],
	'Log_TotalDepth': [ DT.Interval , (8.0, 10.0) ],
	'N_Stages': [ DT.Interval , (2, 14) ],
	'X_Well': [ DT.Interval , (-100.0, -95.0) ],
	'Y_Well': [ DT.Interval , (30.0, 35.0) ],
	'Operator': [ DT.Nominal , tuple(range(1, 29)) ],
	'County': [ DT.Nominal , tuple(range(1, 15))]
	}
	target = "Log_Cum_Production" # Identify Target Attribute in Data File
	rie = ReplaceImputeEncode(data_map=data_map, nominal_encoding='one-hot',
	no_impute=[target], drop=False, display=True)
	df = pd.read_csv("OilProduction.csv")
	encoded_df = rie.fit_transform(df)

	# Define target and features
	target = "Log_Cum_Production"
	# Hyperparameter Optimization to Select Features
	print("\nSTEPWISE SELECTION")
	selected = stepwise(encoded_df, target, reg="linear", method="stepwise",
	crit_in=0.05, crit_out=0.05, verbose=True).fit_transform()
	y = encoded_df[target]
	X = encoded_df[selected]

	print("\nHOLD-OUT VALIDATION ==========================================")
	X_train, X_val, y_train, y_val = train_test_split(X, y,
	test_size=0.3, random_state=12345)
	# Display number of cases
	print(f"Training/Validation Cases: {y_train.shape[0]}/{y_val.shape[0]} ")
	# Fit Regression Model to Training Data
	lr = LinearRegression()
	lr = lr.fit(X_train, y_train)

	# Display hold-out metrics using AdvancedAnalytics
	print("\nTraining and Validation Metrics:")
	linreg.display_split_metrics(lr, X_train, y_train, X_val, y_val)

	# Examine Possible Overfitting
	train_predict = lr.predict(X_train)
	val_predict = lr.predict(X_val)
	ASE_train = mean_squared_error(y_train, train_predict)
	ASE_val = mean_squared_error(y_val, val_predict)
	overfit_ratio = ASE_val / ASE_train
	print(f"\nASE ratio (validation/train): {ASE_val/ASE_train:.2f}")
	# Check for overfitting
	if overfit_ratio > 1.2:
	print("Warning: Potential Overfitting Detected")

	print("\nN-FOLD CROSS-VALIDATION ======================================")
	def print_ase_ratio(scores, n_folds, n):
	train_ase = -scores["train_score"].mean()
	train_sase = 2.0*scores["train_score"].std()
	val_ase = -scores["test_score"].mean()
	val_sase = 2.0*scores["test_score"].std()
	ratios = scores["test_score"]/scores["train_score"]
	ratio = ratios.mean()
	s_ratio = 2.0*ratios.std()

	print(f"\n====== {n_folds:.0f}-Fold Cross Validation ======")
	print(f" Train Avg. ASE..... {train_ase:.4f} +/-{train_sase:.4f}")
	print(f" Test Avg. ASE..... {val_ase:.4f} +/-{val_sase:.4f}")
	print(f" Mean ASE Ratio..... {ratio:.4f} +/-{s_ratio:.4f}")
	print(38*"=")
	n_v = n*(1.0/n_folds)
	n_t = n - n_v
	print(f"Equivalent to {n_folds:.0f} splits each with "+
	f"{n_t:.0f}/{n_v:.0f} Cases")

	n = X.shape[0]
	lr = LinearRegression()
	for n_folds in range(2, 6):
	lr = LinearRegression()
	scores = cross_validate(lr, X[selected], y,
	scoring="neg_mean_squared_error",
	cv=n_folds, return_train_score=True, )
	print_ase_ratio(scores, n_folds, n)