Spaces:

iremrit
/

FinRisk-AI

Sleeping

App Files Files Community

FinRisk-AI / src /tests /pipeline.py

iremrit

Upload 36 files

95409ed verified 2 months ago

raw

history blame contribute delete

2.81 kB

	import json
	import pandas as pd
	from sklearn.ensemble import RandomForestClassifier, StackingClassifier
	from sklearn.linear_model import LogisticRegression
	from xgboost import XGBClassifier
	import joblib
	import os
	from config import DATA_PATH, MODELS_PATH, MODEL_FILENAME, FEATURES_PATH

	# Top 10 features for user input
	SELECTED_FEATURES = [
	'Credit_Mix_Ordinal',
	'Outstanding_Debt',
	'Delay_from_due_date',
	'Payment_of_Min_Amount_Yes',
	'Num_Credit_Card',
	'Interest_Rate',
	'Num_of_Delayed_Payment',
	'Installment_to_Income',
	'Num_Bank_Accounts',
	'Num_Credit_Inquiries'
	]

	def run_pipeline():
	print("Starting pipeline...")

	# Load processed training data
	train_processed_path = os.path.join(DATA_PATH, 'processed', 'train_processed.csv')
	if not os.path.exists(train_processed_path):
	raise FileNotFoundError(f"Processed training data not found at {train_processed_path}")

	train_processed = pd.read_csv(train_processed_path)

	# Train model with ALL FEATURES except the target
	target = 'Credit_Score'
	if target not in train_processed.columns:
	raise ValueError("Target column 'Credit_Score' is missing from processed training data.")

	X = train_processed.drop(target, axis=1)
	y = train_processed[target]

	ALL_FEATURES = X.columns.tolist()

	print(f"Training model using ALL {len(ALL_FEATURES)} features...")
	print(f"Training data loaded: {X.shape[0]} samples, {X.shape[1]} features")

	# Define models
	rf_model = RandomForestClassifier(
	n_estimators=300,
	max_depth=12,
	class_weight='balanced',
	criterion='entropy',
	random_state=1907,
	n_jobs=-1
	)

	xgb_model = XGBClassifier(
	n_estimators=300,
	learning_rate=0.1,
	max_depth=6,
	random_state=1907,
	verbosity=0
	)

	stacking_clf = StackingClassifier(
	estimators=[('rf', rf_model), ('xgb', xgb_model)],
	final_estimator=LogisticRegression(max_iter=1000, random_state=1907),
	cv=5
	)

	# Train model
	print("Training stacking classifier...")
	stacking_clf.fit(X, y)

	# Save model
	os.makedirs(MODELS_PATH, exist_ok=True)
	model_path = os.path.join(MODELS_PATH, MODEL_FILENAME)
	joblib.dump(stacking_clf, model_path)
	print(f"Model saved to {model_path}")

	# Save BOTH feature lists
	feature_data = {
	"all_features": ALL_FEATURES,
	"top_10_features": SELECTED_FEATURES
	}

	with open(FEATURES_PATH, 'w') as f:
	json.dump(feature_data, f, indent=4)

	print("Feature lists saved.")
	print("Pipeline completed successfully.")


	if __name__ == "__main__":
	run_pipeline()