import pandas as pd import numpy as np import pickle from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, f1_score # ===================== # Load dataset # ===================== df = pd.read_csv("loan_data.csv") # upload CSV in same folder print(df.head()) # Drop ID column df.drop(columns=['Loan_ID'], inplace=True) # Encode target df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0}) # ===================== # Target and features # ===================== X = df.drop('Loan_Status', axis=1) y = df['Loan_Status'] # ===================== # Column split # ===================== numeric_features = X.select_dtypes(include=['int64', 'float64']).columns categorical_features = X.select_dtypes(include=['object']).columns # ===================== # Preprocessing # ===================== num_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) cat_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer(transformers=[ ('num', num_transformer, numeric_features), ('cat', cat_transformer, categorical_features) ]) # ===================== # Random Forest Model # ===================== rf_model = RandomForestClassifier( n_estimators=200, max_depth=10, min_samples_split=2, random_state=42, n_jobs=-1 ) # ===================== # Full Pipeline # ===================== rf_pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('model', rf_model) ]) # ===================== # Train-test split # ===================== X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # ===================== # Train # ===================== rf_pipeline.fit(X_train, y_train) # ===================== # Evaluation # ===================== y_pred = rf_pipeline.predict(X_test) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) print(f"Accuracy: {acc:.4f}") print(f"F1 Score: {f1:.4f}") # ===================== # Save model # ===================== with open("loan_rf_pipeline.pkl", "wb") as f: pickle.dump(rf_pipeline, f) print("✅ Loan Random Forest pipeline saved as loan_rf_pipeline.pkl")