Loan_Payment_Prediction / backend /src /data_preprocessing.py
msi
Add app
06702e9
import json
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
scale = StandardScaler()
def convert_money(x):
if isinstance(x, str): # Check if x is a string
x = x[1:].replace(',', '') # Remove the dollar sign and commas
return float(x)
elif isinstance(x, (int, float)): # If already numeric, return as is
return float(x)
else:
return None # Handle unexpected types gracefully
def clean_year(x):
if isinstance(x, str):
return x.replace('A', '')
return x
with open("store.json", "r") as file:
x = json.load(file)
def transform_data(df):
df[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']] = df[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']].applymap(convert_money)
df['State'] = df['State'].map(x['State'])
df['BankState'] = df['BankState'].map(x['BankState'])
df['Industry'] = df['Industry'].map(x['Industry'])
df['ApprovalFY'] = df['ApprovalFY'].apply(clean_year).astype('int64')
df.dropna(subset=['Name', 'City', 'State', 'BankState', 'NewExist','RevLineCr', 'LowDoc', 'DisbursementDate', 'MIS_Status'], inplace=True)
df = df.astype({'Zip': 'str', 'NewExist': 'int64', 'UrbanRural': 'str', 'DisbursementGross': 'float', 'BalanceGross': 'float',
'ChgOffPrinGr': 'float', 'GrAppv': 'float', 'SBA_Appv': 'float'})
df['Industry'] = df['NAICS'].astype('str').apply(lambda x: x[:2])
df['Industry'] = df['Industry'].map({
'0':'Unknown',
'11': 'Ag/For/Fish/Hunt',
'21': 'Min/Quar/Oil_Gas_ext',
'22': 'Utilities',
'23': 'Construction',
'31': 'Manufacturing',
'32': 'Manufacturing',
'33': 'Manufacturing',
'42': 'Wholesale_trade',
'44': 'Retail_trade',
'45': 'Retail_trade',
'48': 'Trans/Ware',
'49': 'Trans/Ware',
'51': 'Information',
'52': 'Finance/Insurance',
'53': 'RE/Rental/Lease',
'54': 'Prof/Science/Tech',
'55': 'Mgmt_comp',
'56': 'Admin_sup/Waste_Mgmt_Rem',
'61': 'Educational',
'62': 'Healthcare/Social_assist',
'71': 'Arts/Entertain/Rec',
'72': 'Accom/Food_serv',
'81': 'Other_no_pub',
'92': 'Public_Admin'
})
df.dropna(subset=['Industry'], inplace=True)
df.loc[(df['FranchiseCode'] <= 1), 'IsFranchise'] = 0
df.loc[(df['FranchiseCode'] > 1), 'IsFranchise'] = 1
df = df.astype({'IsFranchise': 'int64'})
df = df[(df['NewExist'] == 1) | (df['NewExist'] == 2)]
# Create NewBusiness field where 0 = Existing business and 1 = New business; based on NewExist field
df.loc[(df['NewExist'] == 1), 'NewBusiness'] = 0
df.loc[(df['NewExist'] == 2), 'NewBusiness'] = 1
df = df[(df['RevLineCr'] == 'Y') | (df['RevLineCr'] == 'N')]
df = df[(df['LowDoc'] == 'Y') | (df['LowDoc'] == 'N')]
# RevLineCr and LowDoc: 0 = No, 1 = Yes
df['RevLineCr'] = np.where(df['RevLineCr'] == 'N', 0, 1)
df['LowDoc'] = np.where(df['LowDoc'] == 'N', 0, 1)
df['Default'] = np.where(df['MIS_Status'] == 'P I F', 0, 1)
df[['ApprovalDate', 'DisbursementDate']] = df[['ApprovalDate', 'DisbursementDate']].apply(pd.to_datetime)
df['DisbursementFY'] = df['DisbursementDate'].map(lambda x: x.year)
df['DaysToDisbursement'] = df['DisbursementDate'] - df['ApprovalDate']
df['DaysToDisbursement'] = df['DaysToDisbursement'].astype('str').apply(lambda x: x[:x.index('d') - 1]).astype('int64')
df['SBA_AppvPct'] = df['SBA_Appv'] / df['GrAppv']
df['AppvDisbursed'] = np.where(df['DisbursementGross'] == df['GrAppv'], 1, 0)
df = df.astype({'IsFranchise': 'int64', 'NewBusiness': 'int64'})
df.drop(columns=['LoanNr_ChkDgt', 'Name', 'City', 'Zip', 'Bank', 'NAICS', 'ApprovalDate', 'NewExist', 'FranchiseCode',
'ChgOffDate', 'DisbursementDate', 'BalanceGross', 'ChgOffPrinGr', 'SBA_Appv', 'MIS_Status','CreateJob','RetainedJob'], inplace=True)
# Field for loans backed by Real Estate (loans with a term of at least 20 years)
df['RealEstate'] = np.where(df['Term'] >= 240, 1, 0)
# Field for loans active during the Great Recession (2007-2009)
df['GreatRecession'] = np.where(((2007 <= df['DisbursementFY']) & (df['DisbursementFY'] <= 2009)) |
((df['DisbursementFY'] < 2007) & (df['DisbursementFY'] + (df['Term']/12) >= 2007)), 1, 0)
df['DisbursedGreaterAppv'] = np.where(df['DisbursementGross'] > df['GrAppv'], 1, 0)
for column in df.select_dtypes(include='object').columns:
# Encode the column
df[column] = encoder.fit_transform(df[column])
y = df['Default']
X = df.drop('Default', axis=1)
return X,y