Spaces:

MISSAOUI
/

Loan_Payment_Prediction

Sleeping

Loan_Payment_Prediction / backend /src /data_preprocessing.py

msi

Add app

06702e9 12 months ago

4.74 kB

	import json
	import pandas as pd
	from sklearn.preprocessing import StandardScaler
	import numpy as np
	from sklearn.preprocessing import LabelEncoder

	encoder = LabelEncoder()
	scale = StandardScaler()
	def convert_money(x):
	if isinstance(x, str): # Check if x is a string
	x = x[1:].replace(',', '') # Remove the dollar sign and commas
	return float(x)
	elif isinstance(x, (int, float)): # If already numeric, return as is
	return float(x)
	else:
	return None # Handle unexpected types gracefully

	def clean_year(x):
	if isinstance(x, str):
	return x.replace('A', '')
	return x

	with open("store.json", "r") as file:
	x = json.load(file)

	def transform_data(df):
	df[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']] = df[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']].applymap(convert_money)
	df['State'] = df['State'].map(x['State'])
	df['BankState'] = df['BankState'].map(x['BankState'])
	df['Industry'] = df['Industry'].map(x['Industry'])
	df['ApprovalFY'] = df['ApprovalFY'].apply(clean_year).astype('int64')
	df.dropna(subset=['Name', 'City', 'State', 'BankState', 'NewExist','RevLineCr', 'LowDoc', 'DisbursementDate', 'MIS_Status'], inplace=True)
	df = df.astype({'Zip': 'str', 'NewExist': 'int64', 'UrbanRural': 'str', 'DisbursementGross': 'float', 'BalanceGross': 'float',
	'ChgOffPrinGr': 'float', 'GrAppv': 'float', 'SBA_Appv': 'float'})
	df['Industry'] = df['NAICS'].astype('str').apply(lambda x: x[:2])
	df['Industry'] = df['Industry'].map({
	'0':'Unknown',
	'11': 'Ag/For/Fish/Hunt',
	'21': 'Min/Quar/Oil_Gas_ext',
	'22': 'Utilities',
	'23': 'Construction',
	'31': 'Manufacturing',
	'32': 'Manufacturing',
	'33': 'Manufacturing',
	'42': 'Wholesale_trade',
	'44': 'Retail_trade',
	'45': 'Retail_trade',
	'48': 'Trans/Ware',
	'49': 'Trans/Ware',
	'51': 'Information',
	'52': 'Finance/Insurance',
	'53': 'RE/Rental/Lease',
	'54': 'Prof/Science/Tech',
	'55': 'Mgmt_comp',
	'56': 'Admin_sup/Waste_Mgmt_Rem',
	'61': 'Educational',
	'62': 'Healthcare/Social_assist',
	'71': 'Arts/Entertain/Rec',
	'72': 'Accom/Food_serv',
	'81': 'Other_no_pub',
	'92': 'Public_Admin'
	})
	df.dropna(subset=['Industry'], inplace=True)
	df.loc[(df['FranchiseCode'] <= 1), 'IsFranchise'] = 0
	df.loc[(df['FranchiseCode'] > 1), 'IsFranchise'] = 1
	df = df.astype({'IsFranchise': 'int64'})
	df = df[(df['NewExist'] == 1) \| (df['NewExist'] == 2)]

	# Create NewBusiness field where 0 = Existing business and 1 = New business; based on NewExist field
	df.loc[(df['NewExist'] == 1), 'NewBusiness'] = 0
	df.loc[(df['NewExist'] == 2), 'NewBusiness'] = 1
	df = df[(df['RevLineCr'] == 'Y') \| (df['RevLineCr'] == 'N')]
	df = df[(df['LowDoc'] == 'Y') \| (df['LowDoc'] == 'N')]

	# RevLineCr and LowDoc: 0 = No, 1 = Yes
	df['RevLineCr'] = np.where(df['RevLineCr'] == 'N', 0, 1)
	df['LowDoc'] = np.where(df['LowDoc'] == 'N', 0, 1)
	df['Default'] = np.where(df['MIS_Status'] == 'P I F', 0, 1)
	df[['ApprovalDate', 'DisbursementDate']] = df[['ApprovalDate', 'DisbursementDate']].apply(pd.to_datetime)
	df['DisbursementFY'] = df['DisbursementDate'].map(lambda x: x.year)
	df['DaysToDisbursement'] = df['DisbursementDate'] - df['ApprovalDate']
	df['DaysToDisbursement'] = df['DaysToDisbursement'].astype('str').apply(lambda x: x[:x.index('d') - 1]).astype('int64')
	df['SBA_AppvPct'] = df['SBA_Appv'] / df['GrAppv']
	df['AppvDisbursed'] = np.where(df['DisbursementGross'] == df['GrAppv'], 1, 0)
	df = df.astype({'IsFranchise': 'int64', 'NewBusiness': 'int64'})
	df.drop(columns=['LoanNr_ChkDgt', 'Name', 'City', 'Zip', 'Bank', 'NAICS', 'ApprovalDate', 'NewExist', 'FranchiseCode',
	'ChgOffDate', 'DisbursementDate', 'BalanceGross', 'ChgOffPrinGr', 'SBA_Appv', 'MIS_Status','CreateJob','RetainedJob'], inplace=True)
	# Field for loans backed by Real Estate (loans with a term of at least 20 years)
	df['RealEstate'] = np.where(df['Term'] >= 240, 1, 0)

	# Field for loans active during the Great Recession (2007-2009)
	df['GreatRecession'] = np.where(((2007 <= df['DisbursementFY']) & (df['DisbursementFY'] <= 2009)) \|
	((df['DisbursementFY'] < 2007) & (df['DisbursementFY'] + (df['Term']/12) >= 2007)), 1, 0)
	df['DisbursedGreaterAppv'] = np.where(df['DisbursementGross'] > df['GrAppv'], 1, 0)
	for column in df.select_dtypes(include='object').columns:
	# Encode the column
	df[column] = encoder.fit_transform(df[column])
	y = df['Default']
	X = df.drop('Default', axis=1)
	return X,y