File size: 4,742 Bytes
d032558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06702e9
d032558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
scale = StandardScaler()
def convert_money(x):
    if isinstance(x, str):  # Check if x is a string
        x = x[1:].replace(',', '')  # Remove the dollar sign and commas
        return float(x)
    elif isinstance(x, (int, float)):  # If already numeric, return as is
        return float(x)
    else:
        return None  # Handle unexpected types gracefully

def clean_year(x):
    if isinstance(x, str):
        return x.replace('A', '')
    return x

with open("store.json", "r") as file:
    x = json.load(file)

def transform_data(df):
    df[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']] = df[['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']].applymap(convert_money)
    df['State'] = df['State'].map(x['State'])
    df['BankState'] = df['BankState'].map(x['BankState'])
    df['Industry'] = df['Industry'].map(x['Industry'])
    df['ApprovalFY'] = df['ApprovalFY'].apply(clean_year).astype('int64')
    df.dropna(subset=['Name', 'City', 'State', 'BankState', 'NewExist','RevLineCr', 'LowDoc', 'DisbursementDate', 'MIS_Status'], inplace=True)
    df = df.astype({'Zip': 'str', 'NewExist': 'int64', 'UrbanRural': 'str', 'DisbursementGross': 'float', 'BalanceGross': 'float',
                          'ChgOffPrinGr': 'float', 'GrAppv': 'float', 'SBA_Appv': 'float'})
    df['Industry'] = df['NAICS'].astype('str').apply(lambda x: x[:2])
    df['Industry'] = df['Industry'].map({
    '0':'Unknown',
    '11': 'Ag/For/Fish/Hunt',
    '21': 'Min/Quar/Oil_Gas_ext',
    '22': 'Utilities',
    '23': 'Construction',
    '31': 'Manufacturing',
    '32': 'Manufacturing',
    '33': 'Manufacturing',
    '42': 'Wholesale_trade',
    '44': 'Retail_trade',
    '45': 'Retail_trade',
    '48': 'Trans/Ware',
    '49': 'Trans/Ware',
    '51': 'Information',
    '52': 'Finance/Insurance',
    '53': 'RE/Rental/Lease',
    '54': 'Prof/Science/Tech',
    '55': 'Mgmt_comp',
    '56': 'Admin_sup/Waste_Mgmt_Rem',
    '61': 'Educational',
    '62': 'Healthcare/Social_assist',
    '71': 'Arts/Entertain/Rec',
    '72': 'Accom/Food_serv',
    '81': 'Other_no_pub',
    '92': 'Public_Admin'
})
    df.dropna(subset=['Industry'], inplace=True)
    df.loc[(df['FranchiseCode'] <= 1), 'IsFranchise'] = 0
    df.loc[(df['FranchiseCode'] > 1), 'IsFranchise'] = 1
    df = df.astype({'IsFranchise': 'int64'})
    df = df[(df['NewExist'] == 1) | (df['NewExist'] == 2)]

# Create NewBusiness field where 0 = Existing business and 1 = New business; based on NewExist field
    df.loc[(df['NewExist'] == 1), 'NewBusiness'] = 0
    df.loc[(df['NewExist'] == 2), 'NewBusiness'] = 1
    df = df[(df['RevLineCr'] == 'Y') | (df['RevLineCr'] == 'N')]
    df = df[(df['LowDoc'] == 'Y') | (df['LowDoc'] == 'N')]

# RevLineCr and LowDoc: 0 = No, 1 = Yes
    df['RevLineCr'] = np.where(df['RevLineCr'] == 'N', 0, 1)
    df['LowDoc'] = np.where(df['LowDoc'] == 'N', 0, 1)
    df['Default'] = np.where(df['MIS_Status'] == 'P I F', 0, 1)
    df[['ApprovalDate', 'DisbursementDate']] = df[['ApprovalDate', 'DisbursementDate']].apply(pd.to_datetime)
    df['DisbursementFY'] = df['DisbursementDate'].map(lambda x: x.year)
    df['DaysToDisbursement'] = df['DisbursementDate'] - df['ApprovalDate']
    df['DaysToDisbursement'] = df['DaysToDisbursement'].astype('str').apply(lambda x: x[:x.index('d') - 1]).astype('int64')
    df['SBA_AppvPct'] = df['SBA_Appv'] / df['GrAppv']
    df['AppvDisbursed'] = np.where(df['DisbursementGross'] == df['GrAppv'], 1, 0)
    df = df.astype({'IsFranchise': 'int64', 'NewBusiness': 'int64'})
    df.drop(columns=['LoanNr_ChkDgt', 'Name', 'City', 'Zip', 'Bank', 'NAICS', 'ApprovalDate', 'NewExist', 'FranchiseCode',
                      'ChgOffDate', 'DisbursementDate', 'BalanceGross', 'ChgOffPrinGr', 'SBA_Appv', 'MIS_Status','CreateJob','RetainedJob'], inplace=True)
    # Field for loans backed by Real Estate (loans with a term of at least 20 years)
    df['RealEstate'] = np.where(df['Term'] >= 240, 1, 0)

# Field for loans active during the Great Recession (2007-2009)
    df['GreatRecession'] = np.where(((2007 <= df['DisbursementFY']) & (df['DisbursementFY'] <= 2009)) | 
                                     ((df['DisbursementFY'] < 2007) & (df['DisbursementFY'] + (df['Term']/12) >= 2007)), 1, 0)
    df['DisbursedGreaterAppv'] = np.where(df['DisbursementGross'] > df['GrAppv'], 1, 0)
    for column in df.select_dtypes(include='object').columns:
    # Encode the column
        df[column] = encoder.fit_transform(df[column])
    y = df['Default']
    X = df.drop('Default', axis=1)
    return X,y