Loan_Payment_Prediction / backend /src /clean_data_json.py
msi
Add app
06702e9
import json
import pandas as pd
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
x={
"State": {
"AK": 0,
"AL": 1,
"AR": 2,
"AZ": 3,
"CA": 4,
"CO": 5,
"CT": 6,
"DC": 7,
"DE": 8,
"FL": 9,
"GA": 10,
"HI": 11,
"IA": 12,
"ID": 13,
"IL": 14,
"IN": 15,
"KS": 16,
"KY": 17,
"LA": 18,
"MA": 19,
"MD": 20,
"ME": 21,
"MI": 22,
"MN": 23,
"MO": 24,
"MS": 25,
"MT": 26,
"NC": 27,
"ND": 28,
"NE": 29,
"NH": 30,
"NJ": 31,
"NM": 32,
"NV": 33,
"NY": 34,
"OH": 35,
"OK": 36,
"OR": 37,
"PA": 38,
"RI": 39,
"SC": 40,
"SD": 41,
"TN": 42,
"TX": 43,
"UT": 44,
"VA": 45,
"VT": 46,
"WA": 47,
"WI": 48,
"WV": 49,
"WY": 50
},
"BankState": {
"AK": 0,
"AL": 1,
"AR": 2,
"AZ": 3,
"CA": 4,
"CO": 5,
"CT": 6,
"DC": 7,
"DE": 8,
"EN": 9,
"FL": 10,
"GA": 11,
"GU": 12,
"HI": 13,
"IA": 14,
"ID": 15,
"IL": 16,
"IN": 17,
"KS": 18,
"KY": 19,
"LA": 20,
"MA": 21,
"MD": 22,
"ME": 23,
"MI": 24,
"MN": 25,
"MO": 26,
"MS": 27,
"MT": 28,
"NC": 29,
"ND": 30,
"NE": 31,
"NH": 32,
"NJ": 33,
"NM": 34,
"NV": 35,
"NY": 36,
"OH": 37,
"OK": 38,
"OR": 39,
"PA": 40,
"PR": 41,
"RI": 42,
"SC": 43,
"SD": 44,
"TN": 45,
"TX": 46,
"UT": 47,
"VA": 48,
"VT": 49,
"WA": 50,
"WI": 51,
"WV": 52,
"WY": 53
},
"Industry": {
"Accom/Food_serv": 0,
"Admin_sup/Waste_Mgmt_Rem": 1,
"Ag/For/Fish/Hunt": 2,
"Arts/Entertain/Rec": 3,
"Construction": 4,
"Educational": 5,
"Finance/Insurance": 6,
"Healthcare/Social_assist": 7,
"Information": 8,
"Manufacturing": 9,
"Mgmt_comp": 10,
"Min/Quar/Oil_Gas_ext": 11,
"Other_no_pub": 12,
"Prof/Science/Tech": 13,
"Public_Admin": 14,
"RE/Rental/Lease": 15,
"Retail_trade": 16,
"Trans/Ware": 17,
"Unknown": 18,
"Utilities": 19,
"Wholesale_trade": 20
}
}
def clean_data(df):
df['State'] = df['State'].map(x['State'])
df['BankState'] = df['BankState'].map(x['BankState'])
df['Industry'] = df['Industry'].map(x['Industry'])
return df
# Function to scale data
def scaling(df):
# Only scale numerical columns
num_cols = df.select_dtypes(include=['number']).columns
df[num_cols] = scale.fit_transform(df[num_cols])
return df