PhunvVi's picture
Upload 9 files
067ea44 verified
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
def get_cleaned_data():
df = pd.read_csv('German Credit Data.csv')
# Fill missing values
df['Saving accounts'] = df['Saving accounts'].fillna('No Savings')
df['Checking account'] = df['Checking account'].fillna('No Checking')
df = df.drop(columns='Unnamed: 0')
#print(df.info())
num_cols = ['Credit amount', 'Duration in month', 'Age in years']
cat_cols = ['Saving accounts', 'Checking account', 'Purpose', 'Sex', 'Housing', 'Job']
#Encoding Categorical Variabpythles
label = LabelEncoder()
df['Saving accounts'] = label.fit_transform(df['Saving accounts'])
df['Checking account'] = label.fit_transform(df['Checking account'])
#One Hot Encoding
df = pd.get_dummies(df, columns=['Purpose', 'Sex', 'Housing', 'Job']).astype(int)
# Scoring system
risk_score = (
(df['Credit amount'] > 5000).astype(int) +
(df['Duration'] > 24).astype(int) +
(df['Saving accounts'] == 0).astype(int) + # 0 = 'No Savings' after label encoding
(df['Checking account'] == 0).astype(int) + # 0 = 'No Checking' after label encoding
(df['Purpose_radio/TV'] == 1).astype(int) if 'Purpose_radio/TV' in df.columns else 0 +
(df['Housing_rent'] == 1).astype(int) if 'Housing_rent' in df.columns else 0 +
(df['Job_0'] == 1).astype(int) if 'Job_0' in df.columns else 0
)
# Set threshold: if risk_score >= 3, high risk (1), else low risk (0)
df['credit_risk'] = (risk_score >= 3).astype(int)
return df