Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from sklearn.preprocessing import OneHotEncoder, LabelEncoder | |
| def get_cleaned_data(): | |
| df = pd.read_csv('German Credit Data.csv') | |
| # Fill missing values | |
| df['Saving accounts'] = df['Saving accounts'].fillna('No Savings') | |
| df['Checking account'] = df['Checking account'].fillna('No Checking') | |
| df = df.drop(columns='Unnamed: 0') | |
| #print(df.info()) | |
| num_cols = ['Credit amount', 'Duration in month', 'Age in years'] | |
| cat_cols = ['Saving accounts', 'Checking account', 'Purpose', 'Sex', 'Housing', 'Job'] | |
| #Encoding Categorical Variabpythles | |
| label = LabelEncoder() | |
| df['Saving accounts'] = label.fit_transform(df['Saving accounts']) | |
| df['Checking account'] = label.fit_transform(df['Checking account']) | |
| #One Hot Encoding | |
| df = pd.get_dummies(df, columns=['Purpose', 'Sex', 'Housing', 'Job']).astype(int) | |
| # Scoring system | |
| risk_score = ( | |
| (df['Credit amount'] > 5000).astype(int) + | |
| (df['Duration'] > 24).astype(int) + | |
| (df['Saving accounts'] == 0).astype(int) + # 0 = 'No Savings' after label encoding | |
| (df['Checking account'] == 0).astype(int) + # 0 = 'No Checking' after label encoding | |
| (df['Purpose_radio/TV'] == 1).astype(int) if 'Purpose_radio/TV' in df.columns else 0 + | |
| (df['Housing_rent'] == 1).astype(int) if 'Housing_rent' in df.columns else 0 + | |
| (df['Job_0'] == 1).astype(int) if 'Job_0' in df.columns else 0 | |
| ) | |
| # Set threshold: if risk_score >= 3, high risk (1), else low risk (0) | |
| df['credit_risk'] = (risk_score >= 3).astype(int) | |
| return df | |