|
|
|
|
|
|
|
|
import os |
|
|
import pandas as pd |
|
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RAW_DATA_PATH = "data/raw/credit_default.csv" |
|
|
PROCESSED_DATA_DIR = "data/processed" |
|
|
MODELS_DIR = "models/v1" |
|
|
|
|
|
CLEAN_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, "credit_default_clean.csv") |
|
|
CURRENT_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, "current_data.csv") |
|
|
REFERENCE_DATA_PATH = os.path.join(MODELS_DIR, "reference_data.csv") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
COLUMN_RENAME_MAP = { |
|
|
"LIMIT_BAL": "credit_limit", |
|
|
"AGE": "age", |
|
|
|
|
|
"PAY_0": "pay_delay_sep", |
|
|
"PAY_2": "pay_delay_aug", |
|
|
"PAY_3": "pay_delay_jul", |
|
|
"PAY_4": "pay_delay_jun", |
|
|
"PAY_5": "pay_delay_may", |
|
|
"PAY_6": "pay_delay_apr", |
|
|
|
|
|
"BILL_AMT1": "bill_amt_sep", |
|
|
"BILL_AMT2": "bill_amt_aug", |
|
|
"BILL_AMT3": "bill_amt_jul", |
|
|
"BILL_AMT4": "bill_amt_jun", |
|
|
"BILL_AMT5": "bill_amt_may", |
|
|
"BILL_AMT6": "bill_amt_apr", |
|
|
|
|
|
"PAY_AMT1": "pay_amt_sep", |
|
|
"PAY_AMT2": "pay_amt_aug", |
|
|
"PAY_AMT3": "pay_amt_jul", |
|
|
"PAY_AMT4": "pay_amt_jun", |
|
|
"PAY_AMT5": "pay_amt_may", |
|
|
"PAY_AMT6": "pay_amt_apr", |
|
|
|
|
|
"default.payment.next.month": "target" |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FEATURE_COLUMNS = [ |
|
|
"credit_limit", |
|
|
"age", |
|
|
"pay_delay_sep", |
|
|
"pay_delay_aug", |
|
|
"bill_amt_sep", |
|
|
"bill_amt_aug", |
|
|
"pay_amt_sep", |
|
|
"pay_amt_aug", |
|
|
] |
|
|
|
|
|
TARGET_COLUMN = "target" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True) |
|
|
os.makedirs(MODELS_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
df = pd.read_csv(RAW_DATA_PATH) |
|
|
|
|
|
|
|
|
if "ID" in df.columns: |
|
|
df = df.drop(columns=["ID"]) |
|
|
|
|
|
|
|
|
df = df.rename(columns=COLUMN_RENAME_MAP) |
|
|
|
|
|
|
|
|
required_columns = FEATURE_COLUMNS + [TARGET_COLUMN] |
|
|
df = df[required_columns] |
|
|
|
|
|
|
|
|
if df.isnull().any().any(): |
|
|
raise ValueError("Null values detected after preprocessing.") |
|
|
|
|
|
|
|
|
df.to_csv(CLEAN_DATA_PATH, index=False) |
|
|
|
|
|
|
|
|
reference_df, current_df = train_test_split( |
|
|
df, |
|
|
test_size=0.3, |
|
|
shuffle=False |
|
|
) |
|
|
|
|
|
|
|
|
reference_df.to_csv(REFERENCE_DATA_PATH, index=False) |
|
|
current_df.to_csv(CURRENT_DATA_PATH, index=False) |
|
|
|
|
|
print("Data preparation completed successfully.") |
|
|
print(f"Clean data saved to: {CLEAN_DATA_PATH}") |
|
|
print(f"Reference data saved to: {REFERENCE_DATA_PATH}") |
|
|
print(f"Current data saved to: {CURRENT_DATA_PATH}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|