File size: 2,932 Bytes
b4fadea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# Preparing data
import os
import pandas as pd
from sklearn.model_selection import train_test_split
# -----------------------------
# Paths
# -----------------------------
RAW_DATA_PATH = "data/raw/credit_default.csv"
PROCESSED_DATA_DIR = "data/processed"
MODELS_DIR = "models/v1"
CLEAN_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, "credit_default_clean.csv")
CURRENT_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, "current_data.csv")
REFERENCE_DATA_PATH = os.path.join(MODELS_DIR, "reference_data.csv")
# -----------------------------
# Column mapping
# -----------------------------
COLUMN_RENAME_MAP = {
"LIMIT_BAL": "credit_limit",
"AGE": "age",
"PAY_0": "pay_delay_sep",
"PAY_2": "pay_delay_aug",
"PAY_3": "pay_delay_jul",
"PAY_4": "pay_delay_jun",
"PAY_5": "pay_delay_may",
"PAY_6": "pay_delay_apr",
"BILL_AMT1": "bill_amt_sep",
"BILL_AMT2": "bill_amt_aug",
"BILL_AMT3": "bill_amt_jul",
"BILL_AMT4": "bill_amt_jun",
"BILL_AMT5": "bill_amt_may",
"BILL_AMT6": "bill_amt_apr",
"PAY_AMT1": "pay_amt_sep",
"PAY_AMT2": "pay_amt_aug",
"PAY_AMT3": "pay_amt_jul",
"PAY_AMT4": "pay_amt_jun",
"PAY_AMT5": "pay_amt_may",
"PAY_AMT6": "pay_amt_apr",
"default.payment.next.month": "target"
}
# -----------------------------
# Feature selection (frozen)
# -----------------------------
FEATURE_COLUMNS = [
"credit_limit",
"age",
"pay_delay_sep",
"pay_delay_aug",
"bill_amt_sep",
"bill_amt_aug",
"pay_amt_sep",
"pay_amt_aug",
]
TARGET_COLUMN = "target"
# -----------------------------
# Main logic
# -----------------------------
def main():
# Create directories if missing
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)
# Load raw data
df = pd.read_csv(RAW_DATA_PATH)
# Drop ID column (not a feature)
if "ID" in df.columns:
df = df.drop(columns=["ID"])
# Rename columns
df = df.rename(columns=COLUMN_RENAME_MAP)
# Keep only selected features + target
required_columns = FEATURE_COLUMNS + [TARGET_COLUMN]
df = df[required_columns]
# Basic sanity checks
if df.isnull().any().any():
raise ValueError("Null values detected after preprocessing.")
# Save fully cleaned dataset
df.to_csv(CLEAN_DATA_PATH, index=False)
# Reference / current split (time-simulated, deterministic)
reference_df, current_df = train_test_split(
df,
test_size=0.3,
shuffle=False
)
# Persist splits
reference_df.to_csv(REFERENCE_DATA_PATH, index=False)
current_df.to_csv(CURRENT_DATA_PATH, index=False)
print("Data preparation completed successfully.")
print(f"Clean data saved to: {CLEAN_DATA_PATH}")
print(f"Reference data saved to: {REFERENCE_DATA_PATH}")
print(f"Current data saved to: {CURRENT_DATA_PATH}")
if __name__ == "__main__":
main()
|