File size: 1,737 Bytes
a0f3d24 b4fadea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# offline training
import os
import json
import joblib
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
# -----------------------------
# Paths
# -----------------------------
DATA_PATH = "data/processed/credit_default_clean.csv"
MODEL_DIR = "models/v1"
MODEL_PATH = os.path.join(MODEL_DIR, "model.pkl")
FEATURES_PATH = os.path.join(MODEL_DIR, "features.json")
# -----------------------------
# Columns
# -----------------------------
FEATURE_COLUMNS = [
"credit_limit",
"age",
"pay_delay_sep",
"pay_delay_aug",
"bill_amt_sep",
"bill_amt_aug",
"pay_amt_sep",
"pay_amt_aug",
]
TARGET_COLUMN = "target"
# -----------------------------
# Main
# -----------------------------
def main():
os.makedirs(MODEL_DIR, exist_ok=True)
df = pd.read_csv(DATA_PATH)
X = df[FEATURE_COLUMNS]
y = df[TARGET_COLUMN]
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
model = LogisticRegression(
max_iter=1000,
solver="lbfgs"
)
model.fit(X_train, y_train)
# Evaluation
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]
acc = accuracy_score(y_val, y_pred)
roc = roc_auc_score(y_val, y_proba)
print(f"Validation Accuracy: {acc:.4f}")
print(f"Validation ROC-AUC: {roc:.4f}")
# Persist artifacts
joblib.dump(model, MODEL_PATH)
with open(FEATURES_PATH, "w") as f:
json.dump(FEATURE_COLUMNS, f, indent=2)
print("Model and features saved successfully.")
if __name__ == "__main__":
main()
|