|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import pickle |
|
|
import json |
|
|
import argparse |
|
|
from pathlib import Path |
|
|
import sys |
|
|
|
|
|
from feature_engineering import FeatureEngineer |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.preprocessing import LabelEncoder |
|
|
from sklearn.metrics import (accuracy_score, precision_score, recall_score, |
|
|
f1_score, roc_auc_score, confusion_matrix) |
|
|
import xgboost as xgb |
|
|
import lightgbm as lgb |
|
|
from imblearn.over_sampling import SMOTE |
|
|
|
|
|
|
|
|
def load_data(data_dir): |
|
|
"""๋ฐ์ดํฐ ๋ก๋""" |
|
|
print("๋ฐ์ดํฐ ๋ก๋ ์ค...") |
|
|
|
|
|
df_store = pd.read_csv(f'{data_dir}/big_data_set1_f.csv', |
|
|
encoding='cp949', on_bad_lines='skip') |
|
|
df_usage = pd.read_csv(f'{data_dir}/ds2_monthly_usage.csv', |
|
|
encoding='cp949', on_bad_lines='skip') |
|
|
df_customer = pd.read_csv(f'{data_dir}/ds3_monthly_customers.csv', |
|
|
encoding='cp949', on_bad_lines='skip') |
|
|
|
|
|
print(f"๋งค์ฅ ์ ๋ณด: {df_store.shape}") |
|
|
print(f"์ด์ฉ ๋ฐ์ดํฐ: {df_usage.shape}") |
|
|
print(f"๊ณ ๊ฐ ๋ฐ์ดํฐ: {df_customer.shape}") |
|
|
|
|
|
return df_store, df_usage, df_customer |
|
|
|
|
|
|
|
|
def create_features(df_store, df_usage, df_customer, max_stores=None): |
|
|
"""ํน์ง ์์ฑ""" |
|
|
print("\nํน์ง ์์ฑ ์ค...") |
|
|
|
|
|
engineer = FeatureEngineer(include_weather=False) |
|
|
|
|
|
all_features = [] |
|
|
all_targets = [] |
|
|
|
|
|
store_ids = df_store['ENCODED_MCT'].unique() |
|
|
if max_stores: |
|
|
store_ids = store_ids[:max_stores] |
|
|
|
|
|
for idx, store_id in enumerate(store_ids): |
|
|
store_info = df_store[df_store['ENCODED_MCT'] == store_id].iloc[0] |
|
|
usage_data = df_usage[df_usage['ENCODED_MCT'] == store_id] |
|
|
customer_data = df_customer[df_customer['ENCODED_MCT'] == store_id] |
|
|
|
|
|
|
|
|
if len(usage_data) >= 3: |
|
|
store_data = { |
|
|
'industry': store_info['HPSN_MCT_BZN_CD_NM'] if pd.notna(store_info['HPSN_MCT_BZN_CD_NM']) else '๊ธฐํ', |
|
|
'location': store_info['MCT_SIGUNGU_NM'] |
|
|
} |
|
|
|
|
|
features = engineer.create_features(store_data, usage_data, customer_data) |
|
|
target = 1 if pd.notna(store_info['MCT_ME_D']) else 0 |
|
|
|
|
|
all_features.append(features) |
|
|
all_targets.append(target) |
|
|
|
|
|
if (idx + 1) % 500 == 0: |
|
|
print(f" ์ฒ๋ฆฌ ์ค... {idx + 1}/{len(store_ids)}") |
|
|
|
|
|
X = pd.concat(all_features, ignore_index=True) |
|
|
y = pd.Series(all_targets) |
|
|
|
|
|
print(f"์ด ์ํ: {len(X)}, ํน์ง ์: {X.shape[1]}") |
|
|
print(f"ํ์
๋น์จ: {y.mean():.2%} ({y.sum()}๊ฐ)") |
|
|
|
|
|
return X, y |
|
|
|
|
|
|
|
|
def preprocess_data(X, y): |
|
|
"""๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ""" |
|
|
print("\n๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์ค...") |
|
|
|
|
|
|
|
|
label_encoders = {} |
|
|
if 'context_industry' in X.columns: |
|
|
le = LabelEncoder() |
|
|
X['context_industry'] = le.fit_transform(X['context_industry'].astype(str)) |
|
|
label_encoders['context_industry'] = le |
|
|
|
|
|
|
|
|
X = X.fillna(X.median()) |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, test_size=0.25, random_state=42, stratify=y |
|
|
) |
|
|
|
|
|
print(f"Train: {X_train.shape}, Test: {X_test.shape}") |
|
|
print(f"Train ํ์
: {y_train.mean():.2%}, Test ํ์
: {y_test.mean():.2%}") |
|
|
|
|
|
return X_train, X_test, y_train, y_test, label_encoders |
|
|
|
|
|
|
|
|
def apply_smote(X_train, y_train): |
|
|
"""SMOTE ์ ์ฉ""" |
|
|
print("\nํด๋์ค ๋ถ๊ท ํ ์ฒ๋ฆฌ(SMOTE)...") |
|
|
|
|
|
min_samples = min(y_train.sum(), len(y_train) - y_train.sum()) |
|
|
k_neighbors = min(5, min_samples - 1) |
|
|
|
|
|
smote = SMOTE(random_state=42, k_neighbors=k_neighbors) |
|
|
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train) |
|
|
|
|
|
print(f"SMOTE ํ: ์์
{(y_train_balanced == 0).sum()}๊ฐ, ํ์
{(y_train_balanced == 1).sum()}๊ฐ") |
|
|
|
|
|
return X_train_balanced, y_train_balanced |
|
|
|
|
|
|
|
|
def train_models(X_train, y_train): |
|
|
"""๋ชจ๋ธ ํ์ต""" |
|
|
print("\n๋ชจ๋ธ ํ์ต ์ค...") |
|
|
|
|
|
|
|
|
print(" - XGBoost ํ์ต...") |
|
|
xgb_model = xgb.XGBClassifier( |
|
|
max_depth=6, |
|
|
learning_rate=0.1, |
|
|
n_estimators=200, |
|
|
random_state=42, |
|
|
eval_metric='logloss' |
|
|
) |
|
|
xgb_model.fit(X_train, y_train) |
|
|
|
|
|
|
|
|
print(" - LightGBM ํ์ต...") |
|
|
lgb_model = lgb.LGBMClassifier( |
|
|
max_depth=6, |
|
|
learning_rate=0.1, |
|
|
n_estimators=200, |
|
|
random_state=42, |
|
|
verbose=-1 |
|
|
) |
|
|
lgb_model.fit(X_train, y_train) |
|
|
|
|
|
print("๋ชจ๋ธ ํ์ต ์๋ฃ") |
|
|
|
|
|
return xgb_model, lgb_model |
|
|
|
|
|
|
|
|
def evaluate_models(xgb_model, lgb_model, X_test, y_test): |
|
|
"""๋ชจ๋ธ ํ๊ฐ""" |
|
|
print("\n๋ชจ๋ธ ํ๊ฐ ์ค...") |
|
|
|
|
|
|
|
|
xgb_pred = xgb_model.predict_proba(X_test)[:, 1] |
|
|
lgb_pred = lgb_model.predict_proba(X_test)[:, 1] |
|
|
|
|
|
|
|
|
ensemble_pred = 0.5 * xgb_pred + 0.5 * lgb_pred |
|
|
ensemble_pred_binary = (ensemble_pred > 0.5).astype(int) |
|
|
|
|
|
|
|
|
accuracy = accuracy_score(y_test, ensemble_pred_binary) |
|
|
precision = precision_score(y_test, ensemble_pred_binary, zero_division=0) |
|
|
recall = recall_score(y_test, ensemble_pred_binary, zero_division=0) |
|
|
f1 = f1_score(y_test, ensemble_pred_binary, zero_division=0) |
|
|
auc = roc_auc_score(y_test, ensemble_pred) |
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("๋ชจ๋ธ ์ฑ๋ฅ (Test Set)") |
|
|
print("=" * 70) |
|
|
print(f"Accuracy: {accuracy:.4f} ({accuracy * 100:.1f}%)") |
|
|
print(f"Precision: {precision:.4f} ({precision * 100:.1f}%)") |
|
|
print(f"Recall: {recall:.4f} ({recall * 100:.1f}%)") |
|
|
print(f"F1-Score: {f1:.4f}") |
|
|
print(f"AUC-ROC: {auc:.4f}") |
|
|
print("=" * 70) |
|
|
|
|
|
|
|
|
cm = confusion_matrix(y_test, ensemble_pred_binary) |
|
|
print(f"\nํผ๋ ํ๋ ฌ:") |
|
|
print(f" TN: {cm[0, 0]}, FP: {cm[0, 1]}") |
|
|
print(f" FN: {cm[1, 0]}, TP: {cm[1, 1]}") |
|
|
|
|
|
return { |
|
|
'accuracy': float(accuracy), |
|
|
'precision': float(precision), |
|
|
'recall': float(recall), |
|
|
'f1_score': float(f1), |
|
|
'auc_roc': float(auc) |
|
|
} |
|
|
|
|
|
|
|
|
def save_models(xgb_model, lgb_model, X, label_encoders, performance, output_dir): |
|
|
"""๋ชจ๋ธ ์ ์ฅ""" |
|
|
print(f"\n๋ชจ๋ธ ์ ์ฅ ์ค... ({output_dir})") |
|
|
|
|
|
output_path = Path(output_dir) |
|
|
output_path.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
with open(output_path / 'xgboost_model.pkl', 'wb') as f: |
|
|
pickle.dump(xgb_model, f) |
|
|
|
|
|
with open(output_path / 'lightgbm_model.pkl', 'wb') as f: |
|
|
pickle.dump(lgb_model, f) |
|
|
|
|
|
with open(output_path / 'label_encoders.pkl', 'wb') as f: |
|
|
pickle.dump(label_encoders, f) |
|
|
|
|
|
|
|
|
feature_names = list(X.columns) |
|
|
with open(output_path / 'feature_names.json', 'w', encoding='utf-8') as f: |
|
|
json.dump(feature_names, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
|
|
|
config = { |
|
|
'model_version': '2.0', |
|
|
'ensemble_weights': [0.5, 0.5], |
|
|
'threshold': 0.5, |
|
|
'n_features': len(feature_names), |
|
|
'performance': performance |
|
|
} |
|
|
|
|
|
with open(output_path / 'config.json', 'w', encoding='utf-8') as f: |
|
|
json.dump(config, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
print("๋ชจ๋ธ ์ ์ฅ ์๋ฃ") |
|
|
print(f" - {output_path / 'xgboost_model.pkl'}") |
|
|
print(f" - {output_path / 'lightgbm_model.pkl'}") |
|
|
print(f" - {output_path / 'config.json'}") |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description='์์์
์กฐ๊ธฐ๊ฒฝ๋ณด ๋ชจ๋ธ ํ์ต') |
|
|
parser.add_argument('--data', type=str, default='data/raw', |
|
|
help='๋ฐ์ดํฐ ๋๋ ํ ๋ฆฌ ๊ฒฝ๋ก') |
|
|
parser.add_argument('--output', type=str, default='models', |
|
|
help='๋ชจ๋ธ ์ ์ฅ ๊ฒฝ๋ก') |
|
|
parser.add_argument('--max-stores', type=int, default=None, |
|
|
help='์ต๋ ๋งค์ฅ ์ (ํ
์คํธ์ฉ)') |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
print("=" * 70) |
|
|
print("์์์
์กฐ๊ธฐ๊ฒฝ๋ณด ๋ชจ๋ธ v2.0 ํ์ต") |
|
|
print("=" * 70) |
|
|
|
|
|
|
|
|
df_store, df_usage, df_customer = load_data(args.data) |
|
|
|
|
|
|
|
|
X, y = create_features(df_store, df_usage, df_customer, args.max_stores) |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test, label_encoders = preprocess_data(X, y) |
|
|
|
|
|
|
|
|
X_train_balanced, y_train_balanced = apply_smote(X_train, y_train) |
|
|
|
|
|
|
|
|
xgb_model, lgb_model = train_models(X_train_balanced, y_train_balanced) |
|
|
|
|
|
|
|
|
performance = evaluate_models(xgb_model, lgb_model, X_test, y_test) |
|
|
|
|
|
|
|
|
save_models(xgb_model, lgb_model, X, label_encoders, performance, args.output) |
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("ํ์ต ์๋ฃ!") |
|
|
print("=" * 70) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|