File size: 12,317 Bytes

5092c1e

import pickle
import json
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Optional, Union
import warnings

warnings.filterwarnings('ignore')


class EarlyWarningPredictor:
    """자영업 조기경보 예측 모델"""

    def __init__(self, model_path: Optional[str] = None):
        self.model_path = Path(model_path) if model_path else Path(__file__).parent.parent / 'model'
        self.xgb_model = None
        self.lgb_model = None
        self.catboost_model = None
        self.label_encoders = {}
        self.feature_names = []
        self.config = {}
        self.is_loaded = False

    @classmethod
    def from_pretrained(cls, model_name_or_path: str):
        predictor = cls(model_path=model_name_or_path)
        predictor.load_model()
        return predictor

    def load_model(self):
        """모델 및 설정 로드"""
        if not self.model_path.exists():
            raise FileNotFoundError(f"Model directory not found: {self.model_path}")

        # XGBoost 로드
        xgb_path = self.model_path / 'xgboost_model.pkl'
        if xgb_path.exists():
            with open(xgb_path, 'rb') as f:
                self.xgb_model = pickle.load(f)

        # LightGBM 로드
        lgb_path = self.model_path / 'lightgbm_model.pkl'
        if lgb_path.exists():
            with open(lgb_path, 'rb') as f:
                self.lgb_model = pickle.load(f)

        # CatBoost 로드
        catboost_path = self.model_path / 'catboost_model.pkl'
        if catboost_path.exists():
            with open(catboost_path, 'rb') as f:
                self.catboost_model = pickle.load(f)

        # Label Encoders 로드
        le_path = self.model_path / 'label_encoders.pkl'
        if le_path.exists():
            with open(le_path, 'rb') as f:
                self.label_encoders = pickle.load(f)

        # Feature names 로드
        fn_path = self.model_path / 'feature_names.json'
        if fn_path.exists():
            with open(fn_path, 'r', encoding='utf-8') as f:
                self.feature_names = json.load(f)

        # Config 로드
        config_path = self.model_path / 'config.json'
        if config_path.exists():
            with open(config_path, 'r', encoding='utf-8') as f:
                self.config = json.load(f)

        self.is_loaded = True
        print(f"모델 로드 완료: v{self.config.get('model_version', '2.0')}")

    def predict(self, store_data: Dict,
                monthly_usage: Optional[pd.DataFrame] = None,
                monthly_customers: Optional[pd.DataFrame] = None,
                threshold: Optional[float] = None) -> Dict:
        if not self.is_loaded:
            self.load_model()

        # 특징 생성
        from src.feature_engineering import FeatureEngineer
        engineer = FeatureEngineer()

        if monthly_usage is None or monthly_customers is None:
            # 간단한 데이터 형식
            features = self._create_simple_features(store_data)
        else:
            # 전체 특징 생성
            features = engineer.create_features(store_data, monthly_usage, monthly_customers)

        # 특징 정렬 및 결측치 처리
        features = self._align_features(features)

        # 예측
        threshold = threshold or self.config.get('threshold', 0.5)

        if self.xgb_model and self.lgb_model:
            # 앙상블 예측
            xgb_prob = self.xgb_model.predict_proba(features)[0][1]
            lgb_prob = self.lgb_model.predict_proba(features)[0][1]

            weights = self.config.get('ensemble_weights', [0.5, 0.5])
            closure_probability = weights[0] * xgb_prob + weights[1] * lgb_prob

            if self.catboost_model and len(weights) > 2:
                cat_prob = self.catboost_model.predict_proba(features)[0][1]
                closure_probability = (weights[0] * xgb_prob +
                                       weights[1] * lgb_prob +
                                       weights[2] * cat_prob)
        else:
            closure_probability = 0.5

        # 위험도 점수(0-100)
        risk_score = closure_probability * 100

        # 위험 등급
        if risk_score < 30:
            risk_level = '낮음'
            risk_color = 'green'
        elif risk_score < 60:
            risk_level = '보통'
            risk_color = 'yellow'
        else:
            risk_level = '높음'
            risk_color = 'red'

        # 예측 결과
        result = {
            'risk_score': round(risk_score, 2),
            'risk_level': risk_level,
            'risk_color': risk_color,
            'closure_probability': round(closure_probability, 4),
            'is_at_risk': closure_probability > threshold,
            'threshold': threshold,
            'confidence': max(closure_probability, 1 - closure_probability),
            'model_version': self.config.get('model_version', '2.0')
        }

        # 위험 요인 분석(특징 중요도 기반)
        if self.xgb_model:
            result['risk_factors'] = self._analyze_risk_factors(features)

        # 액션 아이템
        result['action_items'] = self._generate_action_items(result, store_data)

        return result

    def predict_batch(self, stores_df: pd.DataFrame) -> pd.DataFrame:
        results = []

        for idx, row in stores_df.iterrows():
            store_data = row.to_dict()
            result = self.predict(store_data)
            result['store_id'] = row.get('store_id', idx)
            results.append(result)

        return pd.DataFrame(results)

    def explain(self, store_data: Dict, top_n: int = 10) -> Dict:
        # SHAP 분석(간단한 버전)
        result = self.predict(store_data)

        explanation = {
            'prediction': result,
            'top_features': result.get('risk_factors', {}),
            'interpretation': self._interpret_prediction(result)
        }

        return explanation

    def _create_simple_features(self, store_data: Dict) -> pd.DataFrame:
        """간단한 특징 생성"""
        # 기본 특징만 사용
        features = {
            'sales_avg_all': store_data.get('avg_sales', 50),
            'customer_reuse_rate': store_data.get('reuse_rate', 25),
            'operation_months': store_data.get('operating_months', 12),
            'trend_slope': store_data.get('sales_trend', 0),
        }

        # 나머지 특징은 기본값으로
        for fname in self.feature_names:
            if fname not in features:
                features[fname] = 0

        return pd.DataFrame([features])

    def _align_features(self, features: pd.DataFrame) -> pd.DataFrame:
        """특징 정렬 및 전처리"""
        # 모델 학습 시 사용한 특징 순서로 정렬
        aligned = pd.DataFrame()

        for fname in self.feature_names:
            if fname in features.columns:
                aligned[fname] = features[fname]
            else:
                aligned[fname] = 0

        # 결측치 처리
        aligned = aligned.fillna(aligned.median().fillna(0))

        return aligned

    def _analyze_risk_factors(self, features: pd.DataFrame) -> Dict[str, float]:
        """위험 요인 분석"""
        # 특징 중요도 기반
        if not hasattr(self.xgb_model, 'feature_importances_'):
            return {}

        importance = self.xgb_model.feature_importances_
        feature_values = features.iloc[0].values

        # 중요도와 값을 곱해서 기여도 계산
        contributions = {}

        for i, fname in enumerate(self.feature_names):
            if importance[i] > 0.01:  # 중요한 특징만
                score = importance[i] * abs(feature_values[i]) * 10

                # 특징명을 한글로 변환
                readable_name = self._translate_feature_name(fname)
                contributions[readable_name] = min(round(score, 1), 100)

        # 상위 6개만 반환
        sorted_factors = sorted(contributions.items(), key=lambda x: x[1], reverse=True)[:6]

        return dict(sorted_factors)

    def _translate_feature_name(self, fname: str) -> str:
        """특징명을 읽기 쉬운 형태로 변환"""
        translations = {
            'sales_avg': '매출',
            'trend_slope': '매출 추세',
            'trend_consecutive_down': '연속 하락',
            'customer_reuse_rate': '재이용률',
            'volatility_cv': '매출 변동성',
            'operation_months': '영업 기간',
            'sales_recent_vs_previous': '최근 매출 변화'
        }

        for key, value in translations.items():
            if key in fname:
                return value

        return fname

    def _generate_action_items(self, result: Dict, store_data: Dict) -> List[str]:
        """액션 아이템 생성"""
        actions = []

        risk_score = result['risk_score']

        if risk_score > 70:
            actions.append("즉시 조치 필요: 비용 절감 및 매출 증대 방안 마련")
            actions.append("현금흐름 개선: 외상 매출 회수 및 재고 최적화")
            actions.append("전문가 상담: 경영 컨설팅 및 구조조정 검토")
        elif risk_score > 40:
            actions.append("매출 분석: 주력 상품/서비스 재점검")
            actions.append("마케팅 강화: 신규 고객 유치 캠페인")
            actions.append("차별화 전략: 경쟁력 있는 요소 발굴 및 강화")
        else:
            actions.append("현재 상태 유지: 정기적인 모니터링 지속")
            actions.append("성장 기회 탐색: 추가 매출원 발굴")
            actions.append("고객 충성도 강화: 멤버십 프로그램 등")

        return actions

    def _interpret_prediction(self, result: Dict) -> str:
        """예측 결과 해석"""
        risk_level = result['risk_level']
        risk_score = result['risk_score']

        if risk_level == '높음':
            return f"위험도가 매우 높습니다 ({risk_score:.1f}점). 폐업 위험이 높으므로 즉각적인 대응이 필요합니다."
        elif risk_level == '보통':
            return f"주의가 필요합니다 ({risk_score:.1f}점). 개선 방안을 마련하여 위험을 줄이세요."
        else:
            return f"안정적입니다 ({risk_score:.1f}점). 현재의 운영 방식을 유지하면서 지속적으로 모니터링하세요."

    def get_model_info(self) -> Dict:
        """모델 정보 반환"""
        return {
            'version': self.config.get('model_version', '2.0'),
            'n_features': self.config.get('n_features', 0),
            'performance': self.config.get('performance', {}),
            'ensemble_weights': self.config.get('ensemble_weights', []),
            'models': {
                'xgboost': self.xgb_model is not None,
                'lightgbm': self.lgb_model is not None,
                'catboost': self.catboost_model is not None
            }
        }


if __name__ == "__main__":
    # 사용 예시
    print("=" * 70)
    print("Early Warning Predictor v2.0 테스트")
    print("=" * 70)

    # 모델 로드
    predictor = EarlyWarningPredictor(model_path='../model')

    try:
        predictor.load_model()

        # 테스트 데이터
        store_data = {
            'store_id': 'TEST_001',
            'industry': '카페',
            'location': '서울 강남구',
            'avg_sales': 45,
            'reuse_rate': 22.5,
            'operating_months': 18,
            'sales_trend': -0.05
        }

        # 예측
        result = predictor.predict(store_data)

        print("\n예측 결과:")
        print(f"  위험도 점수: {result['risk_score']}/100")
        print(f"  위험 등급: {result['risk_level']}")
        print(f"  폐업 확률: {result['closure_probability']:.1%}")

        if 'risk_factors' in result:
            print("\n주요 위험 요인:")
            for factor, score in result['risk_factors'].items():
                print(f"    - {factor}: {score:.1f}점")

        print("\n액션 아이템:")
        for action in result['action_items']:
            print(f"    {action}")

    except FileNotFoundError:
        print("모델 파일이 없습니다. 먼저 모델을 학습해주세요.")