import pandas as pd import numpy as np from typing import Dict, List, Tuple, Optional from scipy import stats from datetime import datetime import warnings warnings.filterwarnings('ignore') def safe_numeric_convert(series, default_value=0): """안전하게 숫자로 변환""" try: converted = pd.to_numeric(series, errors='coerce') converted = converted.replace(-999999.9, np.nan) if converted.isna().all(): return default_value return converted.mean() except: return default_value class FeatureEngineer: """특징 생성 클래스""" def __init__(self, include_weather: bool = False): self.include_weather = include_weather def create_features(self, store_data: Dict, monthly_usage: pd.DataFrame, monthly_customers: pd.DataFrame) -> pd.DataFrame: features = {} # 1. 매출 관련 특징 sales_features = self._create_sales_features(monthly_usage) features.update(sales_features) # 2. 고객 관련 특징 customer_features = self._create_customer_features(monthly_customers) features.update(customer_features) # 3. 운영 관련 특징 operation_features = self._create_operation_features(monthly_usage) features.update(operation_features) # 4. 트렌드 특징 trend_features = self._create_trend_features(monthly_usage) features.update(trend_features) # 5. 변동성 특징 volatility_features = self._create_volatility_features(monthly_usage) features.update(volatility_features) # 6. 계절성 특징 seasonality_features = self._create_seasonality_features(monthly_usage) features.update(seasonality_features) # 7. 맥락 특징 context_features = self._create_context_features(store_data, monthly_usage) features.update(context_features) return pd.DataFrame([features]) def _create_sales_features(self, df: pd.DataFrame) -> Dict: """매출 관련 특징 생성(15개)""" features = {} if len(df) == 0: return self._get_default_sales_features() # 매출 구간 매핑 sales_map = { '1_0-25%': 25, '2_25-50%': 37.5, '3_25-50%': 37.5, '4_50-75%': 62.5, '5_75-100%': 87.5, '6_100%+': 100 } if 'RC_M1_SAA' in df.columns: sales = df['RC_M1_SAA'].map(sales_map).fillna(50) else: sales = pd.Series([50] * len(df)) # 다중 기간 평균 features['sales_avg_1m'] = sales.tail(1).mean() if len(sales) >= 1 else 50 features['sales_avg_3m'] = sales.tail(3).mean() if len(sales) >= 3 else 50 features['sales_avg_6m'] = sales.tail(6).mean() if len(sales) >= 6 else 50 features['sales_avg_12m'] = sales.mean() # 최근 vs 이전 if len(sales) >= 6: recent = sales.tail(3).mean() previous = sales.tail(6).head(3).mean() features['sales_recent_vs_previous'] = (recent / previous - 1) * 100 if previous > 0 else 0 else: features['sales_recent_vs_previous'] = 0 # 전월 대비, 전년 대비 if len(sales) >= 2: features['sales_mom_change'] = (sales.iloc[-1] / sales.iloc[-2] - 1) * 100 if sales.iloc[-2] > 0 else 0 else: features['sales_mom_change'] = 0 if len(sales) >= 13: features['sales_yoy_change'] = (sales.iloc[-1] / sales.iloc[-13] - 1) * 100 if sales.iloc[-13] > 0 else 0 else: features['sales_yoy_change'] = 0 # 최대, 최소, 범위 features['sales_max'] = sales.max() features['sales_min'] = sales.min() features['sales_range'] = features['sales_max'] - features['sales_min'] # 최근 3개월 평균 vs 전체 평균 if len(sales) >= 3: recent_avg = sales.tail(3).mean() total_avg = sales.mean() features['sales_recent_vs_total'] = (recent_avg / total_avg - 1) * 100 if total_avg > 0 else 0 else: features['sales_recent_vs_total'] = 0 # 최근 매출이 평균보다 낮은지 features['sales_below_avg'] = 1 if features['sales_avg_3m'] < features['sales_avg_12m'] else 0 # 최근 매출 추세(최근 3개월) if len(sales) >= 3: recent_sales = sales.tail(3).values if len(recent_sales) >= 2: slope = (recent_sales[-1] - recent_sales[0]) / len(recent_sales) features['sales_recent_trend'] = slope else: features['sales_recent_trend'] = 0 else: features['sales_recent_trend'] = 0 return features def _create_customer_features(self, df: pd.DataFrame) -> Dict: """고객 관련 특징 생성 (12개)""" features = {} if len(df) == 0: return self._get_default_customer_features() # 재이용률 - 안전한 변환 if 'MCT_UE_CLN_REU_RAT' in df.columns: try: reuse_rate = pd.to_numeric(df['MCT_UE_CLN_REU_RAT'], errors='coerce').replace(-999999.9, np.nan) features['customer_reuse_rate'] = reuse_rate.mean() if not reuse_rate.isna().all() else 25.0 features['customer_reuse_rate_last'] = reuse_rate.iloc[-1] if len(reuse_rate) > 0 and pd.notna( reuse_rate.iloc[-1]) else features['customer_reuse_rate'] # 재이용률 추세 if len(reuse_rate) >= 6: recent = reuse_rate.tail(3).mean() previous = reuse_rate.tail(6).head(3).mean() if pd.notna(recent) and pd.notna(previous) and previous > 0: features['customer_reuse_trend'] = (recent / previous - 1) * 100 else: features['customer_reuse_trend'] = 0 else: features['customer_reuse_trend'] = 0 except: features['customer_reuse_rate'] = 25.0 features['customer_reuse_rate_last'] = 25.0 features['customer_reuse_trend'] = 0 else: features['customer_reuse_rate'] = 25.0 features['customer_reuse_rate_last'] = 25.0 features['customer_reuse_trend'] = 0 # 신규 고객 비율 - 안전한 변환 if 'MCT_UE_CLN_NEW_RAT' in df.columns: features['customer_new_rate'] = safe_numeric_convert(df['MCT_UE_CLN_NEW_RAT'], 30.0) else: features['customer_new_rate'] = 30.0 # 연령대별 고객 비율 (남성) - 안전한 변환 age_columns_male = ['M12_MAL_1020_RAT', 'M12_MAL_30_RAT', 'M12_MAL_40_RAT', 'M12_MAL_50_RAT', 'M12_MAL_60_RAT'] for col in age_columns_male: if col in df.columns: features[f'customer_{col.lower()}'] = safe_numeric_convert(df[col], 10.0) else: features[f'customer_{col.lower()}'] = 10.0 # 연령대별 고객 비율 (여성) - 안전한 변환 age_columns_female = ['M12_FME_1020_RAT', 'M12_FME_30_RAT', 'M12_FME_40_RAT', 'M12_FME_50_RAT', 'M12_FME_60_RAT'] for col in age_columns_female: if col in df.columns: features[f'customer_{col.lower()}'] = safe_numeric_convert(df[col], 10.0) else: features[f'customer_{col.lower()}'] = 10.0 return features def _create_operation_features(self, df: pd.DataFrame) -> Dict: """운영 관련 특징 생성(8개)""" features = {} if len(df) == 0: return self._get_default_operation_features() # 영업 개월 수 if 'MCT_OPE_MS_CN' in df.columns: ope_months_map = { '1_0-25%': 3, '2_25-50%': 9, '3_25-50%': 9, '4_50-75%': 18, '5_75-100%': 30, '6_100%+': 48 } ope_numeric = df['MCT_OPE_MS_CN'].map(ope_months_map).fillna(12) features['operation_months'] = ope_numeric.iloc[-1] if len(ope_numeric) > 0 else 12 features['operation_months_avg'] = ope_numeric.mean() else: features['operation_months'] = 12 features['operation_months_avg'] = 12 # 평균 이용 금액 if 'RC_M1_AV_NP_AT' in df.columns: avg_amount_map = { '1_0-25%': 15000, '2_25-50%': 30000, '3_25-50%': 30000, '4_50-75%': 45000, '5_75-100%': 60000, '6_100%+': 80000 } avg_amount = df['RC_M1_AV_NP_AT'].map(avg_amount_map).fillna(30000) features['operation_avg_amount'] = avg_amount.mean() features['operation_avg_amount_last'] = avg_amount.iloc[-1] if len(avg_amount) > 0 else features[ 'operation_avg_amount'] else: features['operation_avg_amount'] = 30000 features['operation_avg_amount_last'] = 30000 # 승인 취소율 - 안전한 변환 if 'APV_CE_RAT' in df.columns: features['operation_cancel_rate'] = safe_numeric_convert(df['APV_CE_RAT'], 5.0) else: features['operation_cancel_rate'] = 5.0 # 배달 매출 비율 - 안전한 변환 if 'DLV_SAA_RAT' in df.columns: features['operation_delivery_rate'] = safe_numeric_convert(df['DLV_SAA_RAT'], 20.0) else: features['operation_delivery_rate'] = 20.0 return features def _create_trend_features(self, df: pd.DataFrame) -> Dict: """트렌드 특징 생성(5개)""" features = {} if len(df) < 3: return self._get_default_trend_features() # 매출 구간 매핑 sales_map = { '1_0-25%': 25, '2_25-50%': 37.5, '3_25-50%': 37.5, '4_50-75%': 62.5, '5_75-100%': 87.5, '6_100%+': 100 } if 'RC_M1_SAA' in df.columns: sales = df['RC_M1_SAA'].map(sales_map).fillna(50).values else: sales = np.array([50] * len(df)) # 선형 회귀 X = np.arange(len(sales)) if len(sales) >= 2 and not np.all(np.isnan(sales)): valid_mask = ~np.isnan(sales) if valid_mask.sum() >= 2: slope, intercept, r_value, p_value, std_err = stats.linregress(X[valid_mask], sales[valid_mask]) features['trend_slope'] = slope features['trend_r2'] = r_value ** 2 features['trend_direction'] = 1 if slope > 0 else -1 if slope < 0 else 0 else: features['trend_slope'] = 0 features['trend_r2'] = 0 features['trend_direction'] = 0 else: features['trend_slope'] = 0 features['trend_r2'] = 0 features['trend_direction'] = 0 # 연속 하락/상승 개월 수 consecutive_down = 0 consecutive_up = 0 for i in range(len(sales) - 1, 0, -1): if not np.isnan(sales[i]) and not np.isnan(sales[i - 1]): if sales[i] < sales[i - 1]: consecutive_down += 1 else: break for i in range(len(sales) - 1, 0, -1): if not np.isnan(sales[i]) and not np.isnan(sales[i - 1]): if sales[i] > sales[i - 1]: consecutive_up += 1 else: break features['trend_consecutive_down'] = consecutive_down features['trend_consecutive_up'] = consecutive_up return features def _create_volatility_features(self, df: pd.DataFrame) -> Dict: """변동성 특징 생성(4개)""" features = {} if len(df) < 2: return self._get_default_volatility_features() # 매출 구간 매핑 sales_map = { '1_0-25%': 25, '2_25-50%': 37.5, '3_25-50%': 37.5, '4_50-75%': 62.5, '5_75-100%': 87.5, '6_100%+': 100 } if 'RC_M1_SAA' in df.columns: sales = df['RC_M1_SAA'].map(sales_map).fillna(50) else: sales = pd.Series([50] * len(df)) # 변동계수(CV) mean_sales = sales.mean() std_sales = sales.std() features['volatility_cv'] = (std_sales / mean_sales * 100) if mean_sales > 0 else 0 # 표준편차 features['volatility_std'] = std_sales # MAD(Mean Absolute Deviation) features['volatility_mad'] = (sales - mean_sales).abs().mean() # 최근 3개월 변동성 if len(sales) >= 3: recent_std = sales.tail(3).std() features['volatility_recent_std'] = recent_std if not np.isnan(recent_std) else 0 else: features['volatility_recent_std'] = 0 return features def _create_seasonality_features(self, df: pd.DataFrame) -> Dict: """계절성 특징 생성(2개)""" features = {} if len(df) < 12: features['seasonality_detected'] = 0 features['seasonality_strength'] = 0 return features # 매출 구간 매핑 sales_map = { '1_0-25%': 25, '2_25-50%': 37.5, '3_25-50%': 37.5, '4_50-75%': 62.5, '5_75-100%': 87.5, '6_100%+': 100 } if 'RC_M1_SAA' in df.columns: sales = df['RC_M1_SAA'].map(sales_map).fillna(50).values else: sales = np.array([50] * len(df)) # 간단한 계절성 감지(최대-최소 차이) max_sales = np.nanmax(sales) min_sales = np.nanmin(sales) mean_sales = np.nanmean(sales) if mean_sales > 0: seasonality_strength = (max_sales - min_sales) / mean_sales * 100 features['seasonality_strength'] = seasonality_strength features['seasonality_detected'] = 1 if seasonality_strength > 30 else 0 else: features['seasonality_strength'] = 0 features['seasonality_detected'] = 0 return features def _create_context_features(self, store_data: Dict, df: pd.DataFrame) -> Dict: """맥락 특징 생성(1개)""" features = {} # 업종 features['context_industry'] = store_data.get('industry', '기타') return features # 기본값 반환 함수들 def _get_default_sales_features(self) -> Dict: """기본 매출 특징""" return { 'sales_avg_1m': 50, 'sales_avg_3m': 50, 'sales_avg_6m': 50, 'sales_avg_12m': 50, 'sales_recent_vs_previous': 0, 'sales_mom_change': 0, 'sales_yoy_change': 0, 'sales_max': 50, 'sales_min': 50, 'sales_range': 0, 'sales_recent_vs_total': 0, 'sales_below_avg': 0, 'sales_recent_trend': 0 } def _get_default_customer_features(self) -> Dict: """기본 고객 특징""" features = { 'customer_reuse_rate': 25.0, 'customer_reuse_rate_last': 25.0, 'customer_reuse_trend': 0, 'customer_new_rate': 30.0 } # 연령대별 기본값 for age in ['1020', '30', '40', '50', '60']: features[f'customer_m12_mal_{age}_rat'] = 10.0 features[f'customer_m12_fme_{age}_rat'] = 10.0 return features def _get_default_operation_features(self) -> Dict: """기본 운영 특징""" return { 'operation_months': 12, 'operation_months_avg': 12, 'operation_avg_amount': 30000, 'operation_avg_amount_last': 30000, 'operation_cancel_rate': 5.0, 'operation_delivery_rate': 20.0 } def _get_default_trend_features(self) -> Dict: """기본 트렌드 특징""" return { 'trend_slope': 0, 'trend_r2': 0, 'trend_direction': 0, 'trend_consecutive_down': 0, 'trend_consecutive_up': 0 } def _get_default_volatility_features(self) -> Dict: """기본 변동성 특징""" return { 'volatility_cv': 0, 'volatility_std': 0, 'volatility_mad': 0, 'volatility_recent_std': 0 }