|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from typing import Dict, List, Tuple, Optional |
|
|
from scipy import stats |
|
|
from datetime import datetime |
|
|
import warnings |
|
|
|
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
def safe_numeric_convert(series, default_value=0): |
|
|
"""안전하게 숫자로 변환""" |
|
|
try: |
|
|
converted = pd.to_numeric(series, errors='coerce') |
|
|
converted = converted.replace(-999999.9, np.nan) |
|
|
if converted.isna().all(): |
|
|
return default_value |
|
|
return converted.mean() |
|
|
except: |
|
|
return default_value |
|
|
|
|
|
|
|
|
class FeatureEngineer: |
|
|
"""특징 생성 클래스""" |
|
|
|
|
|
def __init__(self, include_weather: bool = False): |
|
|
self.include_weather = include_weather |
|
|
|
|
|
def create_features(self, store_data: Dict, monthly_usage: pd.DataFrame, |
|
|
monthly_customers: pd.DataFrame) -> pd.DataFrame: |
|
|
features = {} |
|
|
|
|
|
|
|
|
sales_features = self._create_sales_features(monthly_usage) |
|
|
features.update(sales_features) |
|
|
|
|
|
|
|
|
customer_features = self._create_customer_features(monthly_customers) |
|
|
features.update(customer_features) |
|
|
|
|
|
|
|
|
operation_features = self._create_operation_features(monthly_usage) |
|
|
features.update(operation_features) |
|
|
|
|
|
|
|
|
trend_features = self._create_trend_features(monthly_usage) |
|
|
features.update(trend_features) |
|
|
|
|
|
|
|
|
volatility_features = self._create_volatility_features(monthly_usage) |
|
|
features.update(volatility_features) |
|
|
|
|
|
|
|
|
seasonality_features = self._create_seasonality_features(monthly_usage) |
|
|
features.update(seasonality_features) |
|
|
|
|
|
|
|
|
context_features = self._create_context_features(store_data, monthly_usage) |
|
|
features.update(context_features) |
|
|
|
|
|
return pd.DataFrame([features]) |
|
|
|
|
|
def _create_sales_features(self, df: pd.DataFrame) -> Dict: |
|
|
"""매출 관련 특징 생성(15개)""" |
|
|
features = {} |
|
|
|
|
|
if len(df) == 0: |
|
|
return self._get_default_sales_features() |
|
|
|
|
|
|
|
|
sales_map = { |
|
|
'1_0-25%': 25, |
|
|
'2_25-50%': 37.5, |
|
|
'3_25-50%': 37.5, |
|
|
'4_50-75%': 62.5, |
|
|
'5_75-100%': 87.5, |
|
|
'6_100%+': 100 |
|
|
} |
|
|
|
|
|
if 'RC_M1_SAA' in df.columns: |
|
|
sales = df['RC_M1_SAA'].map(sales_map).fillna(50) |
|
|
else: |
|
|
sales = pd.Series([50] * len(df)) |
|
|
|
|
|
|
|
|
features['sales_avg_1m'] = sales.tail(1).mean() if len(sales) >= 1 else 50 |
|
|
features['sales_avg_3m'] = sales.tail(3).mean() if len(sales) >= 3 else 50 |
|
|
features['sales_avg_6m'] = sales.tail(6).mean() if len(sales) >= 6 else 50 |
|
|
features['sales_avg_12m'] = sales.mean() |
|
|
|
|
|
|
|
|
if len(sales) >= 6: |
|
|
recent = sales.tail(3).mean() |
|
|
previous = sales.tail(6).head(3).mean() |
|
|
features['sales_recent_vs_previous'] = (recent / previous - 1) * 100 if previous > 0 else 0 |
|
|
else: |
|
|
features['sales_recent_vs_previous'] = 0 |
|
|
|
|
|
|
|
|
if len(sales) >= 2: |
|
|
features['sales_mom_change'] = (sales.iloc[-1] / sales.iloc[-2] - 1) * 100 if sales.iloc[-2] > 0 else 0 |
|
|
else: |
|
|
features['sales_mom_change'] = 0 |
|
|
|
|
|
if len(sales) >= 13: |
|
|
features['sales_yoy_change'] = (sales.iloc[-1] / sales.iloc[-13] - 1) * 100 if sales.iloc[-13] > 0 else 0 |
|
|
else: |
|
|
features['sales_yoy_change'] = 0 |
|
|
|
|
|
|
|
|
features['sales_max'] = sales.max() |
|
|
features['sales_min'] = sales.min() |
|
|
features['sales_range'] = features['sales_max'] - features['sales_min'] |
|
|
|
|
|
|
|
|
if len(sales) >= 3: |
|
|
recent_avg = sales.tail(3).mean() |
|
|
total_avg = sales.mean() |
|
|
features['sales_recent_vs_total'] = (recent_avg / total_avg - 1) * 100 if total_avg > 0 else 0 |
|
|
else: |
|
|
features['sales_recent_vs_total'] = 0 |
|
|
|
|
|
|
|
|
features['sales_below_avg'] = 1 if features['sales_avg_3m'] < features['sales_avg_12m'] else 0 |
|
|
|
|
|
|
|
|
if len(sales) >= 3: |
|
|
recent_sales = sales.tail(3).values |
|
|
if len(recent_sales) >= 2: |
|
|
slope = (recent_sales[-1] - recent_sales[0]) / len(recent_sales) |
|
|
features['sales_recent_trend'] = slope |
|
|
else: |
|
|
features['sales_recent_trend'] = 0 |
|
|
else: |
|
|
features['sales_recent_trend'] = 0 |
|
|
|
|
|
return features |
|
|
|
|
|
def _create_customer_features(self, df: pd.DataFrame) -> Dict: |
|
|
"""고객 관련 특징 생성 (12개)""" |
|
|
features = {} |
|
|
|
|
|
if len(df) == 0: |
|
|
return self._get_default_customer_features() |
|
|
|
|
|
|
|
|
if 'MCT_UE_CLN_REU_RAT' in df.columns: |
|
|
try: |
|
|
reuse_rate = pd.to_numeric(df['MCT_UE_CLN_REU_RAT'], errors='coerce').replace(-999999.9, np.nan) |
|
|
features['customer_reuse_rate'] = reuse_rate.mean() if not reuse_rate.isna().all() else 25.0 |
|
|
features['customer_reuse_rate_last'] = reuse_rate.iloc[-1] if len(reuse_rate) > 0 and pd.notna( |
|
|
reuse_rate.iloc[-1]) else features['customer_reuse_rate'] |
|
|
|
|
|
|
|
|
if len(reuse_rate) >= 6: |
|
|
recent = reuse_rate.tail(3).mean() |
|
|
previous = reuse_rate.tail(6).head(3).mean() |
|
|
if pd.notna(recent) and pd.notna(previous) and previous > 0: |
|
|
features['customer_reuse_trend'] = (recent / previous - 1) * 100 |
|
|
else: |
|
|
features['customer_reuse_trend'] = 0 |
|
|
else: |
|
|
features['customer_reuse_trend'] = 0 |
|
|
except: |
|
|
features['customer_reuse_rate'] = 25.0 |
|
|
features['customer_reuse_rate_last'] = 25.0 |
|
|
features['customer_reuse_trend'] = 0 |
|
|
else: |
|
|
features['customer_reuse_rate'] = 25.0 |
|
|
features['customer_reuse_rate_last'] = 25.0 |
|
|
features['customer_reuse_trend'] = 0 |
|
|
|
|
|
|
|
|
if 'MCT_UE_CLN_NEW_RAT' in df.columns: |
|
|
features['customer_new_rate'] = safe_numeric_convert(df['MCT_UE_CLN_NEW_RAT'], 30.0) |
|
|
else: |
|
|
features['customer_new_rate'] = 30.0 |
|
|
|
|
|
|
|
|
age_columns_male = ['M12_MAL_1020_RAT', 'M12_MAL_30_RAT', 'M12_MAL_40_RAT', |
|
|
'M12_MAL_50_RAT', 'M12_MAL_60_RAT'] |
|
|
for col in age_columns_male: |
|
|
if col in df.columns: |
|
|
features[f'customer_{col.lower()}'] = safe_numeric_convert(df[col], 10.0) |
|
|
else: |
|
|
features[f'customer_{col.lower()}'] = 10.0 |
|
|
|
|
|
|
|
|
age_columns_female = ['M12_FME_1020_RAT', 'M12_FME_30_RAT', 'M12_FME_40_RAT', |
|
|
'M12_FME_50_RAT', 'M12_FME_60_RAT'] |
|
|
for col in age_columns_female: |
|
|
if col in df.columns: |
|
|
features[f'customer_{col.lower()}'] = safe_numeric_convert(df[col], 10.0) |
|
|
else: |
|
|
features[f'customer_{col.lower()}'] = 10.0 |
|
|
|
|
|
return features |
|
|
|
|
|
def _create_operation_features(self, df: pd.DataFrame) -> Dict: |
|
|
"""운영 관련 특징 생성(8개)""" |
|
|
features = {} |
|
|
|
|
|
if len(df) == 0: |
|
|
return self._get_default_operation_features() |
|
|
|
|
|
|
|
|
if 'MCT_OPE_MS_CN' in df.columns: |
|
|
ope_months_map = { |
|
|
'1_0-25%': 3, |
|
|
'2_25-50%': 9, |
|
|
'3_25-50%': 9, |
|
|
'4_50-75%': 18, |
|
|
'5_75-100%': 30, |
|
|
'6_100%+': 48 |
|
|
} |
|
|
ope_numeric = df['MCT_OPE_MS_CN'].map(ope_months_map).fillna(12) |
|
|
features['operation_months'] = ope_numeric.iloc[-1] if len(ope_numeric) > 0 else 12 |
|
|
features['operation_months_avg'] = ope_numeric.mean() |
|
|
else: |
|
|
features['operation_months'] = 12 |
|
|
features['operation_months_avg'] = 12 |
|
|
|
|
|
|
|
|
if 'RC_M1_AV_NP_AT' in df.columns: |
|
|
avg_amount_map = { |
|
|
'1_0-25%': 15000, |
|
|
'2_25-50%': 30000, |
|
|
'3_25-50%': 30000, |
|
|
'4_50-75%': 45000, |
|
|
'5_75-100%': 60000, |
|
|
'6_100%+': 80000 |
|
|
} |
|
|
avg_amount = df['RC_M1_AV_NP_AT'].map(avg_amount_map).fillna(30000) |
|
|
features['operation_avg_amount'] = avg_amount.mean() |
|
|
features['operation_avg_amount_last'] = avg_amount.iloc[-1] if len(avg_amount) > 0 else features[ |
|
|
'operation_avg_amount'] |
|
|
else: |
|
|
features['operation_avg_amount'] = 30000 |
|
|
features['operation_avg_amount_last'] = 30000 |
|
|
|
|
|
|
|
|
if 'APV_CE_RAT' in df.columns: |
|
|
features['operation_cancel_rate'] = safe_numeric_convert(df['APV_CE_RAT'], 5.0) |
|
|
else: |
|
|
features['operation_cancel_rate'] = 5.0 |
|
|
|
|
|
|
|
|
if 'DLV_SAA_RAT' in df.columns: |
|
|
features['operation_delivery_rate'] = safe_numeric_convert(df['DLV_SAA_RAT'], 20.0) |
|
|
else: |
|
|
features['operation_delivery_rate'] = 20.0 |
|
|
|
|
|
return features |
|
|
|
|
|
def _create_trend_features(self, df: pd.DataFrame) -> Dict: |
|
|
"""트렌드 특징 생성(5개)""" |
|
|
features = {} |
|
|
|
|
|
if len(df) < 3: |
|
|
return self._get_default_trend_features() |
|
|
|
|
|
|
|
|
sales_map = { |
|
|
'1_0-25%': 25, |
|
|
'2_25-50%': 37.5, |
|
|
'3_25-50%': 37.5, |
|
|
'4_50-75%': 62.5, |
|
|
'5_75-100%': 87.5, |
|
|
'6_100%+': 100 |
|
|
} |
|
|
|
|
|
if 'RC_M1_SAA' in df.columns: |
|
|
sales = df['RC_M1_SAA'].map(sales_map).fillna(50).values |
|
|
else: |
|
|
sales = np.array([50] * len(df)) |
|
|
|
|
|
|
|
|
X = np.arange(len(sales)) |
|
|
if len(sales) >= 2 and not np.all(np.isnan(sales)): |
|
|
valid_mask = ~np.isnan(sales) |
|
|
if valid_mask.sum() >= 2: |
|
|
slope, intercept, r_value, p_value, std_err = stats.linregress(X[valid_mask], sales[valid_mask]) |
|
|
features['trend_slope'] = slope |
|
|
features['trend_r2'] = r_value ** 2 |
|
|
features['trend_direction'] = 1 if slope > 0 else -1 if slope < 0 else 0 |
|
|
else: |
|
|
features['trend_slope'] = 0 |
|
|
features['trend_r2'] = 0 |
|
|
features['trend_direction'] = 0 |
|
|
else: |
|
|
features['trend_slope'] = 0 |
|
|
features['trend_r2'] = 0 |
|
|
features['trend_direction'] = 0 |
|
|
|
|
|
|
|
|
consecutive_down = 0 |
|
|
consecutive_up = 0 |
|
|
for i in range(len(sales) - 1, 0, -1): |
|
|
if not np.isnan(sales[i]) and not np.isnan(sales[i - 1]): |
|
|
if sales[i] < sales[i - 1]: |
|
|
consecutive_down += 1 |
|
|
else: |
|
|
break |
|
|
|
|
|
for i in range(len(sales) - 1, 0, -1): |
|
|
if not np.isnan(sales[i]) and not np.isnan(sales[i - 1]): |
|
|
if sales[i] > sales[i - 1]: |
|
|
consecutive_up += 1 |
|
|
else: |
|
|
break |
|
|
|
|
|
features['trend_consecutive_down'] = consecutive_down |
|
|
features['trend_consecutive_up'] = consecutive_up |
|
|
|
|
|
return features |
|
|
|
|
|
def _create_volatility_features(self, df: pd.DataFrame) -> Dict: |
|
|
"""변동성 특징 생성(4개)""" |
|
|
features = {} |
|
|
|
|
|
if len(df) < 2: |
|
|
return self._get_default_volatility_features() |
|
|
|
|
|
|
|
|
sales_map = { |
|
|
'1_0-25%': 25, |
|
|
'2_25-50%': 37.5, |
|
|
'3_25-50%': 37.5, |
|
|
'4_50-75%': 62.5, |
|
|
'5_75-100%': 87.5, |
|
|
'6_100%+': 100 |
|
|
} |
|
|
|
|
|
if 'RC_M1_SAA' in df.columns: |
|
|
sales = df['RC_M1_SAA'].map(sales_map).fillna(50) |
|
|
else: |
|
|
sales = pd.Series([50] * len(df)) |
|
|
|
|
|
|
|
|
mean_sales = sales.mean() |
|
|
std_sales = sales.std() |
|
|
features['volatility_cv'] = (std_sales / mean_sales * 100) if mean_sales > 0 else 0 |
|
|
|
|
|
|
|
|
features['volatility_std'] = std_sales |
|
|
|
|
|
|
|
|
features['volatility_mad'] = (sales - mean_sales).abs().mean() |
|
|
|
|
|
|
|
|
if len(sales) >= 3: |
|
|
recent_std = sales.tail(3).std() |
|
|
features['volatility_recent_std'] = recent_std if not np.isnan(recent_std) else 0 |
|
|
else: |
|
|
features['volatility_recent_std'] = 0 |
|
|
|
|
|
return features |
|
|
|
|
|
def _create_seasonality_features(self, df: pd.DataFrame) -> Dict: |
|
|
"""계절성 특징 생성(2개)""" |
|
|
features = {} |
|
|
|
|
|
if len(df) < 12: |
|
|
features['seasonality_detected'] = 0 |
|
|
features['seasonality_strength'] = 0 |
|
|
return features |
|
|
|
|
|
|
|
|
sales_map = { |
|
|
'1_0-25%': 25, |
|
|
'2_25-50%': 37.5, |
|
|
'3_25-50%': 37.5, |
|
|
'4_50-75%': 62.5, |
|
|
'5_75-100%': 87.5, |
|
|
'6_100%+': 100 |
|
|
} |
|
|
|
|
|
if 'RC_M1_SAA' in df.columns: |
|
|
sales = df['RC_M1_SAA'].map(sales_map).fillna(50).values |
|
|
else: |
|
|
sales = np.array([50] * len(df)) |
|
|
|
|
|
|
|
|
max_sales = np.nanmax(sales) |
|
|
min_sales = np.nanmin(sales) |
|
|
mean_sales = np.nanmean(sales) |
|
|
|
|
|
if mean_sales > 0: |
|
|
seasonality_strength = (max_sales - min_sales) / mean_sales * 100 |
|
|
features['seasonality_strength'] = seasonality_strength |
|
|
features['seasonality_detected'] = 1 if seasonality_strength > 30 else 0 |
|
|
else: |
|
|
features['seasonality_strength'] = 0 |
|
|
features['seasonality_detected'] = 0 |
|
|
|
|
|
return features |
|
|
|
|
|
def _create_context_features(self, store_data: Dict, df: pd.DataFrame) -> Dict: |
|
|
"""맥락 특징 생성(1개)""" |
|
|
features = {} |
|
|
|
|
|
|
|
|
features['context_industry'] = store_data.get('industry', '기타') |
|
|
|
|
|
return features |
|
|
|
|
|
|
|
|
def _get_default_sales_features(self) -> Dict: |
|
|
"""기본 매출 특징""" |
|
|
return { |
|
|
'sales_avg_1m': 50, 'sales_avg_3m': 50, 'sales_avg_6m': 50, 'sales_avg_12m': 50, |
|
|
'sales_recent_vs_previous': 0, 'sales_mom_change': 0, 'sales_yoy_change': 0, |
|
|
'sales_max': 50, 'sales_min': 50, 'sales_range': 0, |
|
|
'sales_recent_vs_total': 0, 'sales_below_avg': 0, 'sales_recent_trend': 0 |
|
|
} |
|
|
|
|
|
def _get_default_customer_features(self) -> Dict: |
|
|
"""기본 고객 특징""" |
|
|
features = { |
|
|
'customer_reuse_rate': 25.0, |
|
|
'customer_reuse_rate_last': 25.0, |
|
|
'customer_reuse_trend': 0, |
|
|
'customer_new_rate': 30.0 |
|
|
} |
|
|
|
|
|
for age in ['1020', '30', '40', '50', '60']: |
|
|
features[f'customer_m12_mal_{age}_rat'] = 10.0 |
|
|
features[f'customer_m12_fme_{age}_rat'] = 10.0 |
|
|
return features |
|
|
|
|
|
def _get_default_operation_features(self) -> Dict: |
|
|
"""기본 운영 특징""" |
|
|
return { |
|
|
'operation_months': 12, |
|
|
'operation_months_avg': 12, |
|
|
'operation_avg_amount': 30000, |
|
|
'operation_avg_amount_last': 30000, |
|
|
'operation_cancel_rate': 5.0, |
|
|
'operation_delivery_rate': 20.0 |
|
|
} |
|
|
|
|
|
def _get_default_trend_features(self) -> Dict: |
|
|
"""기본 트렌드 특징""" |
|
|
return { |
|
|
'trend_slope': 0, |
|
|
'trend_r2': 0, |
|
|
'trend_direction': 0, |
|
|
'trend_consecutive_down': 0, |
|
|
'trend_consecutive_up': 0 |
|
|
} |
|
|
|
|
|
def _get_default_volatility_features(self) -> Dict: |
|
|
"""기본 변동성 특징""" |
|
|
return { |
|
|
'volatility_cv': 0, |
|
|
'volatility_std': 0, |
|
|
'volatility_mad': 0, |
|
|
'volatility_recent_std': 0 |
|
|
} |
|
|
|