diff --git "a/Analysis_code/deeplearning_model_multi.ipynb" "b/Analysis_code/deeplearning_model_multi.ipynb" new file mode 100644--- /dev/null +++ "b/Analysis_code/deeplearning_model_multi.ipynb" @@ -0,0 +1,2676 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 통합 적용 모델 구성" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 통일 데이터셋 변환 함수 정의" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'%pip install torch torchvision scikit-learn'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'''%pip install torch torchvision scikit-learn'''" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import numpy as np\n", + "import random\n", + "\n", + "# Python 및 Numpy 시드 고정\n", + "seed = 42\n", + "random.seed(seed)\n", + "np.random.seed(seed)\n", + "\n", + "# PyTorch 시드 고정\n", + "torch.manual_seed(seed)\n", + "torch.cuda.manual_seed(seed)\n", + "torch.cuda.manual_seed_all(seed) # Multi-GPU 환경에서 동일한 시드 적용\n", + "\n", + "# PyTorch 연산의 결정적 모드 설정\n", + "torch.backends.cudnn.deterministic = True # 실행마다 동일한 결과를 보장\n", + "torch.backends.cudnn.benchmark = True # 성능 최적화를 활성화 (가능한 한 빠른 연산 수행)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", + "import torch\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "import random\n", + "\n", + "# 전처리 함수\n", + "def preprocessing(df):\n", + " df = df[df.columns].copy()\n", + " df.loc[df['wind_dir']=='정온', 'wind_dir'] = \"0\"\n", + " df['wind_dir'] = df['wind_dir'].astype('int')\n", + " df['lm_cloudcover'] = df['lm_cloudcover'].astype('int')\n", + " df['cloudcover'] = df['cloudcover'].astype('int')\n", + " return df\n", + "\n", + "# 데이터셋 준비 함수\n", + "def prepare_dataset(region, data_sample='pure', target='multi', fold=3):\n", + "\n", + " # 데이터 경로 지정\n", + " dat_path = f\"../data/data_for_modeling/{region}_train.csv\"\n", + " if data_sample == 'pure':\n", + " train_path = dat_path\n", + " else:\n", + " train_path = f'../data/data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv'\n", + " test_path = f\"../data/data_for_modeling/{region}_test.csv\"\n", + " drop_col = ['binary_class','multi_class','visi','year']\n", + " target_col = f'{target}_class'\n", + " \n", + " # 데이터 로드\n", + " region_dat = preprocessing(pd.read_csv(dat_path, index_col=0))\n", + " if data_sample == 'pure':\n", + " region_train = region_dat.loc[~region_dat['year'].isin([2021-fold]), :]\n", + " else:\n", + " region_train = preprocessing(pd.read_csv(train_path))\n", + " region_val = region_dat.loc[region_dat['year'].isin([2021-fold]), :]\n", + " region_test = preprocessing(pd.read_csv(test_path))\n", + "\n", + " # 컬럼 정렬 (일관성 유지)\n", + " common_columns = region_train.columns.to_list()\n", + " train_data = region_train[common_columns]\n", + " val_data = region_val[common_columns]\n", + " test_data = region_test[common_columns]\n", + "\n", + " # 설명변수 & 타겟 분리\n", + " X_train = train_data.drop(columns=drop_col)\n", + " y_train = train_data[target_col]\n", + " X_val = val_data.drop(columns=drop_col)\n", + " y_val = val_data[target_col]\n", + " X_test = test_data.drop(columns=drop_col)\n", + " y_test = test_data[target_col]\n", + "\n", + " # 범주형 & 연속형 변수 분리\n", + " categorical_cols = X_train.select_dtypes(include=['object', 'category', 'int64']).columns\n", + " numerical_cols = X_train.select_dtypes(include=['float64']).columns\n", + "\n", + " # 범주형 변수 Label Encoding\n", + " label_encoders = {}\n", + " for col in categorical_cols:\n", + " le = LabelEncoder()\n", + " le.fit(X_train[col]) # Train 데이터 기준으로 학습\n", + " label_encoders[col] = le\n", + "\n", + " # 변환 적용\n", + " for col in categorical_cols:\n", + " X_train[col] = label_encoders[col].transform(X_train[col])\n", + " X_val[col] = label_encoders[col].transform(X_val[col])\n", + " X_test[col] = label_encoders[col].transform(X_test[col])\n", + "\n", + " # 연속형 변수 Standard Scaling\n", + " scaler = StandardScaler()\n", + " scaler.fit(X_train[numerical_cols]) # Train 데이터 기준으로 학습\n", + "\n", + " # 변환 적용\n", + " X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])\n", + " X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])\n", + " X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])\n", + "\n", + " return X_train, X_val, X_test, y_train, y_val, y_test, categorical_cols, numerical_cols" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", + "import torch\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "import random\n", + "\n", + "# 전처리 함수\n", + "def preprocessing(df):\n", + " df = df[df.columns].copy()\n", + " df.loc[df['wind_dir']=='정온', 'wind_dir'] = \"0\"\n", + " df['wind_dir'] = df['wind_dir'].astype('int')\n", + " df['lm_cloudcover'] = df['lm_cloudcover'].astype('int')\n", + " df['cloudcover'] = df['cloudcover'].astype('int')\n", + " return df\n", + "\n", + "# 데이터 변환 및 dataloader 생성 함수\n", + "def prepare_dataloader(region, data_sample='pure', target='multi', fold=3, random_state=None):\n", + "\n", + " # 데이터 경로 지정\n", + " dat_path = f\"../data/data_for_modeling/{region}_train.csv\"\n", + " if data_sample == 'pure':\n", + " train_path = dat_path\n", + " else:\n", + " train_path = f'../data/data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv'\n", + " test_path = f\"../data/data_for_modeling/{region}_test.csv\"\n", + " drop_col = ['binary_class','multi_class','visi','year']\n", + " target_col = f'{target}_class'\n", + " \n", + " # 데이터 로드\n", + " region_dat = preprocessing(pd.read_csv(dat_path, index_col=0))\n", + " if data_sample == 'pure':\n", + " region_train = region_dat.loc[~region_dat['year'].isin([2021-fold]), :]\n", + " else:\n", + " region_train = preprocessing(pd.read_csv(train_path))\n", + " region_val = region_dat.loc[region_dat['year'].isin([2021-fold]), :]\n", + " region_test = preprocessing(pd.read_csv(test_path))\n", + "\n", + " # 컬럼 정렬 (일관성 유지)\n", + " common_columns = region_train.columns.to_list()\n", + " train_data = region_train[common_columns]\n", + " val_data = region_val[common_columns]\n", + " test_data = region_test[common_columns]\n", + "\n", + " # 설명변수 & 타겟 분리\n", + " X_train = train_data.drop(columns=drop_col)\n", + " y_train = train_data[target_col]\n", + " X_val = val_data.drop(columns=drop_col)\n", + " y_val = val_data[target_col]\n", + " X_test = test_data.drop(columns=drop_col)\n", + " y_test = test_data[target_col]\n", + "\n", + " # 범주형 & 연속형 변수 분리\n", + " categorical_cols = X_train.select_dtypes(include=['object', 'category', 'int64']).columns\n", + " numerical_cols = X_train.select_dtypes(include=['float64']).columns\n", + "\n", + " # 범주형 변수 Label Encoding\n", + " label_encoders = {}\n", + " for col in categorical_cols:\n", + " le = LabelEncoder()\n", + " le.fit(X_train[col]) # Train 데이터 기준으로 학습\n", + " label_encoders[col] = le\n", + "\n", + " # 변환 적용\n", + " for col in categorical_cols:\n", + " X_train[col] = label_encoders[col].transform(X_train[col])\n", + " X_val[col] = label_encoders[col].transform(X_val[col])\n", + " X_test[col] = label_encoders[col].transform(X_test[col])\n", + "\n", + " # 연속형 변수 Standard Scaling\n", + " scaler = StandardScaler()\n", + " scaler.fit(X_train[numerical_cols]) # Train 데이터 기준으로 학습\n", + "\n", + " # 변환 적용\n", + " X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])\n", + " X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])\n", + " X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])\n", + "\n", + " # 연속형 변수와 범주형 변수 분리\n", + " X_train_num = torch.tensor(X_train[numerical_cols].values, dtype=torch.float32)\n", + " X_train_cat = torch.tensor(X_train[categorical_cols].values, dtype=torch.long)\n", + "\n", + " X_val_num = torch.tensor(X_val[numerical_cols].values, dtype=torch.float32)\n", + " X_val_cat = torch.tensor(X_val[categorical_cols].values, dtype=torch.long)\n", + "\n", + " X_test_num = torch.tensor(X_test[numerical_cols].values, dtype=torch.float32)\n", + " X_test_cat = torch.tensor(X_test[categorical_cols].values, dtype=torch.long)\n", + "\n", + " # 레이블 변환\n", + " if target == \"binary\":\n", + " y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32) # 이진 분류 → float32\n", + " y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)\n", + " y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)\n", + " elif target == \"multi\":\n", + " y_train_tensor = torch.tensor(y_train.values, dtype=torch.long) # 다중 분류 → long\n", + " y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)\n", + " y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)\n", + " else:\n", + " raise ValueError(\"target must be 'binary' or 'multi'\")\n", + "\n", + " # TensorDataset 생성\n", + " train_dataset = TensorDataset(X_train_num, X_train_cat, y_train_tensor)\n", + " val_dataset = TensorDataset(X_val_num, X_val_cat, y_val_tensor)\n", + " test_dataset = TensorDataset(X_test_num, X_test_cat, y_test_tensor)\n", + "\n", + " # DataLoader 생성\n", + " if random_state == None:\n", + " train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n", + " else:\n", + " train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=torch.Generator().manual_seed(random_state))\n", + " val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)\n", + " test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)\n", + " \n", + " return X_train, categorical_cols, numerical_cols, train_loader, val_loader, test_loader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 사용자 정의 성능지표 함수" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.utils.class_weight import compute_class_weight\n", + "\n", + "def calculate_csi(Y_test, pred):\n", + "\n", + " cm = confusion_matrix(Y_test, pred) # 변수 이름을 cm으로 변경\n", + " # 혼동 행렬에서 H, F, M 추출\n", + " H = (cm[0, 0] + cm[1, 1])\n", + " \n", + " F = (cm[1, 0] + cm[2, 0] +\n", + " cm[0, 1] + cm[2, 1])\n", + " \n", + " M = (cm[0, 2] + cm[1, 2])\n", + " \n", + " # CSI 계산\n", + " CSI = H / (H + F + M + 1e-10)\n", + " return CSI\n", + "\n", + "def eval_metric_csi(y_true, pred_prob):\n", + "\n", + " pred = np.argmax(pred_prob, axis=1)\n", + " y_true = y_true\n", + " y_pred = pred\n", + " csi = calculate_csi(y_true, y_pred)\n", + " return -1*csi\n", + "\n", + "def sample_weight(y_train):\n", + " class_weights = compute_class_weight(\n", + " class_weight='balanced',\n", + " classes=np.unique(y_train), # 고유 클래스\n", + " y=y_train # 학습 데이터 레이블\n", + " )\n", + " sample_weights = np.array([class_weights[label] for label in y_train])\n", + "\n", + " return sample_weights" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 통일 하이퍼파라미터 최적화 함수 정의" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'%pip install --upgrade ipywidgets'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'''%pip install --upgrade ipywidgets'''" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import optuna\n", + "from sklearn.metrics import accuracy_score, f1_score\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from ft_transformer import FTTransformer\n", + "from resnet_like import ResNetLike\n", + "from deepgbm import DeepGBM\n", + "\n", + "# Optuna의 Trial 로그 숨기기 (WARNING 레벨 이상만 출력)\n", + "optuna.logging.set_verbosity(optuna.logging.WARNING)\n", + "\n", + "# 모델을 GPU로 전송\n", + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "\n", + "# 하이퍼파라미터 최적화 함수 정의\n", + "def objective(trial, model_choose, region, data_sample='pure', target='multi', n_folds=3, random_state=None):\n", + "\n", + " val_scores = []\n", + "\n", + " # fold별로 반복\n", + " for fold in range(1, n_folds+1):\n", + " X_train, categorical_cols, numerical_cols, train_loader, val_loader, _ = prepare_dataloader(region, data_sample=data_sample, target=target, fold=fold, random_state=random_state)\n", + "\n", + " if model_choose == \"ft_transformer\":\n", + " d_token = trial.suggest_categorical(\"d_token\", [64, 128, 192, 256])\n", + " n_blocks = trial.suggest_int(\"n_blocks\", 4, 8)\n", + " attention_dropout = trial.suggest_float(\"attention_dropout\", 0.2, 0.5)\n", + " ffn_dropout = trial.suggest_float(\"ffn_dropout\", 0.2, 0.5)\n", + " lr = trial.suggest_float(\"lr\", 1e-4, 1e-3, log=True)\n", + " weight_decay = trial.suggest_float(\"weight_decay\", 1e-5, 1e-3, log=True)\n", + "\n", + " # FT-Transformer 초기화(다중분류: 3개 범주)\n", + " model = FTTransformer(\n", + " num_features=len(numerical_cols),\n", + " cat_cardinalities=[len(X_train[col].unique()) for col in categorical_cols],\n", + " d_token=d_token,\n", + " n_blocks=n_blocks,\n", + " attention_dropout=attention_dropout,\n", + " ffn_dropout=ffn_dropout,\n", + " num_classes=3\n", + " ).to(device)\n", + "\n", + " elif model_choose == 'resnet_like':\n", + " # 하이퍼파라미터 탐색 공간 정의\n", + " d_main = trial.suggest_categorical(\"d_main\", [64, 128, 192, 256])\n", + " d_hidden = trial.suggest_categorical(\"d_hidden\", [32, 64, 128])\n", + " n_blocks = trial.suggest_int(\"n_blocks\", 3, 8) # ResNet 블록 수\n", + " dropout_first = trial.suggest_float(\"dropout_first\", 0.1, 0.5) # 첫 번째 Dropout\n", + " dropout_second = trial.suggest_float(\"dropout_second\", 0.0, 0.3) # 두 번째 Dropout\n", + " lr = trial.suggest_float(\"lr\", 1e-4, 1e-2, log=True) # 학습률\n", + " weight_decay = trial.suggest_float(\"weight_decay\", 1e-6, 1e-3, log=True) # L2 정규화\n", + "\n", + " # 연속형 변수 + 범주형 변수 개수 반영하여 모델 입력 크기 설정\n", + " input_dim = len(numerical_cols) + len(categorical_cols)\n", + "\n", + " # 모델 초기화 및 GPU로 이동\n", + " model = ResNetLike(\n", + " input_dim=input_dim,\n", + " d_main=d_main, \n", + " d_hidden=d_hidden, \n", + " n_blocks=n_blocks, \n", + " dropout_first=dropout_first, \n", + " dropout_second=dropout_second,\n", + " num_classes=3\n", + " ).to(device)\n", + "\n", + " elif model_choose == 'deepgbm':\n", + " d_main = trial.suggest_categorical(\"d_main\", [64, 128, 192, 256])\n", + " d_hidden = trial.suggest_categorical(\"d_hidden\", [32, 64, 128])\n", + " n_blocks = trial.suggest_int(\"n_blocks\", 3, 8) # ResNet 블록 개수\n", + " dropout = trial.suggest_float(\"dropout\", 0.1, 0.5) # Dropout 비율\n", + " lr = trial.suggest_float(\"lr\", 1e-4, 1e-3, log=True) # 학습률\n", + " weight_decay = trial.suggest_float(\"weight_decay\", 1e-5, 1e-3, log=True) # 정규화\n", + "\n", + " # DeepGBM 모델 초기화 (x_num, x_cat을 따로 받는 구조)\n", + " model = DeepGBM(\n", + " num_features=len(numerical_cols),\n", + " cat_features=[len(X_train[col].unique()) for col in categorical_cols],\n", + " d_main=d_main,\n", + " d_hidden=d_hidden,\n", + " n_blocks=n_blocks,\n", + " dropout=dropout,\n", + " num_classes=3\n", + " ).to(device)\n", + "\n", + " # 손실 함수 및 옵티마이저 설정\n", + " if target == 'binary':\n", + " criterion = nn.BCEWithLogitsLoss() # 이진 분류용\n", + " elif target == 'multi':\n", + " criterion = nn.CrossEntropyLoss() # 다중 분류용\n", + "\n", + " # 가중치 조정\n", + " optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)\n", + "\n", + " # 학습 설정\n", + " epochs = 50 # epoch 증가\n", + " patience = 8 # Early Stopping 기준 (8 epoch 동안 개선 없으면 중지)\n", + " best_val_score = 0 \n", + " counter = 0 \n", + "\n", + " for epoch in range(epochs):\n", + " model.train()\n", + " for x_num_batch, x_cat_batch, y_batch in train_loader:\n", + " x_num_batch, x_cat_batch, y_batch = (\n", + " x_num_batch.to(device),\n", + " x_cat_batch.to(device),\n", + " y_batch.to(device)\n", + " )\n", + " optimizer.zero_grad()\n", + " y_pred = model(x_num_batch, x_cat_batch)\n", + "\n", + " # 손실 계산 (이진 분류 | 다중 분류)\n", + " if target == 'binary':\n", + " loss = criterion(y_pred, y_batch.float())\n", + " elif target == 'multi':\n", + " loss = criterion(y_pred, y_batch)\n", + "\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " # Validation 평가\n", + " model.eval()\n", + " y_pred_val, y_true_val = [], []\n", + " with torch.no_grad():\n", + " for x_num_batch, x_cat_batch, y_batch in val_loader:\n", + " x_num_batch, x_cat_batch, y_batch = (\n", + " x_num_batch.to(device),\n", + " x_cat_batch.to(device),\n", + " y_batch.to(device)\n", + " )\n", + " output = model(x_num_batch, x_cat_batch)\n", + "\n", + " if target == 'binary':\n", + " pred = (torch.sigmoid(output) >= 0.5).long()\n", + " elif target == 'multi':\n", + " pred = output.argmax(dim=1)\n", + "\n", + " y_pred_val.extend(pred.cpu().numpy()) \n", + " y_true_val.extend(y_batch.cpu().numpy())\n", + "\n", + " # csi-score 계산 (다중클래스용)\n", + " val_csi = calculate_csi(y_true_val, y_pred_val) \n", + "\n", + " # Optuna Pruning 적용 (조기 종료)\n", + " trial.report(val_csi, epoch)\n", + " if trial.should_prune():\n", + " raise optuna.exceptions.TrialPruned()\n", + "\n", + " # Early Stopping 체크\n", + " if val_csi > best_val_score:\n", + " best_val_score = val_csi\n", + " counter = 0 # 개선되었으므로 카운터 초기화\n", + " else:\n", + " counter += 1 # 개선되지 않으면 카운터 증가\n", + "\n", + " if counter >= patience:\n", + " break # Early Stopping 발동\n", + "\n", + " val_scores.append(best_val_score)\n", + "\n", + " # 모든 fold에서 평균 성능을 반환\n", + " avg_val_score = sum(val_scores) / len(val_scores)\n", + " return avg_val_score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 통일 최적화 + soft voting 함수 정의" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import optuna\n", + "from sklearn.metrics import accuracy_score, f1_score\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from ft_transformer import FTTransformer\n", + "from resnet_like import ResNetLike\n", + "from deepgbm import DeepGBM\n", + "import copy\n", + "import os\n", + "\n", + "# 모델을 GPU로 전송\n", + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "\n", + "def fold_voting(model_choose, region, data_sample='pure', target='multi', n_folds=3, random_state = None):\n", + "\n", + " # Optuna 실행\n", + " sampler = optuna.samplers.TPESampler(seed=seed)\n", + " study = optuna.create_study(direction=\"maximize\", sampler=sampler)\n", + " study.optimize(lambda trial: objective(trial, model_choose=model_choose, region=region, data_sample=data_sample, target=target, n_folds=n_folds, random_state=random_state), n_trials=50, show_progress_bar=True)\n", + "\n", + " # 최적의 하이퍼파라미터 가져오기\n", + " best_params = study.best_trial.params\n", + " print(f\"### Best Params (All Folds): {best_params} ###\")\n", + "\n", + " model_paths = []\n", + "\n", + " for fold in range(1, n_folds + 1):\n", + " X_train, categorical_cols, numerical_cols, train_loader, val_loader, _ = prepare_dataloader(region=region, data_sample=data_sample, target=target, fold=fold, random_state=seed)\n", + "\n", + " # 구현모델 선택\n", + " if model_choose == 'ft_transformer':\n", + " # FT-Transformer 초기화 (최적화된 하이퍼파라미터로 설정)\n", + " model = FTTransformer(\n", + " num_features=len(numerical_cols),\n", + " cat_cardinalities=[len(X_train[col].unique()) for col in categorical_cols],\n", + " d_token=best_params[\"d_token\"],\n", + " n_blocks=best_params[\"n_blocks\"],\n", + " attention_dropout=best_params[\"attention_dropout\"],\n", + " ffn_dropout=best_params[\"ffn_dropout\"],\n", + " num_classes=3\n", + " ).to(device)\n", + " elif model_choose == 'resnet_like':\n", + " # ResNet-Like 초기화 (최적화된 하이퍼파라미터로 설정)\n", + " model = ResNetLike(\n", + " input_dim=len(numerical_cols) + len(categorical_cols), # 입력 차원\n", + " d_main=best_params[\"d_main\"],\n", + " d_hidden=best_params[\"d_hidden\"],\n", + " n_blocks=best_params[\"n_blocks\"],\n", + " dropout_first=best_params[\"dropout_first\"],\n", + " dropout_second=best_params[\"dropout_second\"],\n", + " num_classes=3\n", + " ).to(device)\n", + " elif model_choose == 'deepgbm':\n", + " # DeepGBM 초기화 (최적화된 하이퍼파라미터로 설정)\n", + " model = DeepGBM(\n", + " num_features=len(numerical_cols),\n", + " cat_features=[len(X_train[col].unique()) for col in categorical_cols],\n", + " d_main=best_params[\"d_main\"],\n", + " d_hidden=best_params[\"d_hidden\"],\n", + " n_blocks=best_params[\"n_blocks\"],\n", + " dropout=best_params[\"dropout\"],\n", + " num_classes=3\n", + " ).to(device)\n", + "\n", + " # 손실 함수 및 옵티마이저 설정\n", + " if target == 'binary':\n", + " criterion = nn.BCEWithLogitsLoss() # 이진 분류용\n", + " elif target == 'multi':\n", + " criterion = nn.CrossEntropyLoss() # 다중 분류용\n", + " optimizer_ft = optim.AdamW(model.parameters(), lr=best_params[\"lr\"], weight_decay=best_params[\"weight_decay\"])\n", + "\n", + " # Early Stopping 설정\n", + " best_csi = -float('inf') # CSI-Score는 최대화가 목표이므로 -inf로 초기화\n", + " patience = 10 # F1-Score가 개선되지 않는 Epoch 수\n", + " counter = 0 # 개선되지 않은 Epoch 수를 기록\n", + " best_model = None\n", + "\n", + " # 학습 루프\n", + " epochs = 50 # 최대 Epoch 수\n", + " for epoch in range(epochs):\n", + " # Training Phase\n", + " model.train()\n", + " for x_num_batch, x_cat_batch, y_batch in train_loader:\n", + " x_num_batch, x_cat_batch, y_batch = (\n", + " x_num_batch.to(device),\n", + " x_cat_batch.to(device),\n", + " y_batch.to(device),\n", + " )\n", + " optimizer_ft.zero_grad()\n", + " y_pred = model(x_num_batch, x_cat_batch)\n", + " \n", + " # 손실 계산 (이진 분류 | 다중 분류)\n", + " if target == 'binary':\n", + " loss = criterion(y_pred.squeeze(-1), y_batch.float())\n", + " elif target == 'multi':\n", + " loss = criterion(y_pred, y_batch)\n", + "\n", + " loss.backward()\n", + " optimizer_ft.step()\n", + "\n", + " # Validation Phase\n", + " model.eval()\n", + " y_true_val, y_pred_val = [], []\n", + " with torch.no_grad():\n", + " for x_num_batch, x_cat_batch, y_batch in val_loader:\n", + " x_num_batch, x_cat_batch, y_batch = (\n", + " x_num_batch.to(device),\n", + " x_cat_batch.to(device),\n", + " y_batch.to(device),\n", + " )\n", + " y_pred = model(x_num_batch, x_cat_batch)\n", + "\n", + " if target == 'binary':\n", + " pred = (torch.sigmoid(y_pred) >= 0.5).long()\n", + " elif target == 'multi':\n", + " pred = y_pred.argmax(dim=1)\n", + " y_true_val.extend(y_batch.cpu().numpy())\n", + " y_pred_val.extend(pred.cpu().numpy()) # 가장 높은 확률의 클래스 선택\n", + "\n", + " # CSI-Score 계산\n", + " val_csi = calculate_csi(y_true_val, y_pred_val)\n", + "\n", + " # Early Stopping 체크\n", + " if val_csi > best_csi:\n", + " best_csi = val_csi\n", + " counter = 0\n", + " best_model = copy.deepcopy(model)\n", + " else:\n", + " counter += 1\n", + " if counter >= patience:\n", + " print(f\"Early stopping at epoch {epoch+1}\")\n", + " break\n", + " \n", + " # 모델 저장 경로 설정\n", + " save_dir = f\"./save_model/{model_choose}/{data_sample}\"\n", + " os.makedirs(save_dir, exist_ok=True) # 폴더 없으면 자동 생성\n", + "\n", + " # 모델 저장\n", + " model_path = f\"./save_model/{model_choose}/{data_sample}/{region}_fold{fold}.pth\"\n", + " torch.save(best_model, model_path)\n", + " model_paths.append(model_path)\n", + " print(f\"Saving model to {model_path}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 사용자 soft voting 정의 함수" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "\n", + "# 모델을 GPU로 전송\n", + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "\n", + "# Soft Voting 앙상블\n", + "def pred_fold(region, model_choose, data_sample, fold, target='multi'):\n", + " _, _, _, _, _, y_test, _, _ = prepare_dataset(region=region, data_sample=data_sample, target=target)\n", + " _, _, _, _, _, test_loader = prepare_dataloader(region=region, data_sample=data_sample, target=target, random_state=seed)\n", + "\n", + " folder_path = f'./save_model/{model_choose}/{data_sample}'\n", + " model_paths = [path for path in glob.glob(f'{folder_path}/*.pth') if f'{region}' in path]\n", + "\n", + " model = torch.load(model_paths[fold-1], weights_only=False).to(device)\n", + " model.eval()\n", + "\n", + " test_preds = []\n", + " with torch.no_grad():\n", + " for x_num_batch, x_cat_batch, _ in test_loader:\n", + " output = model(x_num_batch.to(device), x_cat_batch.to(device))\n", + " output = torch.softmax(output, dim=1)\n", + " test_preds.extend(output.cpu().numpy())\n", + "\n", + " # 최종 예측 (Soft Voting)\n", + " final_preds = np.argmax(test_preds, axis=1)\n", + "\n", + " return y_test, final_preds" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "\n", + "# 모델을 GPU로 전송\n", + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "\n", + "# Soft Voting 앙상블\n", + "def soft_voting(region, model_choose, data_sample, target='multi'):\n", + " _, _, _, _, _, y_test, _, _ = prepare_dataset(region=region, data_sample=data_sample, target=target)\n", + " _, _, _, _, _, test_loader = prepare_dataloader(region=region, data_sample=data_sample, target=target, random_state=seed)\n", + "\n", + " folder_path = f'./save_model/{model_choose}/{data_sample}'\n", + " model_paths = [path for path in glob.glob(f'{folder_path}/*.pth') if f'{region}' in path]\n", + "\n", + " if target == 'multi':\n", + " test_probs = np.zeros((len(y_test), 3))\n", + " elif target == 'binary':\n", + " test_probs = np.zeros((len(y_test), 2))\n", + "\n", + " for _, path in enumerate(model_paths):\n", + " model = torch.load(path, weights_only=False).to(device)\n", + " model.eval()\n", + "\n", + " test_preds = []\n", + " with torch.no_grad():\n", + " for x_num_batch, x_cat_batch, _ in test_loader:\n", + " output = model(x_num_batch.to(device), x_cat_batch.to(device))\n", + " output = torch.softmax(output, dim=1)\n", + " test_preds.extend(output.cpu().numpy())\n", + "\n", + " test_probs += np.array(test_preds) / len(model_paths)\n", + "\n", + " # 최종 예측 (Soft Voting)\n", + " final_preds = np.argmax(test_probs, axis=1)\n", + "\n", + " return y_test, test_probs, final_preds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 모델 별 K-fold + Soft Voting 진행" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", + " for region in ['seoul','busan','daejeon','daegu','incheon','gwangju']:\n", + " for data_sample in ['pure','smote','ctgan7000','ctgan10000','ctgan20000']:\n", + " fold_voting(model_choose=model_choose, region=region, data_sample=data_sample, random_state=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "230879b7c5e641c8b8b61492ae92c3e4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 64, 'n_blocks': 3, 'dropout': 0.29807076404450805, 'lr': 0.00010824018381500966, 'weight_decay': 0.000658628931758311} ###\n", + "Early stopping at epoch 22\n", + "Saving model to ./save_model/deepgbm/pure/seoul_fold1.pth\n", + "Early stopping at epoch 50\n", + "Saving model to ./save_model/deepgbm/pure/seoul_fold2.pth\n", + "Early stopping at epoch 24\n", + "Saving model to ./save_model/deepgbm/pure/seoul_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "53d2758c71964eb28e446dae5138a3e9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 128, 'n_blocks': 4, 'dropout': 0.18531333425244892, 'lr': 0.00019904391652517882, 'weight_decay': 7.803511669278675e-05} ###\n", + "Early stopping at epoch 12\n", + "Saving model to ./save_model/deepgbm/smote/seoul_fold1.pth\n", + "Early stopping at epoch 14\n", + "Saving model to ./save_model/deepgbm/smote/seoul_fold2.pth\n", + "Early stopping at epoch 19\n", + "Saving model to ./save_model/deepgbm/smote/seoul_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1a34a1880a7c43a69835c3f95784175e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 8, 'dropout': 0.10718475024592788, 'lr': 0.0009432177379597497, 'weight_decay': 4.308729113230509e-05} ###\n", + "Early stopping at epoch 13\n", + "Saving model to ./save_model/deepgbm/ctgan7000/seoul_fold1.pth\n", + "Early stopping at epoch 19\n", + "Saving model to ./save_model/deepgbm/ctgan7000/seoul_fold2.pth\n", + "Early stopping at epoch 18\n", + "Saving model to ./save_model/deepgbm/ctgan7000/seoul_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3ec2807729c641f59163b26756414b28", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 64, 'n_blocks': 3, 'dropout': 0.14063926069511057, 'lr': 0.0004059290878693028, 'weight_decay': 0.0008509845719526007} ###\n", + "Early stopping at epoch 14\n", + "Saving model to ./save_model/deepgbm/ctgan10000/seoul_fold1.pth\n", + "Early stopping at epoch 18\n", + "Saving model to ./save_model/deepgbm/ctgan10000/seoul_fold2.pth\n", + "Early stopping at epoch 26\n", + "Saving model to ./save_model/deepgbm/ctgan10000/seoul_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4d7a3ceee7854891aa2cfef6265ef833", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 8, 'dropout': 0.34044600469728353, 'lr': 0.0005105903209394755, 'weight_decay': 1.0994335574766187e-05} ###\n", + "Early stopping at epoch 35\n", + "Saving model to ./save_model/deepgbm/ctgan20000/seoul_fold1.pth\n", + "Early stopping at epoch 48\n", + "Saving model to ./save_model/deepgbm/ctgan20000/seoul_fold2.pth\n", + "Early stopping at epoch 13\n", + "Saving model to ./save_model/deepgbm/ctgan20000/seoul_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d941a8b45f5b47aba5e38e89ba96e5bc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 8, 'dropout': 0.4651768944020654, 'lr': 0.0005440940639887522, 'weight_decay': 1.0703736319022912e-05} ###\n", + "Early stopping at epoch 18\n", + "Saving model to ./save_model/deepgbm/pure/busan_fold1.pth\n", + "Early stopping at epoch 21\n", + "Saving model to ./save_model/deepgbm/pure/busan_fold2.pth\n", + "Early stopping at epoch 26\n", + "Saving model to ./save_model/deepgbm/pure/busan_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cd7785fd4688437bad84b4cd3dd17402", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 32, 'n_blocks': 7, 'dropout': 0.36241152483468775, 'lr': 0.0005993036540724003, 'weight_decay': 0.0006194094729468491} ###\n", + "Early stopping at epoch 22\n", + "Saving model to ./save_model/deepgbm/smote/busan_fold1.pth\n", + "Early stopping at epoch 15\n", + "Saving model to ./save_model/deepgbm/smote/busan_fold2.pth\n", + "Early stopping at epoch 20\n", + "Saving model to ./save_model/deepgbm/smote/busan_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4236916937fa4d299507f7bf67480433", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 5, 'dropout': 0.32589216101245205, 'lr': 0.000870943966836362, 'weight_decay': 0.0004755258531422745} ###\n", + "Early stopping at epoch 32\n", + "Saving model to ./save_model/deepgbm/ctgan7000/busan_fold1.pth\n", + "Early stopping at epoch 41\n", + "Saving model to ./save_model/deepgbm/ctgan7000/busan_fold2.pth\n", + "Early stopping at epoch 18\n", + "Saving model to ./save_model/deepgbm/ctgan7000/busan_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a58c944a170f4a27939007d870df4d3b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 64, 'n_blocks': 5, 'dropout': 0.22554979058504504, 'lr': 0.0007872290481937401, 'weight_decay': 1.3478651710155104e-05} ###\n", + "Early stopping at epoch 30\n", + "Saving model to ./save_model/deepgbm/ctgan10000/busan_fold1.pth\n", + "Early stopping at epoch 15\n", + "Saving model to ./save_model/deepgbm/ctgan10000/busan_fold2.pth\n", + "Early stopping at epoch 17\n", + "Saving model to ./save_model/deepgbm/ctgan10000/busan_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "794104daa0814654b5a7374e63bad90b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 32, 'n_blocks': 4, 'dropout': 0.23007332881069884, 'lr': 0.0005365450324352025, 'weight_decay': 0.00018841476921545086} ###\n", + "Early stopping at epoch 34\n", + "Saving model to ./save_model/deepgbm/ctgan20000/busan_fold1.pth\n", + "Early stopping at epoch 21\n", + "Saving model to ./save_model/deepgbm/ctgan20000/busan_fold2.pth\n", + "Saving model to ./save_model/deepgbm/ctgan20000/busan_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "516dd122b4cf443eaa6514976e0ba235", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 8, 'dropout': 0.34044600469728353, 'lr': 0.0005105903209394755, 'weight_decay': 1.0994335574766187e-05} ###\n", + "Early stopping at epoch 18\n", + "Saving model to ./save_model/deepgbm/pure/daejeon_fold1.pth\n", + "Early stopping at epoch 20\n", + "Saving model to ./save_model/deepgbm/pure/daejeon_fold2.pth\n", + "Early stopping at epoch 25\n", + "Saving model to ./save_model/deepgbm/pure/daejeon_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9e3bf1f1058f4dcfa1254f06dcb0fe38", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 64, 'd_hidden': 128, 'n_blocks': 5, 'dropout': 0.2994658107096196, 'lr': 0.0005068273287449525, 'weight_decay': 2.0471105563346853e-05} ###\n", + "Early stopping at epoch 16\n", + "Saving model to ./save_model/deepgbm/smote/daejeon_fold1.pth\n", + "Early stopping at epoch 16\n", + "Saving model to ./save_model/deepgbm/smote/daejeon_fold2.pth\n", + "Early stopping at epoch 21\n", + "Saving model to ./save_model/deepgbm/smote/daejeon_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d188977daf5147fe8ed6dffd4d233c06", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 32, 'n_blocks': 7, 'dropout': 0.38274293753904687, 'lr': 0.0005358055009231865, 'weight_decay': 0.000348771262454593} ###\n", + "Early stopping at epoch 31\n", + "Saving model to ./save_model/deepgbm/ctgan7000/daejeon_fold1.pth\n", + "Early stopping at epoch 23\n", + "Saving model to ./save_model/deepgbm/ctgan7000/daejeon_fold2.pth\n", + "Early stopping at epoch 16\n", + "Saving model to ./save_model/deepgbm/ctgan7000/daejeon_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "76f8c4fd7727459ba4d87aada9cbcf87", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 3, 'dropout': 0.37335751073629203, 'lr': 0.0005518803643548146, 'weight_decay': 0.0006506083261092598} ###\n", + "Early stopping at epoch 42\n", + "Saving model to ./save_model/deepgbm/ctgan10000/daejeon_fold1.pth\n", + "Early stopping at epoch 26\n", + "Saving model to ./save_model/deepgbm/ctgan10000/daejeon_fold2.pth\n", + "Early stopping at epoch 33\n", + "Saving model to ./save_model/deepgbm/ctgan10000/daejeon_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c2f82c3c6d6a4cf5933e6dd18eede561", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 64, 'n_blocks': 7, 'dropout': 0.498315839448228, 'lr': 0.000982377661956639, 'weight_decay': 5.1544513708209705e-05} ###\n", + "Early stopping at epoch 44\n", + "Saving model to ./save_model/deepgbm/ctgan20000/daejeon_fold1.pth\n", + "Early stopping at epoch 21\n", + "Saving model to ./save_model/deepgbm/ctgan20000/daejeon_fold2.pth\n", + "Early stopping at epoch 39\n", + "Saving model to ./save_model/deepgbm/ctgan20000/daejeon_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "485ab1b95b0446348036e7f85f06670b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 192, 'd_hidden': 128, 'n_blocks': 4, 'dropout': 0.44295645200373956, 'lr': 0.0006995754135310067, 'weight_decay': 7.588529866409616e-05} ###\n", + "Early stopping at epoch 12\n", + "Saving model to ./save_model/deepgbm/pure/daegu_fold1.pth\n", + "Early stopping at epoch 13\n", + "Saving model to ./save_model/deepgbm/pure/daegu_fold2.pth\n", + "Early stopping at epoch 18\n", + "Saving model to ./save_model/deepgbm/pure/daegu_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8bf0047f247b4a5caf0f2d12383850cc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 128, 'n_blocks': 3, 'dropout': 0.34301794076057535, 'lr': 0.00014808945119975197, 'weight_decay': 1.3492834268013232e-05} ###\n", + "Early stopping at epoch 25\n", + "Saving model to ./save_model/deepgbm/smote/daegu_fold1.pth\n", + "Early stopping at epoch 24\n", + "Saving model to ./save_model/deepgbm/smote/daegu_fold2.pth\n", + "Early stopping at epoch 23\n", + "Saving model to ./save_model/deepgbm/smote/daegu_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cf4d89937031420cab15ecba54ffc850", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 128, 'n_blocks': 6, 'dropout': 0.4749441191638353, 'lr': 0.0007742129275419718, 'weight_decay': 0.0001276488991046795} ###\n", + "Early stopping at epoch 17\n", + "Saving model to ./save_model/deepgbm/ctgan7000/daegu_fold1.pth\n", + "Early stopping at epoch 39\n", + "Saving model to ./save_model/deepgbm/ctgan7000/daegu_fold2.pth\n", + "Early stopping at epoch 48\n", + "Saving model to ./save_model/deepgbm/ctgan7000/daegu_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c0df822671e74384a66b4b0395e6ab81", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 32, 'n_blocks': 8, 'dropout': 0.32191381712732553, 'lr': 0.0006025357742197527, 'weight_decay': 1.6123356366734284e-05} ###\n", + "Early stopping at epoch 25\n", + "Saving model to ./save_model/deepgbm/ctgan10000/daegu_fold1.pth\n", + "Early stopping at epoch 40\n", + "Saving model to ./save_model/deepgbm/ctgan10000/daegu_fold2.pth\n", + "Early stopping at epoch 23\n", + "Saving model to ./save_model/deepgbm/ctgan10000/daegu_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9492d14036d64bb7bb2eb8c3d2bee972", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 128, 'n_blocks': 7, 'dropout': 0.49897739440085004, 'lr': 0.0009903735540622498, 'weight_decay': 5.1544513708209705e-05} ###\n", + "Early stopping at epoch 29\n", + "Saving model to ./save_model/deepgbm/ctgan20000/daegu_fold1.pth\n", + "Early stopping at epoch 20\n", + "Saving model to ./save_model/deepgbm/ctgan20000/daegu_fold2.pth\n", + "Early stopping at epoch 18\n", + "Saving model to ./save_model/deepgbm/ctgan20000/daegu_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5c8a303b51f94df288ff928629f0d728", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 8, 'dropout': 0.34044600469728353, 'lr': 0.0005105903209394755, 'weight_decay': 1.0994335574766187e-05} ###\n", + "Early stopping at epoch 15\n", + "Saving model to ./save_model/deepgbm/pure/incheon_fold1.pth\n", + "Early stopping at epoch 33\n", + "Saving model to ./save_model/deepgbm/pure/incheon_fold2.pth\n", + "Early stopping at epoch 22\n", + "Saving model to ./save_model/deepgbm/pure/incheon_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6d66e1cd2b2b4aebac519cc9f774853b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 32, 'n_blocks': 6, 'dropout': 0.4453655419744991, 'lr': 0.0006995754135310067, 'weight_decay': 7.03362294158028e-05} ###\n", + "Early stopping at epoch 28\n", + "Saving model to ./save_model/deepgbm/smote/incheon_fold1.pth\n", + "Early stopping at epoch 17\n", + "Saving model to ./save_model/deepgbm/smote/incheon_fold2.pth\n", + "Early stopping at epoch 14\n", + "Saving model to ./save_model/deepgbm/smote/incheon_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "189a4560776e4601b2490693d9456b91", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 64, 'n_blocks': 3, 'dropout': 0.29807076404450805, 'lr': 0.00010824018381500966, 'weight_decay': 0.000658628931758311} ###\n", + "Early stopping at epoch 35\n", + "Saving model to ./save_model/deepgbm/ctgan7000/incheon_fold1.pth\n", + "Early stopping at epoch 28\n", + "Saving model to ./save_model/deepgbm/ctgan7000/incheon_fold2.pth\n", + "Early stopping at epoch 41\n", + "Saving model to ./save_model/deepgbm/ctgan7000/incheon_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "04414670ee404a1f958e74d114044570", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 64, 'n_blocks': 4, 'dropout': 0.4656646404627707, 'lr': 0.000725954595588046, 'weight_decay': 9.156653323796898e-05} ###\n", + "Early stopping at epoch 14\n", + "Saving model to ./save_model/deepgbm/ctgan10000/incheon_fold1.pth\n", + "Early stopping at epoch 25\n", + "Saving model to ./save_model/deepgbm/ctgan10000/incheon_fold2.pth\n", + "Early stopping at epoch 38\n", + "Saving model to ./save_model/deepgbm/ctgan10000/incheon_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c03845943e074a83a880a4080d779cfa", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 64, 'd_hidden': 32, 'n_blocks': 3, 'dropout': 0.29122487178013184, 'lr': 0.0008644753350143602, 'weight_decay': 0.0006848083277415096} ###\n", + "Early stopping at epoch 39\n", + "Saving model to ./save_model/deepgbm/ctgan20000/incheon_fold1.pth\n", + "Early stopping at epoch 32\n", + "Saving model to ./save_model/deepgbm/ctgan20000/incheon_fold2.pth\n", + "Early stopping at epoch 44\n", + "Saving model to ./save_model/deepgbm/ctgan20000/incheon_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4d15a078f9ea42279a268e574300d613", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 192, 'd_hidden': 32, 'n_blocks': 7, 'dropout': 0.4268924061648935, 'lr': 0.00031827191626512363, 'weight_decay': 7.927166314511625e-05} ###\n", + "Early stopping at epoch 41\n", + "Saving model to ./save_model/deepgbm/pure/gwangju_fold1.pth\n", + "Early stopping at epoch 26\n", + "Saving model to ./save_model/deepgbm/pure/gwangju_fold2.pth\n", + "Early stopping at epoch 18\n", + "Saving model to ./save_model/deepgbm/pure/gwangju_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "718bf72a28814022bd53ea88c94a39da", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 64, 'n_blocks': 3, 'dropout': 0.49840679574448543, 'lr': 0.0007932730569025867, 'weight_decay': 5.2371870545865264e-05} ###\n", + "Early stopping at epoch 34\n", + "Saving model to ./save_model/deepgbm/smote/gwangju_fold1.pth\n", + "Early stopping at epoch 34\n", + "Saving model to ./save_model/deepgbm/smote/gwangju_fold2.pth\n", + "Early stopping at epoch 22\n", + "Saving model to ./save_model/deepgbm/smote/gwangju_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4470573fa1844b6ebf6e59f2ff9d865c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 64, 'd_hidden': 128, 'n_blocks': 7, 'dropout': 0.3593531534642382, 'lr': 0.0007435971973093597, 'weight_decay': 0.00039310742030944726} ###\n", + "Early stopping at epoch 35\n", + "Saving model to ./save_model/deepgbm/ctgan7000/gwangju_fold1.pth\n", + "Early stopping at epoch 22\n", + "Saving model to ./save_model/deepgbm/ctgan7000/gwangju_fold2.pth\n", + "Early stopping at epoch 29\n", + "Saving model to ./save_model/deepgbm/ctgan7000/gwangju_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2f0a9966e1b04380b4abf79e45a022a7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 3, 'dropout': 0.3613681971081546, 'lr': 0.0008491548609780148, 'weight_decay': 0.0004487641324799225} ###\n", + "Early stopping at epoch 29\n", + "Saving model to ./save_model/deepgbm/ctgan10000/gwangju_fold1.pth\n", + "Early stopping at epoch 30\n", + "Saving model to ./save_model/deepgbm/ctgan10000/gwangju_fold2.pth\n", + "Early stopping at epoch 42\n", + "Saving model to ./save_model/deepgbm/ctgan10000/gwangju_fold3.pth\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f3db776ffbfc436c89040085d212e367", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 6, 'dropout': 0.30240965980622486, 'lr': 0.0008659736510641475, 'weight_decay': 0.0003723756511903005} ###\n", + "Early stopping at epoch 32\n", + "Saving model to ./save_model/deepgbm/ctgan20000/gwangju_fold1.pth\n", + "Early stopping at epoch 31\n", + "Saving model to ./save_model/deepgbm/ctgan20000/gwangju_fold2.pth\n", + "Early stopping at epoch 14\n", + "Saving model to ./save_model/deepgbm/ctgan20000/gwangju_fold3.pth\n" + ] + } + ], + "source": [ + "for region in ['seoul','busan','daejeon','daegu','incheon','gwangju']:\n", + " for data_sample in ['pure','smote','ctgan7000','ctgan10000','ctgan20000']:\n", + " fold_voting(model_choose='deepgbm', region=region, data_sample=data_sample, random_state=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 모델별 지역 성능(원데이터)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### FT-Transformer" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### seoul ###\n", + "Test Accuracy: 0.910958904109589\n", + "Test F1-Score: 0.48826800257083774\n", + "Test CSI: 0.34453781512602144\n", + "### busan ###\n", + "Test Accuracy: 0.9679223744292238\n", + "Test F1-Score: 0.5245668459304146\n", + "Test CSI: 0.4158004158003294\n", + "### daejeon ###\n", + "Test Accuracy: 0.9182648401826484\n", + "Test F1-Score: 0.566496524475418\n", + "Test CSI: 0.31021194605006647\n", + "### daegu ###\n", + "Test Accuracy: 0.9768264840182649\n", + "Test F1-Score: 0.7999244404273532\n", + "Test CSI: 0.26181818181808664\n", + "### incheon ###\n", + "Test Accuracy: 0.9181506849315069\n", + "Test F1-Score: 0.6680714652935209\n", + "Test CSI: 0.5138983050847109\n", + "### gwangju ###\n", + "Test Accuracy: 0.9440639269406392\n", + "Test F1-Score: 0.5373981558893273\n", + "Test CSI: 0.4691224268688549\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", + "\n", + "for region in ['seoul','busan','daejeon','daegu','incheon','gwangju']:\n", + " y_test, test_probs, final_preds = soft_voting(region=region, model_choose='ft_transformer', data_sample='pure')\n", + "\n", + " # 성능 지표 계산\n", + " print(f\"### {region} ###\")\n", + " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", + " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", + " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", + " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", + " '''print(confusion_matrix(y_test, final_preds))'''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### ResNet-like" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### seoul ###\n", + "Test Accuracy: 0.9075342465753424\n", + "Test F1-Score: 0.47993840811230576\n", + "Test CSI: 0.32217573221754625\n", + "### busan ###\n", + "Test Accuracy: 0.968607305936073\n", + "Test F1-Score: 0.586245547831333\n", + "Test CSI: 0.45652173913034455\n", + "### daejeon ###\n", + "Test Accuracy: 0.9060502283105023\n", + "Test F1-Score: 0.6242613500628746\n", + "Test CSI: 0.3560250391236028\n", + "### daegu ###\n", + "Test Accuracy: 0.9698630136986301\n", + "Test F1-Score: 0.7948471717999764\n", + "Test CSI: 0.25212464589227984\n", + "### incheon ###\n", + "Test Accuracy: 0.9184931506849315\n", + "Test F1-Score: 0.6724661948944141\n", + "Test CSI: 0.5188679245282669\n", + "### gwangju ###\n", + "Test Accuracy: 0.9299086757990868\n", + "Test F1-Score: 0.5255850318041286\n", + "Test CSI: 0.4387568555758283\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", + "\n", + "for region in ['seoul','busan','daejeon','daegu','incheon','gwangju']:\n", + " y_test, test_probs, final_preds = soft_voting(region=region, model_choose='resnet_like', data_sample='pure')\n", + "\n", + " # 성능 지표 계산\n", + " print(f\"### {region} ###\")\n", + " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", + " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", + " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", + " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", + " '''print(confusion_matrix(y_test, final_preds))'''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### DeepGBM" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### seoul ###\n", + "Test Accuracy: 0.9140410958904109\n", + "Test F1-Score: 0.48377287268662217\n", + "Test CSI: 0.33066666666663724\n", + "### busan ###\n", + "Test Accuracy: 0.9658675799086758\n", + "Test F1-Score: 0.5404572118702554\n", + "Test CSI: 0.3983903420522337\n", + "### daejeon ###\n", + "Test Accuracy: 0.9071917808219178\n", + "Test F1-Score: 0.5336211256730917\n", + "Test CSI: 0.2701974865349847\n", + "### daegu ###\n", + "Test Accuracy: 0.9742009132420091\n", + "Test F1-Score: 0.4604475684598693\n", + "Test CSI: 0.2416107382549525\n", + "### incheon ###\n", + "Test Accuracy: 0.9174657534246575\n", + "Test F1-Score: 0.6883413153781813\n", + "Test CSI: 0.5147651006711064\n", + "### gwangju ###\n", + "Test Accuracy: 0.9417808219178082\n", + "Test F1-Score: 0.6238007071539367\n", + "Test CSI: 0.4539614561027351\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", + "\n", + "for region in ['seoul','busan','daejeon','daegu','incheon','gwangju']:\n", + " y_test, test_probs, final_preds = soft_voting(region=region, model_choose='deepgbm', data_sample='pure')\n", + "\n", + " # 성능 지표 계산\n", + " print(f\"### {region} ###\")\n", + " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", + " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", + " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", + " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", + " '''print(confusion_matrix(y_test, final_preds))'''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 지역별 모델 성능 비교" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### seoul" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### ft_transformer ###\n", + "Test Accuracy: 0.910958904109589\n", + "Test F1-Score: 0.48826800257083774\n", + "Test CSI: 0.34453781512602144\n", + "### resnet_like ###\n", + "Test Accuracy: 0.9075342465753424\n", + "Test F1-Score: 0.47993840811230576\n", + "Test CSI: 0.32217573221754625\n", + "### deepgbm ###\n", + "Test Accuracy: 0.9140410958904109\n", + "Test F1-Score: 0.48377287268662217\n", + "Test CSI: 0.33066666666663724\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", + "\n", + "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", + " y_test, test_probs, final_preds = soft_voting(region='seoul', model_choose=model_choose, data_sample='pure')\n", + "\n", + " # 성능 지표 계산\n", + " print(f\"### {model_choose} ###\")\n", + " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", + " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", + " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", + " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", + " '''print(confusion_matrix(y_test, final_preds))'''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### busan" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### ft_transformer ###\n", + "Test Accuracy: 0.9679223744292238\n", + "Test F1-Score: 0.5245668459304146\n", + "Test CSI: 0.4158004158003294\n", + "### resnet_like ###\n", + "Test Accuracy: 0.968607305936073\n", + "Test F1-Score: 0.586245547831333\n", + "Test CSI: 0.45652173913034455\n", + "### deepgbm ###\n", + "Test Accuracy: 0.9658675799086758\n", + "Test F1-Score: 0.5404572118702554\n", + "Test CSI: 0.3983903420522337\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", + "\n", + "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", + " y_test, test_probs, final_preds = soft_voting(region='busan', model_choose=model_choose, data_sample='pure')\n", + "\n", + " # 성능 지표 계산\n", + " print(f\"### {model_choose} ###\")\n", + " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", + " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", + " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", + " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", + " '''print(confusion_matrix(y_test, final_preds))'''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### daejeon" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### ft_transformer ###\n", + "Test Accuracy: 0.9182648401826484\n", + "Test F1-Score: 0.566496524475418\n", + "Test CSI: 0.31021194605006647\n", + "### resnet_like ###\n", + "Test Accuracy: 0.9060502283105023\n", + "Test F1-Score: 0.6242613500628746\n", + "Test CSI: 0.3560250391236028\n", + "### deepgbm ###\n", + "Test Accuracy: 0.9071917808219178\n", + "Test F1-Score: 0.5336211256730917\n", + "Test CSI: 0.2701974865349847\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", + "\n", + "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", + " y_test, test_probs, final_preds = soft_voting(region='daejeon', model_choose=model_choose, data_sample='pure')\n", + "\n", + " # 성능 지표 계산\n", + " print(f\"### {model_choose} ###\")\n", + " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", + " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", + " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", + " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", + " '''print(confusion_matrix(y_test, final_preds))'''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### daegu" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### ft_transformer ###\n", + "Test Accuracy: 0.9768264840182649\n", + "Test F1-Score: 0.7999244404273532\n", + "Test CSI: 0.26181818181808664\n", + "### resnet_like ###\n", + "Test Accuracy: 0.9698630136986301\n", + "Test F1-Score: 0.7948471717999764\n", + "Test CSI: 0.25212464589227984\n", + "### deepgbm ###\n", + "Test Accuracy: 0.9742009132420091\n", + "Test F1-Score: 0.4604475684598693\n", + "Test CSI: 0.2416107382549525\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", + "\n", + "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", + " y_test, test_probs, final_preds = soft_voting(region='daegu', model_choose=model_choose, data_sample='pure')\n", + "\n", + " # 성능 지표 계산\n", + " print(f\"### {model_choose} ###\")\n", + " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", + " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", + " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", + " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", + " '''print(confusion_matrix(y_test, final_preds))'''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### incheon" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### ft_transformer ###\n", + "Test Accuracy: 0.9181506849315069\n", + "Test F1-Score: 0.6680714652935209\n", + "Test CSI: 0.5138983050847109\n", + "### resnet_like ###\n", + "Test Accuracy: 0.9184931506849315\n", + "Test F1-Score: 0.6724661948944141\n", + "Test CSI: 0.5188679245282669\n", + "### deepgbm ###\n", + "Test Accuracy: 0.9174657534246575\n", + "Test F1-Score: 0.6883413153781813\n", + "Test CSI: 0.5147651006711064\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", + "\n", + "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", + " y_test, test_probs, final_preds = soft_voting(region='incheon', model_choose=model_choose, data_sample='pure')\n", + "\n", + " # 성능 지표 계산\n", + " print(f\"### {model_choose} ###\")\n", + " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", + " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", + " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", + " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", + " '''print(confusion_matrix(y_test, final_preds))'''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### gwangju" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", + "\n", + "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", + " y_test, test_probs, final_preds = soft_voting(region='gwangju', model_choose=model_choose, data_sample='pure')\n", + "\n", + " # 성능 지표 계산\n", + " print(f\"### {model_choose} ###\")\n", + " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", + " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", + " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", + " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", + " '''print(confusion_matrix(y_test, final_preds))'''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 성능비교 시각화 plot" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | Region | \n", + "Model | \n", + "CSI | \n", + "
|---|---|---|---|
| 0 | \n", + "seoul | \n", + "ft_transformer | \n", + "0.344538 | \n", + "
| 1 | \n", + "seoul | \n", + "resnet_like | \n", + "0.322176 | \n", + "
| 2 | \n", + "seoul | \n", + "deepgbm | \n", + "0.330667 | \n", + "
| 3 | \n", + "busan | \n", + "ft_transformer | \n", + "0.415800 | \n", + "
| 4 | \n", + "busan | \n", + "resnet_like | \n", + "0.456522 | \n", + "
| 5 | \n", + "busan | \n", + "deepgbm | \n", + "0.398390 | \n", + "
| 6 | \n", + "daejeon | \n", + "ft_transformer | \n", + "0.310212 | \n", + "
| 7 | \n", + "daejeon | \n", + "resnet_like | \n", + "0.356025 | \n", + "
| 8 | \n", + "daejeon | \n", + "deepgbm | \n", + "0.270197 | \n", + "
| 9 | \n", + "daegu | \n", + "ft_transformer | \n", + "0.261818 | \n", + "
| 10 | \n", + "daegu | \n", + "resnet_like | \n", + "0.252125 | \n", + "
| 11 | \n", + "daegu | \n", + "deepgbm | \n", + "0.241611 | \n", + "
| 12 | \n", + "incheon | \n", + "ft_transformer | \n", + "0.513898 | \n", + "
| 13 | \n", + "incheon | \n", + "resnet_like | \n", + "0.518868 | \n", + "
| 14 | \n", + "incheon | \n", + "deepgbm | \n", + "0.514765 | \n", + "
| 15 | \n", + "gwangju | \n", + "ft_transformer | \n", + "0.469122 | \n", + "
| 16 | \n", + "gwangju | \n", + "resnet_like | \n", + "0.438757 | \n", + "
| 17 | \n", + "gwangju | \n", + "deepgbm | \n", + "0.453961 | \n", + "