diff --git "a/Analysis_code/4.sampling_data_test/lgb_sampled_test.ipynb" "b/Analysis_code/4.sampling_data_test/lgb_sampled_test.ipynb" new file mode 100644--- /dev/null +++ "b/Analysis_code/4.sampling_data_test/lgb_sampled_test.ipynb" @@ -0,0 +1,5215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **lightGBM**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score\n", + "from collections import Counter\n", + "import sys\n", + "from lightgbm import LGBMClassifier\n", + "\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " 제거했던 파생 변수들을 복구\n", + " \n", + " Args:\n", + " df: 데이터프레임\n", + " \n", + " Returns:\n", + " 파생 변수가 추가된 데이터프레임\n", + " \"\"\"\n", + " df = df.copy()\n", + " df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)\n", + " df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)\n", + " df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)\n", + " df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)\n", + " df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']\n", + " return df\n", + "\n", + "\n", + "def preprocessing(df):\n", + " df = df[df.columns].copy()\n", + " df['year'] = df['year'].astype('int')\n", + " df['month'] = df['month'].astype('int')\n", + " df['hour'] = df['hour'].astype('int')\n", + " df= add_derived_features(df).copy()\n", + " df['multi_class'] = df['multi_class'].astype('int')\n", + " df.loc[df['wind_dir']=='정온', 'wind_dir'] = \"0\"\n", + " df['wind_dir'] = df['wind_dir'].astype('int')\n", + " df= df[['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm',\n", + " 'vap_pressure', 'dewpoint_C', 'loc_pressure', 'sea_pressure',\n", + " 'solarRad', 'snow_cm', 'cloudcover', 'lm_cloudcover', 'low_cloudbase',\n", + " 'groundtemp', 'O3', 'NO2', 'PM10', 'PM25', 'year',\n", + " 'month', 'hour', 'ground_temp - temp_C', 'hour_sin', 'hour_cos',\n", + " 'month_sin', 'month_cos','multi_class']].copy()\n", + " return df\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df_seoul = pd.read_csv(\"../../data/data_for_modeling/seoul_train.csv\")\n", + "df_busan = pd.read_csv(\"../../data/data_for_modeling/busan_train.csv\")\n", + "df_daegu = pd.read_csv(\"../../data/data_for_modeling/daegu_train.csv\")\n", + "df_daejeon = pd.read_csv(\"../../data/data_for_modeling/daejeon_train.csv\")\n", + "df_incheon = pd.read_csv(\"../../data/data_for_modeling/incheon_train.csv\")\n", + "df_gwangju = pd.read_csv(\"../../data/data_for_modeling/gwangju_train.csv\")\n", + "\n", + "df_seoul = preprocessing(df_seoul).copy()\n", + "df_busan = preprocessing(df_busan).copy()\n", + "df_daegu = preprocessing(df_daegu).copy()\n", + "df_daejeon = preprocessing(df_daejeon).copy()\n", + "df_incheon = preprocessing(df_incheon).copy()\n", + "df_gwangju = preprocessing(df_gwangju).copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "seoul : Counter({2: 23686, 1: 2579, 0: 39})\n", + "\n", + "busan : Counter({2: 24694, 1: 1516, 0: 94})\n", + "\n", + "daegu : Counter({2: 25149, 1: 1107, 0: 48})\n", + "\n", + "gwangju : Counter({2: 23798, 1: 2411, 0: 95})\n", + "\n", + "daejeon : Counter({2: 23471, 1: 2660, 0: 173})\n", + "\n", + "incheon : Counter({2: 21893, 1: 3892, 0: 519})\n" + ] + } + ], + "source": [ + "print(\"seoul : \", Counter(df_seoul['multi_class']))\n", + "print()\n", + "print(\"busan : \", Counter(df_busan['multi_class']))\n", + "print()\n", + "print(\"daegu : \", Counter(df_daegu['multi_class']))\n", + "print()\n", + "print(\"gwangju : \", Counter(df_gwangju['multi_class']))\n", + "print()\n", + "print(\"daejeon : \", Counter(df_daejeon['multi_class']))\n", + "print()\n", + "print(\"incheon : \", Counter(df_incheon['multi_class']))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n", + " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n", + " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n", + " 'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'ground_temp - temp_C',\n", + " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n", + " dtype='object')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_gwangju.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.utils.class_weight import compute_class_weight\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "from sklearn.metrics import matthews_corrcoef\n", + "\n", + "def calculate_csi(Y_test, pred):\n", + "\n", + " cm = confusion_matrix(Y_test, pred) # 변수 이름을 cm으로 변경\n", + " # 혼동 행렬에서 H, F, M 추출\n", + " H = (cm[0, 0] + cm[1, 1])\n", + " \n", + " F = (cm[1, 0] + cm[2, 0] +\n", + " cm[0, 1] + cm[2, 1])\n", + " \n", + " M = (cm[0, 2] + cm[1, 2])\n", + " \n", + " # CSI 계산\n", + " CSI = H / (H + F + M + 1e-10)\n", + " return CSI\n", + "\n", + "def eval_metric_csi(y_true, pred_prob):\n", + "\n", + " pred = np.argmax(pred_prob, axis=1)\n", + " y_true = y_true\n", + " y_pred = pred\n", + " csi = calculate_csi(y_true, y_pred)\n", + " return -1*csi\n", + "\n", + "def multiclass_mcc(y_val, y_pred):\n", + " \"\"\"\n", + " 다중 분류에서도 sklearn의 matthews_corrcoef를 그대로 사용할 수 있음.\n", + " \"\"\"\n", + " return matthews_corrcoef(y_val, y_pred)\n", + "\n", + "# 사용자 정의 평가 지표 함수 정의\n", + "def csi_metric(y_true, pred):\n", + " y_pred_binary = np.argmax(pred, axis=1)\n", + " score = calculate_csi(y_true, y_pred_binary)\n", + " return 'CSI', score, True # higher_better=True" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "lgb_model = LGBMClassifier(\n", + " n_estimators=4000, # 약한 학습기 개수\n", + " tree_method='hist', \n", + " device='gpu', # GPU 사용\n", + " objective='multiclassova',\n", + " early_stopping_rounds=400, # 과적합 방지를 위한 조기 종료 설정\n", + " random_state= 42,\n", + " verbose= -1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "pre_sampled_data= []\n", + "smote_sample_data= []\n", + "gan20000_sample_data= []\n", + "gan10000_sample_data= []\n", + "gan7000_sample_data= []" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df= pd.DataFrame(columns=['region','model','data_sample','CSI','MCC','Accuracy','fold_csi'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5050411887221465\n", + "mean of accuracy : 0.9361739068958922\n", + "mean of mcc : 0.6469923326874802\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2018, 2019]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2018, 2020]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2019, 2020]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'pure',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4301877051295586\n", + "mean of accuracy : 0.9569711638429356\n", + "mean of mcc : 0.6008010957239577\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2018, 2019]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2018, 2020]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2019, 2020]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'pure',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5546627753142914\n", + "mean of accuracy : 0.9119535519125682\n", + "mean of mcc : 0.6879511579878309\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2018, 2019]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2018, 2020]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2019, 2020]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'pure',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.29234018565381\n", + "mean of accuracy : 0.956963678252697\n", + "mean of mcc : 0.4819888130358391\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2018, 2019]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2018, 2020]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2019, 2020]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'pure',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4784367169072478\n", + "mean of accuracy : 0.9327483136628656\n", + "mean of mcc : 0.6252440470551551\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2018, 2019]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2018, 2020]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2019, 2020]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'pure',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4827774547671669\n", + "mean of accuracy : 0.9432361454200664\n", + "mean of mcc : 0.6368148576215991\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2018, 2019]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2018, 2020]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2019, 2020]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'pure',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **SMOTE 증강기법을 적용시킨 데이터셋에 대한 성능**" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "df_smote_busan_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_busan.csv\")\n", + "df_smote_busan_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_busan.csv\")\n", + "df_smote_busan_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_busan.csv\")\n", + "\n", + "df_smote_seoul_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_seoul.csv\")\n", + "df_smote_seoul_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_seoul.csv\")\n", + "df_smote_seoul_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_seoul.csv\")\n", + "\n", + "df_smote_daegu_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_daegu.csv\")\n", + "df_smote_daegu_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_daegu.csv\")\n", + "df_smote_daegu_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_daegu.csv\")\n", + "\n", + "df_smote_daejeon_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_daejeon.csv\")\n", + "df_smote_daejeon_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_daejeon.csv\")\n", + "df_smote_daejeon_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_daejeon.csv\")\n", + "\n", + "df_smote_gwangju_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_gwangju.csv\")\n", + "df_smote_gwangju_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_gwangju.csv\")\n", + "df_smote_gwangju_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_gwangju.csv\")\n", + "\n", + "df_smote_incheon_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_incheon.csv\")\n", + "df_smote_incheon_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_incheon.csv\")\n", + "df_smote_incheon_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_incheon.csv\")\n", + "\n", + "df_smote_busan_1 = preprocessing(df_smote_busan_1)\n", + "df_smote_busan_2 = preprocessing(df_smote_busan_2)\n", + "df_smote_busan_3 = preprocessing(df_smote_busan_3)\n", + "\n", + "df_smote_seoul_1 = preprocessing(df_smote_seoul_1)\n", + "df_smote_seoul_2 = preprocessing(df_smote_seoul_2)\n", + "df_smote_seoul_3 = preprocessing(df_smote_seoul_3)\n", + "\n", + "df_smote_daegu_1 = preprocessing(df_smote_daegu_1)\n", + "df_smote_daegu_2 = preprocessing(df_smote_daegu_2)\n", + "df_smote_daegu_3 = preprocessing(df_smote_daegu_3)\n", + "\n", + "df_smote_daejeon_1 = preprocessing(df_smote_daejeon_1)\n", + "df_smote_daejeon_2 = preprocessing(df_smote_daejeon_2)\n", + "df_smote_daejeon_3 = preprocessing(df_smote_daejeon_3)\n", + "\n", + "df_smote_gwangju_1 = preprocessing(df_smote_gwangju_1)\n", + "df_smote_gwangju_2 = preprocessing(df_smote_gwangju_2)\n", + "df_smote_gwangju_3 = preprocessing(df_smote_gwangju_3)\n", + "\n", + "df_smote_incheon_1 = preprocessing(df_smote_incheon_1)\n", + "df_smote_incheon_2 = preprocessing(df_smote_incheon_2)\n", + "df_smote_incheon_3 = preprocessing(df_smote_incheon_3)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "smote_oversample=[] # smote 적용 전 f1 score 저장 리스트" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n", + " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n", + " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n", + " 'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'ground_temp - temp_C',\n", + " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n", + " dtype='object')" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_smote_seoul_1.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n", + " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n", + " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n", + " 'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'ground_temp - temp_C',\n", + " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n", + " dtype='object')" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_seoul.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5789392155070722\n", + "mean of accuracy : 0.9399950927797324\n", + "mean of mcc : 0.7084991639282849\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_1.loc[df_smote_seoul_1['year'].isin([2018, 2019]), df_smote_seoul_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_smote_seoul_1.loc[df_smote_seoul_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_2.loc[df_smote_seoul_2['year'].isin([2018, 2020]), df_smote_seoul_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_smote_seoul_2.loc[df_smote_seoul_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_3.loc[df_smote_seoul_3['year'].isin([2019, 2020]), df_smote_seoul_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_smote_seoul_3.loc[df_smote_seoul_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smote',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.46602091302575205\n", + "mean of accuracy : 0.9501977443421413\n", + "mean of mcc : 0.6318799598547477\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_busan_1.loc[df_smote_busan_1['year'].isin([2018, 2019]), df_smote_busan_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_smote_busan_1.loc[df_smote_busan_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_busan_2.loc[df_smote_busan_2['year'].isin([2018, 2020]), df_smote_busan_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_smote_busan_2.loc[df_smote_busan_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_busan_3.loc[df_smote_busan_3['year'].isin([2019, 2020]), df_smote_busan_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_smote_busan_3.loc[df_smote_busan_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smote',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.583559682649986\n", + "mean of accuracy : 0.9104636075554557\n", + "mean of mcc : 0.7061374111787998\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_1.loc[df_smote_incheon_1['year'].isin([2018, 2019]), df_smote_incheon_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_smote_incheon_1.loc[df_smote_incheon_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_2.loc[df_smote_incheon_2['year'].isin([2018, 2020]), df_smote_incheon_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_smote_incheon_2.loc[df_smote_incheon_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_3.loc[df_smote_incheon_3['year'].isin([2019, 2020]), df_smote_incheon_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_smote_incheon_3.loc[df_smote_incheon_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smote',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.44735416496514874\n", + "mean of accuracy : 0.96373033992897\n", + "mean of mcc : 0.6169211806368756\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_1.loc[df_smote_daegu_1['year'].isin([2018, 2019]), df_smote_daegu_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_smote_daegu_1.loc[df_smote_daegu_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_2.loc[df_smote_daegu_2['year'].isin([2018, 2020]), df_smote_daegu_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_smote_daegu_2.loc[df_smote_daegu_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_3.loc[df_smote_daegu_3['year'].isin([2019, 2020]), df_smote_daegu_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_smote_daegu_3.loc[df_smote_daegu_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smote',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5213346054616135\n", + "mean of accuracy : 0.9306212624032071\n", + "mean of mcc : 0.6563209583230294\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_1.loc[df_smote_daejeon_1['year'].isin([2018, 2019]), df_smote_daejeon_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_smote_daejeon_1.loc[df_smote_daejeon_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_2.loc[df_smote_daejeon_2['year'].isin([2018, 2020]), df_smote_daejeon_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_smote_daejeon_2.loc[df_smote_daejeon_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_3.loc[df_smote_daejeon_3['year'].isin([2019, 2020]), df_smote_daejeon_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_smote_daejeon_3.loc[df_smote_daejeon_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smote',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5227311295096367\n", + "mean of accuracy : 0.9368502091806605\n", + "mean of mcc : 0.6604233380668852\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_1.loc[df_smote_gwangju_1['year'].isin([2018, 2019]), df_smote_gwangju_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_smote_gwangju_1.loc[df_smote_gwangju_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_2.loc[df_smote_gwangju_2['year'].isin([2018, 2020]), df_smote_gwangju_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_smote_gwangju_2.loc[df_smote_gwangju_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_3.loc[df_smote_gwangju_3['year'].isin([2019, 2020]), df_smote_gwangju_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_smote_gwangju_3.loc[df_smote_gwangju_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smote',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **CTGAN을 통해 데이터 증강을 진행한 데이터셋에 대한 성능**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **2만개**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 Fold\n", + "df_busan_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_busan.csv\")\n", + "df_seoul_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_seoul.csv\")\n", + "df_incheon_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_incheon.csv\")\n", + "df_daegu_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_daegu.csv\")\n", + "df_daejeon_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_daejeon.csv\")\n", + "df_gwangju_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_gwangju.csv\")\n", + "\n", + "# 2 Fold\n", + "df_busan_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_busan.csv\")\n", + "df_seoul_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_seoul.csv\")\n", + "df_incheon_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_incheon.csv\")\n", + "df_daegu_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_daegu.csv\")\n", + "df_daejeon_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_daejeon.csv\")\n", + "df_gwangju_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_gwangju.csv\")\n", + "\n", + "# 3 Fold\n", + "df_busan_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_busan.csv\")\n", + "df_seoul_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_seoul.csv\")\n", + "df_incheon_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_incheon.csv\")\n", + "df_daegu_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_daegu.csv\")\n", + "df_daejeon_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_daejeon.csv\")\n", + "df_gwangju_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_gwangju.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "df_busan_gan20000_1= preprocessing(df_busan_gan20000_1).copy()\n", + "df_seoul_gan20000_1= preprocessing(df_seoul_gan20000_1).copy()\n", + "df_incheon_gan20000_1= preprocessing(df_incheon_gan20000_1).copy()\n", + "df_daegu_gan20000_1= preprocessing(df_daegu_gan20000_1).copy()\n", + "df_daejeon_gan20000_1= preprocessing(df_daejeon_gan20000_1).copy()\n", + "df_gwangju_gan20000_1= preprocessing(df_gwangju_gan20000_1).copy()\n", + "\n", + "df_busan_gan20000_2= preprocessing(df_busan_gan20000_2).copy()\n", + "df_seoul_gan20000_2= preprocessing(df_seoul_gan20000_2).copy()\n", + "df_incheon_gan20000_2= preprocessing(df_incheon_gan20000_2).copy()\n", + "df_daegu_gan20000_2= preprocessing(df_daegu_gan20000_2).copy()\n", + "df_daejeon_gan20000_2= preprocessing(df_daejeon_gan20000_2).copy()\n", + "df_gwangju_gan20000_2= preprocessing(df_gwangju_gan20000_2).copy()\n", + "\n", + "df_busan_gan20000_3= preprocessing(df_busan_gan20000_3).copy()\n", + "df_seoul_gan20000_3= preprocessing(df_seoul_gan20000_3).copy()\n", + "df_incheon_gan20000_3= preprocessing(df_incheon_gan20000_3).copy()\n", + "df_daegu_gan20000_3= preprocessing(df_daegu_gan20000_3).copy()\n", + "df_daejeon_gan20000_3= preprocessing(df_daejeon_gan20000_3).copy()\n", + "df_gwangju_gan20000_3= preprocessing(df_gwangju_gan20000_3).copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5430102117038431\n", + "mean of accuracy : 0.9409344303881695\n", + "mean of mcc : 0.6780719447285347\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_1.loc[df_seoul_gan20000_1['year'].isin([2018, 2019]), df_seoul_gan20000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan20000_1.loc[df_seoul_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_2.loc[df_seoul_gan20000_2['year'].isin([2018, 2020]), df_seoul_gan20000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan20000_2.loc[df_seoul_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_3.loc[df_seoul_gan20000_3['year'].isin([2019, 2020]), df_seoul_gan20000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan20000_3.loc[df_seoul_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4663461890504854\n", + "mean of accuracy : 0.9575030566160141\n", + "mean of mcc : 0.6265195697208686\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_1.loc[df_busan_gan20000_1['year'].isin([2018, 2019]), df_busan_gan20000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan20000_1.loc[df_busan_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_2.loc[df_busan_gan20000_2['year'].isin([2018, 2020]), df_busan_gan20000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan20000_2.loc[df_busan_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_3.loc[df_busan_gan20000_3['year'].isin([2019, 2020]), df_busan_gan20000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan20000_3.loc[df_busan_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5666805477334921\n", + "mean of accuracy : 0.9076257371227054\n", + "mean of mcc : 0.6889342293705883\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_1.loc[df_incheon_gan20000_1['year'].isin([2018, 2019]), df_incheon_gan20000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan20000_1.loc[df_incheon_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_2.loc[df_incheon_gan20000_2['year'].isin([2018, 2020]), df_incheon_gan20000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan20000_2.loc[df_incheon_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_3.loc[df_incheon_gan20000_3['year'].isin([2019, 2020]), df_incheon_gan20000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan20000_3.loc[df_incheon_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.44041946056823505\n", + "mean of accuracy : 0.9674465196164052\n", + "mean of mcc : 0.6086254895296773\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_1.loc[df_daegu_gan20000_1['year'].isin([2018, 2019]), df_daegu_gan20000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan20000_1.loc[df_daegu_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_2.loc[df_daegu_gan20000_2['year'].isin([2018, 2020]), df_daegu_gan20000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan20000_2.loc[df_daegu_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_3.loc[df_daegu_gan20000_3['year'].isin([2019, 2020]), df_daegu_gan20000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan20000_3.loc[df_daegu_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4808393438710248\n", + "mean of accuracy : 0.9317601117848143\n", + "mean of mcc : 0.6258321334411245\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_1.loc[df_daejeon_gan20000_1['year'].isin([2018, 2019]), df_daejeon_gan20000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_1.loc[df_daejeon_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_2.loc[df_daejeon_gan20000_2['year'].isin([2018, 2020]), df_daejeon_gan20000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_2.loc[df_daejeon_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_3.loc[df_daejeon_gan20000_3['year'].isin([2019, 2020]), df_daejeon_gan20000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_3.loc[df_daejeon_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4937131678363705\n", + "mean of accuracy : 0.9367825269689182\n", + "mean of mcc : 0.6373125823908727\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_1.loc[df_gwangju_gan20000_1['year'].isin([2018, 2019]), df_gwangju_gan20000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_1.loc[df_gwangju_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_2.loc[df_gwangju_gan20000_2['year'].isin([2018, 2020]), df_gwangju_gan20000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_2.loc[df_gwangju_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_3.loc[df_gwangju_gan20000_3['year'].isin([2019, 2020]), df_gwangju_gan20000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_3.loc[df_gwangju_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **1만개**" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 Fold\n", + "df_busan_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_busan.csv\")\n", + "df_seoul_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_seoul.csv\")\n", + "df_incheon_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_incheon.csv\")\n", + "df_daegu_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_daegu.csv\")\n", + "df_daejeon_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_daejeon.csv\")\n", + "df_gwangju_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_gwangju.csv\")\n", + "\n", + "# 2 Fold\n", + "df_busan_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_busan.csv\")\n", + "df_seoul_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_seoul.csv\")\n", + "df_incheon_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_incheon.csv\")\n", + "df_daegu_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_daegu.csv\")\n", + "df_daejeon_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_daejeon.csv\")\n", + "df_gwangju_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_gwangju.csv\")\n", + "\n", + "# 3 Fold\n", + "df_busan_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_busan.csv\")\n", + "df_seoul_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_seoul.csv\")\n", + "df_incheon_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_incheon.csv\")\n", + "df_daegu_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_daegu.csv\")\n", + "df_daejeon_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_daejeon.csv\")\n", + "df_gwangju_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_gwangju.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "df_busan_gan10000_1= preprocessing(df_busan_gan10000_1).copy()\n", + "df_seoul_gan10000_1= preprocessing(df_seoul_gan10000_1).copy()\n", + "df_incheon_gan10000_1= preprocessing(df_incheon_gan10000_1).copy()\n", + "df_daegu_gan10000_1= preprocessing(df_daegu_gan10000_1).copy()\n", + "df_daejeon_gan10000_1= preprocessing(df_daejeon_gan10000_1).copy()\n", + "df_gwangju_gan10000_1= preprocessing(df_gwangju_gan10000_1).copy()\n", + "\n", + "df_busan_gan10000_2= preprocessing(df_busan_gan10000_2).copy()\n", + "df_seoul_gan10000_2= preprocessing(df_seoul_gan10000_2).copy()\n", + "df_incheon_gan10000_2= preprocessing(df_incheon_gan10000_2).copy()\n", + "df_daegu_gan10000_2= preprocessing(df_daegu_gan10000_2).copy()\n", + "df_daejeon_gan10000_2= preprocessing(df_daejeon_gan10000_2).copy()\n", + "df_gwangju_gan10000_2= preprocessing(df_gwangju_gan10000_2).copy()\n", + "\n", + "df_busan_gan10000_3= preprocessing(df_busan_gan10000_3).copy()\n", + "df_seoul_gan10000_3= preprocessing(df_seoul_gan10000_3).copy()\n", + "df_incheon_gan10000_3= preprocessing(df_incheon_gan10000_3).copy()\n", + "df_daegu_gan10000_3= preprocessing(df_daegu_gan10000_3).copy()\n", + "df_daejeon_gan10000_3= preprocessing(df_daejeon_gan10000_3).copy()\n", + "df_gwangju_gan10000_3= preprocessing(df_gwangju_gan10000_3).copy()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5489022319719195\n", + "mean of accuracy : 0.9431400803453353\n", + "mean of mcc : 0.6865310613747596\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_1.loc[df_seoul_gan10000_1['year'].isin([2018, 2019]), df_seoul_gan10000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan10000_1.loc[df_seoul_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_2.loc[df_seoul_gan10000_2['year'].isin([2018, 2020]), df_seoul_gan10000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan10000_2.loc[df_seoul_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_3.loc[df_seoul_gan10000_3['year'].isin([2019, 2020]), df_seoul_gan10000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan10000_3.loc[df_seoul_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4676627137741911\n", + "mean of accuracy : 0.959515848658\n", + "mean of mcc : 0.6314347309502454\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_1.loc[df_busan_gan10000_1['year'].isin([2018, 2019]), df_busan_gan10000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan10000_1.loc[df_busan_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_2.loc[df_busan_gan10000_2['year'].isin([2018, 2020]), df_busan_gan10000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan10000_2.loc[df_busan_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_3.loc[df_busan_gan10000_3['year'].isin([2019, 2020]), df_busan_gan10000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan10000_3.loc[df_busan_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5530992219893435\n", + "mean of accuracy : 0.9121080461777744\n", + "mean of mcc : 0.687650733674605\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_1.loc[df_incheon_gan10000_1['year'].isin([2018, 2019]), df_incheon_gan10000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan10000_1.loc[df_incheon_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_2.loc[df_incheon_gan10000_2['year'].isin([2018, 2020]), df_incheon_gan10000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan10000_2.loc[df_incheon_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_3.loc[df_incheon_gan10000_3['year'].isin([2019, 2020]), df_incheon_gan10000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan10000_3.loc[df_incheon_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.406699786423904\n", + "mean of accuracy : 0.9667631476075221\n", + "mean of mcc : 0.5786232842762297\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_1.loc[df_daegu_gan10000_1['year'].isin([2018, 2019]), df_daegu_gan10000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan10000_1.loc[df_daegu_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_2.loc[df_daegu_gan10000_2['year'].isin([2018, 2020]), df_daegu_gan10000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan10000_2.loc[df_daegu_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_3.loc[df_daegu_gan10000_3['year'].isin([2019, 2020]), df_daegu_gan10000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan10000_3.loc[df_daegu_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4782405346339498\n", + "mean of accuracy : 0.9329755844998378\n", + "mean of mcc : 0.6261770938311793\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_1.loc[df_daejeon_gan10000_1['year'].isin([2018, 2019]), df_daejeon_gan10000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_1.loc[df_daejeon_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_2.loc[df_daejeon_gan10000_2['year'].isin([2018, 2020]), df_daejeon_gan10000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_2.loc[df_daejeon_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_3.loc[df_daejeon_gan10000_3['year'].isin([2019, 2020]), df_daejeon_gan10000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_3.loc[df_daejeon_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4814166239972196\n", + "mean of accuracy : 0.9418691934692385\n", + "mean of mcc : 0.6302426323494177\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_1.loc[df_gwangju_gan10000_1['year'].isin([2018, 2019]), df_gwangju_gan10000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_1.loc[df_gwangju_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_2.loc[df_gwangju_gan10000_2['year'].isin([2018, 2020]), df_gwangju_gan10000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_2.loc[df_gwangju_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_3.loc[df_gwangju_gan10000_3['year'].isin([2019, 2020]), df_gwangju_gan10000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_3.loc[df_gwangju_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **7천개**" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 Fold\n", + "df_busan_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_busan.csv\")\n", + "df_seoul_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_seoul.csv\")\n", + "df_incheon_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_incheon.csv\")\n", + "df_daegu_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_daegu.csv\")\n", + "df_daejeon_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_daejeon.csv\")\n", + "df_gwangju_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_gwangju.csv\")\n", + "\n", + "# 2 Fold\n", + "df_busan_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_busan.csv\")\n", + "df_seoul_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_seoul.csv\")\n", + "df_incheon_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_incheon.csv\")\n", + "df_daegu_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_daegu.csv\")\n", + "df_daejeon_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_daejeon.csv\")\n", + "df_gwangju_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_gwangju.csv\")\n", + "\n", + "# 3 Fold\n", + "df_busan_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_busan.csv\")\n", + "df_seoul_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_seoul.csv\")\n", + "df_incheon_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_incheon.csv\")\n", + "df_daegu_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_daegu.csv\")\n", + "df_daejeon_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_daejeon.csv\")\n", + "df_gwangju_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_gwangju.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "df_busan_gan7000_1= preprocessing(df_busan_gan7000_1).copy()\n", + "df_seoul_gan7000_1= preprocessing(df_seoul_gan7000_1).copy()\n", + "df_incheon_gan7000_1= preprocessing(df_incheon_gan7000_1).copy()\n", + "df_daegu_gan7000_1= preprocessing(df_daegu_gan7000_1).copy()\n", + "df_daejeon_gan7000_1= preprocessing(df_daejeon_gan7000_1).copy()\n", + "df_gwangju_gan7000_1= preprocessing(df_gwangju_gan7000_1).copy()\n", + "\n", + "df_busan_gan7000_2= preprocessing(df_busan_gan7000_2).copy()\n", + "df_seoul_gan7000_2= preprocessing(df_seoul_gan7000_2).copy()\n", + "df_incheon_gan7000_2= preprocessing(df_incheon_gan7000_2).copy()\n", + "df_daegu_gan7000_2= preprocessing(df_daegu_gan7000_2).copy()\n", + "df_daejeon_gan7000_2= preprocessing(df_daejeon_gan7000_2).copy()\n", + "df_gwangju_gan7000_2= preprocessing(df_gwangju_gan7000_2).copy()\n", + "\n", + "df_busan_gan7000_3= preprocessing(df_busan_gan7000_3).copy()\n", + "df_seoul_gan7000_3= preprocessing(df_seoul_gan7000_3).copy()\n", + "df_incheon_gan7000_3= preprocessing(df_incheon_gan7000_3).copy()\n", + "df_daegu_gan7000_3= preprocessing(df_daegu_gan7000_3).copy()\n", + "df_daejeon_gan7000_3= preprocessing(df_daejeon_gan7000_3).copy()\n", + "df_gwangju_gan7000_3= preprocessing(df_gwangju_gan7000_3).copy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5483176293132692\n", + "mean of accuracy : 0.9431403922449285\n", + "mean of mcc : 0.6873897786091137\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_1.loc[df_seoul_gan7000_1['year'].isin([2018, 2019]), df_seoul_gan7000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan7000_1.loc[df_seoul_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_2.loc[df_seoul_gan7000_2['year'].isin([2018, 2020]), df_seoul_gan7000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan7000_2.loc[df_seoul_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_3.loc[df_seoul_gan7000_3['year'].isin([2019, 2020]), df_seoul_gan7000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan7000_3.loc[df_seoul_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4663740712722189\n", + "mean of accuracy : 0.9600495088621072\n", + "mean of mcc : 0.6344100074206912\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_1.loc[df_busan_gan7000_1['year'].isin([2018, 2019]), df_busan_gan7000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan7000_1.loc[df_busan_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_2.loc[df_busan_gan7000_2['year'].isin([2018, 2020]), df_busan_gan7000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan7000_2.loc[df_busan_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_3.loc[df_busan_gan7000_3['year'].isin([2019, 2020]), df_busan_gan7000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan7000_3.loc[df_busan_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5400648981230686\n", + "mean of accuracy : 0.9097125533348306\n", + "mean of mcc : 0.6776113780254353\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_1.loc[df_incheon_gan7000_1['year'].isin([2018, 2019]), df_incheon_gan7000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan7000_1.loc[df_incheon_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_2.loc[df_incheon_gan7000_2['year'].isin([2018, 2020]), df_incheon_gan7000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan7000_2.loc[df_incheon_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_3.loc[df_incheon_gan7000_3['year'].isin([2019, 2020]), df_incheon_gan7000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan7000_3.loc[df_incheon_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.426127621983083\n", + "mean of accuracy : 0.9688532907486422\n", + "mean of mcc : 0.5995660759623473\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_1.loc[df_daegu_gan7000_1['year'].isin([2018, 2019]), df_daegu_gan7000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan7000_1.loc[df_daegu_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_2.loc[df_daegu_gan7000_2['year'].isin([2018, 2020]), df_daegu_gan7000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan7000_2.loc[df_daegu_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_3.loc[df_daegu_gan7000_3['year'].isin([2019, 2020]), df_daegu_gan7000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan7000_3.loc[df_daegu_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4701451158455507\n", + "mean of accuracy : 0.9314935415990885\n", + "mean of mcc : 0.619110040349657\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_1.loc[df_daejeon_gan7000_1['year'].isin([2018, 2019]), df_daejeon_gan7000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_1.loc[df_daejeon_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_2.loc[df_daejeon_gan7000_2['year'].isin([2018, 2020]), df_daejeon_gan7000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_2.loc[df_daejeon_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_3.loc[df_daejeon_gan7000_3['year'].isin([2019, 2020]), df_daejeon_gan7000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_3.loc[df_daejeon_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4860337424842999\n", + "mean of accuracy : 0.9426673445284495\n", + "mean of mcc : 0.6348033992505139\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_1.loc[df_gwangju_gan7000_1['year'].isin([2018, 2019]), df_gwangju_gan7000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_1.loc[df_gwangju_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_2.loc[df_gwangju_gan7000_2['year'].isin([2018, 2020]), df_gwangju_gan7000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_2.loc[df_gwangju_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_3.loc[df_gwangju_gan7000_3['year'].isin([2019, 2020]), df_gwangju_gan7000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_3.loc[df_gwangju_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **SMOTENC+CTGAN을 통해 데이터 증강을 진행한 데이터셋에 대한 성능**\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **7천개**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 Fold\n", + "df_busan_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_busan.csv\")\n", + "df_seoul_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_seoul.csv\")\n", + "df_incheon_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_incheon.csv\")\n", + "df_daegu_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_gwangju.csv\")\n", + "\n", + "# 2 Fold\n", + "df_busan_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_busan.csv\")\n", + "df_seoul_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_seoul.csv\")\n", + "df_incheon_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_incheon.csv\")\n", + "df_daegu_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_gwangju.csv\")\n", + "\n", + "# 3 Fold\n", + "df_busan_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_busan.csv\")\n", + "df_seoul_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_seoul.csv\")\n", + "df_incheon_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_incheon.csv\")\n", + "df_daegu_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_gwangju.csv\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "df_busan_smotenc_ctgan7000_1 = preprocessing(df_busan_smotenc_ctgan7000_1).copy()\n", + "df_seoul_smotenc_ctgan7000_1 = preprocessing(df_seoul_smotenc_ctgan7000_1).copy()\n", + "df_incheon_smotenc_ctgan7000_1 = preprocessing(df_incheon_smotenc_ctgan7000_1).copy()\n", + "df_daegu_smotenc_ctgan7000_1 = preprocessing(df_daegu_smotenc_ctgan7000_1).copy()\n", + "df_daejeon_smotenc_ctgan7000_1 = preprocessing(df_daejeon_smotenc_ctgan7000_1).copy()\n", + "df_gwangju_smotenc_ctgan7000_1 = preprocessing(df_gwangju_smotenc_ctgan7000_1).copy()\n", + "\n", + "df_busan_smotenc_ctgan7000_2 = preprocessing(df_busan_smotenc_ctgan7000_2).copy()\n", + "df_seoul_smotenc_ctgan7000_2 = preprocessing(df_seoul_smotenc_ctgan7000_2).copy()\n", + "df_incheon_smotenc_ctgan7000_2 = preprocessing(df_incheon_smotenc_ctgan7000_2).copy()\n", + "df_daegu_smotenc_ctgan7000_2 = preprocessing(df_daegu_smotenc_ctgan7000_2).copy()\n", + "df_daejeon_smotenc_ctgan7000_2 = preprocessing(df_daejeon_smotenc_ctgan7000_2).copy()\n", + "df_gwangju_smotenc_ctgan7000_2 = preprocessing(df_gwangju_smotenc_ctgan7000_2).copy()\n", + "\n", + "df_busan_smotenc_ctgan7000_3 = preprocessing(df_busan_smotenc_ctgan7000_3).copy()\n", + "df_seoul_smotenc_ctgan7000_3 = preprocessing(df_seoul_smotenc_ctgan7000_3).copy()\n", + "df_incheon_smotenc_ctgan7000_3 = preprocessing(df_incheon_smotenc_ctgan7000_3).copy()\n", + "df_daegu_smotenc_ctgan7000_3 = preprocessing(df_daegu_smotenc_ctgan7000_3).copy()\n", + "df_daejeon_smotenc_ctgan7000_3 = preprocessing(df_daejeon_smotenc_ctgan7000_3).copy()\n", + "df_gwangju_smotenc_ctgan7000_3 = preprocessing(df_gwangju_smotenc_ctgan7000_3).copy()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5352598289152354\n", + "mean of accuracy : 0.9412404038891801\n", + "mean of mcc : 0.6782186947210392\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_1.loc[df_seoul_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan7000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_1.loc[df_seoul_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_2.loc[df_seoul_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan7000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_2.loc[df_seoul_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_3.loc[df_seoul_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan7000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_3.loc[df_seoul_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4485402157416473\n", + "mean of accuracy : 0.9570824080312065\n", + "mean of mcc : 0.6170303705969965\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_1.loc[df_busan_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan7000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_1.loc[df_busan_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_2.loc[df_busan_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan7000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_2.loc[df_busan_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_3.loc[df_busan_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan7000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_3.loc[df_busan_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5518512077229454\n", + "mean of accuracy : 0.9110783616538164\n", + "mean of mcc : 0.6864459032542548\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_1.loc[df_incheon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan7000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_1.loc[df_incheon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_2.loc[df_incheon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan7000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_2.loc[df_incheon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_3.loc[df_incheon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan7000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_3.loc[df_incheon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.40224349400243914\n", + "mean of accuracy : 0.9664241127496235\n", + "mean of mcc : 0.5832379476945067\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_1.loc[df_daegu_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan7000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_1.loc[df_daegu_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_2.loc[df_daegu_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan7000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_2.loc[df_daegu_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_3.loc[df_daegu_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan7000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_3.loc[df_daegu_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.476308593039838\n", + "mean of accuracy : 0.9321407332551505\n", + "mean of mcc : 0.625183787216149\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_1.loc[df_daejeon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan7000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_1.loc[df_daejeon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_2.loc[df_daejeon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan7000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_2.loc[df_daejeon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_3.loc[df_daejeon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan7000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_3.loc[df_daejeon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.47155191980971883\n", + "mean of accuracy : 0.9403094875697615\n", + "mean of mcc : 0.6237103634516713\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_1.loc[df_gwangju_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan7000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_1.loc[df_gwangju_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_2.loc[df_gwangju_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan7000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_2.loc[df_gwangju_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_3.loc[df_gwangju_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan7000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_3.loc[df_gwangju_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **1만개**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 Fold\n", + "df_busan_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_busan.csv\")\n", + "df_seoul_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_seoul.csv\")\n", + "df_incheon_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_incheon.csv\")\n", + "df_daegu_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_gwangju.csv\")\n", + "\n", + "# 2 Fold\n", + "df_busan_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_busan.csv\")\n", + "df_seoul_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_seoul.csv\")\n", + "df_incheon_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_incheon.csv\")\n", + "df_daegu_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_gwangju.csv\")\n", + "\n", + "# 3 Fold\n", + "df_busan_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_busan.csv\")\n", + "df_seoul_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_seoul.csv\")\n", + "df_incheon_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_incheon.csv\")\n", + "df_daegu_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_gwangju.csv\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "df_busan_smotenc_ctgan10000_1 = preprocessing(df_busan_smotenc_ctgan10000_1).copy()\n", + "df_seoul_smotenc_ctgan10000_1 = preprocessing(df_seoul_smotenc_ctgan10000_1).copy()\n", + "df_incheon_smotenc_ctgan10000_1 = preprocessing(df_incheon_smotenc_ctgan10000_1).copy()\n", + "df_daegu_smotenc_ctgan10000_1 = preprocessing(df_daegu_smotenc_ctgan10000_1).copy()\n", + "df_daejeon_smotenc_ctgan10000_1 = preprocessing(df_daejeon_smotenc_ctgan10000_1).copy()\n", + "df_gwangju_smotenc_ctgan10000_1 = preprocessing(df_gwangju_smotenc_ctgan10000_1).copy()\n", + "\n", + "df_busan_smotenc_ctgan10000_2 = preprocessing(df_busan_smotenc_ctgan10000_2).copy()\n", + "df_seoul_smotenc_ctgan10000_2 = preprocessing(df_seoul_smotenc_ctgan10000_2).copy()\n", + "df_incheon_smotenc_ctgan10000_2 = preprocessing(df_incheon_smotenc_ctgan10000_2).copy()\n", + "df_daegu_smotenc_ctgan10000_2 = preprocessing(df_daegu_smotenc_ctgan10000_2).copy()\n", + "df_daejeon_smotenc_ctgan10000_2 = preprocessing(df_daejeon_smotenc_ctgan10000_2).copy()\n", + "df_gwangju_smotenc_ctgan10000_2 = preprocessing(df_gwangju_smotenc_ctgan10000_2).copy()\n", + "\n", + "df_busan_smotenc_ctgan10000_3 = preprocessing(df_busan_smotenc_ctgan10000_3).copy()\n", + "df_seoul_smotenc_ctgan10000_3 = preprocessing(df_seoul_smotenc_ctgan10000_3).copy()\n", + "df_incheon_smotenc_ctgan10000_3 = preprocessing(df_incheon_smotenc_ctgan10000_3).copy()\n", + "df_daegu_smotenc_ctgan10000_3 = preprocessing(df_daegu_smotenc_ctgan10000_3).copy()\n", + "df_daejeon_smotenc_ctgan10000_3 = preprocessing(df_daejeon_smotenc_ctgan10000_3).copy()\n", + "df_gwangju_smotenc_ctgan10000_3 = preprocessing(df_gwangju_smotenc_ctgan10000_3).copy()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5391446462395447\n", + "mean of accuracy : 0.9419634911129409\n", + "mean of mcc : 0.6802382864465635\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_1.loc[df_seoul_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan10000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_1.loc[df_seoul_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_2.loc[df_seoul_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan10000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_2.loc[df_seoul_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_3.loc[df_seoul_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan10000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_3.loc[df_seoul_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.45629913656734233\n", + "mean of accuracy : 0.9579187148073292\n", + "mean of mcc : 0.624299022660101\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_1.loc[df_busan_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan10000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_1.loc[df_busan_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_2.loc[df_busan_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan10000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_2.loc[df_busan_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_3.loc[df_busan_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan10000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_3.loc[df_busan_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5474454733447011\n", + "mean of accuracy : 0.9102821859586961\n", + "mean of mcc : 0.6821514877761338\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_1.loc[df_incheon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan10000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_1.loc[df_incheon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_2.loc[df_incheon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan10000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_2.loc[df_incheon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_3.loc[df_incheon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan10000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_3.loc[df_incheon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4114169416612505\n", + "mean of accuracy : 0.9674109630627709\n", + "mean of mcc : 0.5898029433914993\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_1.loc[df_daegu_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan10000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_1.loc[df_daegu_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_2.loc[df_daegu_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan10000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_2.loc[df_daegu_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_3.loc[df_daegu_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan10000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_3.loc[df_daegu_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.46834204028253223\n", + "mean of accuracy : 0.9317592800525656\n", + "mean of mcc : 0.620788512419694\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_1.loc[df_daejeon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan10000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_1.loc[df_daejeon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_2.loc[df_daejeon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan10000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_2.loc[df_daejeon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_3.loc[df_daejeon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan10000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_3.loc[df_daejeon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4859514070622382\n", + "mean of accuracy : 0.9427799402816245\n", + "mean of mcc : 0.6384358903533097\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_1.loc[df_gwangju_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan10000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_1.loc[df_gwangju_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_2.loc[df_gwangju_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan10000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_2.loc[df_gwangju_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_3.loc[df_gwangju_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan10000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_3.loc[df_gwangju_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **2만개**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 Fold\n", + "df_busan_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_busan.csv\")\n", + "df_seoul_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_seoul.csv\")\n", + "df_incheon_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_incheon.csv\")\n", + "df_daegu_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_gwangju.csv\")\n", + "\n", + "# 2 Fold\n", + "df_busan_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_busan.csv\")\n", + "df_seoul_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_seoul.csv\")\n", + "df_incheon_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_incheon.csv\")\n", + "df_daegu_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_gwangju.csv\")\n", + "\n", + "# 3 Fold\n", + "df_busan_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_busan.csv\")\n", + "df_seoul_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_seoul.csv\")\n", + "df_incheon_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_incheon.csv\")\n", + "df_daegu_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_gwangju.csv\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "df_busan_smotenc_ctgan20000_1 = preprocessing(df_busan_smotenc_ctgan20000_1).copy()\n", + "df_seoul_smotenc_ctgan20000_1 = preprocessing(df_seoul_smotenc_ctgan20000_1).copy()\n", + "df_incheon_smotenc_ctgan20000_1 = preprocessing(df_incheon_smotenc_ctgan20000_1).copy()\n", + "df_daegu_smotenc_ctgan20000_1 = preprocessing(df_daegu_smotenc_ctgan20000_1).copy()\n", + "df_daejeon_smotenc_ctgan20000_1 = preprocessing(df_daejeon_smotenc_ctgan20000_1).copy()\n", + "df_gwangju_smotenc_ctgan20000_1 = preprocessing(df_gwangju_smotenc_ctgan20000_1).copy()\n", + "\n", + "df_busan_smotenc_ctgan20000_2 = preprocessing(df_busan_smotenc_ctgan20000_2).copy()\n", + "df_seoul_smotenc_ctgan20000_2 = preprocessing(df_seoul_smotenc_ctgan20000_2).copy()\n", + "df_incheon_smotenc_ctgan20000_2 = preprocessing(df_incheon_smotenc_ctgan20000_2).copy()\n", + "df_daegu_smotenc_ctgan20000_2 = preprocessing(df_daegu_smotenc_ctgan20000_2).copy()\n", + "df_daejeon_smotenc_ctgan20000_2 = preprocessing(df_daejeon_smotenc_ctgan20000_2).copy()\n", + "df_gwangju_smotenc_ctgan20000_2 = preprocessing(df_gwangju_smotenc_ctgan20000_2).copy()\n", + "\n", + "df_busan_smotenc_ctgan20000_3 = preprocessing(df_busan_smotenc_ctgan20000_3).copy()\n", + "df_seoul_smotenc_ctgan20000_3 = preprocessing(df_seoul_smotenc_ctgan20000_3).copy()\n", + "df_incheon_smotenc_ctgan20000_3 = preprocessing(df_incheon_smotenc_ctgan20000_3).copy()\n", + "df_daegu_smotenc_ctgan20000_3 = preprocessing(df_daegu_smotenc_ctgan20000_3).copy()\n", + "df_daejeon_smotenc_ctgan20000_3 = preprocessing(df_daejeon_smotenc_ctgan20000_3).copy()\n", + "df_gwangju_smotenc_ctgan20000_3 = preprocessing(df_gwangju_smotenc_ctgan20000_3).copy()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5358813620881883\n", + "mean of accuracy : 0.9413917791584533\n", + "mean of mcc : 0.6789791066414157\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_1.loc[df_seoul_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan20000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_1.loc[df_seoul_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_2.loc[df_seoul_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan20000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_2.loc[df_seoul_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_3.loc[df_seoul_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan20000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_3.loc[df_seoul_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4336268899145388\n", + "mean of accuracy : 0.9563998677545724\n", + "mean of mcc : 0.6080139285870266\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_1.loc[df_busan_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan20000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_1.loc[df_busan_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_2.loc[df_busan_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan20000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_2.loc[df_busan_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_3.loc[df_busan_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan20000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_3.loc[df_busan_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.564798264178871\n", + "mean of accuracy : 0.9092926325157406\n", + "mean of mcc : 0.68899875702518\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_1.loc[df_incheon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan20000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_1.loc[df_incheon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_2.loc[df_incheon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan20000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_2.loc[df_incheon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_3.loc[df_incheon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan20000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_3.loc[df_incheon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.422336525695402\n", + "mean of accuracy : 0.963494335903386\n", + "mean of mcc : 0.5910759394692583\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_1.loc[df_daegu_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan20000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_1.loc[df_daegu_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_2.loc[df_daegu_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan20000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_2.loc[df_daegu_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_3.loc[df_daegu_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan20000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_3.loc[df_daegu_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4827377579875818\n", + "mean of accuracy : 0.9311888156964511\n", + "mean of mcc : 0.6279627682150313\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_1.loc[df_daejeon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan20000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_1.loc[df_daejeon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_2.loc[df_daejeon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan20000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_2.loc[df_daejeon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_3.loc[df_daejeon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan20000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_3.loc[df_daejeon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4753475520930324\n", + "mean of accuracy : 0.9346923838277981\n", + "mean of mcc : 0.6188303813518945\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_1.loc[df_gwangju_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan20000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_1.loc[df_gwangju_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_2.loc[df_gwangju_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan20000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_2.loc[df_gwangju_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_3.loc[df_gwangju_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan20000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_3.loc[df_gwangju_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | region | \n", + "model | \n", + "data_sample | \n", + "CSI | \n", + "MCC | \n", + "Accuracy | \n", + "fold_csi | \n", + "
|---|---|---|---|---|---|---|---|
| 0 | \n", + "seoul | \n", + "LightGBM | \n", + "pure | \n", + "0.505041 | \n", + "0.646992 | \n", + "0.936174 | \n", + "[[0.46595932802825235, 0.5771195097037204, 0.4... | \n", + "
| 1 | \n", + "busan | \n", + "LightGBM | \n", + "pure | \n", + "0.430188 | \n", + "0.600801 | \n", + "0.956971 | \n", + "[[0.32824427480911017, 0.4782608695651431, 0.4... | \n", + "
| 2 | \n", + "incheon | \n", + "LightGBM | \n", + "pure | \n", + "0.554663 | \n", + "0.687951 | \n", + "0.911954 | \n", + "[[0.4845292955891715, 0.6037628278220865, 0.57... | \n", + "
| 3 | \n", + "daegu | \n", + "LightGBM | \n", + "pure | \n", + "0.292340 | \n", + "0.481989 | \n", + "0.956964 | \n", + "[[0.28124999999994504, 0.3320537428022395, 0.2... | \n", + "
| 4 | \n", + "daejeon | \n", + "LightGBM | \n", + "pure | \n", + "0.478437 | \n", + "0.625244 | \n", + "0.932748 | \n", + "[[0.43333333333329205, 0.4547920433995972, 0.5... | \n", + "
| 5 | \n", + "gwangju | \n", + "LightGBM | \n", + "pure | \n", + "0.482777 | \n", + "0.636815 | \n", + "0.943236 | \n", + "[[0.3928095872169916, 0.5461624026695722, 0.50... | \n", + "
| 6 | \n", + "seoul | \n", + "LightGBM | \n", + "smote | \n", + "0.578939 | \n", + "0.708499 | \n", + "0.939995 | \n", + "[[0.4550682961897588, 0.6503831417623898, 0.63... | \n", + "
| 7 | \n", + "busan | \n", + "LightGBM | \n", + "smote | \n", + "0.466021 | \n", + "0.631880 | \n", + "0.950198 | \n", + "[[0.4690909090908522, 0.47058823529405874, 0.4... | \n", + "
| 8 | \n", + "incheon | \n", + "LightGBM | \n", + "smote | \n", + "0.583560 | \n", + "0.706137 | \n", + "0.910464 | \n", + "[[0.5613293051359177, 0.6213080168776044, 0.56... | \n", + "
| 9 | \n", + "daegu | \n", + "LightGBM | \n", + "smote | \n", + "0.447354 | \n", + "0.616921 | \n", + "0.963730 | \n", + "[[0.3632567849686089, 0.49122807017536024, 0.4... | \n", + "
| 10 | \n", + "daejeon | \n", + "LightGBM | \n", + "smote | \n", + "0.521335 | \n", + "0.656321 | \n", + "0.930621 | \n", + "[[0.4589041095890018, 0.5326514555467716, 0.57... | \n", + "
| 11 | \n", + "gwangju | \n", + "LightGBM | \n", + "smote | \n", + "0.522731 | \n", + "0.660423 | \n", + "0.936850 | \n", + "[[0.47532729103721294, 0.5410958904109059, 0.5... | \n", + "
| 12 | \n", + "seoul | \n", + "LightGBM | \n", + "ctgan20000 | \n", + "0.543010 | \n", + "0.678072 | \n", + "0.940934 | \n", + "[[0.47165160230070075, 0.5903500473036338, 0.5... | \n", + "
| 13 | \n", + "busan | \n", + "LightGBM | \n", + "ctgan20000 | \n", + "0.466346 | \n", + "0.626520 | \n", + "0.957503 | \n", + "[[0.4154262516914187, 0.49489051094883285, 0.4... | \n", + "
| 14 | \n", + "incheon | \n", + "LightGBM | \n", + "ctgan20000 | \n", + "0.566681 | \n", + "0.688934 | \n", + "0.907626 | \n", + "[[0.5311667554608135, 0.6098117512834792, 0.55... | \n", + "
| 15 | \n", + "daegu | \n", + "LightGBM | \n", + "ctgan20000 | \n", + "0.440419 | \n", + "0.608625 | \n", + "0.967447 | \n", + "[[0.46130952380938656, 0.4414784394249607, 0.4... | \n", + "
| 16 | \n", + "daejeon | \n", + "LightGBM | \n", + "ctgan20000 | \n", + "0.480839 | \n", + "0.625832 | \n", + "0.931760 | \n", + "[[0.42667928098387636, 0.48797250859102337, 0.... | \n", + "
| 17 | \n", + "gwangju | \n", + "LightGBM | \n", + "ctgan20000 | \n", + "0.493713 | \n", + "0.637313 | \n", + "0.936783 | \n", + "[[0.42775665399235474, 0.5447427293064268, 0.5... | \n", + "
| 18 | \n", + "seoul | \n", + "LightGBM | \n", + "ctgan10000 | \n", + "0.548902 | \n", + "0.686531 | \n", + "0.943140 | \n", + "[[0.482333607230856, 0.5810397553516227, 0.583... | \n", + "
| 19 | \n", + "busan | \n", + "LightGBM | \n", + "ctgan10000 | \n", + "0.467663 | \n", + "0.631435 | \n", + "0.959516 | \n", + "[[0.4084084084083471, 0.5208955223879819, 0.47... | \n", + "
| 20 | \n", + "incheon | \n", + "LightGBM | \n", + "ctgan10000 | \n", + "0.553099 | \n", + "0.687651 | \n", + "0.912108 | \n", + "[[0.4707429322813629, 0.6094198736358064, 0.57... | \n", + "
| 21 | \n", + "daegu | \n", + "LightGBM | \n", + "ctgan10000 | \n", + "0.406700 | \n", + "0.578623 | \n", + "0.966763 | \n", + "[[0.44943820224706477, 0.45661157024783955, 0.... | \n", + "
| 22 | \n", + "daejeon | \n", + "LightGBM | \n", + "ctgan10000 | \n", + "0.478241 | \n", + "0.626177 | \n", + "0.932976 | \n", + "[[0.43804034582128354, 0.4569356300996866, 0.5... | \n", + "
| 23 | \n", + "gwangju | \n", + "LightGBM | \n", + "ctgan10000 | \n", + "0.481417 | \n", + "0.630243 | \n", + "0.941869 | \n", + "[[0.38888888888883977, 0.53999999999994, 0.515... | \n", + "
| 24 | \n", + "seoul | \n", + "LightGBM | \n", + "ctgan7000 | \n", + "0.548318 | \n", + "0.687390 | \n", + "0.943140 | \n", + "[[0.4815724815724421, 0.5754132231404364, 0.58... | \n", + "
| 25 | \n", + "busan | \n", + "LightGBM | \n", + "ctgan7000 | \n", + "0.466374 | \n", + "0.634410 | \n", + "0.960050 | \n", + "[[0.3847328244274221, 0.5324074074073252, 0.48... | \n", + "
| 26 | \n", + "incheon | \n", + "LightGBM | \n", + "ctgan7000 | \n", + "0.540065 | \n", + "0.677611 | \n", + "0.909713 | \n", + "[[0.45490716180368335, 0.6001144164759382, 0.5... | \n", + "
| 27 | \n", + "daegu | \n", + "LightGBM | \n", + "ctgan7000 | \n", + "0.426128 | \n", + "0.599566 | \n", + "0.968853 | \n", + "[[0.4832826747718896, 0.4640657084187959, 0.33... | \n", + "
| 28 | \n", + "daejeon | \n", + "LightGBM | \n", + "ctgan7000 | \n", + "0.470145 | \n", + "0.619110 | \n", + "0.931494 | \n", + "[[0.4214559386972776, 0.4623753399818257, 0.52... | \n", + "
| 29 | \n", + "gwangju | \n", + "LightGBM | \n", + "ctgan7000 | \n", + "0.486034 | \n", + "0.634803 | \n", + "0.942667 | \n", + "[[0.3902439024389743, 0.543429844097935, 0.524... | \n", + "
| 30 | \n", + "seoul | \n", + "LightGBM | \n", + "smotenc_ctgan7000 | \n", + "0.535260 | \n", + "0.678219 | \n", + "0.941240 | \n", + "[[0.46849757673663417, 0.5743801652891969, 0.5... | \n", + "
| 31 | \n", + "busan | \n", + "LightGBM | \n", + "smotenc_ctgan7000 | \n", + "0.448540 | \n", + "0.617030 | \n", + "0.957082 | \n", + "[[0.38415545590427735, 0.4850640113797318, 0.4... | \n", + "
| 32 | \n", + "incheon | \n", + "LightGBM | \n", + "smotenc_ctgan7000 | \n", + "0.551851 | \n", + "0.686446 | \n", + "0.911078 | \n", + "[[0.4885695623774991, 0.6043577981651029, 0.56... | \n", + "
| 33 | \n", + "daegu | \n", + "LightGBM | \n", + "smotenc_ctgan7000 | \n", + "0.402243 | \n", + "0.583238 | \n", + "0.966424 | \n", + "[[0.41432225063928024, 0.4475806451612001, 0.3... | \n", + "
| 34 | \n", + "daejeon | \n", + "LightGBM | \n", + "smotenc_ctgan7000 | \n", + "0.476309 | \n", + "0.625184 | \n", + "0.932141 | \n", + "[[0.4230038022813286, 0.47079964061091906, 0.5... | \n", + "
| 35 | \n", + "gwangju | \n", + "LightGBM | \n", + "smotenc_ctgan7000 | \n", + "0.471552 | \n", + "0.623710 | \n", + "0.940309 | \n", + "[[0.37113402061850886, 0.5363735070574879, 0.5... | \n", + "
| 36 | \n", + "seoul | \n", + "LightGBM | \n", + "smotenc_ctgan10000 | \n", + "0.539145 | \n", + "0.680238 | \n", + "0.941963 | \n", + "[[0.4607201309328592, 0.5864583333332722, 0.57... | \n", + "
| 37 | \n", + "busan | \n", + "LightGBM | \n", + "smotenc_ctgan10000 | \n", + "0.456299 | \n", + "0.624299 | \n", + "0.957919 | \n", + "[[0.40412979351026485, 0.4999999999999277, 0.4... | \n", + "
| 38 | \n", + "incheon | \n", + "LightGBM | \n", + "smotenc_ctgan10000 | \n", + "0.547445 | \n", + "0.682151 | \n", + "0.910282 | \n", + "[[0.4681967213114447, 0.6035067873302826, 0.57... | \n", + "
| 39 | \n", + "daegu | \n", + "LightGBM | \n", + "smotenc_ctgan10000 | \n", + "0.411417 | \n", + "0.589803 | \n", + "0.967411 | \n", + "[[0.4368131868130668, 0.45213849287159835, 0.3... | \n", + "
| 40 | \n", + "daejeon | \n", + "LightGBM | \n", + "smotenc_ctgan10000 | \n", + "0.468342 | \n", + "0.620789 | \n", + "0.931759 | \n", + "[[0.42665388302968105, 0.4723481414324141, 0.5... | \n", + "
| 41 | \n", + "gwangju | \n", + "LightGBM | \n", + "smotenc_ctgan10000 | \n", + "0.485951 | \n", + "0.638436 | \n", + "0.942780 | \n", + "[[0.3915343915343397, 0.5499451152578979, 0.51... | \n", + "
| 42 | \n", + "seoul | \n", + "LightGBM | \n", + "smotenc_ctgan20000 | \n", + "0.535881 | \n", + "0.678979 | \n", + "0.941392 | \n", + "[[0.4706840390879095, 0.5655314757481357, 0.57... | \n", + "
| 43 | \n", + "busan | \n", + "LightGBM | \n", + "smotenc_ctgan20000 | \n", + "0.433627 | \n", + "0.608014 | \n", + "0.956400 | \n", + "[[0.3399089529589772, 0.5086956521738393, 0.45... | \n", + "
| 44 | \n", + "incheon | \n", + "LightGBM | \n", + "smotenc_ctgan20000 | \n", + "0.564798 | \n", + "0.688999 | \n", + "0.909293 | \n", + "[[0.5387685290763661, 0.5963821368004185, 0.55... | \n", + "
| 45 | \n", + "daegu | \n", + "LightGBM | \n", + "smotenc_ctgan20000 | \n", + "0.422337 | \n", + "0.591076 | \n", + "0.963494 | \n", + "[[0.42962962962952356, 0.43951612903216947, 0.... | \n", + "
| 46 | \n", + "daejeon | \n", + "LightGBM | \n", + "smotenc_ctgan20000 | \n", + "0.482738 | \n", + "0.627963 | \n", + "0.931189 | \n", + "[[0.4281636536631372, 0.4917627677100089, 0.52... | \n", + "
| 47 | \n", + "gwangju | \n", + "LightGBM | \n", + "smotenc_ctgan20000 | \n", + "0.475348 | \n", + "0.618830 | \n", + "0.934692 | \n", + "[[0.3949903660885939, 0.5378704720087225, 0.49... | \n", + "