diff --git "a/Analysis_code/4.sampling_data_test/lgb_sampled_test.ipynb" "b/Analysis_code/4.sampling_data_test/lgb_sampled_test.ipynb" new file mode 100644--- /dev/null +++ "b/Analysis_code/4.sampling_data_test/lgb_sampled_test.ipynb" @@ -0,0 +1,5215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **lightGBM**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score\n", + "from collections import Counter\n", + "import sys\n", + "from lightgbm import LGBMClassifier\n", + "\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " 제거했던 파생 변수들을 복구\n", + " \n", + " Args:\n", + " df: 데이터프레임\n", + " \n", + " Returns:\n", + " 파생 변수가 추가된 데이터프레임\n", + " \"\"\"\n", + " df = df.copy()\n", + " df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)\n", + " df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)\n", + " df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)\n", + " df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)\n", + " df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']\n", + " return df\n", + "\n", + "\n", + "def preprocessing(df):\n", + " df = df[df.columns].copy()\n", + " df['year'] = df['year'].astype('int')\n", + " df['month'] = df['month'].astype('int')\n", + " df['hour'] = df['hour'].astype('int')\n", + " df= add_derived_features(df).copy()\n", + " df['multi_class'] = df['multi_class'].astype('int')\n", + " df.loc[df['wind_dir']=='정온', 'wind_dir'] = \"0\"\n", + " df['wind_dir'] = df['wind_dir'].astype('int')\n", + " df= df[['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm',\n", + " 'vap_pressure', 'dewpoint_C', 'loc_pressure', 'sea_pressure',\n", + " 'solarRad', 'snow_cm', 'cloudcover', 'lm_cloudcover', 'low_cloudbase',\n", + " 'groundtemp', 'O3', 'NO2', 'PM10', 'PM25', 'year',\n", + " 'month', 'hour', 'ground_temp - temp_C', 'hour_sin', 'hour_cos',\n", + " 'month_sin', 'month_cos','multi_class']].copy()\n", + " return df\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df_seoul = pd.read_csv(\"../../data/data_for_modeling/seoul_train.csv\")\n", + "df_busan = pd.read_csv(\"../../data/data_for_modeling/busan_train.csv\")\n", + "df_daegu = pd.read_csv(\"../../data/data_for_modeling/daegu_train.csv\")\n", + "df_daejeon = pd.read_csv(\"../../data/data_for_modeling/daejeon_train.csv\")\n", + "df_incheon = pd.read_csv(\"../../data/data_for_modeling/incheon_train.csv\")\n", + "df_gwangju = pd.read_csv(\"../../data/data_for_modeling/gwangju_train.csv\")\n", + "\n", + "df_seoul = preprocessing(df_seoul).copy()\n", + "df_busan = preprocessing(df_busan).copy()\n", + "df_daegu = preprocessing(df_daegu).copy()\n", + "df_daejeon = preprocessing(df_daejeon).copy()\n", + "df_incheon = preprocessing(df_incheon).copy()\n", + "df_gwangju = preprocessing(df_gwangju).copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "seoul : Counter({2: 23686, 1: 2579, 0: 39})\n", + "\n", + "busan : Counter({2: 24694, 1: 1516, 0: 94})\n", + "\n", + "daegu : Counter({2: 25149, 1: 1107, 0: 48})\n", + "\n", + "gwangju : Counter({2: 23798, 1: 2411, 0: 95})\n", + "\n", + "daejeon : Counter({2: 23471, 1: 2660, 0: 173})\n", + "\n", + "incheon : Counter({2: 21893, 1: 3892, 0: 519})\n" + ] + } + ], + "source": [ + "print(\"seoul : \", Counter(df_seoul['multi_class']))\n", + "print()\n", + "print(\"busan : \", Counter(df_busan['multi_class']))\n", + "print()\n", + "print(\"daegu : \", Counter(df_daegu['multi_class']))\n", + "print()\n", + "print(\"gwangju : \", Counter(df_gwangju['multi_class']))\n", + "print()\n", + "print(\"daejeon : \", Counter(df_daejeon['multi_class']))\n", + "print()\n", + "print(\"incheon : \", Counter(df_incheon['multi_class']))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n", + " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n", + " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n", + " 'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'ground_temp - temp_C',\n", + " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n", + " dtype='object')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_gwangju.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.utils.class_weight import compute_class_weight\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "from sklearn.metrics import matthews_corrcoef\n", + "\n", + "def calculate_csi(Y_test, pred):\n", + "\n", + " cm = confusion_matrix(Y_test, pred) # 변수 이름을 cm으로 변경\n", + " # 혼동 행렬에서 H, F, M 추출\n", + " H = (cm[0, 0] + cm[1, 1])\n", + " \n", + " F = (cm[1, 0] + cm[2, 0] +\n", + " cm[0, 1] + cm[2, 1])\n", + " \n", + " M = (cm[0, 2] + cm[1, 2])\n", + " \n", + " # CSI 계산\n", + " CSI = H / (H + F + M + 1e-10)\n", + " return CSI\n", + "\n", + "def eval_metric_csi(y_true, pred_prob):\n", + "\n", + " pred = np.argmax(pred_prob, axis=1)\n", + " y_true = y_true\n", + " y_pred = pred\n", + " csi = calculate_csi(y_true, y_pred)\n", + " return -1*csi\n", + "\n", + "def multiclass_mcc(y_val, y_pred):\n", + " \"\"\"\n", + " 다중 분류에서도 sklearn의 matthews_corrcoef를 그대로 사용할 수 있음.\n", + " \"\"\"\n", + " return matthews_corrcoef(y_val, y_pred)\n", + "\n", + "# 사용자 정의 평가 지표 함수 정의\n", + "def csi_metric(y_true, pred):\n", + " y_pred_binary = np.argmax(pred, axis=1)\n", + " score = calculate_csi(y_true, y_pred_binary)\n", + " return 'CSI', score, True # higher_better=True" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "lgb_model = LGBMClassifier(\n", + " n_estimators=4000, # 약한 학습기 개수\n", + " tree_method='hist', \n", + " device='gpu', # GPU 사용\n", + " objective='multiclassova',\n", + " early_stopping_rounds=400, # 과적합 방지를 위한 조기 종료 설정\n", + " random_state= 42,\n", + " verbose= -1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "pre_sampled_data= []\n", + "smote_sample_data= []\n", + "gan20000_sample_data= []\n", + "gan10000_sample_data= []\n", + "gan7000_sample_data= []" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df= pd.DataFrame(columns=['region','model','data_sample','CSI','MCC','Accuracy','fold_csi'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5050411887221465\n", + "mean of accuracy : 0.9361739068958922\n", + "mean of mcc : 0.6469923326874802\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2018, 2019]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2018, 2020]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2019, 2020]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'pure',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4301877051295586\n", + "mean of accuracy : 0.9569711638429356\n", + "mean of mcc : 0.6008010957239577\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2018, 2019]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2018, 2020]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2019, 2020]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'pure',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5546627753142914\n", + "mean of accuracy : 0.9119535519125682\n", + "mean of mcc : 0.6879511579878309\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2018, 2019]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2018, 2020]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2019, 2020]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'pure',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.29234018565381\n", + "mean of accuracy : 0.956963678252697\n", + "mean of mcc : 0.4819888130358391\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2018, 2019]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2018, 2020]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2019, 2020]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'pure',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4784367169072478\n", + "mean of accuracy : 0.9327483136628656\n", + "mean of mcc : 0.6252440470551551\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2018, 2019]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2018, 2020]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2019, 2020]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'pure',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4827774547671669\n", + "mean of accuracy : 0.9432361454200664\n", + "mean of mcc : 0.6368148576215991\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2018, 2019]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2018, 2020]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2019, 2020]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'pure',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **SMOTE 증강기법을 적용시킨 데이터셋에 대한 성능**" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "df_smote_busan_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_busan.csv\")\n", + "df_smote_busan_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_busan.csv\")\n", + "df_smote_busan_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_busan.csv\")\n", + "\n", + "df_smote_seoul_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_seoul.csv\")\n", + "df_smote_seoul_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_seoul.csv\")\n", + "df_smote_seoul_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_seoul.csv\")\n", + "\n", + "df_smote_daegu_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_daegu.csv\")\n", + "df_smote_daegu_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_daegu.csv\")\n", + "df_smote_daegu_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_daegu.csv\")\n", + "\n", + "df_smote_daejeon_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_daejeon.csv\")\n", + "df_smote_daejeon_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_daejeon.csv\")\n", + "df_smote_daejeon_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_daejeon.csv\")\n", + "\n", + "df_smote_gwangju_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_gwangju.csv\")\n", + "df_smote_gwangju_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_gwangju.csv\")\n", + "df_smote_gwangju_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_gwangju.csv\")\n", + "\n", + "df_smote_incheon_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_incheon.csv\")\n", + "df_smote_incheon_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_incheon.csv\")\n", + "df_smote_incheon_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_incheon.csv\")\n", + "\n", + "df_smote_busan_1 = preprocessing(df_smote_busan_1)\n", + "df_smote_busan_2 = preprocessing(df_smote_busan_2)\n", + "df_smote_busan_3 = preprocessing(df_smote_busan_3)\n", + "\n", + "df_smote_seoul_1 = preprocessing(df_smote_seoul_1)\n", + "df_smote_seoul_2 = preprocessing(df_smote_seoul_2)\n", + "df_smote_seoul_3 = preprocessing(df_smote_seoul_3)\n", + "\n", + "df_smote_daegu_1 = preprocessing(df_smote_daegu_1)\n", + "df_smote_daegu_2 = preprocessing(df_smote_daegu_2)\n", + "df_smote_daegu_3 = preprocessing(df_smote_daegu_3)\n", + "\n", + "df_smote_daejeon_1 = preprocessing(df_smote_daejeon_1)\n", + "df_smote_daejeon_2 = preprocessing(df_smote_daejeon_2)\n", + "df_smote_daejeon_3 = preprocessing(df_smote_daejeon_3)\n", + "\n", + "df_smote_gwangju_1 = preprocessing(df_smote_gwangju_1)\n", + "df_smote_gwangju_2 = preprocessing(df_smote_gwangju_2)\n", + "df_smote_gwangju_3 = preprocessing(df_smote_gwangju_3)\n", + "\n", + "df_smote_incheon_1 = preprocessing(df_smote_incheon_1)\n", + "df_smote_incheon_2 = preprocessing(df_smote_incheon_2)\n", + "df_smote_incheon_3 = preprocessing(df_smote_incheon_3)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "smote_oversample=[] # smote 적용 전 f1 score 저장 리스트" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n", + " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n", + " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n", + " 'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'ground_temp - temp_C',\n", + " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n", + " dtype='object')" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_smote_seoul_1.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n", + " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n", + " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n", + " 'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'ground_temp - temp_C',\n", + " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n", + " dtype='object')" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_seoul.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5789392155070722\n", + "mean of accuracy : 0.9399950927797324\n", + "mean of mcc : 0.7084991639282849\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_1.loc[df_smote_seoul_1['year'].isin([2018, 2019]), df_smote_seoul_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_smote_seoul_1.loc[df_smote_seoul_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_2.loc[df_smote_seoul_2['year'].isin([2018, 2020]), df_smote_seoul_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_smote_seoul_2.loc[df_smote_seoul_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_3.loc[df_smote_seoul_3['year'].isin([2019, 2020]), df_smote_seoul_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_smote_seoul_3.loc[df_smote_seoul_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smote',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.46602091302575205\n", + "mean of accuracy : 0.9501977443421413\n", + "mean of mcc : 0.6318799598547477\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_busan_1.loc[df_smote_busan_1['year'].isin([2018, 2019]), df_smote_busan_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_smote_busan_1.loc[df_smote_busan_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_busan_2.loc[df_smote_busan_2['year'].isin([2018, 2020]), df_smote_busan_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_smote_busan_2.loc[df_smote_busan_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_busan_3.loc[df_smote_busan_3['year'].isin([2019, 2020]), df_smote_busan_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_smote_busan_3.loc[df_smote_busan_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smote',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.583559682649986\n", + "mean of accuracy : 0.9104636075554557\n", + "mean of mcc : 0.7061374111787998\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_1.loc[df_smote_incheon_1['year'].isin([2018, 2019]), df_smote_incheon_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_smote_incheon_1.loc[df_smote_incheon_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_2.loc[df_smote_incheon_2['year'].isin([2018, 2020]), df_smote_incheon_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_smote_incheon_2.loc[df_smote_incheon_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_3.loc[df_smote_incheon_3['year'].isin([2019, 2020]), df_smote_incheon_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_smote_incheon_3.loc[df_smote_incheon_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smote',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.44735416496514874\n", + "mean of accuracy : 0.96373033992897\n", + "mean of mcc : 0.6169211806368756\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_1.loc[df_smote_daegu_1['year'].isin([2018, 2019]), df_smote_daegu_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_smote_daegu_1.loc[df_smote_daegu_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_2.loc[df_smote_daegu_2['year'].isin([2018, 2020]), df_smote_daegu_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_smote_daegu_2.loc[df_smote_daegu_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_3.loc[df_smote_daegu_3['year'].isin([2019, 2020]), df_smote_daegu_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_smote_daegu_3.loc[df_smote_daegu_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smote',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5213346054616135\n", + "mean of accuracy : 0.9306212624032071\n", + "mean of mcc : 0.6563209583230294\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_1.loc[df_smote_daejeon_1['year'].isin([2018, 2019]), df_smote_daejeon_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_smote_daejeon_1.loc[df_smote_daejeon_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_2.loc[df_smote_daejeon_2['year'].isin([2018, 2020]), df_smote_daejeon_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_smote_daejeon_2.loc[df_smote_daejeon_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_3.loc[df_smote_daejeon_3['year'].isin([2019, 2020]), df_smote_daejeon_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_smote_daejeon_3.loc[df_smote_daejeon_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smote',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5227311295096367\n", + "mean of accuracy : 0.9368502091806605\n", + "mean of mcc : 0.6604233380668852\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", + "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", + "from warnings import filterwarnings\n", + "filterwarnings('ignore')\n", + "\n", + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_1.loc[df_smote_gwangju_1['year'].isin([2018, 2019]), df_smote_gwangju_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_smote_gwangju_1.loc[df_smote_gwangju_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_2.loc[df_smote_gwangju_2['year'].isin([2018, 2020]), df_smote_gwangju_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_smote_gwangju_2.loc[df_smote_gwangju_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_3.loc[df_smote_gwangju_3['year'].isin([2019, 2020]), df_smote_gwangju_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_smote_gwangju_3.loc[df_smote_gwangju_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smote',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **CTGAN을 통해 데이터 증강을 진행한 데이터셋에 대한 성능**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **2만개**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 Fold\n", + "df_busan_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_busan.csv\")\n", + "df_seoul_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_seoul.csv\")\n", + "df_incheon_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_incheon.csv\")\n", + "df_daegu_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_daegu.csv\")\n", + "df_daejeon_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_daejeon.csv\")\n", + "df_gwangju_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_gwangju.csv\")\n", + "\n", + "# 2 Fold\n", + "df_busan_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_busan.csv\")\n", + "df_seoul_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_seoul.csv\")\n", + "df_incheon_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_incheon.csv\")\n", + "df_daegu_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_daegu.csv\")\n", + "df_daejeon_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_daejeon.csv\")\n", + "df_gwangju_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_gwangju.csv\")\n", + "\n", + "# 3 Fold\n", + "df_busan_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_busan.csv\")\n", + "df_seoul_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_seoul.csv\")\n", + "df_incheon_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_incheon.csv\")\n", + "df_daegu_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_daegu.csv\")\n", + "df_daejeon_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_daejeon.csv\")\n", + "df_gwangju_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_gwangju.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "df_busan_gan20000_1= preprocessing(df_busan_gan20000_1).copy()\n", + "df_seoul_gan20000_1= preprocessing(df_seoul_gan20000_1).copy()\n", + "df_incheon_gan20000_1= preprocessing(df_incheon_gan20000_1).copy()\n", + "df_daegu_gan20000_1= preprocessing(df_daegu_gan20000_1).copy()\n", + "df_daejeon_gan20000_1= preprocessing(df_daejeon_gan20000_1).copy()\n", + "df_gwangju_gan20000_1= preprocessing(df_gwangju_gan20000_1).copy()\n", + "\n", + "df_busan_gan20000_2= preprocessing(df_busan_gan20000_2).copy()\n", + "df_seoul_gan20000_2= preprocessing(df_seoul_gan20000_2).copy()\n", + "df_incheon_gan20000_2= preprocessing(df_incheon_gan20000_2).copy()\n", + "df_daegu_gan20000_2= preprocessing(df_daegu_gan20000_2).copy()\n", + "df_daejeon_gan20000_2= preprocessing(df_daejeon_gan20000_2).copy()\n", + "df_gwangju_gan20000_2= preprocessing(df_gwangju_gan20000_2).copy()\n", + "\n", + "df_busan_gan20000_3= preprocessing(df_busan_gan20000_3).copy()\n", + "df_seoul_gan20000_3= preprocessing(df_seoul_gan20000_3).copy()\n", + "df_incheon_gan20000_3= preprocessing(df_incheon_gan20000_3).copy()\n", + "df_daegu_gan20000_3= preprocessing(df_daegu_gan20000_3).copy()\n", + "df_daejeon_gan20000_3= preprocessing(df_daejeon_gan20000_3).copy()\n", + "df_gwangju_gan20000_3= preprocessing(df_gwangju_gan20000_3).copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5430102117038431\n", + "mean of accuracy : 0.9409344303881695\n", + "mean of mcc : 0.6780719447285347\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_1.loc[df_seoul_gan20000_1['year'].isin([2018, 2019]), df_seoul_gan20000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan20000_1.loc[df_seoul_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_2.loc[df_seoul_gan20000_2['year'].isin([2018, 2020]), df_seoul_gan20000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan20000_2.loc[df_seoul_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_3.loc[df_seoul_gan20000_3['year'].isin([2019, 2020]), df_seoul_gan20000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan20000_3.loc[df_seoul_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4663461890504854\n", + "mean of accuracy : 0.9575030566160141\n", + "mean of mcc : 0.6265195697208686\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_1.loc[df_busan_gan20000_1['year'].isin([2018, 2019]), df_busan_gan20000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan20000_1.loc[df_busan_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_2.loc[df_busan_gan20000_2['year'].isin([2018, 2020]), df_busan_gan20000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan20000_2.loc[df_busan_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_3.loc[df_busan_gan20000_3['year'].isin([2019, 2020]), df_busan_gan20000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan20000_3.loc[df_busan_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5666805477334921\n", + "mean of accuracy : 0.9076257371227054\n", + "mean of mcc : 0.6889342293705883\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_1.loc[df_incheon_gan20000_1['year'].isin([2018, 2019]), df_incheon_gan20000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan20000_1.loc[df_incheon_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_2.loc[df_incheon_gan20000_2['year'].isin([2018, 2020]), df_incheon_gan20000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan20000_2.loc[df_incheon_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_3.loc[df_incheon_gan20000_3['year'].isin([2019, 2020]), df_incheon_gan20000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan20000_3.loc[df_incheon_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.44041946056823505\n", + "mean of accuracy : 0.9674465196164052\n", + "mean of mcc : 0.6086254895296773\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_1.loc[df_daegu_gan20000_1['year'].isin([2018, 2019]), df_daegu_gan20000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan20000_1.loc[df_daegu_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_2.loc[df_daegu_gan20000_2['year'].isin([2018, 2020]), df_daegu_gan20000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan20000_2.loc[df_daegu_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_3.loc[df_daegu_gan20000_3['year'].isin([2019, 2020]), df_daegu_gan20000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan20000_3.loc[df_daegu_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4808393438710248\n", + "mean of accuracy : 0.9317601117848143\n", + "mean of mcc : 0.6258321334411245\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_1.loc[df_daejeon_gan20000_1['year'].isin([2018, 2019]), df_daejeon_gan20000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_1.loc[df_daejeon_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_2.loc[df_daejeon_gan20000_2['year'].isin([2018, 2020]), df_daejeon_gan20000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_2.loc[df_daejeon_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_3.loc[df_daejeon_gan20000_3['year'].isin([2019, 2020]), df_daejeon_gan20000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_3.loc[df_daejeon_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4937131678363705\n", + "mean of accuracy : 0.9367825269689182\n", + "mean of mcc : 0.6373125823908727\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_1.loc[df_gwangju_gan20000_1['year'].isin([2018, 2019]), df_gwangju_gan20000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_1.loc[df_gwangju_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_2.loc[df_gwangju_gan20000_2['year'].isin([2018, 2020]), df_gwangju_gan20000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_2.loc[df_gwangju_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_3.loc[df_gwangju_gan20000_3['year'].isin([2019, 2020]), df_gwangju_gan20000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_3.loc[df_gwangju_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **1만개**" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 Fold\n", + "df_busan_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_busan.csv\")\n", + "df_seoul_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_seoul.csv\")\n", + "df_incheon_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_incheon.csv\")\n", + "df_daegu_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_daegu.csv\")\n", + "df_daejeon_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_daejeon.csv\")\n", + "df_gwangju_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_gwangju.csv\")\n", + "\n", + "# 2 Fold\n", + "df_busan_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_busan.csv\")\n", + "df_seoul_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_seoul.csv\")\n", + "df_incheon_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_incheon.csv\")\n", + "df_daegu_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_daegu.csv\")\n", + "df_daejeon_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_daejeon.csv\")\n", + "df_gwangju_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_gwangju.csv\")\n", + "\n", + "# 3 Fold\n", + "df_busan_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_busan.csv\")\n", + "df_seoul_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_seoul.csv\")\n", + "df_incheon_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_incheon.csv\")\n", + "df_daegu_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_daegu.csv\")\n", + "df_daejeon_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_daejeon.csv\")\n", + "df_gwangju_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_gwangju.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "df_busan_gan10000_1= preprocessing(df_busan_gan10000_1).copy()\n", + "df_seoul_gan10000_1= preprocessing(df_seoul_gan10000_1).copy()\n", + "df_incheon_gan10000_1= preprocessing(df_incheon_gan10000_1).copy()\n", + "df_daegu_gan10000_1= preprocessing(df_daegu_gan10000_1).copy()\n", + "df_daejeon_gan10000_1= preprocessing(df_daejeon_gan10000_1).copy()\n", + "df_gwangju_gan10000_1= preprocessing(df_gwangju_gan10000_1).copy()\n", + "\n", + "df_busan_gan10000_2= preprocessing(df_busan_gan10000_2).copy()\n", + "df_seoul_gan10000_2= preprocessing(df_seoul_gan10000_2).copy()\n", + "df_incheon_gan10000_2= preprocessing(df_incheon_gan10000_2).copy()\n", + "df_daegu_gan10000_2= preprocessing(df_daegu_gan10000_2).copy()\n", + "df_daejeon_gan10000_2= preprocessing(df_daejeon_gan10000_2).copy()\n", + "df_gwangju_gan10000_2= preprocessing(df_gwangju_gan10000_2).copy()\n", + "\n", + "df_busan_gan10000_3= preprocessing(df_busan_gan10000_3).copy()\n", + "df_seoul_gan10000_3= preprocessing(df_seoul_gan10000_3).copy()\n", + "df_incheon_gan10000_3= preprocessing(df_incheon_gan10000_3).copy()\n", + "df_daegu_gan10000_3= preprocessing(df_daegu_gan10000_3).copy()\n", + "df_daejeon_gan10000_3= preprocessing(df_daejeon_gan10000_3).copy()\n", + "df_gwangju_gan10000_3= preprocessing(df_gwangju_gan10000_3).copy()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5489022319719195\n", + "mean of accuracy : 0.9431400803453353\n", + "mean of mcc : 0.6865310613747596\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_1.loc[df_seoul_gan10000_1['year'].isin([2018, 2019]), df_seoul_gan10000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan10000_1.loc[df_seoul_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_2.loc[df_seoul_gan10000_2['year'].isin([2018, 2020]), df_seoul_gan10000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan10000_2.loc[df_seoul_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_3.loc[df_seoul_gan10000_3['year'].isin([2019, 2020]), df_seoul_gan10000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan10000_3.loc[df_seoul_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4676627137741911\n", + "mean of accuracy : 0.959515848658\n", + "mean of mcc : 0.6314347309502454\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_1.loc[df_busan_gan10000_1['year'].isin([2018, 2019]), df_busan_gan10000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan10000_1.loc[df_busan_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_2.loc[df_busan_gan10000_2['year'].isin([2018, 2020]), df_busan_gan10000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan10000_2.loc[df_busan_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_3.loc[df_busan_gan10000_3['year'].isin([2019, 2020]), df_busan_gan10000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan10000_3.loc[df_busan_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5530992219893435\n", + "mean of accuracy : 0.9121080461777744\n", + "mean of mcc : 0.687650733674605\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_1.loc[df_incheon_gan10000_1['year'].isin([2018, 2019]), df_incheon_gan10000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan10000_1.loc[df_incheon_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_2.loc[df_incheon_gan10000_2['year'].isin([2018, 2020]), df_incheon_gan10000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan10000_2.loc[df_incheon_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_3.loc[df_incheon_gan10000_3['year'].isin([2019, 2020]), df_incheon_gan10000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan10000_3.loc[df_incheon_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.406699786423904\n", + "mean of accuracy : 0.9667631476075221\n", + "mean of mcc : 0.5786232842762297\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_1.loc[df_daegu_gan10000_1['year'].isin([2018, 2019]), df_daegu_gan10000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan10000_1.loc[df_daegu_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_2.loc[df_daegu_gan10000_2['year'].isin([2018, 2020]), df_daegu_gan10000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan10000_2.loc[df_daegu_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_3.loc[df_daegu_gan10000_3['year'].isin([2019, 2020]), df_daegu_gan10000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan10000_3.loc[df_daegu_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4782405346339498\n", + "mean of accuracy : 0.9329755844998378\n", + "mean of mcc : 0.6261770938311793\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_1.loc[df_daejeon_gan10000_1['year'].isin([2018, 2019]), df_daejeon_gan10000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_1.loc[df_daejeon_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_2.loc[df_daejeon_gan10000_2['year'].isin([2018, 2020]), df_daejeon_gan10000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_2.loc[df_daejeon_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_3.loc[df_daejeon_gan10000_3['year'].isin([2019, 2020]), df_daejeon_gan10000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_3.loc[df_daejeon_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4814166239972196\n", + "mean of accuracy : 0.9418691934692385\n", + "mean of mcc : 0.6302426323494177\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_1.loc[df_gwangju_gan10000_1['year'].isin([2018, 2019]), df_gwangju_gan10000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_1.loc[df_gwangju_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_2.loc[df_gwangju_gan10000_2['year'].isin([2018, 2020]), df_gwangju_gan10000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_2.loc[df_gwangju_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_3.loc[df_gwangju_gan10000_3['year'].isin([2019, 2020]), df_gwangju_gan10000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_3.loc[df_gwangju_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **7천개**" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 Fold\n", + "df_busan_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_busan.csv\")\n", + "df_seoul_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_seoul.csv\")\n", + "df_incheon_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_incheon.csv\")\n", + "df_daegu_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_daegu.csv\")\n", + "df_daejeon_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_daejeon.csv\")\n", + "df_gwangju_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_gwangju.csv\")\n", + "\n", + "# 2 Fold\n", + "df_busan_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_busan.csv\")\n", + "df_seoul_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_seoul.csv\")\n", + "df_incheon_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_incheon.csv\")\n", + "df_daegu_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_daegu.csv\")\n", + "df_daejeon_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_daejeon.csv\")\n", + "df_gwangju_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_gwangju.csv\")\n", + "\n", + "# 3 Fold\n", + "df_busan_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_busan.csv\")\n", + "df_seoul_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_seoul.csv\")\n", + "df_incheon_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_incheon.csv\")\n", + "df_daegu_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_daegu.csv\")\n", + "df_daejeon_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_daejeon.csv\")\n", + "df_gwangju_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_gwangju.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "df_busan_gan7000_1= preprocessing(df_busan_gan7000_1).copy()\n", + "df_seoul_gan7000_1= preprocessing(df_seoul_gan7000_1).copy()\n", + "df_incheon_gan7000_1= preprocessing(df_incheon_gan7000_1).copy()\n", + "df_daegu_gan7000_1= preprocessing(df_daegu_gan7000_1).copy()\n", + "df_daejeon_gan7000_1= preprocessing(df_daejeon_gan7000_1).copy()\n", + "df_gwangju_gan7000_1= preprocessing(df_gwangju_gan7000_1).copy()\n", + "\n", + "df_busan_gan7000_2= preprocessing(df_busan_gan7000_2).copy()\n", + "df_seoul_gan7000_2= preprocessing(df_seoul_gan7000_2).copy()\n", + "df_incheon_gan7000_2= preprocessing(df_incheon_gan7000_2).copy()\n", + "df_daegu_gan7000_2= preprocessing(df_daegu_gan7000_2).copy()\n", + "df_daejeon_gan7000_2= preprocessing(df_daejeon_gan7000_2).copy()\n", + "df_gwangju_gan7000_2= preprocessing(df_gwangju_gan7000_2).copy()\n", + "\n", + "df_busan_gan7000_3= preprocessing(df_busan_gan7000_3).copy()\n", + "df_seoul_gan7000_3= preprocessing(df_seoul_gan7000_3).copy()\n", + "df_incheon_gan7000_3= preprocessing(df_incheon_gan7000_3).copy()\n", + "df_daegu_gan7000_3= preprocessing(df_daegu_gan7000_3).copy()\n", + "df_daejeon_gan7000_3= preprocessing(df_daejeon_gan7000_3).copy()\n", + "df_gwangju_gan7000_3= preprocessing(df_gwangju_gan7000_3).copy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5483176293132692\n", + "mean of accuracy : 0.9431403922449285\n", + "mean of mcc : 0.6873897786091137\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_1.loc[df_seoul_gan7000_1['year'].isin([2018, 2019]), df_seoul_gan7000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan7000_1.loc[df_seoul_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_2.loc[df_seoul_gan7000_2['year'].isin([2018, 2020]), df_seoul_gan7000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan7000_2.loc[df_seoul_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_3.loc[df_seoul_gan7000_3['year'].isin([2019, 2020]), df_seoul_gan7000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan7000_3.loc[df_seoul_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4663740712722189\n", + "mean of accuracy : 0.9600495088621072\n", + "mean of mcc : 0.6344100074206912\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_1.loc[df_busan_gan7000_1['year'].isin([2018, 2019]), df_busan_gan7000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan7000_1.loc[df_busan_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_2.loc[df_busan_gan7000_2['year'].isin([2018, 2020]), df_busan_gan7000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan7000_2.loc[df_busan_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_3.loc[df_busan_gan7000_3['year'].isin([2019, 2020]), df_busan_gan7000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan7000_3.loc[df_busan_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5400648981230686\n", + "mean of accuracy : 0.9097125533348306\n", + "mean of mcc : 0.6776113780254353\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_1.loc[df_incheon_gan7000_1['year'].isin([2018, 2019]), df_incheon_gan7000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan7000_1.loc[df_incheon_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_2.loc[df_incheon_gan7000_2['year'].isin([2018, 2020]), df_incheon_gan7000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan7000_2.loc[df_incheon_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_3.loc[df_incheon_gan7000_3['year'].isin([2019, 2020]), df_incheon_gan7000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan7000_3.loc[df_incheon_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.426127621983083\n", + "mean of accuracy : 0.9688532907486422\n", + "mean of mcc : 0.5995660759623473\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_1.loc[df_daegu_gan7000_1['year'].isin([2018, 2019]), df_daegu_gan7000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan7000_1.loc[df_daegu_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_2.loc[df_daegu_gan7000_2['year'].isin([2018, 2020]), df_daegu_gan7000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan7000_2.loc[df_daegu_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_3.loc[df_daegu_gan7000_3['year'].isin([2019, 2020]), df_daegu_gan7000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan7000_3.loc[df_daegu_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4701451158455507\n", + "mean of accuracy : 0.9314935415990885\n", + "mean of mcc : 0.619110040349657\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_1.loc[df_daejeon_gan7000_1['year'].isin([2018, 2019]), df_daejeon_gan7000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_1.loc[df_daejeon_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_2.loc[df_daejeon_gan7000_2['year'].isin([2018, 2020]), df_daejeon_gan7000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_2.loc[df_daejeon_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_3.loc[df_daejeon_gan7000_3['year'].isin([2019, 2020]), df_daejeon_gan7000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_3.loc[df_daejeon_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4860337424842999\n", + "mean of accuracy : 0.9426673445284495\n", + "mean of mcc : 0.6348033992505139\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_1.loc[df_gwangju_gan7000_1['year'].isin([2018, 2019]), df_gwangju_gan7000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_1.loc[df_gwangju_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_2.loc[df_gwangju_gan7000_2['year'].isin([2018, 2020]), df_gwangju_gan7000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_2.loc[df_gwangju_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_3.loc[df_gwangju_gan7000_3['year'].isin([2019, 2020]), df_gwangju_gan7000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_3.loc[df_gwangju_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **SMOTENC+CTGAN을 통해 데이터 증강을 진행한 데이터셋에 대한 성능**\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **7천개**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 Fold\n", + "df_busan_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_busan.csv\")\n", + "df_seoul_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_seoul.csv\")\n", + "df_incheon_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_incheon.csv\")\n", + "df_daegu_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_gwangju.csv\")\n", + "\n", + "# 2 Fold\n", + "df_busan_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_busan.csv\")\n", + "df_seoul_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_seoul.csv\")\n", + "df_incheon_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_incheon.csv\")\n", + "df_daegu_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_gwangju.csv\")\n", + "\n", + "# 3 Fold\n", + "df_busan_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_busan.csv\")\n", + "df_seoul_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_seoul.csv\")\n", + "df_incheon_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_incheon.csv\")\n", + "df_daegu_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_gwangju.csv\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "df_busan_smotenc_ctgan7000_1 = preprocessing(df_busan_smotenc_ctgan7000_1).copy()\n", + "df_seoul_smotenc_ctgan7000_1 = preprocessing(df_seoul_smotenc_ctgan7000_1).copy()\n", + "df_incheon_smotenc_ctgan7000_1 = preprocessing(df_incheon_smotenc_ctgan7000_1).copy()\n", + "df_daegu_smotenc_ctgan7000_1 = preprocessing(df_daegu_smotenc_ctgan7000_1).copy()\n", + "df_daejeon_smotenc_ctgan7000_1 = preprocessing(df_daejeon_smotenc_ctgan7000_1).copy()\n", + "df_gwangju_smotenc_ctgan7000_1 = preprocessing(df_gwangju_smotenc_ctgan7000_1).copy()\n", + "\n", + "df_busan_smotenc_ctgan7000_2 = preprocessing(df_busan_smotenc_ctgan7000_2).copy()\n", + "df_seoul_smotenc_ctgan7000_2 = preprocessing(df_seoul_smotenc_ctgan7000_2).copy()\n", + "df_incheon_smotenc_ctgan7000_2 = preprocessing(df_incheon_smotenc_ctgan7000_2).copy()\n", + "df_daegu_smotenc_ctgan7000_2 = preprocessing(df_daegu_smotenc_ctgan7000_2).copy()\n", + "df_daejeon_smotenc_ctgan7000_2 = preprocessing(df_daejeon_smotenc_ctgan7000_2).copy()\n", + "df_gwangju_smotenc_ctgan7000_2 = preprocessing(df_gwangju_smotenc_ctgan7000_2).copy()\n", + "\n", + "df_busan_smotenc_ctgan7000_3 = preprocessing(df_busan_smotenc_ctgan7000_3).copy()\n", + "df_seoul_smotenc_ctgan7000_3 = preprocessing(df_seoul_smotenc_ctgan7000_3).copy()\n", + "df_incheon_smotenc_ctgan7000_3 = preprocessing(df_incheon_smotenc_ctgan7000_3).copy()\n", + "df_daegu_smotenc_ctgan7000_3 = preprocessing(df_daegu_smotenc_ctgan7000_3).copy()\n", + "df_daejeon_smotenc_ctgan7000_3 = preprocessing(df_daejeon_smotenc_ctgan7000_3).copy()\n", + "df_gwangju_smotenc_ctgan7000_3 = preprocessing(df_gwangju_smotenc_ctgan7000_3).copy()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5352598289152354\n", + "mean of accuracy : 0.9412404038891801\n", + "mean of mcc : 0.6782186947210392\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_1.loc[df_seoul_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan7000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_1.loc[df_seoul_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_2.loc[df_seoul_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan7000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_2.loc[df_seoul_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_3.loc[df_seoul_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan7000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_3.loc[df_seoul_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4485402157416473\n", + "mean of accuracy : 0.9570824080312065\n", + "mean of mcc : 0.6170303705969965\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_1.loc[df_busan_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan7000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_1.loc[df_busan_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_2.loc[df_busan_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan7000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_2.loc[df_busan_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_3.loc[df_busan_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan7000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_3.loc[df_busan_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5518512077229454\n", + "mean of accuracy : 0.9110783616538164\n", + "mean of mcc : 0.6864459032542548\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_1.loc[df_incheon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan7000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_1.loc[df_incheon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_2.loc[df_incheon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan7000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_2.loc[df_incheon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_3.loc[df_incheon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan7000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_3.loc[df_incheon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.40224349400243914\n", + "mean of accuracy : 0.9664241127496235\n", + "mean of mcc : 0.5832379476945067\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_1.loc[df_daegu_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan7000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_1.loc[df_daegu_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_2.loc[df_daegu_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan7000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_2.loc[df_daegu_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_3.loc[df_daegu_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan7000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_3.loc[df_daegu_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.476308593039838\n", + "mean of accuracy : 0.9321407332551505\n", + "mean of mcc : 0.625183787216149\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_1.loc[df_daejeon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan7000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_1.loc[df_daejeon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_2.loc[df_daejeon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan7000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_2.loc[df_daejeon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_3.loc[df_daejeon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan7000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_3.loc[df_daejeon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.47155191980971883\n", + "mean of accuracy : 0.9403094875697615\n", + "mean of mcc : 0.6237103634516713\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_1.loc[df_gwangju_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan7000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_1.loc[df_gwangju_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_2.loc[df_gwangju_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan7000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_2.loc[df_gwangju_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_3.loc[df_gwangju_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan7000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_3.loc[df_gwangju_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan7000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **1만개**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 Fold\n", + "df_busan_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_busan.csv\")\n", + "df_seoul_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_seoul.csv\")\n", + "df_incheon_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_incheon.csv\")\n", + "df_daegu_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_gwangju.csv\")\n", + "\n", + "# 2 Fold\n", + "df_busan_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_busan.csv\")\n", + "df_seoul_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_seoul.csv\")\n", + "df_incheon_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_incheon.csv\")\n", + "df_daegu_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_gwangju.csv\")\n", + "\n", + "# 3 Fold\n", + "df_busan_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_busan.csv\")\n", + "df_seoul_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_seoul.csv\")\n", + "df_incheon_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_incheon.csv\")\n", + "df_daegu_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_gwangju.csv\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "df_busan_smotenc_ctgan10000_1 = preprocessing(df_busan_smotenc_ctgan10000_1).copy()\n", + "df_seoul_smotenc_ctgan10000_1 = preprocessing(df_seoul_smotenc_ctgan10000_1).copy()\n", + "df_incheon_smotenc_ctgan10000_1 = preprocessing(df_incheon_smotenc_ctgan10000_1).copy()\n", + "df_daegu_smotenc_ctgan10000_1 = preprocessing(df_daegu_smotenc_ctgan10000_1).copy()\n", + "df_daejeon_smotenc_ctgan10000_1 = preprocessing(df_daejeon_smotenc_ctgan10000_1).copy()\n", + "df_gwangju_smotenc_ctgan10000_1 = preprocessing(df_gwangju_smotenc_ctgan10000_1).copy()\n", + "\n", + "df_busan_smotenc_ctgan10000_2 = preprocessing(df_busan_smotenc_ctgan10000_2).copy()\n", + "df_seoul_smotenc_ctgan10000_2 = preprocessing(df_seoul_smotenc_ctgan10000_2).copy()\n", + "df_incheon_smotenc_ctgan10000_2 = preprocessing(df_incheon_smotenc_ctgan10000_2).copy()\n", + "df_daegu_smotenc_ctgan10000_2 = preprocessing(df_daegu_smotenc_ctgan10000_2).copy()\n", + "df_daejeon_smotenc_ctgan10000_2 = preprocessing(df_daejeon_smotenc_ctgan10000_2).copy()\n", + "df_gwangju_smotenc_ctgan10000_2 = preprocessing(df_gwangju_smotenc_ctgan10000_2).copy()\n", + "\n", + "df_busan_smotenc_ctgan10000_3 = preprocessing(df_busan_smotenc_ctgan10000_3).copy()\n", + "df_seoul_smotenc_ctgan10000_3 = preprocessing(df_seoul_smotenc_ctgan10000_3).copy()\n", + "df_incheon_smotenc_ctgan10000_3 = preprocessing(df_incheon_smotenc_ctgan10000_3).copy()\n", + "df_daegu_smotenc_ctgan10000_3 = preprocessing(df_daegu_smotenc_ctgan10000_3).copy()\n", + "df_daejeon_smotenc_ctgan10000_3 = preprocessing(df_daejeon_smotenc_ctgan10000_3).copy()\n", + "df_gwangju_smotenc_ctgan10000_3 = preprocessing(df_gwangju_smotenc_ctgan10000_3).copy()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5391446462395447\n", + "mean of accuracy : 0.9419634911129409\n", + "mean of mcc : 0.6802382864465635\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_1.loc[df_seoul_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan10000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_1.loc[df_seoul_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_2.loc[df_seoul_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan10000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_2.loc[df_seoul_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_3.loc[df_seoul_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan10000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_3.loc[df_seoul_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.45629913656734233\n", + "mean of accuracy : 0.9579187148073292\n", + "mean of mcc : 0.624299022660101\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_1.loc[df_busan_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan10000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_1.loc[df_busan_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_2.loc[df_busan_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan10000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_2.loc[df_busan_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_3.loc[df_busan_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan10000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_3.loc[df_busan_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5474454733447011\n", + "mean of accuracy : 0.9102821859586961\n", + "mean of mcc : 0.6821514877761338\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_1.loc[df_incheon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan10000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_1.loc[df_incheon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_2.loc[df_incheon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan10000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_2.loc[df_incheon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_3.loc[df_incheon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan10000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_3.loc[df_incheon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4114169416612505\n", + "mean of accuracy : 0.9674109630627709\n", + "mean of mcc : 0.5898029433914993\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_1.loc[df_daegu_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan10000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_1.loc[df_daegu_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_2.loc[df_daegu_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan10000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_2.loc[df_daegu_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_3.loc[df_daegu_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan10000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_3.loc[df_daegu_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.46834204028253223\n", + "mean of accuracy : 0.9317592800525656\n", + "mean of mcc : 0.620788512419694\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_1.loc[df_daejeon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan10000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_1.loc[df_daejeon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_2.loc[df_daejeon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan10000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_2.loc[df_daejeon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_3.loc[df_daejeon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan10000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_3.loc[df_daejeon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4859514070622382\n", + "mean of accuracy : 0.9427799402816245\n", + "mean of mcc : 0.6384358903533097\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_1.loc[df_gwangju_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan10000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_1.loc[df_gwangju_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_2.loc[df_gwangju_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan10000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_2.loc[df_gwangju_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_3.loc[df_gwangju_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan10000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_3.loc[df_gwangju_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan10000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **2만개**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 Fold\n", + "df_busan_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_busan.csv\")\n", + "df_seoul_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_seoul.csv\")\n", + "df_incheon_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_incheon.csv\")\n", + "df_daegu_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_gwangju.csv\")\n", + "\n", + "# 2 Fold\n", + "df_busan_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_busan.csv\")\n", + "df_seoul_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_seoul.csv\")\n", + "df_incheon_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_incheon.csv\")\n", + "df_daegu_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_gwangju.csv\")\n", + "\n", + "# 3 Fold\n", + "df_busan_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_busan.csv\")\n", + "df_seoul_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_seoul.csv\")\n", + "df_incheon_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_incheon.csv\")\n", + "df_daegu_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_daegu.csv\")\n", + "df_daejeon_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_daejeon.csv\")\n", + "df_gwangju_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_gwangju.csv\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "df_busan_smotenc_ctgan20000_1 = preprocessing(df_busan_smotenc_ctgan20000_1).copy()\n", + "df_seoul_smotenc_ctgan20000_1 = preprocessing(df_seoul_smotenc_ctgan20000_1).copy()\n", + "df_incheon_smotenc_ctgan20000_1 = preprocessing(df_incheon_smotenc_ctgan20000_1).copy()\n", + "df_daegu_smotenc_ctgan20000_1 = preprocessing(df_daegu_smotenc_ctgan20000_1).copy()\n", + "df_daejeon_smotenc_ctgan20000_1 = preprocessing(df_daejeon_smotenc_ctgan20000_1).copy()\n", + "df_gwangju_smotenc_ctgan20000_1 = preprocessing(df_gwangju_smotenc_ctgan20000_1).copy()\n", + "\n", + "df_busan_smotenc_ctgan20000_2 = preprocessing(df_busan_smotenc_ctgan20000_2).copy()\n", + "df_seoul_smotenc_ctgan20000_2 = preprocessing(df_seoul_smotenc_ctgan20000_2).copy()\n", + "df_incheon_smotenc_ctgan20000_2 = preprocessing(df_incheon_smotenc_ctgan20000_2).copy()\n", + "df_daegu_smotenc_ctgan20000_2 = preprocessing(df_daegu_smotenc_ctgan20000_2).copy()\n", + "df_daejeon_smotenc_ctgan20000_2 = preprocessing(df_daejeon_smotenc_ctgan20000_2).copy()\n", + "df_gwangju_smotenc_ctgan20000_2 = preprocessing(df_gwangju_smotenc_ctgan20000_2).copy()\n", + "\n", + "df_busan_smotenc_ctgan20000_3 = preprocessing(df_busan_smotenc_ctgan20000_3).copy()\n", + "df_seoul_smotenc_ctgan20000_3 = preprocessing(df_seoul_smotenc_ctgan20000_3).copy()\n", + "df_incheon_smotenc_ctgan20000_3 = preprocessing(df_incheon_smotenc_ctgan20000_3).copy()\n", + "df_daegu_smotenc_ctgan20000_3 = preprocessing(df_daegu_smotenc_ctgan20000_3).copy()\n", + "df_daejeon_smotenc_ctgan20000_3 = preprocessing(df_daejeon_smotenc_ctgan20000_3).copy()\n", + "df_gwangju_smotenc_ctgan20000_3 = preprocessing(df_gwangju_smotenc_ctgan20000_3).copy()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **서울**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.5358813620881883\n", + "mean of accuracy : 0.9413917791584533\n", + "mean of mcc : 0.6789791066414157\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_1.loc[df_seoul_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan20000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_1.loc[df_seoul_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_2.loc[df_seoul_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan20000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_2.loc[df_seoul_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_3.loc[df_seoul_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan20000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_3.loc[df_seoul_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'seoul',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **부산**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4336268899145388\n", + "mean of accuracy : 0.9563998677545724\n", + "mean of mcc : 0.6080139285870266\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_1.loc[df_busan_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan20000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_1.loc[df_busan_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_2.loc[df_busan_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan20000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_2.loc[df_busan_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_3.loc[df_busan_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan20000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_3.loc[df_busan_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'busan',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **인천**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.564798264178871\n", + "mean of accuracy : 0.9092926325157406\n", + "mean of mcc : 0.68899875702518\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_1.loc[df_incheon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan20000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_1.loc[df_incheon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_2.loc[df_incheon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan20000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_2.loc[df_incheon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_3.loc[df_incheon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan20000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_3.loc[df_incheon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'incheon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대구**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.422336525695402\n", + "mean of accuracy : 0.963494335903386\n", + "mean of mcc : 0.5910759394692583\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_1.loc[df_daegu_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan20000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_1.loc[df_daegu_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_2.loc[df_daegu_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan20000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_2.loc[df_daegu_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_3.loc[df_daegu_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan20000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_3.loc[df_daegu_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daegu',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **대전**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4827377579875818\n", + "mean of accuracy : 0.9311888156964511\n", + "mean of mcc : 0.6279627682150313\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_1.loc[df_daejeon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan20000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_1.loc[df_daejeon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_2.loc[df_daejeon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan20000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_2.loc[df_daejeon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_3.loc[df_daejeon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan20000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_3.loc[df_daejeon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'daejeon',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **광주**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of csi : 0.4753475520930324\n", + "mean of accuracy : 0.9346923838277981\n", + "mean of mcc : 0.6188303813518945\n" + ] + } + ], + "source": [ + "csi = []\n", + "accuracy = []\n", + "mcc = []\n", + "\n", + "# Fold 1\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_1.loc[df_gwangju_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan20000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_1.loc[df_gwangju_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 2\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_2.loc[df_gwangju_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan20000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_2.loc[df_gwangju_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "# Fold 3\n", + "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_3.loc[df_gwangju_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan20000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_3.loc[df_gwangju_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", + "X_tr.drop(columns=['year'], inplace=True)\n", + "X_val.drop(columns=['year'], inplace=True)\n", + "\n", + "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", + "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", + "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", + "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", + "\n", + "\n", + "print(\"mean of csi : \", np.mean(csi))\n", + "print(\"mean of accuracy : \", np.mean(accuracy))\n", + "print(\"mean of mcc : \", np.mean(mcc))\n", + "\n", + "new_row = pd.DataFrame([{\n", + " 'region': 'gwangju',\n", + " 'model': 'LightGBM',\n", + " 'data_sample': 'smotenc_ctgan20000',\n", + " 'CSI': np.mean(csi),\n", + " 'MCC': np.mean(mcc),\n", + " 'Accuracy': np.mean(accuracy),\n", + " 'fold_csi': [csi]\n", + "\n", + "}])\n", + "\n", + "df = pd.concat([df, new_row], ignore_index=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
regionmodeldata_sampleCSIMCCAccuracyfold_csi
0seoulLightGBMpure0.5050410.6469920.936174[[0.46595932802825235, 0.5771195097037204, 0.4...
1busanLightGBMpure0.4301880.6008010.956971[[0.32824427480911017, 0.4782608695651431, 0.4...
2incheonLightGBMpure0.5546630.6879510.911954[[0.4845292955891715, 0.6037628278220865, 0.57...
3daeguLightGBMpure0.2923400.4819890.956964[[0.28124999999994504, 0.3320537428022395, 0.2...
4daejeonLightGBMpure0.4784370.6252440.932748[[0.43333333333329205, 0.4547920433995972, 0.5...
5gwangjuLightGBMpure0.4827770.6368150.943236[[0.3928095872169916, 0.5461624026695722, 0.50...
6seoulLightGBMsmote0.5789390.7084990.939995[[0.4550682961897588, 0.6503831417623898, 0.63...
7busanLightGBMsmote0.4660210.6318800.950198[[0.4690909090908522, 0.47058823529405874, 0.4...
8incheonLightGBMsmote0.5835600.7061370.910464[[0.5613293051359177, 0.6213080168776044, 0.56...
9daeguLightGBMsmote0.4473540.6169210.963730[[0.3632567849686089, 0.49122807017536024, 0.4...
10daejeonLightGBMsmote0.5213350.6563210.930621[[0.4589041095890018, 0.5326514555467716, 0.57...
11gwangjuLightGBMsmote0.5227310.6604230.936850[[0.47532729103721294, 0.5410958904109059, 0.5...
12seoulLightGBMctgan200000.5430100.6780720.940934[[0.47165160230070075, 0.5903500473036338, 0.5...
13busanLightGBMctgan200000.4663460.6265200.957503[[0.4154262516914187, 0.49489051094883285, 0.4...
14incheonLightGBMctgan200000.5666810.6889340.907626[[0.5311667554608135, 0.6098117512834792, 0.55...
15daeguLightGBMctgan200000.4404190.6086250.967447[[0.46130952380938656, 0.4414784394249607, 0.4...
16daejeonLightGBMctgan200000.4808390.6258320.931760[[0.42667928098387636, 0.48797250859102337, 0....
17gwangjuLightGBMctgan200000.4937130.6373130.936783[[0.42775665399235474, 0.5447427293064268, 0.5...
18seoulLightGBMctgan100000.5489020.6865310.943140[[0.482333607230856, 0.5810397553516227, 0.583...
19busanLightGBMctgan100000.4676630.6314350.959516[[0.4084084084083471, 0.5208955223879819, 0.47...
20incheonLightGBMctgan100000.5530990.6876510.912108[[0.4707429322813629, 0.6094198736358064, 0.57...
21daeguLightGBMctgan100000.4067000.5786230.966763[[0.44943820224706477, 0.45661157024783955, 0....
22daejeonLightGBMctgan100000.4782410.6261770.932976[[0.43804034582128354, 0.4569356300996866, 0.5...
23gwangjuLightGBMctgan100000.4814170.6302430.941869[[0.38888888888883977, 0.53999999999994, 0.515...
24seoulLightGBMctgan70000.5483180.6873900.943140[[0.4815724815724421, 0.5754132231404364, 0.58...
25busanLightGBMctgan70000.4663740.6344100.960050[[0.3847328244274221, 0.5324074074073252, 0.48...
26incheonLightGBMctgan70000.5400650.6776110.909713[[0.45490716180368335, 0.6001144164759382, 0.5...
27daeguLightGBMctgan70000.4261280.5995660.968853[[0.4832826747718896, 0.4640657084187959, 0.33...
28daejeonLightGBMctgan70000.4701450.6191100.931494[[0.4214559386972776, 0.4623753399818257, 0.52...
29gwangjuLightGBMctgan70000.4860340.6348030.942667[[0.3902439024389743, 0.543429844097935, 0.524...
30seoulLightGBMsmotenc_ctgan70000.5352600.6782190.941240[[0.46849757673663417, 0.5743801652891969, 0.5...
31busanLightGBMsmotenc_ctgan70000.4485400.6170300.957082[[0.38415545590427735, 0.4850640113797318, 0.4...
32incheonLightGBMsmotenc_ctgan70000.5518510.6864460.911078[[0.4885695623774991, 0.6043577981651029, 0.56...
33daeguLightGBMsmotenc_ctgan70000.4022430.5832380.966424[[0.41432225063928024, 0.4475806451612001, 0.3...
34daejeonLightGBMsmotenc_ctgan70000.4763090.6251840.932141[[0.4230038022813286, 0.47079964061091906, 0.5...
35gwangjuLightGBMsmotenc_ctgan70000.4715520.6237100.940309[[0.37113402061850886, 0.5363735070574879, 0.5...
36seoulLightGBMsmotenc_ctgan100000.5391450.6802380.941963[[0.4607201309328592, 0.5864583333332722, 0.57...
37busanLightGBMsmotenc_ctgan100000.4562990.6242990.957919[[0.40412979351026485, 0.4999999999999277, 0.4...
38incheonLightGBMsmotenc_ctgan100000.5474450.6821510.910282[[0.4681967213114447, 0.6035067873302826, 0.57...
39daeguLightGBMsmotenc_ctgan100000.4114170.5898030.967411[[0.4368131868130668, 0.45213849287159835, 0.3...
40daejeonLightGBMsmotenc_ctgan100000.4683420.6207890.931759[[0.42665388302968105, 0.4723481414324141, 0.5...
41gwangjuLightGBMsmotenc_ctgan100000.4859510.6384360.942780[[0.3915343915343397, 0.5499451152578979, 0.51...
42seoulLightGBMsmotenc_ctgan200000.5358810.6789790.941392[[0.4706840390879095, 0.5655314757481357, 0.57...
43busanLightGBMsmotenc_ctgan200000.4336270.6080140.956400[[0.3399089529589772, 0.5086956521738393, 0.45...
44incheonLightGBMsmotenc_ctgan200000.5647980.6889990.909293[[0.5387685290763661, 0.5963821368004185, 0.55...
45daeguLightGBMsmotenc_ctgan200000.4223370.5910760.963494[[0.42962962962952356, 0.43951612903216947, 0....
46daejeonLightGBMsmotenc_ctgan200000.4827380.6279630.931189[[0.4281636536631372, 0.4917627677100089, 0.52...
47gwangjuLightGBMsmotenc_ctgan200000.4753480.6188300.934692[[0.3949903660885939, 0.5378704720087225, 0.49...
\n", + "
" + ], + "text/plain": [ + " region model data_sample CSI MCC Accuracy \\\n", + "0 seoul LightGBM pure 0.505041 0.646992 0.936174 \n", + "1 busan LightGBM pure 0.430188 0.600801 0.956971 \n", + "2 incheon LightGBM pure 0.554663 0.687951 0.911954 \n", + "3 daegu LightGBM pure 0.292340 0.481989 0.956964 \n", + "4 daejeon LightGBM pure 0.478437 0.625244 0.932748 \n", + "5 gwangju LightGBM pure 0.482777 0.636815 0.943236 \n", + "6 seoul LightGBM smote 0.578939 0.708499 0.939995 \n", + "7 busan LightGBM smote 0.466021 0.631880 0.950198 \n", + "8 incheon LightGBM smote 0.583560 0.706137 0.910464 \n", + "9 daegu LightGBM smote 0.447354 0.616921 0.963730 \n", + "10 daejeon LightGBM smote 0.521335 0.656321 0.930621 \n", + "11 gwangju LightGBM smote 0.522731 0.660423 0.936850 \n", + "12 seoul LightGBM ctgan20000 0.543010 0.678072 0.940934 \n", + "13 busan LightGBM ctgan20000 0.466346 0.626520 0.957503 \n", + "14 incheon LightGBM ctgan20000 0.566681 0.688934 0.907626 \n", + "15 daegu LightGBM ctgan20000 0.440419 0.608625 0.967447 \n", + "16 daejeon LightGBM ctgan20000 0.480839 0.625832 0.931760 \n", + "17 gwangju LightGBM ctgan20000 0.493713 0.637313 0.936783 \n", + "18 seoul LightGBM ctgan10000 0.548902 0.686531 0.943140 \n", + "19 busan LightGBM ctgan10000 0.467663 0.631435 0.959516 \n", + "20 incheon LightGBM ctgan10000 0.553099 0.687651 0.912108 \n", + "21 daegu LightGBM ctgan10000 0.406700 0.578623 0.966763 \n", + "22 daejeon LightGBM ctgan10000 0.478241 0.626177 0.932976 \n", + "23 gwangju LightGBM ctgan10000 0.481417 0.630243 0.941869 \n", + "24 seoul LightGBM ctgan7000 0.548318 0.687390 0.943140 \n", + "25 busan LightGBM ctgan7000 0.466374 0.634410 0.960050 \n", + "26 incheon LightGBM ctgan7000 0.540065 0.677611 0.909713 \n", + "27 daegu LightGBM ctgan7000 0.426128 0.599566 0.968853 \n", + "28 daejeon LightGBM ctgan7000 0.470145 0.619110 0.931494 \n", + "29 gwangju LightGBM ctgan7000 0.486034 0.634803 0.942667 \n", + "30 seoul LightGBM smotenc_ctgan7000 0.535260 0.678219 0.941240 \n", + "31 busan LightGBM smotenc_ctgan7000 0.448540 0.617030 0.957082 \n", + "32 incheon LightGBM smotenc_ctgan7000 0.551851 0.686446 0.911078 \n", + "33 daegu LightGBM smotenc_ctgan7000 0.402243 0.583238 0.966424 \n", + "34 daejeon LightGBM smotenc_ctgan7000 0.476309 0.625184 0.932141 \n", + "35 gwangju LightGBM smotenc_ctgan7000 0.471552 0.623710 0.940309 \n", + "36 seoul LightGBM smotenc_ctgan10000 0.539145 0.680238 0.941963 \n", + "37 busan LightGBM smotenc_ctgan10000 0.456299 0.624299 0.957919 \n", + "38 incheon LightGBM smotenc_ctgan10000 0.547445 0.682151 0.910282 \n", + "39 daegu LightGBM smotenc_ctgan10000 0.411417 0.589803 0.967411 \n", + "40 daejeon LightGBM smotenc_ctgan10000 0.468342 0.620789 0.931759 \n", + "41 gwangju LightGBM smotenc_ctgan10000 0.485951 0.638436 0.942780 \n", + "42 seoul LightGBM smotenc_ctgan20000 0.535881 0.678979 0.941392 \n", + "43 busan LightGBM smotenc_ctgan20000 0.433627 0.608014 0.956400 \n", + "44 incheon LightGBM smotenc_ctgan20000 0.564798 0.688999 0.909293 \n", + "45 daegu LightGBM smotenc_ctgan20000 0.422337 0.591076 0.963494 \n", + "46 daejeon LightGBM smotenc_ctgan20000 0.482738 0.627963 0.931189 \n", + "47 gwangju LightGBM smotenc_ctgan20000 0.475348 0.618830 0.934692 \n", + "\n", + " fold_csi \n", + "0 [[0.46595932802825235, 0.5771195097037204, 0.4... \n", + "1 [[0.32824427480911017, 0.4782608695651431, 0.4... \n", + "2 [[0.4845292955891715, 0.6037628278220865, 0.57... \n", + "3 [[0.28124999999994504, 0.3320537428022395, 0.2... \n", + "4 [[0.43333333333329205, 0.4547920433995972, 0.5... \n", + "5 [[0.3928095872169916, 0.5461624026695722, 0.50... \n", + "6 [[0.4550682961897588, 0.6503831417623898, 0.63... \n", + "7 [[0.4690909090908522, 0.47058823529405874, 0.4... \n", + "8 [[0.5613293051359177, 0.6213080168776044, 0.56... \n", + "9 [[0.3632567849686089, 0.49122807017536024, 0.4... \n", + "10 [[0.4589041095890018, 0.5326514555467716, 0.57... \n", + "11 [[0.47532729103721294, 0.5410958904109059, 0.5... \n", + "12 [[0.47165160230070075, 0.5903500473036338, 0.5... \n", + "13 [[0.4154262516914187, 0.49489051094883285, 0.4... \n", + "14 [[0.5311667554608135, 0.6098117512834792, 0.55... \n", + "15 [[0.46130952380938656, 0.4414784394249607, 0.4... \n", + "16 [[0.42667928098387636, 0.48797250859102337, 0.... \n", + "17 [[0.42775665399235474, 0.5447427293064268, 0.5... \n", + "18 [[0.482333607230856, 0.5810397553516227, 0.583... \n", + "19 [[0.4084084084083471, 0.5208955223879819, 0.47... \n", + "20 [[0.4707429322813629, 0.6094198736358064, 0.57... \n", + "21 [[0.44943820224706477, 0.45661157024783955, 0.... \n", + "22 [[0.43804034582128354, 0.4569356300996866, 0.5... \n", + "23 [[0.38888888888883977, 0.53999999999994, 0.515... \n", + "24 [[0.4815724815724421, 0.5754132231404364, 0.58... \n", + "25 [[0.3847328244274221, 0.5324074074073252, 0.48... \n", + "26 [[0.45490716180368335, 0.6001144164759382, 0.5... \n", + "27 [[0.4832826747718896, 0.4640657084187959, 0.33... \n", + "28 [[0.4214559386972776, 0.4623753399818257, 0.52... \n", + "29 [[0.3902439024389743, 0.543429844097935, 0.524... \n", + "30 [[0.46849757673663417, 0.5743801652891969, 0.5... \n", + "31 [[0.38415545590427735, 0.4850640113797318, 0.4... \n", + "32 [[0.4885695623774991, 0.6043577981651029, 0.56... \n", + "33 [[0.41432225063928024, 0.4475806451612001, 0.3... \n", + "34 [[0.4230038022813286, 0.47079964061091906, 0.5... \n", + "35 [[0.37113402061850886, 0.5363735070574879, 0.5... \n", + "36 [[0.4607201309328592, 0.5864583333332722, 0.57... \n", + "37 [[0.40412979351026485, 0.4999999999999277, 0.4... \n", + "38 [[0.4681967213114447, 0.6035067873302826, 0.57... \n", + "39 [[0.4368131868130668, 0.45213849287159835, 0.3... \n", + "40 [[0.42665388302968105, 0.4723481414324141, 0.5... \n", + "41 [[0.3915343915343397, 0.5499451152578979, 0.51... \n", + "42 [[0.4706840390879095, 0.5655314757481357, 0.57... \n", + "43 [[0.3399089529589772, 0.5086956521738393, 0.45... \n", + "44 [[0.5387685290763661, 0.5963821368004185, 0.55... \n", + "45 [[0.42962962962952356, 0.43951612903216947, 0.... \n", + "46 [[0.4281636536631372, 0.4917627677100089, 0.52... \n", + "47 [[0.3949903660885939, 0.5378704720087225, 0.49... " + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 개별 모델 결과 저장\n", + "df.to_csv(\"../../data/oversampled_data_test_for_model/lightgbm_sampled_data_test.csv\", index=False)\n", + "\n", + "df.to_csv(\"../../data/oversampled_data_test_for_model/combined_sampled_data_test.csv\", index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py39", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}