diff --git "a/Analysis_code/4.oversampling_data_test/lgb_sampled_test.ipynb" "b/Analysis_code/4.oversampling_data_test/lgb_sampled_test.ipynb" --- "a/Analysis_code/4.oversampling_data_test/lgb_sampled_test.ipynb" +++ "b/Analysis_code/4.oversampling_data_test/lgb_sampled_test.ipynb" @@ -1,5215 +1,3 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **lightGBM**" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score\n", - "from collections import Counter\n", - "import sys\n", - "from lightgbm import LGBMClassifier\n", - "\n", - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:\n", - " \"\"\"\n", - " 제거했던 파생 변수들을 복구\n", - " \n", - " Args:\n", - " df: 데이터프레임\n", - " \n", - " Returns:\n", - " 파생 변수가 추가된 데이터프레임\n", - " \"\"\"\n", - " df = df.copy()\n", - " df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)\n", - " df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)\n", - " df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)\n", - " df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)\n", - " df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']\n", - " return df\n", - "\n", - "\n", - "def preprocessing(df):\n", - " df = df[df.columns].copy()\n", - " df['year'] = df['year'].astype('int')\n", - " df['month'] = df['month'].astype('int')\n", - " df['hour'] = df['hour'].astype('int')\n", - " df= add_derived_features(df).copy()\n", - " df['multi_class'] = df['multi_class'].astype('int')\n", - " df.loc[df['wind_dir']=='정온', 'wind_dir'] = \"0\"\n", - " df['wind_dir'] = df['wind_dir'].astype('int')\n", - " df= df[['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm',\n", - " 'vap_pressure', 'dewpoint_C', 'loc_pressure', 'sea_pressure',\n", - " 'solarRad', 'snow_cm', 'cloudcover', 'lm_cloudcover', 'low_cloudbase',\n", - " 'groundtemp', 'O3', 'NO2', 'PM10', 'PM25', 'year',\n", - " 'month', 'hour', 'ground_temp - temp_C', 'hour_sin', 'hour_cos',\n", - " 'month_sin', 'month_cos','multi_class']].copy()\n", - " return df\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "df_seoul = pd.read_csv(\"../../data/data_for_modeling/seoul_train.csv\")\n", - "df_busan = pd.read_csv(\"../../data/data_for_modeling/busan_train.csv\")\n", - "df_daegu = pd.read_csv(\"../../data/data_for_modeling/daegu_train.csv\")\n", - "df_daejeon = pd.read_csv(\"../../data/data_for_modeling/daejeon_train.csv\")\n", - "df_incheon = pd.read_csv(\"../../data/data_for_modeling/incheon_train.csv\")\n", - "df_gwangju = pd.read_csv(\"../../data/data_for_modeling/gwangju_train.csv\")\n", - "\n", - "df_seoul = preprocessing(df_seoul).copy()\n", - "df_busan = preprocessing(df_busan).copy()\n", - "df_daegu = preprocessing(df_daegu).copy()\n", - "df_daejeon = preprocessing(df_daejeon).copy()\n", - "df_incheon = preprocessing(df_incheon).copy()\n", - "df_gwangju = preprocessing(df_gwangju).copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "seoul : Counter({2: 23686, 1: 2579, 0: 39})\n", - "\n", - "busan : Counter({2: 24694, 1: 1516, 0: 94})\n", - "\n", - "daegu : Counter({2: 25149, 1: 1107, 0: 48})\n", - "\n", - "gwangju : Counter({2: 23798, 1: 2411, 0: 95})\n", - "\n", - "daejeon : Counter({2: 23471, 1: 2660, 0: 173})\n", - "\n", - "incheon : Counter({2: 21893, 1: 3892, 0: 519})\n" - ] - } - ], - "source": [ - "print(\"seoul : \", Counter(df_seoul['multi_class']))\n", - "print()\n", - "print(\"busan : \", Counter(df_busan['multi_class']))\n", - "print()\n", - "print(\"daegu : \", Counter(df_daegu['multi_class']))\n", - "print()\n", - "print(\"gwangju : \", Counter(df_gwangju['multi_class']))\n", - "print()\n", - "print(\"daejeon : \", Counter(df_daejeon['multi_class']))\n", - "print()\n", - "print(\"incheon : \", Counter(df_incheon['multi_class']))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n", - " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n", - " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n", - " 'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'ground_temp - temp_C',\n", - " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n", - " dtype='object')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_gwangju.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "from sklearn.utils.class_weight import compute_class_weight\n", - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "from sklearn.metrics import matthews_corrcoef\n", - "\n", - "def calculate_csi(Y_test, pred):\n", - "\n", - " cm = confusion_matrix(Y_test, pred) # 변수 이름을 cm으로 변경\n", - " # 혼동 행렬에서 H, F, M 추출\n", - " H = (cm[0, 0] + cm[1, 1])\n", - " \n", - " F = (cm[1, 0] + cm[2, 0] +\n", - " cm[0, 1] + cm[2, 1])\n", - " \n", - " M = (cm[0, 2] + cm[1, 2])\n", - " \n", - " # CSI 계산\n", - " CSI = H / (H + F + M + 1e-10)\n", - " return CSI\n", - "\n", - "def eval_metric_csi(y_true, pred_prob):\n", - "\n", - " pred = np.argmax(pred_prob, axis=1)\n", - " y_true = y_true\n", - " y_pred = pred\n", - " csi = calculate_csi(y_true, y_pred)\n", - " return -1*csi\n", - "\n", - "def multiclass_mcc(y_val, y_pred):\n", - " \"\"\"\n", - " 다중 분류에서도 sklearn의 matthews_corrcoef를 그대로 사용할 수 있음.\n", - " \"\"\"\n", - " return matthews_corrcoef(y_val, y_pred)\n", - "\n", - "# 사용자 정의 평가 지표 함수 정의\n", - "def csi_metric(y_true, pred):\n", - " y_pred_binary = np.argmax(pred, axis=1)\n", - " score = calculate_csi(y_true, y_pred_binary)\n", - " return 'CSI', score, True # higher_better=True" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "lgb_model = LGBMClassifier(\n", - " n_estimators=4000, # 약한 학습기 개수\n", - " tree_method='hist', \n", - " device='gpu', # GPU 사용\n", - " objective='multiclassova',\n", - " early_stopping_rounds=400, # 과적합 방지를 위한 조기 종료 설정\n", - " random_state= 42,\n", - " verbose= -1\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "pre_sampled_data= []\n", - "smote_sample_data= []\n", - "gan20000_sample_data= []\n", - "gan10000_sample_data= []\n", - "gan7000_sample_data= []" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "df= pd.DataFrame(columns=['region','model','data_sample','CSI','MCC','Accuracy','fold_csi'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5050411887221465\n", - "mean of accuracy : 0.9361739068958922\n", - "mean of mcc : 0.6469923326874802\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2018, 2019]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2018, 2020]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2019, 2020]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4301877051295586\n", - "mean of accuracy : 0.9569711638429356\n", - "mean of mcc : 0.6008010957239577\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2018, 2019]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2018, 2020]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2019, 2020]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5546627753142914\n", - "mean of accuracy : 0.9119535519125682\n", - "mean of mcc : 0.6879511579878309\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2018, 2019]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2018, 2020]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2019, 2020]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.29234018565381\n", - "mean of accuracy : 0.956963678252697\n", - "mean of mcc : 0.4819888130358391\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2018, 2019]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2018, 2020]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2019, 2020]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4784367169072478\n", - "mean of accuracy : 0.9327483136628656\n", - "mean of mcc : 0.6252440470551551\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2018, 2019]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2018, 2020]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2019, 2020]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4827774547671669\n", - "mean of accuracy : 0.9432361454200664\n", - "mean of mcc : 0.6368148576215991\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2018, 2019]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2018, 2020]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2019, 2020]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **SMOTE 증강기법을 적용시킨 데이터셋에 대한 성능**" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "df_smote_busan_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_busan.csv\")\n", - "df_smote_busan_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_busan.csv\")\n", - "df_smote_busan_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_busan.csv\")\n", - "\n", - "df_smote_seoul_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_seoul.csv\")\n", - "df_smote_seoul_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_seoul.csv\")\n", - "df_smote_seoul_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_seoul.csv\")\n", - "\n", - "df_smote_daegu_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_daegu.csv\")\n", - "df_smote_daegu_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_daegu.csv\")\n", - "df_smote_daegu_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_daegu.csv\")\n", - "\n", - "df_smote_daejeon_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_daejeon.csv\")\n", - "df_smote_daejeon_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_daejeon.csv\")\n", - "df_smote_daejeon_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_daejeon.csv\")\n", - "\n", - "df_smote_gwangju_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_gwangju.csv\")\n", - "df_smote_gwangju_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_gwangju.csv\")\n", - "df_smote_gwangju_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_gwangju.csv\")\n", - "\n", - "df_smote_incheon_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_incheon.csv\")\n", - "df_smote_incheon_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_incheon.csv\")\n", - "df_smote_incheon_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_incheon.csv\")\n", - "\n", - "df_smote_busan_1 = preprocessing(df_smote_busan_1)\n", - "df_smote_busan_2 = preprocessing(df_smote_busan_2)\n", - "df_smote_busan_3 = preprocessing(df_smote_busan_3)\n", - "\n", - "df_smote_seoul_1 = preprocessing(df_smote_seoul_1)\n", - "df_smote_seoul_2 = preprocessing(df_smote_seoul_2)\n", - "df_smote_seoul_3 = preprocessing(df_smote_seoul_3)\n", - "\n", - "df_smote_daegu_1 = preprocessing(df_smote_daegu_1)\n", - "df_smote_daegu_2 = preprocessing(df_smote_daegu_2)\n", - "df_smote_daegu_3 = preprocessing(df_smote_daegu_3)\n", - "\n", - "df_smote_daejeon_1 = preprocessing(df_smote_daejeon_1)\n", - "df_smote_daejeon_2 = preprocessing(df_smote_daejeon_2)\n", - "df_smote_daejeon_3 = preprocessing(df_smote_daejeon_3)\n", - "\n", - "df_smote_gwangju_1 = preprocessing(df_smote_gwangju_1)\n", - "df_smote_gwangju_2 = preprocessing(df_smote_gwangju_2)\n", - "df_smote_gwangju_3 = preprocessing(df_smote_gwangju_3)\n", - "\n", - "df_smote_incheon_1 = preprocessing(df_smote_incheon_1)\n", - "df_smote_incheon_2 = preprocessing(df_smote_incheon_2)\n", - "df_smote_incheon_3 = preprocessing(df_smote_incheon_3)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "smote_oversample=[] # smote 적용 전 f1 score 저장 리스트" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n", - " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n", - " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n", - " 'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'ground_temp - temp_C',\n", - " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n", - " dtype='object')" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_smote_seoul_1.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n", - " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n", - " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n", - " 'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'ground_temp - temp_C',\n", - " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n", - " dtype='object')" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_seoul.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5789392155070722\n", - "mean of accuracy : 0.9399950927797324\n", - "mean of mcc : 0.7084991639282849\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_1.loc[df_smote_seoul_1['year'].isin([2018, 2019]), df_smote_seoul_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_smote_seoul_1.loc[df_smote_seoul_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_2.loc[df_smote_seoul_2['year'].isin([2018, 2020]), df_smote_seoul_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_smote_seoul_2.loc[df_smote_seoul_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_3.loc[df_smote_seoul_3['year'].isin([2019, 2020]), df_smote_seoul_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_smote_seoul_3.loc[df_smote_seoul_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.46602091302575205\n", - "mean of accuracy : 0.9501977443421413\n", - "mean of mcc : 0.6318799598547477\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_busan_1.loc[df_smote_busan_1['year'].isin([2018, 2019]), df_smote_busan_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_smote_busan_1.loc[df_smote_busan_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_busan_2.loc[df_smote_busan_2['year'].isin([2018, 2020]), df_smote_busan_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_smote_busan_2.loc[df_smote_busan_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_busan_3.loc[df_smote_busan_3['year'].isin([2019, 2020]), df_smote_busan_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_smote_busan_3.loc[df_smote_busan_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.583559682649986\n", - "mean of accuracy : 0.9104636075554557\n", - "mean of mcc : 0.7061374111787998\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_1.loc[df_smote_incheon_1['year'].isin([2018, 2019]), df_smote_incheon_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_smote_incheon_1.loc[df_smote_incheon_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_2.loc[df_smote_incheon_2['year'].isin([2018, 2020]), df_smote_incheon_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_smote_incheon_2.loc[df_smote_incheon_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_3.loc[df_smote_incheon_3['year'].isin([2019, 2020]), df_smote_incheon_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_smote_incheon_3.loc[df_smote_incheon_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.44735416496514874\n", - "mean of accuracy : 0.96373033992897\n", - "mean of mcc : 0.6169211806368756\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_1.loc[df_smote_daegu_1['year'].isin([2018, 2019]), df_smote_daegu_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_smote_daegu_1.loc[df_smote_daegu_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_2.loc[df_smote_daegu_2['year'].isin([2018, 2020]), df_smote_daegu_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_smote_daegu_2.loc[df_smote_daegu_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_3.loc[df_smote_daegu_3['year'].isin([2019, 2020]), df_smote_daegu_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_smote_daegu_3.loc[df_smote_daegu_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5213346054616135\n", - "mean of accuracy : 0.9306212624032071\n", - "mean of mcc : 0.6563209583230294\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_1.loc[df_smote_daejeon_1['year'].isin([2018, 2019]), df_smote_daejeon_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_smote_daejeon_1.loc[df_smote_daejeon_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_2.loc[df_smote_daejeon_2['year'].isin([2018, 2020]), df_smote_daejeon_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_smote_daejeon_2.loc[df_smote_daejeon_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_3.loc[df_smote_daejeon_3['year'].isin([2019, 2020]), df_smote_daejeon_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_smote_daejeon_3.loc[df_smote_daejeon_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5227311295096367\n", - "mean of accuracy : 0.9368502091806605\n", - "mean of mcc : 0.6604233380668852\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_1.loc[df_smote_gwangju_1['year'].isin([2018, 2019]), df_smote_gwangju_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_smote_gwangju_1.loc[df_smote_gwangju_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_2.loc[df_smote_gwangju_2['year'].isin([2018, 2020]), df_smote_gwangju_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_smote_gwangju_2.loc[df_smote_gwangju_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_3.loc[df_smote_gwangju_3['year'].isin([2019, 2020]), df_smote_gwangju_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_smote_gwangju_3.loc[df_smote_gwangju_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **CTGAN을 통해 데이터 증강을 진행한 데이터셋에 대한 성능**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **2만개**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_busan.csv\")\n", - "df_seoul_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_seoul.csv\")\n", - "df_incheon_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_incheon.csv\")\n", - "df_daegu_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_daegu.csv\")\n", - "df_daejeon_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_daejeon.csv\")\n", - "df_gwangju_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_busan.csv\")\n", - "df_seoul_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_seoul.csv\")\n", - "df_incheon_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_incheon.csv\")\n", - "df_daegu_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_daegu.csv\")\n", - "df_daejeon_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_daejeon.csv\")\n", - "df_gwangju_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_busan.csv\")\n", - "df_seoul_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_seoul.csv\")\n", - "df_incheon_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_incheon.csv\")\n", - "df_daegu_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_daegu.csv\")\n", - "df_daejeon_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_daejeon.csv\")\n", - "df_gwangju_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_gwangju.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_gan20000_1= preprocessing(df_busan_gan20000_1).copy()\n", - "df_seoul_gan20000_1= preprocessing(df_seoul_gan20000_1).copy()\n", - "df_incheon_gan20000_1= preprocessing(df_incheon_gan20000_1).copy()\n", - "df_daegu_gan20000_1= preprocessing(df_daegu_gan20000_1).copy()\n", - "df_daejeon_gan20000_1= preprocessing(df_daejeon_gan20000_1).copy()\n", - "df_gwangju_gan20000_1= preprocessing(df_gwangju_gan20000_1).copy()\n", - "\n", - "df_busan_gan20000_2= preprocessing(df_busan_gan20000_2).copy()\n", - "df_seoul_gan20000_2= preprocessing(df_seoul_gan20000_2).copy()\n", - "df_incheon_gan20000_2= preprocessing(df_incheon_gan20000_2).copy()\n", - "df_daegu_gan20000_2= preprocessing(df_daegu_gan20000_2).copy()\n", - "df_daejeon_gan20000_2= preprocessing(df_daejeon_gan20000_2).copy()\n", - "df_gwangju_gan20000_2= preprocessing(df_gwangju_gan20000_2).copy()\n", - "\n", - "df_busan_gan20000_3= preprocessing(df_busan_gan20000_3).copy()\n", - "df_seoul_gan20000_3= preprocessing(df_seoul_gan20000_3).copy()\n", - "df_incheon_gan20000_3= preprocessing(df_incheon_gan20000_3).copy()\n", - "df_daegu_gan20000_3= preprocessing(df_daegu_gan20000_3).copy()\n", - "df_daejeon_gan20000_3= preprocessing(df_daejeon_gan20000_3).copy()\n", - "df_gwangju_gan20000_3= preprocessing(df_gwangju_gan20000_3).copy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5430102117038431\n", - "mean of accuracy : 0.9409344303881695\n", - "mean of mcc : 0.6780719447285347\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_1.loc[df_seoul_gan20000_1['year'].isin([2018, 2019]), df_seoul_gan20000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan20000_1.loc[df_seoul_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_2.loc[df_seoul_gan20000_2['year'].isin([2018, 2020]), df_seoul_gan20000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan20000_2.loc[df_seoul_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_3.loc[df_seoul_gan20000_3['year'].isin([2019, 2020]), df_seoul_gan20000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan20000_3.loc[df_seoul_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4663461890504854\n", - "mean of accuracy : 0.9575030566160141\n", - "mean of mcc : 0.6265195697208686\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_1.loc[df_busan_gan20000_1['year'].isin([2018, 2019]), df_busan_gan20000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan20000_1.loc[df_busan_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_2.loc[df_busan_gan20000_2['year'].isin([2018, 2020]), df_busan_gan20000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan20000_2.loc[df_busan_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_3.loc[df_busan_gan20000_3['year'].isin([2019, 2020]), df_busan_gan20000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan20000_3.loc[df_busan_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5666805477334921\n", - "mean of accuracy : 0.9076257371227054\n", - "mean of mcc : 0.6889342293705883\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_1.loc[df_incheon_gan20000_1['year'].isin([2018, 2019]), df_incheon_gan20000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan20000_1.loc[df_incheon_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_2.loc[df_incheon_gan20000_2['year'].isin([2018, 2020]), df_incheon_gan20000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan20000_2.loc[df_incheon_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_3.loc[df_incheon_gan20000_3['year'].isin([2019, 2020]), df_incheon_gan20000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan20000_3.loc[df_incheon_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.44041946056823505\n", - "mean of accuracy : 0.9674465196164052\n", - "mean of mcc : 0.6086254895296773\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_1.loc[df_daegu_gan20000_1['year'].isin([2018, 2019]), df_daegu_gan20000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan20000_1.loc[df_daegu_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_2.loc[df_daegu_gan20000_2['year'].isin([2018, 2020]), df_daegu_gan20000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan20000_2.loc[df_daegu_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_3.loc[df_daegu_gan20000_3['year'].isin([2019, 2020]), df_daegu_gan20000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan20000_3.loc[df_daegu_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4808393438710248\n", - "mean of accuracy : 0.9317601117848143\n", - "mean of mcc : 0.6258321334411245\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_1.loc[df_daejeon_gan20000_1['year'].isin([2018, 2019]), df_daejeon_gan20000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_1.loc[df_daejeon_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_2.loc[df_daejeon_gan20000_2['year'].isin([2018, 2020]), df_daejeon_gan20000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_2.loc[df_daejeon_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_3.loc[df_daejeon_gan20000_3['year'].isin([2019, 2020]), df_daejeon_gan20000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_3.loc[df_daejeon_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4937131678363705\n", - "mean of accuracy : 0.9367825269689182\n", - "mean of mcc : 0.6373125823908727\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_1.loc[df_gwangju_gan20000_1['year'].isin([2018, 2019]), df_gwangju_gan20000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_1.loc[df_gwangju_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_2.loc[df_gwangju_gan20000_2['year'].isin([2018, 2020]), df_gwangju_gan20000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_2.loc[df_gwangju_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_3.loc[df_gwangju_gan20000_3['year'].isin([2019, 2020]), df_gwangju_gan20000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_3.loc[df_gwangju_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **1만개**" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_busan.csv\")\n", - "df_seoul_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_seoul.csv\")\n", - "df_incheon_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_incheon.csv\")\n", - "df_daegu_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_daegu.csv\")\n", - "df_daejeon_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_daejeon.csv\")\n", - "df_gwangju_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_busan.csv\")\n", - "df_seoul_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_seoul.csv\")\n", - "df_incheon_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_incheon.csv\")\n", - "df_daegu_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_daegu.csv\")\n", - "df_daejeon_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_daejeon.csv\")\n", - "df_gwangju_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_busan.csv\")\n", - "df_seoul_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_seoul.csv\")\n", - "df_incheon_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_incheon.csv\")\n", - "df_daegu_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_daegu.csv\")\n", - "df_daejeon_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_daejeon.csv\")\n", - "df_gwangju_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_gwangju.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_gan10000_1= preprocessing(df_busan_gan10000_1).copy()\n", - "df_seoul_gan10000_1= preprocessing(df_seoul_gan10000_1).copy()\n", - "df_incheon_gan10000_1= preprocessing(df_incheon_gan10000_1).copy()\n", - "df_daegu_gan10000_1= preprocessing(df_daegu_gan10000_1).copy()\n", - "df_daejeon_gan10000_1= preprocessing(df_daejeon_gan10000_1).copy()\n", - "df_gwangju_gan10000_1= preprocessing(df_gwangju_gan10000_1).copy()\n", - "\n", - "df_busan_gan10000_2= preprocessing(df_busan_gan10000_2).copy()\n", - "df_seoul_gan10000_2= preprocessing(df_seoul_gan10000_2).copy()\n", - "df_incheon_gan10000_2= preprocessing(df_incheon_gan10000_2).copy()\n", - "df_daegu_gan10000_2= preprocessing(df_daegu_gan10000_2).copy()\n", - "df_daejeon_gan10000_2= preprocessing(df_daejeon_gan10000_2).copy()\n", - "df_gwangju_gan10000_2= preprocessing(df_gwangju_gan10000_2).copy()\n", - "\n", - "df_busan_gan10000_3= preprocessing(df_busan_gan10000_3).copy()\n", - "df_seoul_gan10000_3= preprocessing(df_seoul_gan10000_3).copy()\n", - "df_incheon_gan10000_3= preprocessing(df_incheon_gan10000_3).copy()\n", - "df_daegu_gan10000_3= preprocessing(df_daegu_gan10000_3).copy()\n", - "df_daejeon_gan10000_3= preprocessing(df_daejeon_gan10000_3).copy()\n", - "df_gwangju_gan10000_3= preprocessing(df_gwangju_gan10000_3).copy()\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5489022319719195\n", - "mean of accuracy : 0.9431400803453353\n", - "mean of mcc : 0.6865310613747596\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_1.loc[df_seoul_gan10000_1['year'].isin([2018, 2019]), df_seoul_gan10000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan10000_1.loc[df_seoul_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_2.loc[df_seoul_gan10000_2['year'].isin([2018, 2020]), df_seoul_gan10000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan10000_2.loc[df_seoul_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_3.loc[df_seoul_gan10000_3['year'].isin([2019, 2020]), df_seoul_gan10000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan10000_3.loc[df_seoul_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4676627137741911\n", - "mean of accuracy : 0.959515848658\n", - "mean of mcc : 0.6314347309502454\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_1.loc[df_busan_gan10000_1['year'].isin([2018, 2019]), df_busan_gan10000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan10000_1.loc[df_busan_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_2.loc[df_busan_gan10000_2['year'].isin([2018, 2020]), df_busan_gan10000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan10000_2.loc[df_busan_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_3.loc[df_busan_gan10000_3['year'].isin([2019, 2020]), df_busan_gan10000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan10000_3.loc[df_busan_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5530992219893435\n", - "mean of accuracy : 0.9121080461777744\n", - "mean of mcc : 0.687650733674605\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_1.loc[df_incheon_gan10000_1['year'].isin([2018, 2019]), df_incheon_gan10000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan10000_1.loc[df_incheon_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_2.loc[df_incheon_gan10000_2['year'].isin([2018, 2020]), df_incheon_gan10000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan10000_2.loc[df_incheon_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_3.loc[df_incheon_gan10000_3['year'].isin([2019, 2020]), df_incheon_gan10000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan10000_3.loc[df_incheon_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.406699786423904\n", - "mean of accuracy : 0.9667631476075221\n", - "mean of mcc : 0.5786232842762297\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_1.loc[df_daegu_gan10000_1['year'].isin([2018, 2019]), df_daegu_gan10000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan10000_1.loc[df_daegu_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_2.loc[df_daegu_gan10000_2['year'].isin([2018, 2020]), df_daegu_gan10000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan10000_2.loc[df_daegu_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_3.loc[df_daegu_gan10000_3['year'].isin([2019, 2020]), df_daegu_gan10000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan10000_3.loc[df_daegu_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4782405346339498\n", - "mean of accuracy : 0.9329755844998378\n", - "mean of mcc : 0.6261770938311793\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_1.loc[df_daejeon_gan10000_1['year'].isin([2018, 2019]), df_daejeon_gan10000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_1.loc[df_daejeon_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_2.loc[df_daejeon_gan10000_2['year'].isin([2018, 2020]), df_daejeon_gan10000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_2.loc[df_daejeon_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_3.loc[df_daejeon_gan10000_3['year'].isin([2019, 2020]), df_daejeon_gan10000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_3.loc[df_daejeon_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4814166239972196\n", - "mean of accuracy : 0.9418691934692385\n", - "mean of mcc : 0.6302426323494177\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_1.loc[df_gwangju_gan10000_1['year'].isin([2018, 2019]), df_gwangju_gan10000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_1.loc[df_gwangju_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_2.loc[df_gwangju_gan10000_2['year'].isin([2018, 2020]), df_gwangju_gan10000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_2.loc[df_gwangju_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_3.loc[df_gwangju_gan10000_3['year'].isin([2019, 2020]), df_gwangju_gan10000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_3.loc[df_gwangju_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **7천개**" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_busan.csv\")\n", - "df_seoul_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_seoul.csv\")\n", - "df_incheon_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_incheon.csv\")\n", - "df_daegu_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_daegu.csv\")\n", - "df_daejeon_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_daejeon.csv\")\n", - "df_gwangju_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_busan.csv\")\n", - "df_seoul_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_seoul.csv\")\n", - "df_incheon_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_incheon.csv\")\n", - "df_daegu_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_daegu.csv\")\n", - "df_daejeon_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_daejeon.csv\")\n", - "df_gwangju_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_busan.csv\")\n", - "df_seoul_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_seoul.csv\")\n", - "df_incheon_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_incheon.csv\")\n", - "df_daegu_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_daegu.csv\")\n", - "df_daejeon_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_daejeon.csv\")\n", - "df_gwangju_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_gwangju.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_gan7000_1= preprocessing(df_busan_gan7000_1).copy()\n", - "df_seoul_gan7000_1= preprocessing(df_seoul_gan7000_1).copy()\n", - "df_incheon_gan7000_1= preprocessing(df_incheon_gan7000_1).copy()\n", - "df_daegu_gan7000_1= preprocessing(df_daegu_gan7000_1).copy()\n", - "df_daejeon_gan7000_1= preprocessing(df_daejeon_gan7000_1).copy()\n", - "df_gwangju_gan7000_1= preprocessing(df_gwangju_gan7000_1).copy()\n", - "\n", - "df_busan_gan7000_2= preprocessing(df_busan_gan7000_2).copy()\n", - "df_seoul_gan7000_2= preprocessing(df_seoul_gan7000_2).copy()\n", - "df_incheon_gan7000_2= preprocessing(df_incheon_gan7000_2).copy()\n", - "df_daegu_gan7000_2= preprocessing(df_daegu_gan7000_2).copy()\n", - "df_daejeon_gan7000_2= preprocessing(df_daejeon_gan7000_2).copy()\n", - "df_gwangju_gan7000_2= preprocessing(df_gwangju_gan7000_2).copy()\n", - "\n", - "df_busan_gan7000_3= preprocessing(df_busan_gan7000_3).copy()\n", - "df_seoul_gan7000_3= preprocessing(df_seoul_gan7000_3).copy()\n", - "df_incheon_gan7000_3= preprocessing(df_incheon_gan7000_3).copy()\n", - "df_daegu_gan7000_3= preprocessing(df_daegu_gan7000_3).copy()\n", - "df_daejeon_gan7000_3= preprocessing(df_daejeon_gan7000_3).copy()\n", - "df_gwangju_gan7000_3= preprocessing(df_gwangju_gan7000_3).copy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5483176293132692\n", - "mean of accuracy : 0.9431403922449285\n", - "mean of mcc : 0.6873897786091137\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_1.loc[df_seoul_gan7000_1['year'].isin([2018, 2019]), df_seoul_gan7000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan7000_1.loc[df_seoul_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_2.loc[df_seoul_gan7000_2['year'].isin([2018, 2020]), df_seoul_gan7000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan7000_2.loc[df_seoul_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_3.loc[df_seoul_gan7000_3['year'].isin([2019, 2020]), df_seoul_gan7000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan7000_3.loc[df_seoul_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4663740712722189\n", - "mean of accuracy : 0.9600495088621072\n", - "mean of mcc : 0.6344100074206912\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_1.loc[df_busan_gan7000_1['year'].isin([2018, 2019]), df_busan_gan7000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan7000_1.loc[df_busan_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_2.loc[df_busan_gan7000_2['year'].isin([2018, 2020]), df_busan_gan7000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan7000_2.loc[df_busan_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_3.loc[df_busan_gan7000_3['year'].isin([2019, 2020]), df_busan_gan7000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan7000_3.loc[df_busan_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5400648981230686\n", - "mean of accuracy : 0.9097125533348306\n", - "mean of mcc : 0.6776113780254353\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_1.loc[df_incheon_gan7000_1['year'].isin([2018, 2019]), df_incheon_gan7000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan7000_1.loc[df_incheon_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_2.loc[df_incheon_gan7000_2['year'].isin([2018, 2020]), df_incheon_gan7000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan7000_2.loc[df_incheon_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_3.loc[df_incheon_gan7000_3['year'].isin([2019, 2020]), df_incheon_gan7000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan7000_3.loc[df_incheon_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.426127621983083\n", - "mean of accuracy : 0.9688532907486422\n", - "mean of mcc : 0.5995660759623473\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_1.loc[df_daegu_gan7000_1['year'].isin([2018, 2019]), df_daegu_gan7000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan7000_1.loc[df_daegu_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_2.loc[df_daegu_gan7000_2['year'].isin([2018, 2020]), df_daegu_gan7000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan7000_2.loc[df_daegu_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_3.loc[df_daegu_gan7000_3['year'].isin([2019, 2020]), df_daegu_gan7000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan7000_3.loc[df_daegu_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4701451158455507\n", - "mean of accuracy : 0.9314935415990885\n", - "mean of mcc : 0.619110040349657\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_1.loc[df_daejeon_gan7000_1['year'].isin([2018, 2019]), df_daejeon_gan7000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_1.loc[df_daejeon_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_2.loc[df_daejeon_gan7000_2['year'].isin([2018, 2020]), df_daejeon_gan7000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_2.loc[df_daejeon_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_3.loc[df_daejeon_gan7000_3['year'].isin([2019, 2020]), df_daejeon_gan7000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_3.loc[df_daejeon_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4860337424842999\n", - "mean of accuracy : 0.9426673445284495\n", - "mean of mcc : 0.6348033992505139\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_1.loc[df_gwangju_gan7000_1['year'].isin([2018, 2019]), df_gwangju_gan7000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_1.loc[df_gwangju_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_2.loc[df_gwangju_gan7000_2['year'].isin([2018, 2020]), df_gwangju_gan7000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_2.loc[df_gwangju_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_3.loc[df_gwangju_gan7000_3['year'].isin([2019, 2020]), df_gwangju_gan7000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_3.loc[df_gwangju_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **SMOTENC+CTGAN을 통해 데이터 증강을 진행한 데이터셋에 대한 성능**\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **7천개**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_busan.csv\")\n", - "df_seoul_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_seoul.csv\")\n", - "df_incheon_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_incheon.csv\")\n", - "df_daegu_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_busan.csv\")\n", - "df_seoul_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_seoul.csv\")\n", - "df_incheon_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_incheon.csv\")\n", - "df_daegu_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_busan.csv\")\n", - "df_seoul_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_seoul.csv\")\n", - "df_incheon_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_incheon.csv\")\n", - "df_daegu_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_gwangju.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_smotenc_ctgan7000_1 = preprocessing(df_busan_smotenc_ctgan7000_1).copy()\n", - "df_seoul_smotenc_ctgan7000_1 = preprocessing(df_seoul_smotenc_ctgan7000_1).copy()\n", - "df_incheon_smotenc_ctgan7000_1 = preprocessing(df_incheon_smotenc_ctgan7000_1).copy()\n", - "df_daegu_smotenc_ctgan7000_1 = preprocessing(df_daegu_smotenc_ctgan7000_1).copy()\n", - "df_daejeon_smotenc_ctgan7000_1 = preprocessing(df_daejeon_smotenc_ctgan7000_1).copy()\n", - "df_gwangju_smotenc_ctgan7000_1 = preprocessing(df_gwangju_smotenc_ctgan7000_1).copy()\n", - "\n", - "df_busan_smotenc_ctgan7000_2 = preprocessing(df_busan_smotenc_ctgan7000_2).copy()\n", - "df_seoul_smotenc_ctgan7000_2 = preprocessing(df_seoul_smotenc_ctgan7000_2).copy()\n", - "df_incheon_smotenc_ctgan7000_2 = preprocessing(df_incheon_smotenc_ctgan7000_2).copy()\n", - "df_daegu_smotenc_ctgan7000_2 = preprocessing(df_daegu_smotenc_ctgan7000_2).copy()\n", - "df_daejeon_smotenc_ctgan7000_2 = preprocessing(df_daejeon_smotenc_ctgan7000_2).copy()\n", - "df_gwangju_smotenc_ctgan7000_2 = preprocessing(df_gwangju_smotenc_ctgan7000_2).copy()\n", - "\n", - "df_busan_smotenc_ctgan7000_3 = preprocessing(df_busan_smotenc_ctgan7000_3).copy()\n", - "df_seoul_smotenc_ctgan7000_3 = preprocessing(df_seoul_smotenc_ctgan7000_3).copy()\n", - "df_incheon_smotenc_ctgan7000_3 = preprocessing(df_incheon_smotenc_ctgan7000_3).copy()\n", - "df_daegu_smotenc_ctgan7000_3 = preprocessing(df_daegu_smotenc_ctgan7000_3).copy()\n", - "df_daejeon_smotenc_ctgan7000_3 = preprocessing(df_daejeon_smotenc_ctgan7000_3).copy()\n", - "df_gwangju_smotenc_ctgan7000_3 = preprocessing(df_gwangju_smotenc_ctgan7000_3).copy()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5352598289152354\n", - "mean of accuracy : 0.9412404038891801\n", - "mean of mcc : 0.6782186947210392\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_1.loc[df_seoul_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan7000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_1.loc[df_seoul_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_2.loc[df_seoul_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan7000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_2.loc[df_seoul_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_3.loc[df_seoul_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan7000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_3.loc[df_seoul_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4485402157416473\n", - "mean of accuracy : 0.9570824080312065\n", - "mean of mcc : 0.6170303705969965\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_1.loc[df_busan_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan7000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_1.loc[df_busan_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_2.loc[df_busan_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan7000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_2.loc[df_busan_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_3.loc[df_busan_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan7000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_3.loc[df_busan_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5518512077229454\n", - "mean of accuracy : 0.9110783616538164\n", - "mean of mcc : 0.6864459032542548\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_1.loc[df_incheon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan7000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_1.loc[df_incheon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_2.loc[df_incheon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan7000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_2.loc[df_incheon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_3.loc[df_incheon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan7000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_3.loc[df_incheon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.40224349400243914\n", - "mean of accuracy : 0.9664241127496235\n", - "mean of mcc : 0.5832379476945067\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_1.loc[df_daegu_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan7000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_1.loc[df_daegu_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_2.loc[df_daegu_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan7000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_2.loc[df_daegu_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_3.loc[df_daegu_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan7000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_3.loc[df_daegu_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.476308593039838\n", - "mean of accuracy : 0.9321407332551505\n", - "mean of mcc : 0.625183787216149\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_1.loc[df_daejeon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan7000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_1.loc[df_daejeon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_2.loc[df_daejeon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan7000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_2.loc[df_daejeon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_3.loc[df_daejeon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan7000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_3.loc[df_daejeon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.47155191980971883\n", - "mean of accuracy : 0.9403094875697615\n", - "mean of mcc : 0.6237103634516713\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_1.loc[df_gwangju_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan7000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_1.loc[df_gwangju_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_2.loc[df_gwangju_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan7000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_2.loc[df_gwangju_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_3.loc[df_gwangju_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan7000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_3.loc[df_gwangju_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **1만개**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_busan.csv\")\n", - "df_seoul_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_seoul.csv\")\n", - "df_incheon_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_incheon.csv\")\n", - "df_daegu_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_busan.csv\")\n", - "df_seoul_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_seoul.csv\")\n", - "df_incheon_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_incheon.csv\")\n", - "df_daegu_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_busan.csv\")\n", - "df_seoul_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_seoul.csv\")\n", - "df_incheon_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_incheon.csv\")\n", - "df_daegu_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_gwangju.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_smotenc_ctgan10000_1 = preprocessing(df_busan_smotenc_ctgan10000_1).copy()\n", - "df_seoul_smotenc_ctgan10000_1 = preprocessing(df_seoul_smotenc_ctgan10000_1).copy()\n", - "df_incheon_smotenc_ctgan10000_1 = preprocessing(df_incheon_smotenc_ctgan10000_1).copy()\n", - "df_daegu_smotenc_ctgan10000_1 = preprocessing(df_daegu_smotenc_ctgan10000_1).copy()\n", - "df_daejeon_smotenc_ctgan10000_1 = preprocessing(df_daejeon_smotenc_ctgan10000_1).copy()\n", - "df_gwangju_smotenc_ctgan10000_1 = preprocessing(df_gwangju_smotenc_ctgan10000_1).copy()\n", - "\n", - "df_busan_smotenc_ctgan10000_2 = preprocessing(df_busan_smotenc_ctgan10000_2).copy()\n", - "df_seoul_smotenc_ctgan10000_2 = preprocessing(df_seoul_smotenc_ctgan10000_2).copy()\n", - "df_incheon_smotenc_ctgan10000_2 = preprocessing(df_incheon_smotenc_ctgan10000_2).copy()\n", - "df_daegu_smotenc_ctgan10000_2 = preprocessing(df_daegu_smotenc_ctgan10000_2).copy()\n", - "df_daejeon_smotenc_ctgan10000_2 = preprocessing(df_daejeon_smotenc_ctgan10000_2).copy()\n", - "df_gwangju_smotenc_ctgan10000_2 = preprocessing(df_gwangju_smotenc_ctgan10000_2).copy()\n", - "\n", - "df_busan_smotenc_ctgan10000_3 = preprocessing(df_busan_smotenc_ctgan10000_3).copy()\n", - "df_seoul_smotenc_ctgan10000_3 = preprocessing(df_seoul_smotenc_ctgan10000_3).copy()\n", - "df_incheon_smotenc_ctgan10000_3 = preprocessing(df_incheon_smotenc_ctgan10000_3).copy()\n", - "df_daegu_smotenc_ctgan10000_3 = preprocessing(df_daegu_smotenc_ctgan10000_3).copy()\n", - "df_daejeon_smotenc_ctgan10000_3 = preprocessing(df_daejeon_smotenc_ctgan10000_3).copy()\n", - "df_gwangju_smotenc_ctgan10000_3 = preprocessing(df_gwangju_smotenc_ctgan10000_3).copy()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5391446462395447\n", - "mean of accuracy : 0.9419634911129409\n", - "mean of mcc : 0.6802382864465635\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_1.loc[df_seoul_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan10000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_1.loc[df_seoul_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_2.loc[df_seoul_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan10000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_2.loc[df_seoul_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_3.loc[df_seoul_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan10000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_3.loc[df_seoul_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.45629913656734233\n", - "mean of accuracy : 0.9579187148073292\n", - "mean of mcc : 0.624299022660101\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_1.loc[df_busan_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan10000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_1.loc[df_busan_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_2.loc[df_busan_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan10000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_2.loc[df_busan_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_3.loc[df_busan_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan10000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_3.loc[df_busan_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5474454733447011\n", - "mean of accuracy : 0.9102821859586961\n", - "mean of mcc : 0.6821514877761338\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_1.loc[df_incheon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan10000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_1.loc[df_incheon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_2.loc[df_incheon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan10000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_2.loc[df_incheon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_3.loc[df_incheon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan10000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_3.loc[df_incheon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4114169416612505\n", - "mean of accuracy : 0.9674109630627709\n", - "mean of mcc : 0.5898029433914993\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_1.loc[df_daegu_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan10000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_1.loc[df_daegu_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_2.loc[df_daegu_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan10000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_2.loc[df_daegu_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_3.loc[df_daegu_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan10000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_3.loc[df_daegu_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.46834204028253223\n", - "mean of accuracy : 0.9317592800525656\n", - "mean of mcc : 0.620788512419694\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_1.loc[df_daejeon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan10000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_1.loc[df_daejeon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_2.loc[df_daejeon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan10000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_2.loc[df_daejeon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_3.loc[df_daejeon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan10000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_3.loc[df_daejeon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4859514070622382\n", - "mean of accuracy : 0.9427799402816245\n", - "mean of mcc : 0.6384358903533097\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_1.loc[df_gwangju_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan10000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_1.loc[df_gwangju_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_2.loc[df_gwangju_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan10000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_2.loc[df_gwangju_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_3.loc[df_gwangju_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan10000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_3.loc[df_gwangju_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **2만개**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_busan.csv\")\n", - "df_seoul_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_seoul.csv\")\n", - "df_incheon_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_incheon.csv\")\n", - "df_daegu_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_busan.csv\")\n", - "df_seoul_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_seoul.csv\")\n", - "df_incheon_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_incheon.csv\")\n", - "df_daegu_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_busan.csv\")\n", - "df_seoul_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_seoul.csv\")\n", - "df_incheon_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_incheon.csv\")\n", - "df_daegu_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_gwangju.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_smotenc_ctgan20000_1 = preprocessing(df_busan_smotenc_ctgan20000_1).copy()\n", - "df_seoul_smotenc_ctgan20000_1 = preprocessing(df_seoul_smotenc_ctgan20000_1).copy()\n", - "df_incheon_smotenc_ctgan20000_1 = preprocessing(df_incheon_smotenc_ctgan20000_1).copy()\n", - "df_daegu_smotenc_ctgan20000_1 = preprocessing(df_daegu_smotenc_ctgan20000_1).copy()\n", - "df_daejeon_smotenc_ctgan20000_1 = preprocessing(df_daejeon_smotenc_ctgan20000_1).copy()\n", - "df_gwangju_smotenc_ctgan20000_1 = preprocessing(df_gwangju_smotenc_ctgan20000_1).copy()\n", - "\n", - "df_busan_smotenc_ctgan20000_2 = preprocessing(df_busan_smotenc_ctgan20000_2).copy()\n", - "df_seoul_smotenc_ctgan20000_2 = preprocessing(df_seoul_smotenc_ctgan20000_2).copy()\n", - "df_incheon_smotenc_ctgan20000_2 = preprocessing(df_incheon_smotenc_ctgan20000_2).copy()\n", - "df_daegu_smotenc_ctgan20000_2 = preprocessing(df_daegu_smotenc_ctgan20000_2).copy()\n", - "df_daejeon_smotenc_ctgan20000_2 = preprocessing(df_daejeon_smotenc_ctgan20000_2).copy()\n", - "df_gwangju_smotenc_ctgan20000_2 = preprocessing(df_gwangju_smotenc_ctgan20000_2).copy()\n", - "\n", - "df_busan_smotenc_ctgan20000_3 = preprocessing(df_busan_smotenc_ctgan20000_3).copy()\n", - "df_seoul_smotenc_ctgan20000_3 = preprocessing(df_seoul_smotenc_ctgan20000_3).copy()\n", - "df_incheon_smotenc_ctgan20000_3 = preprocessing(df_incheon_smotenc_ctgan20000_3).copy()\n", - "df_daegu_smotenc_ctgan20000_3 = preprocessing(df_daegu_smotenc_ctgan20000_3).copy()\n", - "df_daejeon_smotenc_ctgan20000_3 = preprocessing(df_daejeon_smotenc_ctgan20000_3).copy()\n", - "df_gwangju_smotenc_ctgan20000_3 = preprocessing(df_gwangju_smotenc_ctgan20000_3).copy()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5358813620881883\n", - "mean of accuracy : 0.9413917791584533\n", - "mean of mcc : 0.6789791066414157\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_1.loc[df_seoul_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan20000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_1.loc[df_seoul_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_2.loc[df_seoul_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan20000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_2.loc[df_seoul_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_3.loc[df_seoul_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan20000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_3.loc[df_seoul_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4336268899145388\n", - "mean of accuracy : 0.9563998677545724\n", - "mean of mcc : 0.6080139285870266\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_1.loc[df_busan_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan20000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_1.loc[df_busan_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_2.loc[df_busan_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan20000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_2.loc[df_busan_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_3.loc[df_busan_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan20000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_3.loc[df_busan_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.564798264178871\n", - "mean of accuracy : 0.9092926325157406\n", - "mean of mcc : 0.68899875702518\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_1.loc[df_incheon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan20000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_1.loc[df_incheon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_2.loc[df_incheon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan20000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_2.loc[df_incheon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_3.loc[df_incheon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan20000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_3.loc[df_incheon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.422336525695402\n", - "mean of accuracy : 0.963494335903386\n", - "mean of mcc : 0.5910759394692583\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_1.loc[df_daegu_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan20000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_1.loc[df_daegu_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_2.loc[df_daegu_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan20000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_2.loc[df_daegu_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_3.loc[df_daegu_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan20000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_3.loc[df_daegu_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4827377579875818\n", - "mean of accuracy : 0.9311888156964511\n", - "mean of mcc : 0.6279627682150313\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_1.loc[df_daejeon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan20000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_1.loc[df_daejeon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_2.loc[df_daejeon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan20000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_2.loc[df_daejeon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_3.loc[df_daejeon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan20000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_3.loc[df_daejeon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4753475520930324\n", - "mean of accuracy : 0.9346923838277981\n", - "mean of mcc : 0.6188303813518945\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_1.loc[df_gwangju_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan20000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_1.loc[df_gwangju_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_2.loc[df_gwangju_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan20000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_2.loc[df_gwangju_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_3.loc[df_gwangju_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan20000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_3.loc[df_gwangju_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
| \n", - " | region | \n", - "model | \n", - "data_sample | \n", - "CSI | \n", - "MCC | \n", - "Accuracy | \n", - "fold_csi | \n", - "
|---|---|---|---|---|---|---|---|
| 0 | \n", - "seoul | \n", - "LightGBM | \n", - "pure | \n", - "0.505041 | \n", - "0.646992 | \n", - "0.936174 | \n", - "[[0.46595932802825235, 0.5771195097037204, 0.4... | \n", - "
| 1 | \n", - "busan | \n", - "LightGBM | \n", - "pure | \n", - "0.430188 | \n", - "0.600801 | \n", - "0.956971 | \n", - "[[0.32824427480911017, 0.4782608695651431, 0.4... | \n", - "
| 2 | \n", - "incheon | \n", - "LightGBM | \n", - "pure | \n", - "0.554663 | \n", - "0.687951 | \n", - "0.911954 | \n", - "[[0.4845292955891715, 0.6037628278220865, 0.57... | \n", - "
| 3 | \n", - "daegu | \n", - "LightGBM | \n", - "pure | \n", - "0.292340 | \n", - "0.481989 | \n", - "0.956964 | \n", - "[[0.28124999999994504, 0.3320537428022395, 0.2... | \n", - "
| 4 | \n", - "daejeon | \n", - "LightGBM | \n", - "pure | \n", - "0.478437 | \n", - "0.625244 | \n", - "0.932748 | \n", - "[[0.43333333333329205, 0.4547920433995972, 0.5... | \n", - "
| 5 | \n", - "gwangju | \n", - "LightGBM | \n", - "pure | \n", - "0.482777 | \n", - "0.636815 | \n", - "0.943236 | \n", - "[[0.3928095872169916, 0.5461624026695722, 0.50... | \n", - "
| 6 | \n", - "seoul | \n", - "LightGBM | \n", - "smote | \n", - "0.578939 | \n", - "0.708499 | \n", - "0.939995 | \n", - "[[0.4550682961897588, 0.6503831417623898, 0.63... | \n", - "
| 7 | \n", - "busan | \n", - "LightGBM | \n", - "smote | \n", - "0.466021 | \n", - "0.631880 | \n", - "0.950198 | \n", - "[[0.4690909090908522, 0.47058823529405874, 0.4... | \n", - "
| 8 | \n", - "incheon | \n", - "LightGBM | \n", - "smote | \n", - "0.583560 | \n", - "0.706137 | \n", - "0.910464 | \n", - "[[0.5613293051359177, 0.6213080168776044, 0.56... | \n", - "
| 9 | \n", - "daegu | \n", - "LightGBM | \n", - "smote | \n", - "0.447354 | \n", - "0.616921 | \n", - "0.963730 | \n", - "[[0.3632567849686089, 0.49122807017536024, 0.4... | \n", - "
| 10 | \n", - "daejeon | \n", - "LightGBM | \n", - "smote | \n", - "0.521335 | \n", - "0.656321 | \n", - "0.930621 | \n", - "[[0.4589041095890018, 0.5326514555467716, 0.57... | \n", - "
| 11 | \n", - "gwangju | \n", - "LightGBM | \n", - "smote | \n", - "0.522731 | \n", - "0.660423 | \n", - "0.936850 | \n", - "[[0.47532729103721294, 0.5410958904109059, 0.5... | \n", - "
| 12 | \n", - "seoul | \n", - "LightGBM | \n", - "ctgan20000 | \n", - "0.543010 | \n", - "0.678072 | \n", - "0.940934 | \n", - "[[0.47165160230070075, 0.5903500473036338, 0.5... | \n", - "
| 13 | \n", - "busan | \n", - "LightGBM | \n", - "ctgan20000 | \n", - "0.466346 | \n", - "0.626520 | \n", - "0.957503 | \n", - "[[0.4154262516914187, 0.49489051094883285, 0.4... | \n", - "
| 14 | \n", - "incheon | \n", - "LightGBM | \n", - "ctgan20000 | \n", - "0.566681 | \n", - "0.688934 | \n", - "0.907626 | \n", - "[[0.5311667554608135, 0.6098117512834792, 0.55... | \n", - "
| 15 | \n", - "daegu | \n", - "LightGBM | \n", - "ctgan20000 | \n", - "0.440419 | \n", - "0.608625 | \n", - "0.967447 | \n", - "[[0.46130952380938656, 0.4414784394249607, 0.4... | \n", - "
| 16 | \n", - "daejeon | \n", - "LightGBM | \n", - "ctgan20000 | \n", - "0.480839 | \n", - "0.625832 | \n", - "0.931760 | \n", - "[[0.42667928098387636, 0.48797250859102337, 0.... | \n", - "
| 17 | \n", - "gwangju | \n", - "LightGBM | \n", - "ctgan20000 | \n", - "0.493713 | \n", - "0.637313 | \n", - "0.936783 | \n", - "[[0.42775665399235474, 0.5447427293064268, 0.5... | \n", - "
| 18 | \n", - "seoul | \n", - "LightGBM | \n", - "ctgan10000 | \n", - "0.548902 | \n", - "0.686531 | \n", - "0.943140 | \n", - "[[0.482333607230856, 0.5810397553516227, 0.583... | \n", - "
| 19 | \n", - "busan | \n", - "LightGBM | \n", - "ctgan10000 | \n", - "0.467663 | \n", - "0.631435 | \n", - "0.959516 | \n", - "[[0.4084084084083471, 0.5208955223879819, 0.47... | \n", - "
| 20 | \n", - "incheon | \n", - "LightGBM | \n", - "ctgan10000 | \n", - "0.553099 | \n", - "0.687651 | \n", - "0.912108 | \n", - "[[0.4707429322813629, 0.6094198736358064, 0.57... | \n", - "
| 21 | \n", - "daegu | \n", - "LightGBM | \n", - "ctgan10000 | \n", - "0.406700 | \n", - "0.578623 | \n", - "0.966763 | \n", - "[[0.44943820224706477, 0.45661157024783955, 0.... | \n", - "
| 22 | \n", - "daejeon | \n", - "LightGBM | \n", - "ctgan10000 | \n", - "0.478241 | \n", - "0.626177 | \n", - "0.932976 | \n", - "[[0.43804034582128354, 0.4569356300996866, 0.5... | \n", - "
| 23 | \n", - "gwangju | \n", - "LightGBM | \n", - "ctgan10000 | \n", - "0.481417 | \n", - "0.630243 | \n", - "0.941869 | \n", - "[[0.38888888888883977, 0.53999999999994, 0.515... | \n", - "
| 24 | \n", - "seoul | \n", - "LightGBM | \n", - "ctgan7000 | \n", - "0.548318 | \n", - "0.687390 | \n", - "0.943140 | \n", - "[[0.4815724815724421, 0.5754132231404364, 0.58... | \n", - "
| 25 | \n", - "busan | \n", - "LightGBM | \n", - "ctgan7000 | \n", - "0.466374 | \n", - "0.634410 | \n", - "0.960050 | \n", - "[[0.3847328244274221, 0.5324074074073252, 0.48... | \n", - "
| 26 | \n", - "incheon | \n", - "LightGBM | \n", - "ctgan7000 | \n", - "0.540065 | \n", - "0.677611 | \n", - "0.909713 | \n", - "[[0.45490716180368335, 0.6001144164759382, 0.5... | \n", - "
| 27 | \n", - "daegu | \n", - "LightGBM | \n", - "ctgan7000 | \n", - "0.426128 | \n", - "0.599566 | \n", - "0.968853 | \n", - "[[0.4832826747718896, 0.4640657084187959, 0.33... | \n", - "
| 28 | \n", - "daejeon | \n", - "LightGBM | \n", - "ctgan7000 | \n", - "0.470145 | \n", - "0.619110 | \n", - "0.931494 | \n", - "[[0.4214559386972776, 0.4623753399818257, 0.52... | \n", - "
| 29 | \n", - "gwangju | \n", - "LightGBM | \n", - "ctgan7000 | \n", - "0.486034 | \n", - "0.634803 | \n", - "0.942667 | \n", - "[[0.3902439024389743, 0.543429844097935, 0.524... | \n", - "
| 30 | \n", - "seoul | \n", - "LightGBM | \n", - "smotenc_ctgan7000 | \n", - "0.535260 | \n", - "0.678219 | \n", - "0.941240 | \n", - "[[0.46849757673663417, 0.5743801652891969, 0.5... | \n", - "
| 31 | \n", - "busan | \n", - "LightGBM | \n", - "smotenc_ctgan7000 | \n", - "0.448540 | \n", - "0.617030 | \n", - "0.957082 | \n", - "[[0.38415545590427735, 0.4850640113797318, 0.4... | \n", - "
| 32 | \n", - "incheon | \n", - "LightGBM | \n", - "smotenc_ctgan7000 | \n", - "0.551851 | \n", - "0.686446 | \n", - "0.911078 | \n", - "[[0.4885695623774991, 0.6043577981651029, 0.56... | \n", - "
| 33 | \n", - "daegu | \n", - "LightGBM | \n", - "smotenc_ctgan7000 | \n", - "0.402243 | \n", - "0.583238 | \n", - "0.966424 | \n", - "[[0.41432225063928024, 0.4475806451612001, 0.3... | \n", - "
| 34 | \n", - "daejeon | \n", - "LightGBM | \n", - "smotenc_ctgan7000 | \n", - "0.476309 | \n", - "0.625184 | \n", - "0.932141 | \n", - "[[0.4230038022813286, 0.47079964061091906, 0.5... | \n", - "
| 35 | \n", - "gwangju | \n", - "LightGBM | \n", - "smotenc_ctgan7000 | \n", - "0.471552 | \n", - "0.623710 | \n", - "0.940309 | \n", - "[[0.37113402061850886, 0.5363735070574879, 0.5... | \n", - "
| 36 | \n", - "seoul | \n", - "LightGBM | \n", - "smotenc_ctgan10000 | \n", - "0.539145 | \n", - "0.680238 | \n", - "0.941963 | \n", - "[[0.4607201309328592, 0.5864583333332722, 0.57... | \n", - "
| 37 | \n", - "busan | \n", - "LightGBM | \n", - "smotenc_ctgan10000 | \n", - "0.456299 | \n", - "0.624299 | \n", - "0.957919 | \n", - "[[0.40412979351026485, 0.4999999999999277, 0.4... | \n", - "
| 38 | \n", - "incheon | \n", - "LightGBM | \n", - "smotenc_ctgan10000 | \n", - "0.547445 | \n", - "0.682151 | \n", - "0.910282 | \n", - "[[0.4681967213114447, 0.6035067873302826, 0.57... | \n", - "
| 39 | \n", - "daegu | \n", - "LightGBM | \n", - "smotenc_ctgan10000 | \n", - "0.411417 | \n", - "0.589803 | \n", - "0.967411 | \n", - "[[0.4368131868130668, 0.45213849287159835, 0.3... | \n", - "
| 40 | \n", - "daejeon | \n", - "LightGBM | \n", - "smotenc_ctgan10000 | \n", - "0.468342 | \n", - "0.620789 | \n", - "0.931759 | \n", - "[[0.42665388302968105, 0.4723481414324141, 0.5... | \n", - "
| 41 | \n", - "gwangju | \n", - "LightGBM | \n", - "smotenc_ctgan10000 | \n", - "0.485951 | \n", - "0.638436 | \n", - "0.942780 | \n", - "[[0.3915343915343397, 0.5499451152578979, 0.51... | \n", - "
| 42 | \n", - "seoul | \n", - "LightGBM | \n", - "smotenc_ctgan20000 | \n", - "0.535881 | \n", - "0.678979 | \n", - "0.941392 | \n", - "[[0.4706840390879095, 0.5655314757481357, 0.57... | \n", - "
| 43 | \n", - "busan | \n", - "LightGBM | \n", - "smotenc_ctgan20000 | \n", - "0.433627 | \n", - "0.608014 | \n", - "0.956400 | \n", - "[[0.3399089529589772, 0.5086956521738393, 0.45... | \n", - "
| 44 | \n", - "incheon | \n", - "LightGBM | \n", - "smotenc_ctgan20000 | \n", - "0.564798 | \n", - "0.688999 | \n", - "0.909293 | \n", - "[[0.5387685290763661, 0.5963821368004185, 0.55... | \n", - "
| 45 | \n", - "daegu | \n", - "LightGBM | \n", - "smotenc_ctgan20000 | \n", - "0.422337 | \n", - "0.591076 | \n", - "0.963494 | \n", - "[[0.42962962962952356, 0.43951612903216947, 0.... | \n", - "
| 46 | \n", - "daejeon | \n", - "LightGBM | \n", - "smotenc_ctgan20000 | \n", - "0.482738 | \n", - "0.627963 | \n", - "0.931189 | \n", - "[[0.4281636536631372, 0.4917627677100089, 0.52... | \n", - "
| 47 | \n", - "gwangju | \n", - "LightGBM | \n", - "smotenc_ctgan20000 | \n", - "0.475348 | \n", - "0.618830 | \n", - "0.934692 | \n", - "[[0.3949903660885939, 0.5378704720087225, 0.49... | \n", - "