diff --git "a/Analysis_code/4.oversampling_data_test/lgb_sampled_test.ipynb" "b/Analysis_code/4.oversampling_data_test/lgb_sampled_test.ipynb" --- "a/Analysis_code/4.oversampling_data_test/lgb_sampled_test.ipynb" +++ "b/Analysis_code/4.oversampling_data_test/lgb_sampled_test.ipynb" @@ -1,5215 +1,3 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **lightGBM**" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score\n", - "from collections import Counter\n", - "import sys\n", - "from lightgbm import LGBMClassifier\n", - "\n", - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:\n", - " \"\"\"\n", - " 제거했던 파생 변수들을 복구\n", - " \n", - " Args:\n", - " df: 데이터프레임\n", - " \n", - " Returns:\n", - " 파생 변수가 추가된 데이터프레임\n", - " \"\"\"\n", - " df = df.copy()\n", - " df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)\n", - " df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)\n", - " df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)\n", - " df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)\n", - " df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']\n", - " return df\n", - "\n", - "\n", - "def preprocessing(df):\n", - " df = df[df.columns].copy()\n", - " df['year'] = df['year'].astype('int')\n", - " df['month'] = df['month'].astype('int')\n", - " df['hour'] = df['hour'].astype('int')\n", - " df= add_derived_features(df).copy()\n", - " df['multi_class'] = df['multi_class'].astype('int')\n", - " df.loc[df['wind_dir']=='정온', 'wind_dir'] = \"0\"\n", - " df['wind_dir'] = df['wind_dir'].astype('int')\n", - " df= df[['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm',\n", - " 'vap_pressure', 'dewpoint_C', 'loc_pressure', 'sea_pressure',\n", - " 'solarRad', 'snow_cm', 'cloudcover', 'lm_cloudcover', 'low_cloudbase',\n", - " 'groundtemp', 'O3', 'NO2', 'PM10', 'PM25', 'year',\n", - " 'month', 'hour', 'ground_temp - temp_C', 'hour_sin', 'hour_cos',\n", - " 'month_sin', 'month_cos','multi_class']].copy()\n", - " return df\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "df_seoul = pd.read_csv(\"../../data/data_for_modeling/seoul_train.csv\")\n", - "df_busan = pd.read_csv(\"../../data/data_for_modeling/busan_train.csv\")\n", - "df_daegu = pd.read_csv(\"../../data/data_for_modeling/daegu_train.csv\")\n", - "df_daejeon = pd.read_csv(\"../../data/data_for_modeling/daejeon_train.csv\")\n", - "df_incheon = pd.read_csv(\"../../data/data_for_modeling/incheon_train.csv\")\n", - "df_gwangju = pd.read_csv(\"../../data/data_for_modeling/gwangju_train.csv\")\n", - "\n", - "df_seoul = preprocessing(df_seoul).copy()\n", - "df_busan = preprocessing(df_busan).copy()\n", - "df_daegu = preprocessing(df_daegu).copy()\n", - "df_daejeon = preprocessing(df_daejeon).copy()\n", - "df_incheon = preprocessing(df_incheon).copy()\n", - "df_gwangju = preprocessing(df_gwangju).copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "seoul : Counter({2: 23686, 1: 2579, 0: 39})\n", - "\n", - "busan : Counter({2: 24694, 1: 1516, 0: 94})\n", - "\n", - "daegu : Counter({2: 25149, 1: 1107, 0: 48})\n", - "\n", - "gwangju : Counter({2: 23798, 1: 2411, 0: 95})\n", - "\n", - "daejeon : Counter({2: 23471, 1: 2660, 0: 173})\n", - "\n", - "incheon : Counter({2: 21893, 1: 3892, 0: 519})\n" - ] - } - ], - "source": [ - "print(\"seoul : \", Counter(df_seoul['multi_class']))\n", - "print()\n", - "print(\"busan : \", Counter(df_busan['multi_class']))\n", - "print()\n", - "print(\"daegu : \", Counter(df_daegu['multi_class']))\n", - "print()\n", - "print(\"gwangju : \", Counter(df_gwangju['multi_class']))\n", - "print()\n", - "print(\"daejeon : \", Counter(df_daejeon['multi_class']))\n", - "print()\n", - "print(\"incheon : \", Counter(df_incheon['multi_class']))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n", - " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n", - " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n", - " 'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'ground_temp - temp_C',\n", - " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n", - " dtype='object')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_gwangju.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "from sklearn.utils.class_weight import compute_class_weight\n", - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "from sklearn.metrics import matthews_corrcoef\n", - "\n", - "def calculate_csi(Y_test, pred):\n", - "\n", - " cm = confusion_matrix(Y_test, pred) # 변수 이름을 cm으로 변경\n", - " # 혼동 행렬에서 H, F, M 추출\n", - " H = (cm[0, 0] + cm[1, 1])\n", - " \n", - " F = (cm[1, 0] + cm[2, 0] +\n", - " cm[0, 1] + cm[2, 1])\n", - " \n", - " M = (cm[0, 2] + cm[1, 2])\n", - " \n", - " # CSI 계산\n", - " CSI = H / (H + F + M + 1e-10)\n", - " return CSI\n", - "\n", - "def eval_metric_csi(y_true, pred_prob):\n", - "\n", - " pred = np.argmax(pred_prob, axis=1)\n", - " y_true = y_true\n", - " y_pred = pred\n", - " csi = calculate_csi(y_true, y_pred)\n", - " return -1*csi\n", - "\n", - "def multiclass_mcc(y_val, y_pred):\n", - " \"\"\"\n", - " 다중 분류에서도 sklearn의 matthews_corrcoef를 그대로 사용할 수 있음.\n", - " \"\"\"\n", - " return matthews_corrcoef(y_val, y_pred)\n", - "\n", - "# 사용자 정의 평가 지표 함수 정의\n", - "def csi_metric(y_true, pred):\n", - " y_pred_binary = np.argmax(pred, axis=1)\n", - " score = calculate_csi(y_true, y_pred_binary)\n", - " return 'CSI', score, True # higher_better=True" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "lgb_model = LGBMClassifier(\n", - " n_estimators=4000, # 약한 학습기 개수\n", - " tree_method='hist', \n", - " device='gpu', # GPU 사용\n", - " objective='multiclassova',\n", - " early_stopping_rounds=400, # 과적합 방지를 위한 조기 종료 설정\n", - " random_state= 42,\n", - " verbose= -1\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "pre_sampled_data= []\n", - "smote_sample_data= []\n", - "gan20000_sample_data= []\n", - "gan10000_sample_data= []\n", - "gan7000_sample_data= []" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "df= pd.DataFrame(columns=['region','model','data_sample','CSI','MCC','Accuracy','fold_csi'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5050411887221465\n", - "mean of accuracy : 0.9361739068958922\n", - "mean of mcc : 0.6469923326874802\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2018, 2019]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2018, 2020]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2019, 2020]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4301877051295586\n", - "mean of accuracy : 0.9569711638429356\n", - "mean of mcc : 0.6008010957239577\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2018, 2019]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2018, 2020]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2019, 2020]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5546627753142914\n", - "mean of accuracy : 0.9119535519125682\n", - "mean of mcc : 0.6879511579878309\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2018, 2019]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2018, 2020]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2019, 2020]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.29234018565381\n", - "mean of accuracy : 0.956963678252697\n", - "mean of mcc : 0.4819888130358391\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2018, 2019]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2018, 2020]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2019, 2020]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4784367169072478\n", - "mean of accuracy : 0.9327483136628656\n", - "mean of mcc : 0.6252440470551551\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2018, 2019]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2018, 2020]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2019, 2020]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4827774547671669\n", - "mean of accuracy : 0.9432361454200664\n", - "mean of mcc : 0.6368148576215991\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2018, 2019]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2018, 2020]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2019, 2020]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **SMOTE 증강기법을 적용시킨 데이터셋에 대한 성능**" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "df_smote_busan_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_busan.csv\")\n", - "df_smote_busan_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_busan.csv\")\n", - "df_smote_busan_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_busan.csv\")\n", - "\n", - "df_smote_seoul_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_seoul.csv\")\n", - "df_smote_seoul_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_seoul.csv\")\n", - "df_smote_seoul_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_seoul.csv\")\n", - "\n", - "df_smote_daegu_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_daegu.csv\")\n", - "df_smote_daegu_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_daegu.csv\")\n", - "df_smote_daegu_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_daegu.csv\")\n", - "\n", - "df_smote_daejeon_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_daejeon.csv\")\n", - "df_smote_daejeon_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_daejeon.csv\")\n", - "df_smote_daejeon_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_daejeon.csv\")\n", - "\n", - "df_smote_gwangju_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_gwangju.csv\")\n", - "df_smote_gwangju_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_gwangju.csv\")\n", - "df_smote_gwangju_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_gwangju.csv\")\n", - "\n", - "df_smote_incheon_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_incheon.csv\")\n", - "df_smote_incheon_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_incheon.csv\")\n", - "df_smote_incheon_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_incheon.csv\")\n", - "\n", - "df_smote_busan_1 = preprocessing(df_smote_busan_1)\n", - "df_smote_busan_2 = preprocessing(df_smote_busan_2)\n", - "df_smote_busan_3 = preprocessing(df_smote_busan_3)\n", - "\n", - "df_smote_seoul_1 = preprocessing(df_smote_seoul_1)\n", - "df_smote_seoul_2 = preprocessing(df_smote_seoul_2)\n", - "df_smote_seoul_3 = preprocessing(df_smote_seoul_3)\n", - "\n", - "df_smote_daegu_1 = preprocessing(df_smote_daegu_1)\n", - "df_smote_daegu_2 = preprocessing(df_smote_daegu_2)\n", - "df_smote_daegu_3 = preprocessing(df_smote_daegu_3)\n", - "\n", - "df_smote_daejeon_1 = preprocessing(df_smote_daejeon_1)\n", - "df_smote_daejeon_2 = preprocessing(df_smote_daejeon_2)\n", - "df_smote_daejeon_3 = preprocessing(df_smote_daejeon_3)\n", - "\n", - "df_smote_gwangju_1 = preprocessing(df_smote_gwangju_1)\n", - "df_smote_gwangju_2 = preprocessing(df_smote_gwangju_2)\n", - "df_smote_gwangju_3 = preprocessing(df_smote_gwangju_3)\n", - "\n", - "df_smote_incheon_1 = preprocessing(df_smote_incheon_1)\n", - "df_smote_incheon_2 = preprocessing(df_smote_incheon_2)\n", - "df_smote_incheon_3 = preprocessing(df_smote_incheon_3)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "smote_oversample=[] # smote 적용 전 f1 score 저장 리스트" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n", - " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n", - " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n", - " 'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'ground_temp - temp_C',\n", - " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n", - " dtype='object')" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_smote_seoul_1.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n", - " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n", - " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n", - " 'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'ground_temp - temp_C',\n", - " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n", - " dtype='object')" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_seoul.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5789392155070722\n", - "mean of accuracy : 0.9399950927797324\n", - "mean of mcc : 0.7084991639282849\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_1.loc[df_smote_seoul_1['year'].isin([2018, 2019]), df_smote_seoul_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_smote_seoul_1.loc[df_smote_seoul_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_2.loc[df_smote_seoul_2['year'].isin([2018, 2020]), df_smote_seoul_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_smote_seoul_2.loc[df_smote_seoul_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_3.loc[df_smote_seoul_3['year'].isin([2019, 2020]), df_smote_seoul_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_smote_seoul_3.loc[df_smote_seoul_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.46602091302575205\n", - "mean of accuracy : 0.9501977443421413\n", - "mean of mcc : 0.6318799598547477\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_busan_1.loc[df_smote_busan_1['year'].isin([2018, 2019]), df_smote_busan_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_smote_busan_1.loc[df_smote_busan_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_busan_2.loc[df_smote_busan_2['year'].isin([2018, 2020]), df_smote_busan_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_smote_busan_2.loc[df_smote_busan_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_busan_3.loc[df_smote_busan_3['year'].isin([2019, 2020]), df_smote_busan_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_smote_busan_3.loc[df_smote_busan_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.583559682649986\n", - "mean of accuracy : 0.9104636075554557\n", - "mean of mcc : 0.7061374111787998\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_1.loc[df_smote_incheon_1['year'].isin([2018, 2019]), df_smote_incheon_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_smote_incheon_1.loc[df_smote_incheon_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_2.loc[df_smote_incheon_2['year'].isin([2018, 2020]), df_smote_incheon_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_smote_incheon_2.loc[df_smote_incheon_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_3.loc[df_smote_incheon_3['year'].isin([2019, 2020]), df_smote_incheon_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_smote_incheon_3.loc[df_smote_incheon_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.44735416496514874\n", - "mean of accuracy : 0.96373033992897\n", - "mean of mcc : 0.6169211806368756\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_1.loc[df_smote_daegu_1['year'].isin([2018, 2019]), df_smote_daegu_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_smote_daegu_1.loc[df_smote_daegu_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_2.loc[df_smote_daegu_2['year'].isin([2018, 2020]), df_smote_daegu_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_smote_daegu_2.loc[df_smote_daegu_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_3.loc[df_smote_daegu_3['year'].isin([2019, 2020]), df_smote_daegu_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_smote_daegu_3.loc[df_smote_daegu_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5213346054616135\n", - "mean of accuracy : 0.9306212624032071\n", - "mean of mcc : 0.6563209583230294\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_1.loc[df_smote_daejeon_1['year'].isin([2018, 2019]), df_smote_daejeon_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_smote_daejeon_1.loc[df_smote_daejeon_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_2.loc[df_smote_daejeon_2['year'].isin([2018, 2020]), df_smote_daejeon_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_smote_daejeon_2.loc[df_smote_daejeon_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_3.loc[df_smote_daejeon_3['year'].isin([2019, 2020]), df_smote_daejeon_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_smote_daejeon_3.loc[df_smote_daejeon_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5227311295096367\n", - "mean of accuracy : 0.9368502091806605\n", - "mean of mcc : 0.6604233380668852\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_1.loc[df_smote_gwangju_1['year'].isin([2018, 2019]), df_smote_gwangju_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_smote_gwangju_1.loc[df_smote_gwangju_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_2.loc[df_smote_gwangju_2['year'].isin([2018, 2020]), df_smote_gwangju_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_smote_gwangju_2.loc[df_smote_gwangju_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_3.loc[df_smote_gwangju_3['year'].isin([2019, 2020]), df_smote_gwangju_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_smote_gwangju_3.loc[df_smote_gwangju_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **CTGAN을 통해 데이터 증강을 진행한 데이터셋에 대한 성능**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **2만개**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_busan.csv\")\n", - "df_seoul_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_seoul.csv\")\n", - "df_incheon_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_incheon.csv\")\n", - "df_daegu_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_daegu.csv\")\n", - "df_daejeon_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_daejeon.csv\")\n", - "df_gwangju_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_busan.csv\")\n", - "df_seoul_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_seoul.csv\")\n", - "df_incheon_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_incheon.csv\")\n", - "df_daegu_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_daegu.csv\")\n", - "df_daejeon_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_daejeon.csv\")\n", - "df_gwangju_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_busan.csv\")\n", - "df_seoul_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_seoul.csv\")\n", - "df_incheon_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_incheon.csv\")\n", - "df_daegu_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_daegu.csv\")\n", - "df_daejeon_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_daejeon.csv\")\n", - "df_gwangju_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_gwangju.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_gan20000_1= preprocessing(df_busan_gan20000_1).copy()\n", - "df_seoul_gan20000_1= preprocessing(df_seoul_gan20000_1).copy()\n", - "df_incheon_gan20000_1= preprocessing(df_incheon_gan20000_1).copy()\n", - "df_daegu_gan20000_1= preprocessing(df_daegu_gan20000_1).copy()\n", - "df_daejeon_gan20000_1= preprocessing(df_daejeon_gan20000_1).copy()\n", - "df_gwangju_gan20000_1= preprocessing(df_gwangju_gan20000_1).copy()\n", - "\n", - "df_busan_gan20000_2= preprocessing(df_busan_gan20000_2).copy()\n", - "df_seoul_gan20000_2= preprocessing(df_seoul_gan20000_2).copy()\n", - "df_incheon_gan20000_2= preprocessing(df_incheon_gan20000_2).copy()\n", - "df_daegu_gan20000_2= preprocessing(df_daegu_gan20000_2).copy()\n", - "df_daejeon_gan20000_2= preprocessing(df_daejeon_gan20000_2).copy()\n", - "df_gwangju_gan20000_2= preprocessing(df_gwangju_gan20000_2).copy()\n", - "\n", - "df_busan_gan20000_3= preprocessing(df_busan_gan20000_3).copy()\n", - "df_seoul_gan20000_3= preprocessing(df_seoul_gan20000_3).copy()\n", - "df_incheon_gan20000_3= preprocessing(df_incheon_gan20000_3).copy()\n", - "df_daegu_gan20000_3= preprocessing(df_daegu_gan20000_3).copy()\n", - "df_daejeon_gan20000_3= preprocessing(df_daejeon_gan20000_3).copy()\n", - "df_gwangju_gan20000_3= preprocessing(df_gwangju_gan20000_3).copy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5430102117038431\n", - "mean of accuracy : 0.9409344303881695\n", - "mean of mcc : 0.6780719447285347\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_1.loc[df_seoul_gan20000_1['year'].isin([2018, 2019]), df_seoul_gan20000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan20000_1.loc[df_seoul_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_2.loc[df_seoul_gan20000_2['year'].isin([2018, 2020]), df_seoul_gan20000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan20000_2.loc[df_seoul_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_3.loc[df_seoul_gan20000_3['year'].isin([2019, 2020]), df_seoul_gan20000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan20000_3.loc[df_seoul_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4663461890504854\n", - "mean of accuracy : 0.9575030566160141\n", - "mean of mcc : 0.6265195697208686\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_1.loc[df_busan_gan20000_1['year'].isin([2018, 2019]), df_busan_gan20000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan20000_1.loc[df_busan_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_2.loc[df_busan_gan20000_2['year'].isin([2018, 2020]), df_busan_gan20000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan20000_2.loc[df_busan_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_3.loc[df_busan_gan20000_3['year'].isin([2019, 2020]), df_busan_gan20000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan20000_3.loc[df_busan_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5666805477334921\n", - "mean of accuracy : 0.9076257371227054\n", - "mean of mcc : 0.6889342293705883\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_1.loc[df_incheon_gan20000_1['year'].isin([2018, 2019]), df_incheon_gan20000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan20000_1.loc[df_incheon_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_2.loc[df_incheon_gan20000_2['year'].isin([2018, 2020]), df_incheon_gan20000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan20000_2.loc[df_incheon_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_3.loc[df_incheon_gan20000_3['year'].isin([2019, 2020]), df_incheon_gan20000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan20000_3.loc[df_incheon_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.44041946056823505\n", - "mean of accuracy : 0.9674465196164052\n", - "mean of mcc : 0.6086254895296773\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_1.loc[df_daegu_gan20000_1['year'].isin([2018, 2019]), df_daegu_gan20000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan20000_1.loc[df_daegu_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_2.loc[df_daegu_gan20000_2['year'].isin([2018, 2020]), df_daegu_gan20000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan20000_2.loc[df_daegu_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_3.loc[df_daegu_gan20000_3['year'].isin([2019, 2020]), df_daegu_gan20000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan20000_3.loc[df_daegu_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4808393438710248\n", - "mean of accuracy : 0.9317601117848143\n", - "mean of mcc : 0.6258321334411245\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_1.loc[df_daejeon_gan20000_1['year'].isin([2018, 2019]), df_daejeon_gan20000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_1.loc[df_daejeon_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_2.loc[df_daejeon_gan20000_2['year'].isin([2018, 2020]), df_daejeon_gan20000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_2.loc[df_daejeon_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_3.loc[df_daejeon_gan20000_3['year'].isin([2019, 2020]), df_daejeon_gan20000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_3.loc[df_daejeon_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4937131678363705\n", - "mean of accuracy : 0.9367825269689182\n", - "mean of mcc : 0.6373125823908727\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_1.loc[df_gwangju_gan20000_1['year'].isin([2018, 2019]), df_gwangju_gan20000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_1.loc[df_gwangju_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_2.loc[df_gwangju_gan20000_2['year'].isin([2018, 2020]), df_gwangju_gan20000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_2.loc[df_gwangju_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_3.loc[df_gwangju_gan20000_3['year'].isin([2019, 2020]), df_gwangju_gan20000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_3.loc[df_gwangju_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **1만개**" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_busan.csv\")\n", - "df_seoul_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_seoul.csv\")\n", - "df_incheon_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_incheon.csv\")\n", - "df_daegu_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_daegu.csv\")\n", - "df_daejeon_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_daejeon.csv\")\n", - "df_gwangju_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_busan.csv\")\n", - "df_seoul_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_seoul.csv\")\n", - "df_incheon_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_incheon.csv\")\n", - "df_daegu_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_daegu.csv\")\n", - "df_daejeon_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_daejeon.csv\")\n", - "df_gwangju_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_busan.csv\")\n", - "df_seoul_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_seoul.csv\")\n", - "df_incheon_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_incheon.csv\")\n", - "df_daegu_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_daegu.csv\")\n", - "df_daejeon_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_daejeon.csv\")\n", - "df_gwangju_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_gwangju.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_gan10000_1= preprocessing(df_busan_gan10000_1).copy()\n", - "df_seoul_gan10000_1= preprocessing(df_seoul_gan10000_1).copy()\n", - "df_incheon_gan10000_1= preprocessing(df_incheon_gan10000_1).copy()\n", - "df_daegu_gan10000_1= preprocessing(df_daegu_gan10000_1).copy()\n", - "df_daejeon_gan10000_1= preprocessing(df_daejeon_gan10000_1).copy()\n", - "df_gwangju_gan10000_1= preprocessing(df_gwangju_gan10000_1).copy()\n", - "\n", - "df_busan_gan10000_2= preprocessing(df_busan_gan10000_2).copy()\n", - "df_seoul_gan10000_2= preprocessing(df_seoul_gan10000_2).copy()\n", - "df_incheon_gan10000_2= preprocessing(df_incheon_gan10000_2).copy()\n", - "df_daegu_gan10000_2= preprocessing(df_daegu_gan10000_2).copy()\n", - "df_daejeon_gan10000_2= preprocessing(df_daejeon_gan10000_2).copy()\n", - "df_gwangju_gan10000_2= preprocessing(df_gwangju_gan10000_2).copy()\n", - "\n", - "df_busan_gan10000_3= preprocessing(df_busan_gan10000_3).copy()\n", - "df_seoul_gan10000_3= preprocessing(df_seoul_gan10000_3).copy()\n", - "df_incheon_gan10000_3= preprocessing(df_incheon_gan10000_3).copy()\n", - "df_daegu_gan10000_3= preprocessing(df_daegu_gan10000_3).copy()\n", - "df_daejeon_gan10000_3= preprocessing(df_daejeon_gan10000_3).copy()\n", - "df_gwangju_gan10000_3= preprocessing(df_gwangju_gan10000_3).copy()\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5489022319719195\n", - "mean of accuracy : 0.9431400803453353\n", - "mean of mcc : 0.6865310613747596\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_1.loc[df_seoul_gan10000_1['year'].isin([2018, 2019]), df_seoul_gan10000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan10000_1.loc[df_seoul_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_2.loc[df_seoul_gan10000_2['year'].isin([2018, 2020]), df_seoul_gan10000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan10000_2.loc[df_seoul_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_3.loc[df_seoul_gan10000_3['year'].isin([2019, 2020]), df_seoul_gan10000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan10000_3.loc[df_seoul_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4676627137741911\n", - "mean of accuracy : 0.959515848658\n", - "mean of mcc : 0.6314347309502454\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_1.loc[df_busan_gan10000_1['year'].isin([2018, 2019]), df_busan_gan10000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan10000_1.loc[df_busan_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_2.loc[df_busan_gan10000_2['year'].isin([2018, 2020]), df_busan_gan10000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan10000_2.loc[df_busan_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_3.loc[df_busan_gan10000_3['year'].isin([2019, 2020]), df_busan_gan10000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan10000_3.loc[df_busan_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5530992219893435\n", - "mean of accuracy : 0.9121080461777744\n", - "mean of mcc : 0.687650733674605\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_1.loc[df_incheon_gan10000_1['year'].isin([2018, 2019]), df_incheon_gan10000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan10000_1.loc[df_incheon_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_2.loc[df_incheon_gan10000_2['year'].isin([2018, 2020]), df_incheon_gan10000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan10000_2.loc[df_incheon_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_3.loc[df_incheon_gan10000_3['year'].isin([2019, 2020]), df_incheon_gan10000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan10000_3.loc[df_incheon_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.406699786423904\n", - "mean of accuracy : 0.9667631476075221\n", - "mean of mcc : 0.5786232842762297\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_1.loc[df_daegu_gan10000_1['year'].isin([2018, 2019]), df_daegu_gan10000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan10000_1.loc[df_daegu_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_2.loc[df_daegu_gan10000_2['year'].isin([2018, 2020]), df_daegu_gan10000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan10000_2.loc[df_daegu_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_3.loc[df_daegu_gan10000_3['year'].isin([2019, 2020]), df_daegu_gan10000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan10000_3.loc[df_daegu_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4782405346339498\n", - "mean of accuracy : 0.9329755844998378\n", - "mean of mcc : 0.6261770938311793\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_1.loc[df_daejeon_gan10000_1['year'].isin([2018, 2019]), df_daejeon_gan10000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_1.loc[df_daejeon_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_2.loc[df_daejeon_gan10000_2['year'].isin([2018, 2020]), df_daejeon_gan10000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_2.loc[df_daejeon_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_3.loc[df_daejeon_gan10000_3['year'].isin([2019, 2020]), df_daejeon_gan10000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_3.loc[df_daejeon_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4814166239972196\n", - "mean of accuracy : 0.9418691934692385\n", - "mean of mcc : 0.6302426323494177\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_1.loc[df_gwangju_gan10000_1['year'].isin([2018, 2019]), df_gwangju_gan10000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_1.loc[df_gwangju_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_2.loc[df_gwangju_gan10000_2['year'].isin([2018, 2020]), df_gwangju_gan10000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_2.loc[df_gwangju_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_3.loc[df_gwangju_gan10000_3['year'].isin([2019, 2020]), df_gwangju_gan10000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_3.loc[df_gwangju_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **7천개**" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_busan.csv\")\n", - "df_seoul_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_seoul.csv\")\n", - "df_incheon_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_incheon.csv\")\n", - "df_daegu_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_daegu.csv\")\n", - "df_daejeon_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_daejeon.csv\")\n", - "df_gwangju_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_busan.csv\")\n", - "df_seoul_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_seoul.csv\")\n", - "df_incheon_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_incheon.csv\")\n", - "df_daegu_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_daegu.csv\")\n", - "df_daejeon_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_daejeon.csv\")\n", - "df_gwangju_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_busan.csv\")\n", - "df_seoul_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_seoul.csv\")\n", - "df_incheon_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_incheon.csv\")\n", - "df_daegu_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_daegu.csv\")\n", - "df_daejeon_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_daejeon.csv\")\n", - "df_gwangju_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_gwangju.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_gan7000_1= preprocessing(df_busan_gan7000_1).copy()\n", - "df_seoul_gan7000_1= preprocessing(df_seoul_gan7000_1).copy()\n", - "df_incheon_gan7000_1= preprocessing(df_incheon_gan7000_1).copy()\n", - "df_daegu_gan7000_1= preprocessing(df_daegu_gan7000_1).copy()\n", - "df_daejeon_gan7000_1= preprocessing(df_daejeon_gan7000_1).copy()\n", - "df_gwangju_gan7000_1= preprocessing(df_gwangju_gan7000_1).copy()\n", - "\n", - "df_busan_gan7000_2= preprocessing(df_busan_gan7000_2).copy()\n", - "df_seoul_gan7000_2= preprocessing(df_seoul_gan7000_2).copy()\n", - "df_incheon_gan7000_2= preprocessing(df_incheon_gan7000_2).copy()\n", - "df_daegu_gan7000_2= preprocessing(df_daegu_gan7000_2).copy()\n", - "df_daejeon_gan7000_2= preprocessing(df_daejeon_gan7000_2).copy()\n", - "df_gwangju_gan7000_2= preprocessing(df_gwangju_gan7000_2).copy()\n", - "\n", - "df_busan_gan7000_3= preprocessing(df_busan_gan7000_3).copy()\n", - "df_seoul_gan7000_3= preprocessing(df_seoul_gan7000_3).copy()\n", - "df_incheon_gan7000_3= preprocessing(df_incheon_gan7000_3).copy()\n", - "df_daegu_gan7000_3= preprocessing(df_daegu_gan7000_3).copy()\n", - "df_daejeon_gan7000_3= preprocessing(df_daejeon_gan7000_3).copy()\n", - "df_gwangju_gan7000_3= preprocessing(df_gwangju_gan7000_3).copy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5483176293132692\n", - "mean of accuracy : 0.9431403922449285\n", - "mean of mcc : 0.6873897786091137\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_1.loc[df_seoul_gan7000_1['year'].isin([2018, 2019]), df_seoul_gan7000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan7000_1.loc[df_seoul_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_2.loc[df_seoul_gan7000_2['year'].isin([2018, 2020]), df_seoul_gan7000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan7000_2.loc[df_seoul_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_3.loc[df_seoul_gan7000_3['year'].isin([2019, 2020]), df_seoul_gan7000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan7000_3.loc[df_seoul_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4663740712722189\n", - "mean of accuracy : 0.9600495088621072\n", - "mean of mcc : 0.6344100074206912\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_1.loc[df_busan_gan7000_1['year'].isin([2018, 2019]), df_busan_gan7000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan7000_1.loc[df_busan_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_2.loc[df_busan_gan7000_2['year'].isin([2018, 2020]), df_busan_gan7000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan7000_2.loc[df_busan_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_3.loc[df_busan_gan7000_3['year'].isin([2019, 2020]), df_busan_gan7000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan7000_3.loc[df_busan_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5400648981230686\n", - "mean of accuracy : 0.9097125533348306\n", - "mean of mcc : 0.6776113780254353\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_1.loc[df_incheon_gan7000_1['year'].isin([2018, 2019]), df_incheon_gan7000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan7000_1.loc[df_incheon_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_2.loc[df_incheon_gan7000_2['year'].isin([2018, 2020]), df_incheon_gan7000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan7000_2.loc[df_incheon_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_3.loc[df_incheon_gan7000_3['year'].isin([2019, 2020]), df_incheon_gan7000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan7000_3.loc[df_incheon_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.426127621983083\n", - "mean of accuracy : 0.9688532907486422\n", - "mean of mcc : 0.5995660759623473\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_1.loc[df_daegu_gan7000_1['year'].isin([2018, 2019]), df_daegu_gan7000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan7000_1.loc[df_daegu_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_2.loc[df_daegu_gan7000_2['year'].isin([2018, 2020]), df_daegu_gan7000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan7000_2.loc[df_daegu_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_3.loc[df_daegu_gan7000_3['year'].isin([2019, 2020]), df_daegu_gan7000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan7000_3.loc[df_daegu_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4701451158455507\n", - "mean of accuracy : 0.9314935415990885\n", - "mean of mcc : 0.619110040349657\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_1.loc[df_daejeon_gan7000_1['year'].isin([2018, 2019]), df_daejeon_gan7000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_1.loc[df_daejeon_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_2.loc[df_daejeon_gan7000_2['year'].isin([2018, 2020]), df_daejeon_gan7000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_2.loc[df_daejeon_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_3.loc[df_daejeon_gan7000_3['year'].isin([2019, 2020]), df_daejeon_gan7000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_3.loc[df_daejeon_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4860337424842999\n", - "mean of accuracy : 0.9426673445284495\n", - "mean of mcc : 0.6348033992505139\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_1.loc[df_gwangju_gan7000_1['year'].isin([2018, 2019]), df_gwangju_gan7000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_1.loc[df_gwangju_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_2.loc[df_gwangju_gan7000_2['year'].isin([2018, 2020]), df_gwangju_gan7000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_2.loc[df_gwangju_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_3.loc[df_gwangju_gan7000_3['year'].isin([2019, 2020]), df_gwangju_gan7000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_3.loc[df_gwangju_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **SMOTENC+CTGAN을 통해 데이터 증강을 진행한 데이터셋에 대한 성능**\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **7천개**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_busan.csv\")\n", - "df_seoul_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_seoul.csv\")\n", - "df_incheon_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_incheon.csv\")\n", - "df_daegu_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_busan.csv\")\n", - "df_seoul_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_seoul.csv\")\n", - "df_incheon_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_incheon.csv\")\n", - "df_daegu_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_busan.csv\")\n", - "df_seoul_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_seoul.csv\")\n", - "df_incheon_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_incheon.csv\")\n", - "df_daegu_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_gwangju.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_smotenc_ctgan7000_1 = preprocessing(df_busan_smotenc_ctgan7000_1).copy()\n", - "df_seoul_smotenc_ctgan7000_1 = preprocessing(df_seoul_smotenc_ctgan7000_1).copy()\n", - "df_incheon_smotenc_ctgan7000_1 = preprocessing(df_incheon_smotenc_ctgan7000_1).copy()\n", - "df_daegu_smotenc_ctgan7000_1 = preprocessing(df_daegu_smotenc_ctgan7000_1).copy()\n", - "df_daejeon_smotenc_ctgan7000_1 = preprocessing(df_daejeon_smotenc_ctgan7000_1).copy()\n", - "df_gwangju_smotenc_ctgan7000_1 = preprocessing(df_gwangju_smotenc_ctgan7000_1).copy()\n", - "\n", - "df_busan_smotenc_ctgan7000_2 = preprocessing(df_busan_smotenc_ctgan7000_2).copy()\n", - "df_seoul_smotenc_ctgan7000_2 = preprocessing(df_seoul_smotenc_ctgan7000_2).copy()\n", - "df_incheon_smotenc_ctgan7000_2 = preprocessing(df_incheon_smotenc_ctgan7000_2).copy()\n", - "df_daegu_smotenc_ctgan7000_2 = preprocessing(df_daegu_smotenc_ctgan7000_2).copy()\n", - "df_daejeon_smotenc_ctgan7000_2 = preprocessing(df_daejeon_smotenc_ctgan7000_2).copy()\n", - "df_gwangju_smotenc_ctgan7000_2 = preprocessing(df_gwangju_smotenc_ctgan7000_2).copy()\n", - "\n", - "df_busan_smotenc_ctgan7000_3 = preprocessing(df_busan_smotenc_ctgan7000_3).copy()\n", - "df_seoul_smotenc_ctgan7000_3 = preprocessing(df_seoul_smotenc_ctgan7000_3).copy()\n", - "df_incheon_smotenc_ctgan7000_3 = preprocessing(df_incheon_smotenc_ctgan7000_3).copy()\n", - "df_daegu_smotenc_ctgan7000_3 = preprocessing(df_daegu_smotenc_ctgan7000_3).copy()\n", - "df_daejeon_smotenc_ctgan7000_3 = preprocessing(df_daejeon_smotenc_ctgan7000_3).copy()\n", - "df_gwangju_smotenc_ctgan7000_3 = preprocessing(df_gwangju_smotenc_ctgan7000_3).copy()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5352598289152354\n", - "mean of accuracy : 0.9412404038891801\n", - "mean of mcc : 0.6782186947210392\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_1.loc[df_seoul_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan7000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_1.loc[df_seoul_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_2.loc[df_seoul_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan7000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_2.loc[df_seoul_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_3.loc[df_seoul_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan7000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_3.loc[df_seoul_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4485402157416473\n", - "mean of accuracy : 0.9570824080312065\n", - "mean of mcc : 0.6170303705969965\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_1.loc[df_busan_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan7000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_1.loc[df_busan_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_2.loc[df_busan_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan7000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_2.loc[df_busan_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_3.loc[df_busan_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan7000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_3.loc[df_busan_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5518512077229454\n", - "mean of accuracy : 0.9110783616538164\n", - "mean of mcc : 0.6864459032542548\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_1.loc[df_incheon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan7000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_1.loc[df_incheon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_2.loc[df_incheon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan7000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_2.loc[df_incheon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_3.loc[df_incheon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan7000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_3.loc[df_incheon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.40224349400243914\n", - "mean of accuracy : 0.9664241127496235\n", - "mean of mcc : 0.5832379476945067\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_1.loc[df_daegu_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan7000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_1.loc[df_daegu_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_2.loc[df_daegu_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan7000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_2.loc[df_daegu_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_3.loc[df_daegu_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan7000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_3.loc[df_daegu_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.476308593039838\n", - "mean of accuracy : 0.9321407332551505\n", - "mean of mcc : 0.625183787216149\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_1.loc[df_daejeon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan7000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_1.loc[df_daejeon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_2.loc[df_daejeon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan7000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_2.loc[df_daejeon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_3.loc[df_daejeon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan7000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_3.loc[df_daejeon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.47155191980971883\n", - "mean of accuracy : 0.9403094875697615\n", - "mean of mcc : 0.6237103634516713\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_1.loc[df_gwangju_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan7000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_1.loc[df_gwangju_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_2.loc[df_gwangju_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan7000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_2.loc[df_gwangju_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_3.loc[df_gwangju_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan7000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_3.loc[df_gwangju_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **1만개**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_busan.csv\")\n", - "df_seoul_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_seoul.csv\")\n", - "df_incheon_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_incheon.csv\")\n", - "df_daegu_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_busan.csv\")\n", - "df_seoul_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_seoul.csv\")\n", - "df_incheon_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_incheon.csv\")\n", - "df_daegu_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_busan.csv\")\n", - "df_seoul_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_seoul.csv\")\n", - "df_incheon_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_incheon.csv\")\n", - "df_daegu_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_gwangju.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_smotenc_ctgan10000_1 = preprocessing(df_busan_smotenc_ctgan10000_1).copy()\n", - "df_seoul_smotenc_ctgan10000_1 = preprocessing(df_seoul_smotenc_ctgan10000_1).copy()\n", - "df_incheon_smotenc_ctgan10000_1 = preprocessing(df_incheon_smotenc_ctgan10000_1).copy()\n", - "df_daegu_smotenc_ctgan10000_1 = preprocessing(df_daegu_smotenc_ctgan10000_1).copy()\n", - "df_daejeon_smotenc_ctgan10000_1 = preprocessing(df_daejeon_smotenc_ctgan10000_1).copy()\n", - "df_gwangju_smotenc_ctgan10000_1 = preprocessing(df_gwangju_smotenc_ctgan10000_1).copy()\n", - "\n", - "df_busan_smotenc_ctgan10000_2 = preprocessing(df_busan_smotenc_ctgan10000_2).copy()\n", - "df_seoul_smotenc_ctgan10000_2 = preprocessing(df_seoul_smotenc_ctgan10000_2).copy()\n", - "df_incheon_smotenc_ctgan10000_2 = preprocessing(df_incheon_smotenc_ctgan10000_2).copy()\n", - "df_daegu_smotenc_ctgan10000_2 = preprocessing(df_daegu_smotenc_ctgan10000_2).copy()\n", - "df_daejeon_smotenc_ctgan10000_2 = preprocessing(df_daejeon_smotenc_ctgan10000_2).copy()\n", - "df_gwangju_smotenc_ctgan10000_2 = preprocessing(df_gwangju_smotenc_ctgan10000_2).copy()\n", - "\n", - "df_busan_smotenc_ctgan10000_3 = preprocessing(df_busan_smotenc_ctgan10000_3).copy()\n", - "df_seoul_smotenc_ctgan10000_3 = preprocessing(df_seoul_smotenc_ctgan10000_3).copy()\n", - "df_incheon_smotenc_ctgan10000_3 = preprocessing(df_incheon_smotenc_ctgan10000_3).copy()\n", - "df_daegu_smotenc_ctgan10000_3 = preprocessing(df_daegu_smotenc_ctgan10000_3).copy()\n", - "df_daejeon_smotenc_ctgan10000_3 = preprocessing(df_daejeon_smotenc_ctgan10000_3).copy()\n", - "df_gwangju_smotenc_ctgan10000_3 = preprocessing(df_gwangju_smotenc_ctgan10000_3).copy()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5391446462395447\n", - "mean of accuracy : 0.9419634911129409\n", - "mean of mcc : 0.6802382864465635\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_1.loc[df_seoul_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan10000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_1.loc[df_seoul_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_2.loc[df_seoul_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan10000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_2.loc[df_seoul_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_3.loc[df_seoul_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan10000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_3.loc[df_seoul_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.45629913656734233\n", - "mean of accuracy : 0.9579187148073292\n", - "mean of mcc : 0.624299022660101\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_1.loc[df_busan_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan10000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_1.loc[df_busan_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_2.loc[df_busan_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan10000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_2.loc[df_busan_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_3.loc[df_busan_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan10000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_3.loc[df_busan_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5474454733447011\n", - "mean of accuracy : 0.9102821859586961\n", - "mean of mcc : 0.6821514877761338\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_1.loc[df_incheon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan10000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_1.loc[df_incheon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_2.loc[df_incheon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan10000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_2.loc[df_incheon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_3.loc[df_incheon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan10000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_3.loc[df_incheon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4114169416612505\n", - "mean of accuracy : 0.9674109630627709\n", - "mean of mcc : 0.5898029433914993\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_1.loc[df_daegu_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan10000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_1.loc[df_daegu_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_2.loc[df_daegu_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan10000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_2.loc[df_daegu_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_3.loc[df_daegu_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan10000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_3.loc[df_daegu_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.46834204028253223\n", - "mean of accuracy : 0.9317592800525656\n", - "mean of mcc : 0.620788512419694\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_1.loc[df_daejeon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan10000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_1.loc[df_daejeon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_2.loc[df_daejeon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan10000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_2.loc[df_daejeon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_3.loc[df_daejeon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan10000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_3.loc[df_daejeon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4859514070622382\n", - "mean of accuracy : 0.9427799402816245\n", - "mean of mcc : 0.6384358903533097\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_1.loc[df_gwangju_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan10000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_1.loc[df_gwangju_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_2.loc[df_gwangju_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan10000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_2.loc[df_gwangju_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_3.loc[df_gwangju_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan10000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_3.loc[df_gwangju_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **2만개**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_busan.csv\")\n", - "df_seoul_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_seoul.csv\")\n", - "df_incheon_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_incheon.csv\")\n", - "df_daegu_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_busan.csv\")\n", - "df_seoul_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_seoul.csv\")\n", - "df_incheon_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_incheon.csv\")\n", - "df_daegu_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_busan.csv\")\n", - "df_seoul_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_seoul.csv\")\n", - "df_incheon_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_incheon.csv\")\n", - "df_daegu_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_gwangju.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_smotenc_ctgan20000_1 = preprocessing(df_busan_smotenc_ctgan20000_1).copy()\n", - "df_seoul_smotenc_ctgan20000_1 = preprocessing(df_seoul_smotenc_ctgan20000_1).copy()\n", - "df_incheon_smotenc_ctgan20000_1 = preprocessing(df_incheon_smotenc_ctgan20000_1).copy()\n", - "df_daegu_smotenc_ctgan20000_1 = preprocessing(df_daegu_smotenc_ctgan20000_1).copy()\n", - "df_daejeon_smotenc_ctgan20000_1 = preprocessing(df_daejeon_smotenc_ctgan20000_1).copy()\n", - "df_gwangju_smotenc_ctgan20000_1 = preprocessing(df_gwangju_smotenc_ctgan20000_1).copy()\n", - "\n", - "df_busan_smotenc_ctgan20000_2 = preprocessing(df_busan_smotenc_ctgan20000_2).copy()\n", - "df_seoul_smotenc_ctgan20000_2 = preprocessing(df_seoul_smotenc_ctgan20000_2).copy()\n", - "df_incheon_smotenc_ctgan20000_2 = preprocessing(df_incheon_smotenc_ctgan20000_2).copy()\n", - "df_daegu_smotenc_ctgan20000_2 = preprocessing(df_daegu_smotenc_ctgan20000_2).copy()\n", - "df_daejeon_smotenc_ctgan20000_2 = preprocessing(df_daejeon_smotenc_ctgan20000_2).copy()\n", - "df_gwangju_smotenc_ctgan20000_2 = preprocessing(df_gwangju_smotenc_ctgan20000_2).copy()\n", - "\n", - "df_busan_smotenc_ctgan20000_3 = preprocessing(df_busan_smotenc_ctgan20000_3).copy()\n", - "df_seoul_smotenc_ctgan20000_3 = preprocessing(df_seoul_smotenc_ctgan20000_3).copy()\n", - "df_incheon_smotenc_ctgan20000_3 = preprocessing(df_incheon_smotenc_ctgan20000_3).copy()\n", - "df_daegu_smotenc_ctgan20000_3 = preprocessing(df_daegu_smotenc_ctgan20000_3).copy()\n", - "df_daejeon_smotenc_ctgan20000_3 = preprocessing(df_daejeon_smotenc_ctgan20000_3).copy()\n", - "df_gwangju_smotenc_ctgan20000_3 = preprocessing(df_gwangju_smotenc_ctgan20000_3).copy()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5358813620881883\n", - "mean of accuracy : 0.9413917791584533\n", - "mean of mcc : 0.6789791066414157\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_1.loc[df_seoul_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan20000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_1.loc[df_seoul_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_2.loc[df_seoul_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan20000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_2.loc[df_seoul_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_3.loc[df_seoul_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan20000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_3.loc[df_seoul_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4336268899145388\n", - "mean of accuracy : 0.9563998677545724\n", - "mean of mcc : 0.6080139285870266\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_1.loc[df_busan_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan20000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_1.loc[df_busan_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_2.loc[df_busan_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan20000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_2.loc[df_busan_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_3.loc[df_busan_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan20000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_3.loc[df_busan_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.564798264178871\n", - "mean of accuracy : 0.9092926325157406\n", - "mean of mcc : 0.68899875702518\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_1.loc[df_incheon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan20000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_1.loc[df_incheon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_2.loc[df_incheon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan20000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_2.loc[df_incheon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_3.loc[df_incheon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan20000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_3.loc[df_incheon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.422336525695402\n", - "mean of accuracy : 0.963494335903386\n", - "mean of mcc : 0.5910759394692583\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_1.loc[df_daegu_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan20000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_1.loc[df_daegu_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_2.loc[df_daegu_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan20000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_2.loc[df_daegu_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_3.loc[df_daegu_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan20000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_3.loc[df_daegu_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4827377579875818\n", - "mean of accuracy : 0.9311888156964511\n", - "mean of mcc : 0.6279627682150313\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_1.loc[df_daejeon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan20000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_1.loc[df_daejeon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_2.loc[df_daejeon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan20000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_2.loc[df_daejeon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_3.loc[df_daejeon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan20000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_3.loc[df_daejeon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4753475520930324\n", - "mean of accuracy : 0.9346923838277981\n", - "mean of mcc : 0.6188303813518945\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_1.loc[df_gwangju_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan20000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_1.loc[df_gwangju_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_2.loc[df_gwangju_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan20000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_2.loc[df_gwangju_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_3.loc[df_gwangju_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan20000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_3.loc[df_gwangju_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "lgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], eval_metric=csi_metric)\n", - "csi.append(calculate_csi(Y_val, lgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, lgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, lgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'LightGBM',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
regionmodeldata_sampleCSIMCCAccuracyfold_csi
0seoulLightGBMpure0.5050410.6469920.936174[[0.46595932802825235, 0.5771195097037204, 0.4...
1busanLightGBMpure0.4301880.6008010.956971[[0.32824427480911017, 0.4782608695651431, 0.4...
2incheonLightGBMpure0.5546630.6879510.911954[[0.4845292955891715, 0.6037628278220865, 0.57...
3daeguLightGBMpure0.2923400.4819890.956964[[0.28124999999994504, 0.3320537428022395, 0.2...
4daejeonLightGBMpure0.4784370.6252440.932748[[0.43333333333329205, 0.4547920433995972, 0.5...
5gwangjuLightGBMpure0.4827770.6368150.943236[[0.3928095872169916, 0.5461624026695722, 0.50...
6seoulLightGBMsmote0.5789390.7084990.939995[[0.4550682961897588, 0.6503831417623898, 0.63...
7busanLightGBMsmote0.4660210.6318800.950198[[0.4690909090908522, 0.47058823529405874, 0.4...
8incheonLightGBMsmote0.5835600.7061370.910464[[0.5613293051359177, 0.6213080168776044, 0.56...
9daeguLightGBMsmote0.4473540.6169210.963730[[0.3632567849686089, 0.49122807017536024, 0.4...
10daejeonLightGBMsmote0.5213350.6563210.930621[[0.4589041095890018, 0.5326514555467716, 0.57...
11gwangjuLightGBMsmote0.5227310.6604230.936850[[0.47532729103721294, 0.5410958904109059, 0.5...
12seoulLightGBMctgan200000.5430100.6780720.940934[[0.47165160230070075, 0.5903500473036338, 0.5...
13busanLightGBMctgan200000.4663460.6265200.957503[[0.4154262516914187, 0.49489051094883285, 0.4...
14incheonLightGBMctgan200000.5666810.6889340.907626[[0.5311667554608135, 0.6098117512834792, 0.55...
15daeguLightGBMctgan200000.4404190.6086250.967447[[0.46130952380938656, 0.4414784394249607, 0.4...
16daejeonLightGBMctgan200000.4808390.6258320.931760[[0.42667928098387636, 0.48797250859102337, 0....
17gwangjuLightGBMctgan200000.4937130.6373130.936783[[0.42775665399235474, 0.5447427293064268, 0.5...
18seoulLightGBMctgan100000.5489020.6865310.943140[[0.482333607230856, 0.5810397553516227, 0.583...
19busanLightGBMctgan100000.4676630.6314350.959516[[0.4084084084083471, 0.5208955223879819, 0.47...
20incheonLightGBMctgan100000.5530990.6876510.912108[[0.4707429322813629, 0.6094198736358064, 0.57...
21daeguLightGBMctgan100000.4067000.5786230.966763[[0.44943820224706477, 0.45661157024783955, 0....
22daejeonLightGBMctgan100000.4782410.6261770.932976[[0.43804034582128354, 0.4569356300996866, 0.5...
23gwangjuLightGBMctgan100000.4814170.6302430.941869[[0.38888888888883977, 0.53999999999994, 0.515...
24seoulLightGBMctgan70000.5483180.6873900.943140[[0.4815724815724421, 0.5754132231404364, 0.58...
25busanLightGBMctgan70000.4663740.6344100.960050[[0.3847328244274221, 0.5324074074073252, 0.48...
26incheonLightGBMctgan70000.5400650.6776110.909713[[0.45490716180368335, 0.6001144164759382, 0.5...
27daeguLightGBMctgan70000.4261280.5995660.968853[[0.4832826747718896, 0.4640657084187959, 0.33...
28daejeonLightGBMctgan70000.4701450.6191100.931494[[0.4214559386972776, 0.4623753399818257, 0.52...
29gwangjuLightGBMctgan70000.4860340.6348030.942667[[0.3902439024389743, 0.543429844097935, 0.524...
30seoulLightGBMsmotenc_ctgan70000.5352600.6782190.941240[[0.46849757673663417, 0.5743801652891969, 0.5...
31busanLightGBMsmotenc_ctgan70000.4485400.6170300.957082[[0.38415545590427735, 0.4850640113797318, 0.4...
32incheonLightGBMsmotenc_ctgan70000.5518510.6864460.911078[[0.4885695623774991, 0.6043577981651029, 0.56...
33daeguLightGBMsmotenc_ctgan70000.4022430.5832380.966424[[0.41432225063928024, 0.4475806451612001, 0.3...
34daejeonLightGBMsmotenc_ctgan70000.4763090.6251840.932141[[0.4230038022813286, 0.47079964061091906, 0.5...
35gwangjuLightGBMsmotenc_ctgan70000.4715520.6237100.940309[[0.37113402061850886, 0.5363735070574879, 0.5...
36seoulLightGBMsmotenc_ctgan100000.5391450.6802380.941963[[0.4607201309328592, 0.5864583333332722, 0.57...
37busanLightGBMsmotenc_ctgan100000.4562990.6242990.957919[[0.40412979351026485, 0.4999999999999277, 0.4...
38incheonLightGBMsmotenc_ctgan100000.5474450.6821510.910282[[0.4681967213114447, 0.6035067873302826, 0.57...
39daeguLightGBMsmotenc_ctgan100000.4114170.5898030.967411[[0.4368131868130668, 0.45213849287159835, 0.3...
40daejeonLightGBMsmotenc_ctgan100000.4683420.6207890.931759[[0.42665388302968105, 0.4723481414324141, 0.5...
41gwangjuLightGBMsmotenc_ctgan100000.4859510.6384360.942780[[0.3915343915343397, 0.5499451152578979, 0.51...
42seoulLightGBMsmotenc_ctgan200000.5358810.6789790.941392[[0.4706840390879095, 0.5655314757481357, 0.57...
43busanLightGBMsmotenc_ctgan200000.4336270.6080140.956400[[0.3399089529589772, 0.5086956521738393, 0.45...
44incheonLightGBMsmotenc_ctgan200000.5647980.6889990.909293[[0.5387685290763661, 0.5963821368004185, 0.55...
45daeguLightGBMsmotenc_ctgan200000.4223370.5910760.963494[[0.42962962962952356, 0.43951612903216947, 0....
46daejeonLightGBMsmotenc_ctgan200000.4827380.6279630.931189[[0.4281636536631372, 0.4917627677100089, 0.52...
47gwangjuLightGBMsmotenc_ctgan200000.4753480.6188300.934692[[0.3949903660885939, 0.5378704720087225, 0.49...
\n", - "
" - ], - "text/plain": [ - " region model data_sample CSI MCC Accuracy \\\n", - "0 seoul LightGBM pure 0.505041 0.646992 0.936174 \n", - "1 busan LightGBM pure 0.430188 0.600801 0.956971 \n", - "2 incheon LightGBM pure 0.554663 0.687951 0.911954 \n", - "3 daegu LightGBM pure 0.292340 0.481989 0.956964 \n", - "4 daejeon LightGBM pure 0.478437 0.625244 0.932748 \n", - "5 gwangju LightGBM pure 0.482777 0.636815 0.943236 \n", - "6 seoul LightGBM smote 0.578939 0.708499 0.939995 \n", - "7 busan LightGBM smote 0.466021 0.631880 0.950198 \n", - "8 incheon LightGBM smote 0.583560 0.706137 0.910464 \n", - "9 daegu LightGBM smote 0.447354 0.616921 0.963730 \n", - "10 daejeon LightGBM smote 0.521335 0.656321 0.930621 \n", - "11 gwangju LightGBM smote 0.522731 0.660423 0.936850 \n", - "12 seoul LightGBM ctgan20000 0.543010 0.678072 0.940934 \n", - "13 busan LightGBM ctgan20000 0.466346 0.626520 0.957503 \n", - "14 incheon LightGBM ctgan20000 0.566681 0.688934 0.907626 \n", - "15 daegu LightGBM ctgan20000 0.440419 0.608625 0.967447 \n", - "16 daejeon LightGBM ctgan20000 0.480839 0.625832 0.931760 \n", - "17 gwangju LightGBM ctgan20000 0.493713 0.637313 0.936783 \n", - "18 seoul LightGBM ctgan10000 0.548902 0.686531 0.943140 \n", - "19 busan LightGBM ctgan10000 0.467663 0.631435 0.959516 \n", - "20 incheon LightGBM ctgan10000 0.553099 0.687651 0.912108 \n", - "21 daegu LightGBM ctgan10000 0.406700 0.578623 0.966763 \n", - "22 daejeon LightGBM ctgan10000 0.478241 0.626177 0.932976 \n", - "23 gwangju LightGBM ctgan10000 0.481417 0.630243 0.941869 \n", - "24 seoul LightGBM ctgan7000 0.548318 0.687390 0.943140 \n", - "25 busan LightGBM ctgan7000 0.466374 0.634410 0.960050 \n", - "26 incheon LightGBM ctgan7000 0.540065 0.677611 0.909713 \n", - "27 daegu LightGBM ctgan7000 0.426128 0.599566 0.968853 \n", - "28 daejeon LightGBM ctgan7000 0.470145 0.619110 0.931494 \n", - "29 gwangju LightGBM ctgan7000 0.486034 0.634803 0.942667 \n", - "30 seoul LightGBM smotenc_ctgan7000 0.535260 0.678219 0.941240 \n", - "31 busan LightGBM smotenc_ctgan7000 0.448540 0.617030 0.957082 \n", - "32 incheon LightGBM smotenc_ctgan7000 0.551851 0.686446 0.911078 \n", - "33 daegu LightGBM smotenc_ctgan7000 0.402243 0.583238 0.966424 \n", - "34 daejeon LightGBM smotenc_ctgan7000 0.476309 0.625184 0.932141 \n", - "35 gwangju LightGBM smotenc_ctgan7000 0.471552 0.623710 0.940309 \n", - "36 seoul LightGBM smotenc_ctgan10000 0.539145 0.680238 0.941963 \n", - "37 busan LightGBM smotenc_ctgan10000 0.456299 0.624299 0.957919 \n", - "38 incheon LightGBM smotenc_ctgan10000 0.547445 0.682151 0.910282 \n", - "39 daegu LightGBM smotenc_ctgan10000 0.411417 0.589803 0.967411 \n", - "40 daejeon LightGBM smotenc_ctgan10000 0.468342 0.620789 0.931759 \n", - "41 gwangju LightGBM smotenc_ctgan10000 0.485951 0.638436 0.942780 \n", - "42 seoul LightGBM smotenc_ctgan20000 0.535881 0.678979 0.941392 \n", - "43 busan LightGBM smotenc_ctgan20000 0.433627 0.608014 0.956400 \n", - "44 incheon LightGBM smotenc_ctgan20000 0.564798 0.688999 0.909293 \n", - "45 daegu LightGBM smotenc_ctgan20000 0.422337 0.591076 0.963494 \n", - "46 daejeon LightGBM smotenc_ctgan20000 0.482738 0.627963 0.931189 \n", - "47 gwangju LightGBM smotenc_ctgan20000 0.475348 0.618830 0.934692 \n", - "\n", - " fold_csi \n", - "0 [[0.46595932802825235, 0.5771195097037204, 0.4... \n", - "1 [[0.32824427480911017, 0.4782608695651431, 0.4... \n", - "2 [[0.4845292955891715, 0.6037628278220865, 0.57... \n", - "3 [[0.28124999999994504, 0.3320537428022395, 0.2... \n", - "4 [[0.43333333333329205, 0.4547920433995972, 0.5... \n", - "5 [[0.3928095872169916, 0.5461624026695722, 0.50... \n", - "6 [[0.4550682961897588, 0.6503831417623898, 0.63... \n", - "7 [[0.4690909090908522, 0.47058823529405874, 0.4... \n", - "8 [[0.5613293051359177, 0.6213080168776044, 0.56... \n", - "9 [[0.3632567849686089, 0.49122807017536024, 0.4... \n", - "10 [[0.4589041095890018, 0.5326514555467716, 0.57... \n", - "11 [[0.47532729103721294, 0.5410958904109059, 0.5... \n", - "12 [[0.47165160230070075, 0.5903500473036338, 0.5... \n", - "13 [[0.4154262516914187, 0.49489051094883285, 0.4... \n", - "14 [[0.5311667554608135, 0.6098117512834792, 0.55... \n", - "15 [[0.46130952380938656, 0.4414784394249607, 0.4... \n", - "16 [[0.42667928098387636, 0.48797250859102337, 0.... \n", - "17 [[0.42775665399235474, 0.5447427293064268, 0.5... \n", - "18 [[0.482333607230856, 0.5810397553516227, 0.583... \n", - "19 [[0.4084084084083471, 0.5208955223879819, 0.47... \n", - "20 [[0.4707429322813629, 0.6094198736358064, 0.57... \n", - "21 [[0.44943820224706477, 0.45661157024783955, 0.... \n", - "22 [[0.43804034582128354, 0.4569356300996866, 0.5... \n", - "23 [[0.38888888888883977, 0.53999999999994, 0.515... \n", - "24 [[0.4815724815724421, 0.5754132231404364, 0.58... \n", - "25 [[0.3847328244274221, 0.5324074074073252, 0.48... \n", - "26 [[0.45490716180368335, 0.6001144164759382, 0.5... \n", - "27 [[0.4832826747718896, 0.4640657084187959, 0.33... \n", - "28 [[0.4214559386972776, 0.4623753399818257, 0.52... \n", - "29 [[0.3902439024389743, 0.543429844097935, 0.524... \n", - "30 [[0.46849757673663417, 0.5743801652891969, 0.5... \n", - "31 [[0.38415545590427735, 0.4850640113797318, 0.4... \n", - "32 [[0.4885695623774991, 0.6043577981651029, 0.56... \n", - "33 [[0.41432225063928024, 0.4475806451612001, 0.3... \n", - "34 [[0.4230038022813286, 0.47079964061091906, 0.5... \n", - "35 [[0.37113402061850886, 0.5363735070574879, 0.5... \n", - "36 [[0.4607201309328592, 0.5864583333332722, 0.57... \n", - "37 [[0.40412979351026485, 0.4999999999999277, 0.4... \n", - "38 [[0.4681967213114447, 0.6035067873302826, 0.57... \n", - "39 [[0.4368131868130668, 0.45213849287159835, 0.3... \n", - "40 [[0.42665388302968105, 0.4723481414324141, 0.5... \n", - "41 [[0.3915343915343397, 0.5499451152578979, 0.51... \n", - "42 [[0.4706840390879095, 0.5655314757481357, 0.57... \n", - "43 [[0.3399089529589772, 0.5086956521738393, 0.45... \n", - "44 [[0.5387685290763661, 0.5963821368004185, 0.55... \n", - "45 [[0.42962962962952356, 0.43951612903216947, 0.... \n", - "46 [[0.4281636536631372, 0.4917627677100089, 0.52... \n", - "47 [[0.3949903660885939, 0.5378704720087225, 0.49... " - ] - }, - "execution_count": 74, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 개별 모델 결과 저장\n", - "df.to_csv(\"../../data/oversampled_data_test_for_model/lightgbm_sampled_data_test.csv\", index=False)\n", - "\n", - "df.to_csv(\"../../data/oversampled_data_test_for_model/combined_sampled_data_test.csv\", index=False)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py39", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +version https://git-lfs.github.com/spec/v1 +oid sha256:d54ff7f9df46e1abe78486edc7cde884c864086006e1013e84061ff54065c4ed +size 240922