diff --git "a/Analysis_code/4.oversampling_data_test/xgb_sampled_test.ipynb" "b/Analysis_code/4.oversampling_data_test/xgb_sampled_test.ipynb" --- "a/Analysis_code/4.oversampling_data_test/xgb_sampled_test.ipynb" +++ "b/Analysis_code/4.oversampling_data_test/xgb_sampled_test.ipynb" @@ -1,4547 +1,3 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **XGBoost**" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from sklearn.model_selection import train_test_split\n", - "from xgboost import XGBClassifier\n", - "from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score\n", - "from collections import Counter" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:\n", - " \"\"\"\n", - " 제거했던 파생 변수들을 복구\n", - " \n", - " Args:\n", - " df: 데이터프레임\n", - " \n", - " Returns:\n", - " 파생 변수가 추가된 데이터프레임\n", - " \"\"\"\n", - " df = df.copy()\n", - " df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)\n", - " df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)\n", - " df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)\n", - " df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)\n", - " df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']\n", - " return df\n", - "\n", - "\n", - "def preprocessing(df):\n", - " df = df[df.columns].copy()\n", - " df['year'] = df['year'].astype('int')\n", - " df['month'] = df['month'].astype('int')\n", - " df['hour'] = df['hour'].astype('int')\n", - " df= add_derived_features(df).copy()\n", - " df['multi_class'] = df['multi_class'].astype('int')\n", - "\n", - " df.loc[df['wind_dir']=='정온', 'wind_dir'] = \"0\"\n", - " df['wind_dir'] = df['wind_dir'].astype('int')\n", - " df= df[['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm',\n", - " 'vap_pressure', 'dewpoint_C', 'loc_pressure', 'sea_pressure',\n", - " 'solarRad', 'snow_cm', 'cloudcover', 'lm_cloudcover', 'low_cloudbase',\n", - " 'groundtemp', 'O3', 'NO2', 'PM10', 'PM25', 'year',\n", - " 'month', 'hour', 'ground_temp - temp_C', 'hour_sin', 'hour_cos',\n", - " 'month_sin', 'month_cos','multi_class']].copy()\n", - " return df\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "df_seoul = pd.read_csv(\"../../data/data_for_modeling/seoul_train.csv\")\n", - "df_busan = pd.read_csv(\"../../data/data_for_modeling/busan_train.csv\")\n", - "df_daegu = pd.read_csv(\"../../data/data_for_modeling/daegu_train.csv\")\n", - "df_daejeon = pd.read_csv(\"../../data/data_for_modeling/daejeon_train.csv\")\n", - "df_incheon = pd.read_csv(\"../../data/data_for_modeling/incheon_train.csv\")\n", - "df_gwangju = pd.read_csv(\"../../data/data_for_modeling/gwangju_train.csv\")\n", - "\n", - "df_seoul = preprocessing(df_seoul).copy()\n", - "df_busan = preprocessing(df_busan).copy()\n", - "df_daegu = preprocessing(df_daegu).copy()\n", - "df_daejeon = preprocessing(df_daejeon).copy()\n", - "df_incheon = preprocessing(df_incheon).copy()\n", - "df_gwangju = preprocessing(df_gwangju).copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "seoul : Counter({2: 23686, 1: 2579, 0: 39})\n", - "\n", - "busan : Counter({2: 24694, 1: 1516, 0: 94})\n", - "\n", - "daegu : Counter({2: 25149, 1: 1107, 0: 48})\n", - "\n", - "gwangju : Counter({2: 23798, 1: 2411, 0: 95})\n", - "\n", - "daejeon : Counter({2: 23471, 1: 2660, 0: 173})\n", - "\n", - "incheon : Counter({2: 21893, 1: 3892, 0: 519})\n" - ] - } - ], - "source": [ - "print(\"seoul : \", Counter(df_seoul['multi_class']))\n", - "print()\n", - "print(\"busan : \", Counter(df_busan['multi_class']))\n", - "print()\n", - "print(\"daegu : \", Counter(df_daegu['multi_class']))\n", - "print()\n", - "print(\"gwangju : \", Counter(df_gwangju['multi_class']))\n", - "print()\n", - "print(\"daejeon : \", Counter(df_daejeon['multi_class']))\n", - "print()\n", - "print(\"incheon : \", Counter(df_incheon['multi_class']))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0, 1, 2])" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.unique(df_seoul['multi_class'])" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "from sklearn.utils.class_weight import compute_class_weight\n", - "\n", - "def calculate_csi(Y_test, pred):\n", - "\n", - " cm = confusion_matrix(Y_test, pred) # 변수 이름을 cm으로 변경\n", - " # 혼동 행렬에서 H, F, M 추출\n", - " H = (cm[0, 0] + cm[1, 1])\n", - " \n", - " F = (cm[1, 0] + cm[2, 0] +\n", - " cm[0, 1] + cm[2, 1])\n", - " \n", - " M = (cm[0, 2] + cm[1, 2])\n", - " \n", - " # CSI 계산\n", - " CSI = H / (H + F + M + 1e-10)\n", - " return CSI\n", - "\n", - "def eval_metric_csi(y_true, pred_prob):\n", - "\n", - " pred = np.argmax(pred_prob, axis=1)\n", - " y_true = y_true\n", - " y_pred = pred\n", - " csi = calculate_csi(y_true, y_pred)\n", - " return -1*csi\n", - "\n", - "def sample_weight(y_train):\n", - " class_weights = compute_class_weight(\n", - " class_weight='balanced',\n", - " classes=np.unique(y_train), # 고유 클래스\n", - " y=y_train # 학습 데이터 레이블\n", - " )\n", - " sample_weights = np.array([class_weights[label] for label in y_train])\n", - "\n", - " return sample_weights\n", - "\n", - "\n", - "\n", - "from sklearn.metrics import matthews_corrcoef\n", - "\n", - "def multiclass_mcc(y_val, y_pred):\n", - " \"\"\"\n", - " 다중 분류에서도 sklearn의 matthews_corrcoef를 그대로 사용할 수 있음.\n", - " \"\"\"\n", - " return matthews_corrcoef(y_val, y_pred)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# 사용자 정의 평가 지표 함수 정의\n", - "def f1_metric(y_true, pred):\n", - " y_pred_binary = (pred >= 0.5).astype(int) # 확률 값을 이진 값으로 변환\n", - " score = f1_score(y_true, y_pred_binary)\n", - " return 'f1', score, True # higher_better=True\n", - "\n", - "# 사용자 정의 평가 지표 함수 정의\n", - "def custom_metric(Y_true, preds): \n", - " pred = (preds >= 0.5).astype(int)\n", - " score = f1_score(Y_true, pred)\n", - " return -score" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "pre_sampled_data= []\n", - "smote_sample_data= []\n", - "gan20000_sample_data= []\n", - "gan10000_sample_data= []\n", - "gan7000_sample_data= []" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n", - "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n", - "from warnings import filterwarnings\n", - "filterwarnings('ignore')\n", - "\n", - "# XGBoost 분류기 초기화\n", - "xgb_model = XGBClassifier(\n", - " n_estimators=4000, # 약한 학습기 개수\n", - " tree_method='hist', \n", - " device='cuda', # GPU 사용\n", - " enable_categorical=True, # 범주형 변수 지정\n", - " objective='multi:softprob', \n", - " early_stopping_rounds=400, # 과적합 방지를 위한 조기 종료 설정\n", - " random_state= 42,\n", - " num_class=3,\n", - " eval_metric = eval_metric_csi\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "df= pd.DataFrame(columns=['region','model','data_sample','CSI','MCC','Accuracy','fold_csi'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.565011886957529\n", - "mean of accuracy : 0.9455723773402865\n", - "mean of mcc : 0.7004827765196581\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2018, 2019]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2018, 2020]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul.loc[df_seoul['year'].isin([2019, 2020]), df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul.loc[df_seoul['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4602444563246328\n", - "mean of accuracy : 0.9597446789929386\n", - "mean of mcc : 0.6260853753404706\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2018, 2019]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2018, 2020]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan.loc[df_busan['year'].isin([2019, 2020]), df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan.loc[df_busan['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5581443524210103\n", - "mean of accuracy : 0.9130578844058522\n", - "mean of mcc : 0.6892177198119791\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2018, 2019]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2018, 2020]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon.loc[df_incheon['year'].isin([2019, 2020]), df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon.loc[df_incheon['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4166514505499186\n", - "mean of accuracy : 0.9683242050719033\n", - "mean of mcc : 0.600956509945712\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2018, 2019]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2018, 2020]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu.loc[df_daegu['year'].isin([2019, 2020]), df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu.loc[df_daegu['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5044504047922097\n", - "mean of accuracy : 0.9357524265788357\n", - "mean of mcc : 0.6471930212882061\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2018, 2019]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2018, 2020]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon.loc[df_daejeon['year'].isin([2019, 2020]), df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.49003643817143744\n", - "mean of accuracy : 0.943428795402184\n", - "mean of mcc : 0.6379812479286601\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2018, 2019]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2018, 2020]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju.loc[df_gwangju['year'].isin([2019, 2020]), df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'pure',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **SMOTE 증강기법을 적용시킨 데이터셋에 대한 성능**" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "df_smote_busan_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_busan.csv\")\n", - "df_smote_busan_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_busan.csv\")\n", - "df_smote_busan_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_busan.csv\")\n", - "\n", - "df_smote_seoul_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_seoul.csv\")\n", - "df_smote_seoul_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_seoul.csv\")\n", - "df_smote_seoul_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_seoul.csv\")\n", - "\n", - "df_smote_daegu_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_daegu.csv\")\n", - "df_smote_daegu_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_daegu.csv\")\n", - "df_smote_daegu_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_daegu.csv\")\n", - "\n", - "df_smote_daejeon_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_daejeon.csv\")\n", - "df_smote_daejeon_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_daejeon.csv\")\n", - "df_smote_daejeon_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_daejeon.csv\")\n", - "\n", - "df_smote_gwangju_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_gwangju.csv\")\n", - "df_smote_gwangju_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_gwangju.csv\")\n", - "df_smote_gwangju_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_gwangju.csv\")\n", - "\n", - "df_smote_incheon_1 = pd.read_csv(\"../../data/data_oversampled/smote/smote_1_incheon.csv\")\n", - "df_smote_incheon_2 = pd.read_csv(\"../../data/data_oversampled/smote/smote_2_incheon.csv\")\n", - "df_smote_incheon_3 = pd.read_csv(\"../../data/data_oversampled/smote/smote_3_incheon.csv\")\n", - "\n", - "\n", - "df_smote_busan_1 = preprocessing(df_smote_busan_1)\n", - "df_smote_busan_2 = preprocessing(df_smote_busan_2)\n", - "df_smote_busan_3 = preprocessing(df_smote_busan_3)\n", - "\n", - "df_smote_seoul_1 = preprocessing(df_smote_seoul_1)\n", - "df_smote_seoul_2 = preprocessing(df_smote_seoul_2)\n", - "df_smote_seoul_3 = preprocessing(df_smote_seoul_3)\n", - "\n", - "df_smote_daegu_1 = preprocessing(df_smote_daegu_1)\n", - "df_smote_daegu_2 = preprocessing(df_smote_daegu_2)\n", - "df_smote_daegu_3 = preprocessing(df_smote_daegu_3)\n", - "\n", - "df_smote_daejeon_1 = preprocessing(df_smote_daejeon_1)\n", - "df_smote_daejeon_2 = preprocessing(df_smote_daejeon_2)\n", - "df_smote_daejeon_3 = preprocessing(df_smote_daejeon_3)\n", - "\n", - "df_smote_gwangju_1 = preprocessing(df_smote_gwangju_1)\n", - "df_smote_gwangju_2 = preprocessing(df_smote_gwangju_2)\n", - "df_smote_gwangju_3 = preprocessing(df_smote_gwangju_3)\n", - "\n", - "df_smote_incheon_1 = preprocessing(df_smote_incheon_1)\n", - "df_smote_incheon_2 = preprocessing(df_smote_incheon_2)\n", - "df_smote_incheon_3 = preprocessing(df_smote_incheon_3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5822658808056899\n", - "mean of accuracy : 0.9420404263459506\n", - "mean of mcc : 0.7105015517582441\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_1.loc[df_smote_seoul_1['year'].isin([2018, 2019]), df_smote_seoul_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_smote_seoul_1.loc[df_smote_seoul_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_2.loc[df_smote_seoul_2['year'].isin([2018, 2020]), df_smote_seoul_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_smote_seoul_2.loc[df_smote_seoul_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_seoul_3.loc[df_smote_seoul_3['year'].isin([2019, 2020]), df_smote_seoul_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_smote_seoul_3.loc[df_smote_seoul_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.46696555439565635\n", - "mean of accuracy : 0.9524436293468407\n", - "mean of mcc : 0.6292568508681569\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_busan_1.loc[df_smote_busan_1['year'].isin([2018, 2019]), df_smote_busan_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_smote_busan_1.loc[df_smote_busan_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_busan_2.loc[df_smote_busan_2['year'].isin([2018, 2020]), df_smote_busan_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_smote_busan_2.loc[df_smote_busan_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_busan_3.loc[df_smote_busan_3['year'].isin([2019, 2020]), df_smote_busan_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_smote_busan_3.loc[df_smote_busan_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.589146277616683\n", - "mean of accuracy : 0.9121357012750456\n", - "mean of mcc : 0.7091428152788728\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_1.loc[df_smote_incheon_1['year'].isin([2018, 2019]), df_smote_incheon_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_smote_incheon_1.loc[df_smote_incheon_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_2.loc[df_smote_incheon_2['year'].isin([2018, 2020]), df_smote_incheon_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_smote_incheon_2.loc[df_smote_incheon_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_incheon_3.loc[df_smote_incheon_3['year'].isin([2019, 2020]), df_smote_incheon_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_smote_incheon_3.loc[df_smote_incheon_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4540663399321314\n", - "mean of accuracy : 0.9649076569270821\n", - "mean of mcc : 0.6209477801158471\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_1.loc[df_smote_daegu_1['year'].isin([2018, 2019]), df_smote_daegu_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_smote_daegu_1.loc[df_smote_daegu_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_2.loc[df_smote_daegu_2['year'].isin([2018, 2020]), df_smote_daegu_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_smote_daegu_2.loc[df_smote_daegu_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daegu_3.loc[df_smote_daegu_3['year'].isin([2019, 2020]), df_smote_daegu_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_smote_daegu_3.loc[df_smote_daegu_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5164940408638485\n", - "mean of accuracy : 0.9314560096813634\n", - "mean of mcc : 0.651202614107659\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_1.loc[df_smote_daejeon_1['year'].isin([2018, 2019]), df_smote_daejeon_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_smote_daejeon_1.loc[df_smote_daejeon_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_2.loc[df_smote_daejeon_2['year'].isin([2018, 2020]), df_smote_daejeon_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_smote_daejeon_2.loc[df_smote_daejeon_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_daejeon_3.loc[df_smote_daejeon_3['year'].isin([2019, 2020]), df_smote_daejeon_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_smote_daejeon_3.loc[df_smote_daejeon_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5121121813474988\n", - "mean of accuracy : 0.9352206377722885\n", - "mean of mcc : 0.6522384095048649\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "# Fold 1\n", - "\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_1.loc[df_smote_gwangju_1['year'].isin([2018, 2019]), df_smote_gwangju_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_smote_gwangju_1.loc[df_smote_gwangju_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_2.loc[df_smote_gwangju_2['year'].isin([2018, 2020]), df_smote_gwangju_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_smote_gwangju_2.loc[df_smote_gwangju_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_smote_gwangju_3.loc[df_smote_gwangju_3['year'].isin([2019, 2020]), df_smote_gwangju_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_smote_gwangju_3.loc[df_smote_gwangju_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smote',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **CTGAN을 통해 데이터 증강을 진행한 데이터셋에 대한 성능**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **2만개**" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_busan.csv\")\n", - "df_seoul_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_seoul.csv\")\n", - "df_incheon_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_incheon.csv\")\n", - "df_daegu_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_daegu.csv\")\n", - "df_daejeon_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_daejeon.csv\")\n", - "df_gwangju_gan20000_1= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_busan.csv\")\n", - "df_seoul_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_seoul.csv\")\n", - "df_incheon_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_incheon.csv\")\n", - "df_daegu_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_daegu.csv\")\n", - "df_daejeon_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_daejeon.csv\")\n", - "df_gwangju_gan20000_2= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_busan.csv\")\n", - "df_seoul_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_seoul.csv\")\n", - "df_incheon_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_incheon.csv\")\n", - "df_daegu_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_daegu.csv\")\n", - "df_daejeon_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_daejeon.csv\")\n", - "df_gwangju_gan20000_3= pd.read_csv(\"../../data/data_oversampled/ctgan20000/ctgan20000_3_gwangju.csv\")\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_gan20000_1= preprocessing(df_busan_gan20000_1).copy()\n", - "df_seoul_gan20000_1= preprocessing(df_seoul_gan20000_1).copy()\n", - "df_incheon_gan20000_1= preprocessing(df_incheon_gan20000_1).copy()\n", - "df_daegu_gan20000_1= preprocessing(df_daegu_gan20000_1).copy()\n", - "df_daejeon_gan20000_1= preprocessing(df_daejeon_gan20000_1).copy()\n", - "df_gwangju_gan20000_1= preprocessing(df_gwangju_gan20000_1).copy()\n", - "\n", - "df_busan_gan20000_2= preprocessing(df_busan_gan20000_2).copy()\n", - "df_seoul_gan20000_2= preprocessing(df_seoul_gan20000_2).copy()\n", - "df_incheon_gan20000_2= preprocessing(df_incheon_gan20000_2).copy()\n", - "df_daegu_gan20000_2= preprocessing(df_daegu_gan20000_2).copy()\n", - "df_daejeon_gan20000_2= preprocessing(df_daejeon_gan20000_2).copy()\n", - "df_gwangju_gan20000_2= preprocessing(df_gwangju_gan20000_2).copy()\n", - "\n", - "df_busan_gan20000_3= preprocessing(df_busan_gan20000_3).copy()\n", - "df_seoul_gan20000_3= preprocessing(df_seoul_gan20000_3).copy()\n", - "df_incheon_gan20000_3= preprocessing(df_incheon_gan20000_3).copy()\n", - "df_daegu_gan20000_3= preprocessing(df_daegu_gan20000_3).copy()\n", - "df_daejeon_gan20000_3= preprocessing(df_daejeon_gan20000_3).copy()\n", - "df_gwangju_gan20000_3= preprocessing(df_gwangju_gan20000_3).copy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5612905206753843\n", - "mean of accuracy : 0.9447758897455731\n", - "mean of mcc : 0.6967744639852204\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_1.loc[df_seoul_gan20000_1['year'].isin([2018, 2019]), df_seoul_gan20000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan20000_1.loc[df_seoul_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_2.loc[df_seoul_gan20000_2['year'].isin([2018, 2020]), df_seoul_gan20000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan20000_2.loc[df_seoul_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan20000_3.loc[df_seoul_gan20000_3['year'].isin([2019, 2020]), df_seoul_gan20000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan20000_3.loc[df_seoul_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4637131316761601\n", - "mean of accuracy : 0.958679126015753\n", - "mean of mcc : 0.6292212200611739\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_1.loc[df_busan_gan20000_1['year'].isin([2018, 2019]), df_busan_gan20000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan20000_1.loc[df_busan_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_2.loc[df_busan_gan20000_2['year'].isin([2018, 2020]), df_busan_gan20000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan20000_2.loc[df_busan_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan20000_3.loc[df_busan_gan20000_3['year'].isin([2019, 2020]), df_busan_gan20000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan20000_3.loc[df_busan_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5532678910387965\n", - "mean of accuracy : 0.9112315043541184\n", - "mean of mcc : 0.685576203077003\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_1.loc[df_incheon_gan20000_1['year'].isin([2018, 2019]), df_incheon_gan20000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan20000_1.loc[df_incheon_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_2.loc[df_incheon_gan20000_2['year'].isin([2018, 2020]), df_incheon_gan20000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan20000_2.loc[df_incheon_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan20000_3.loc[df_incheon_gan20000_3['year'].isin([2019, 2020]), df_incheon_gan20000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan20000_3.loc[df_incheon_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.42241695688681397\n", - "mean of accuracy : 0.9666134358027464\n", - "mean of mcc : 0.5955993347276635\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_1.loc[df_daegu_gan20000_1['year'].isin([2018, 2019]), df_daegu_gan20000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan20000_1.loc[df_daegu_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_2.loc[df_daegu_gan20000_2['year'].isin([2018, 2020]), df_daegu_gan20000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan20000_2.loc[df_daegu_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan20000_3.loc[df_daegu_gan20000_3['year'].isin([2019, 2020]), df_daegu_gan20000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan20000_3.loc[df_daegu_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.48679908438561315\n", - "mean of accuracy : 0.9327868852459016\n", - "mean of mcc : 0.6323632523449321\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_1.loc[df_daejeon_gan20000_1['year'].isin([2018, 2019]), df_daejeon_gan20000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_1.loc[df_daejeon_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_2.loc[df_daejeon_gan20000_2['year'].isin([2018, 2020]), df_daejeon_gan20000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_2.loc[df_daejeon_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan20000_3.loc[df_daejeon_gan20000_3['year'].isin([2019, 2020]), df_daejeon_gan20000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan20000_3.loc[df_daejeon_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.48353837097976565\n", - "mean of accuracy : 0.9410715622426827\n", - "mean of mcc : 0.6300846989412464\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_1.loc[df_gwangju_gan20000_1['year'].isin([2018, 2019]), df_gwangju_gan20000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_1.loc[df_gwangju_gan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_2.loc[df_gwangju_gan20000_2['year'].isin([2018, 2020]), df_gwangju_gan20000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_2.loc[df_gwangju_gan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan20000_3.loc[df_gwangju_gan20000_3['year'].isin([2019, 2020]), df_gwangju_gan20000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan20000_3.loc[df_gwangju_gan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **1만개**" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_busan.csv\")\n", - "df_seoul_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_seoul.csv\")\n", - "df_incheon_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_incheon.csv\")\n", - "df_daegu_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_daegu.csv\")\n", - "df_daejeon_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_daejeon.csv\")\n", - "df_gwangju_gan10000_1= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_busan.csv\")\n", - "df_seoul_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_seoul.csv\")\n", - "df_incheon_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_incheon.csv\")\n", - "df_daegu_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_daegu.csv\")\n", - "df_daejeon_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_daejeon.csv\")\n", - "df_gwangju_gan10000_2= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_busan.csv\")\n", - "df_seoul_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_seoul.csv\")\n", - "df_incheon_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_incheon.csv\")\n", - "df_daegu_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_daegu.csv\")\n", - "df_daejeon_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_daejeon.csv\")\n", - "df_gwangju_gan10000_3= pd.read_csv(\"../../data/data_oversampled/ctgan10000/ctgan10000_3_gwangju.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_gan10000_1= preprocessing(df_busan_gan10000_1).copy()\n", - "df_seoul_gan10000_1= preprocessing(df_seoul_gan10000_1).copy()\n", - "df_incheon_gan10000_1= preprocessing(df_incheon_gan10000_1).copy()\n", - "df_daegu_gan10000_1= preprocessing(df_daegu_gan10000_1).copy()\n", - "df_daejeon_gan10000_1= preprocessing(df_daejeon_gan10000_1).copy()\n", - "df_gwangju_gan10000_1= preprocessing(df_gwangju_gan10000_1).copy()\n", - "\n", - "df_busan_gan10000_2= preprocessing(df_busan_gan10000_2).copy()\n", - "df_seoul_gan10000_2= preprocessing(df_seoul_gan10000_2).copy()\n", - "df_incheon_gan10000_2= preprocessing(df_incheon_gan10000_2).copy()\n", - "df_daegu_gan10000_2= preprocessing(df_daegu_gan10000_2).copy()\n", - "df_daejeon_gan10000_2= preprocessing(df_daejeon_gan10000_2).copy()\n", - "df_gwangju_gan10000_2= preprocessing(df_gwangju_gan10000_2).copy()\n", - "\n", - "df_busan_gan10000_3= preprocessing(df_busan_gan10000_3).copy()\n", - "df_seoul_gan10000_3= preprocessing(df_seoul_gan10000_3).copy()\n", - "df_incheon_gan10000_3= preprocessing(df_incheon_gan10000_3).copy()\n", - "df_daegu_gan10000_3= preprocessing(df_daegu_gan10000_3).copy()\n", - "df_daejeon_gan10000_3= preprocessing(df_daejeon_gan10000_3).copy()\n", - "df_gwangju_gan10000_3= preprocessing(df_gwangju_gan10000_3).copy()\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5726452857277949\n", - "mean of accuracy : 0.9459915703936588\n", - "mean of mcc : 0.7050455795057567\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_1.loc[df_seoul_gan10000_1['year'].isin([2018, 2019]), df_seoul_gan10000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan10000_1.loc[df_seoul_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_2.loc[df_seoul_gan10000_2['year'].isin([2018, 2020]), df_seoul_gan10000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan10000_2.loc[df_seoul_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan10000_3.loc[df_seoul_gan10000_3['year'].isin([2019, 2020]), df_seoul_gan10000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan10000_3.loc[df_seoul_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.46215668360603956\n", - "mean of accuracy : 0.957996793672181\n", - "mean of mcc : 0.6232604815328809\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_1.loc[df_busan_gan10000_1['year'].isin([2018, 2019]), df_busan_gan10000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan10000_1.loc[df_busan_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_2.loc[df_busan_gan10000_2['year'].isin([2018, 2020]), df_busan_gan10000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan10000_2.loc[df_busan_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan10000_3.loc[df_busan_gan10000_3['year'].isin([2019, 2020]), df_busan_gan10000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan10000_3.loc[df_busan_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5588257624146101\n", - "mean of accuracy : 0.9126001197694439\n", - "mean of mcc : 0.6900542521034104\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_1.loc[df_incheon_gan10000_1['year'].isin([2018, 2019]), df_incheon_gan10000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan10000_1.loc[df_incheon_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_2.loc[df_incheon_gan10000_2['year'].isin([2018, 2020]), df_incheon_gan10000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan10000_2.loc[df_incheon_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan10000_3.loc[df_incheon_gan10000_3['year'].isin([2019, 2020]), df_incheon_gan10000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan10000_3.loc[df_incheon_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.42209289106237763\n", - "mean of accuracy : 0.9674885220949672\n", - "mean of mcc : 0.598275696961497\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_1.loc[df_daegu_gan10000_1['year'].isin([2018, 2019]), df_daegu_gan10000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan10000_1.loc[df_daegu_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_2.loc[df_daegu_gan10000_2['year'].isin([2018, 2020]), df_daegu_gan10000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan10000_2.loc[df_daegu_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan10000_3.loc[df_daegu_gan10000_3['year'].isin([2019, 2020]), df_daegu_gan10000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan10000_3.loc[df_daegu_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4992083791607569\n", - "mean of accuracy : 0.9357514908800559\n", - "mean of mcc : 0.6446946657921234\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_1.loc[df_daejeon_gan10000_1['year'].isin([2018, 2019]), df_daejeon_gan10000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_1.loc[df_daejeon_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_2.loc[df_daejeon_gan10000_2['year'].isin([2018, 2020]), df_daejeon_gan10000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_2.loc[df_daejeon_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan10000_3.loc[df_daejeon_gan10000_3['year'].isin([2019, 2020]), df_daejeon_gan10000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan10000_3.loc[df_daejeon_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4948663541817706\n", - "mean of accuracy : 0.9419830368207868\n", - "mean of mcc : 0.6398305973894735\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_1.loc[df_gwangju_gan10000_1['year'].isin([2018, 2019]), df_gwangju_gan10000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_1.loc[df_gwangju_gan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_2.loc[df_gwangju_gan10000_2['year'].isin([2018, 2020]), df_gwangju_gan10000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_2.loc[df_gwangju_gan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan10000_3.loc[df_gwangju_gan10000_3['year'].isin([2019, 2020]), df_gwangju_gan10000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan10000_3.loc[df_gwangju_gan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **7천개**" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_busan.csv\")\n", - "df_seoul_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_seoul.csv\")\n", - "df_incheon_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_incheon.csv\")\n", - "df_daegu_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_daegu.csv\")\n", - "df_daejeon_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_daejeon.csv\")\n", - "df_gwangju_gan7000_1= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_busan.csv\")\n", - "df_seoul_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_seoul.csv\")\n", - "df_incheon_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_incheon.csv\")\n", - "df_daegu_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_daegu.csv\")\n", - "df_daejeon_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_daejeon.csv\")\n", - "df_gwangju_gan7000_2= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_busan.csv\")\n", - "df_seoul_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_seoul.csv\")\n", - "df_incheon_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_incheon.csv\")\n", - "df_daegu_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_daegu.csv\")\n", - "df_daejeon_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_daejeon.csv\")\n", - "df_gwangju_gan7000_3= pd.read_csv(\"../../data/data_oversampled/ctgan7000/ctgan7000_3_gwangju.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_gan7000_1= preprocessing(df_busan_gan7000_1).copy()\n", - "df_seoul_gan7000_1= preprocessing(df_seoul_gan7000_1).copy()\n", - "df_incheon_gan7000_1= preprocessing(df_incheon_gan7000_1).copy()\n", - "df_daegu_gan7000_1= preprocessing(df_daegu_gan7000_1).copy()\n", - "df_daejeon_gan7000_1= preprocessing(df_daejeon_gan7000_1).copy()\n", - "df_gwangju_gan7000_1= preprocessing(df_gwangju_gan7000_1).copy()\n", - "\n", - "df_busan_gan7000_2= preprocessing(df_busan_gan7000_2).copy()\n", - "df_seoul_gan7000_2= preprocessing(df_seoul_gan7000_2).copy()\n", - "df_incheon_gan7000_2= preprocessing(df_incheon_gan7000_2).copy()\n", - "df_daegu_gan7000_2= preprocessing(df_daegu_gan7000_2).copy()\n", - "df_daejeon_gan7000_2= preprocessing(df_daejeon_gan7000_2).copy()\n", - "df_gwangju_gan7000_2= preprocessing(df_gwangju_gan7000_2).copy()\n", - "\n", - "df_busan_gan7000_3= preprocessing(df_busan_gan7000_3).copy()\n", - "df_seoul_gan7000_3= preprocessing(df_seoul_gan7000_3).copy()\n", - "df_incheon_gan7000_3= preprocessing(df_incheon_gan7000_3).copy()\n", - "df_daegu_gan7000_3= preprocessing(df_daegu_gan7000_3).copy()\n", - "df_daejeon_gan7000_3= preprocessing(df_daejeon_gan7000_3).copy()\n", - "df_gwangju_gan7000_3= preprocessing(df_gwangju_gan7000_3).copy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5606594873679361\n", - "mean of accuracy : 0.9448886934318104\n", - "mean of mcc : 0.6970239928104922\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_1.loc[df_seoul_gan7000_1['year'].isin([2018, 2019]), df_seoul_gan7000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_gan7000_1.loc[df_seoul_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_2.loc[df_seoul_gan7000_2['year'].isin([2018, 2020]), df_seoul_gan7000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_gan7000_2.loc[df_seoul_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_gan7000_3.loc[df_seoul_gan7000_3['year'].isin([2019, 2020]), df_seoul_gan7000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_gan7000_3.loc[df_seoul_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.46498805766493717\n", - "mean of accuracy : 0.959059435586496\n", - "mean of mcc : 0.6331340983154944\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_1.loc[df_busan_gan7000_1['year'].isin([2018, 2019]), df_busan_gan7000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_gan7000_1.loc[df_busan_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_2.loc[df_busan_gan7000_2['year'].isin([2018, 2020]), df_busan_gan7000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_gan7000_2.loc[df_busan_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_gan7000_3.loc[df_busan_gan7000_3['year'].isin([2019, 2020]), df_busan_gan7000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_gan7000_3.loc[df_busan_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5617695510575865\n", - "mean of accuracy : 0.9131326363417088\n", - "mean of mcc : 0.6926108566907646\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_1.loc[df_incheon_gan7000_1['year'].isin([2018, 2019]), df_incheon_gan7000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_gan7000_1.loc[df_incheon_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_2.loc[df_incheon_gan7000_2['year'].isin([2018, 2020]), df_incheon_gan7000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_gan7000_2.loc[df_incheon_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_gan7000_3.loc[df_incheon_gan7000_3['year'].isin([2019, 2020]), df_incheon_gan7000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_gan7000_3.loc[df_incheon_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4371777220246276\n", - "mean of accuracy : 0.9704867089186648\n", - "mean of mcc : 0.6091415637717575\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_1.loc[df_daegu_gan7000_1['year'].isin([2018, 2019]), df_daegu_gan7000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_gan7000_1.loc[df_daegu_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_2.loc[df_daegu_gan7000_2['year'].isin([2018, 2020]), df_daegu_gan7000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_gan7000_2.loc[df_daegu_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_gan7000_3.loc[df_daegu_gan7000_3['year'].isin([2019, 2020]), df_daegu_gan7000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_gan7000_3.loc[df_daegu_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.49032336332073895\n", - "mean of accuracy : 0.9335084129716962\n", - "mean of mcc : 0.6358762844808045\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_1.loc[df_daejeon_gan7000_1['year'].isin([2018, 2019]), df_daejeon_gan7000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_1.loc[df_daejeon_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_2.loc[df_daejeon_gan7000_2['year'].isin([2018, 2020]), df_daejeon_gan7000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_2.loc[df_daejeon_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_gan7000_3.loc[df_daejeon_gan7000_3['year'].isin([2019, 2020]), df_daejeon_gan7000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_gan7000_3.loc[df_daejeon_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4977293677719481\n", - "mean of accuracy : 0.942971238698838\n", - "mean of mcc : 0.6427810028391122\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_1.loc[df_gwangju_gan7000_1['year'].isin([2018, 2019]), df_gwangju_gan7000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_1.loc[df_gwangju_gan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_2.loc[df_gwangju_gan7000_2['year'].isin([2018, 2020]), df_gwangju_gan7000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_2.loc[df_gwangju_gan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_gan7000_3.loc[df_gwangju_gan7000_3['year'].isin([2019, 2020]), df_gwangju_gan7000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_gan7000_3.loc[df_gwangju_gan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **SMOTENC+CTGAN을 통해 데이터 증강을 진행한 데이터셋에 대한 성능**\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **7천개**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_busan.csv\")\n", - "df_seoul_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_seoul.csv\")\n", - "df_incheon_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_incheon.csv\")\n", - "df_daegu_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan7000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_busan.csv\")\n", - "df_seoul_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_seoul.csv\")\n", - "df_incheon_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_incheon.csv\")\n", - "df_daegu_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan7000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_busan.csv\")\n", - "df_seoul_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_seoul.csv\")\n", - "df_incheon_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_incheon.csv\")\n", - "df_daegu_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan7000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_gwangju.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_smotenc_ctgan7000_1 = preprocessing(df_busan_smotenc_ctgan7000_1).copy()\n", - "df_seoul_smotenc_ctgan7000_1 = preprocessing(df_seoul_smotenc_ctgan7000_1).copy()\n", - "df_incheon_smotenc_ctgan7000_1 = preprocessing(df_incheon_smotenc_ctgan7000_1).copy()\n", - "df_daegu_smotenc_ctgan7000_1 = preprocessing(df_daegu_smotenc_ctgan7000_1).copy()\n", - "df_daejeon_smotenc_ctgan7000_1 = preprocessing(df_daejeon_smotenc_ctgan7000_1).copy()\n", - "df_gwangju_smotenc_ctgan7000_1 = preprocessing(df_gwangju_smotenc_ctgan7000_1).copy()\n", - "\n", - "df_busan_smotenc_ctgan7000_2 = preprocessing(df_busan_smotenc_ctgan7000_2).copy()\n", - "df_seoul_smotenc_ctgan7000_2 = preprocessing(df_seoul_smotenc_ctgan7000_2).copy()\n", - "df_incheon_smotenc_ctgan7000_2 = preprocessing(df_incheon_smotenc_ctgan7000_2).copy()\n", - "df_daegu_smotenc_ctgan7000_2 = preprocessing(df_daegu_smotenc_ctgan7000_2).copy()\n", - "df_daejeon_smotenc_ctgan7000_2 = preprocessing(df_daejeon_smotenc_ctgan7000_2).copy()\n", - "df_gwangju_smotenc_ctgan7000_2 = preprocessing(df_gwangju_smotenc_ctgan7000_2).copy()\n", - "\n", - "df_busan_smotenc_ctgan7000_3 = preprocessing(df_busan_smotenc_ctgan7000_3).copy()\n", - "df_seoul_smotenc_ctgan7000_3 = preprocessing(df_seoul_smotenc_ctgan7000_3).copy()\n", - "df_incheon_smotenc_ctgan7000_3 = preprocessing(df_incheon_smotenc_ctgan7000_3).copy()\n", - "df_daegu_smotenc_ctgan7000_3 = preprocessing(df_daegu_smotenc_ctgan7000_3).copy()\n", - "df_daejeon_smotenc_ctgan7000_3 = preprocessing(df_daejeon_smotenc_ctgan7000_3).copy()\n", - "df_gwangju_smotenc_ctgan7000_3 = preprocessing(df_gwangju_smotenc_ctgan7000_3).copy()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5566046896452139\n", - "mean of accuracy : 0.9440149587044938\n", - "mean of mcc : 0.6956293176165902\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_1.loc[df_seoul_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan7000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_1.loc[df_seoul_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_2.loc[df_seoul_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan7000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_2.loc[df_seoul_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan7000_3.loc[df_seoul_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan7000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan7000_3.loc[df_seoul_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4596744180737112\n", - "mean of accuracy : 0.9579584300222073\n", - "mean of mcc : 0.6249928154033518\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_1.loc[df_busan_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan7000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_1.loc[df_busan_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_2.loc[df_busan_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan7000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_2.loc[df_busan_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan7000_3.loc[df_busan_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan7000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan7000_3.loc[df_busan_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.561186125284916\n", - "mean of accuracy : 0.9126745598057072\n", - "mean of mcc : 0.692361701378235\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_1.loc[df_incheon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan7000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_1.loc[df_incheon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_2.loc[df_incheon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan7000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_2.loc[df_incheon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan7000_3.loc[df_incheon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan7000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan7000_3.loc[df_incheon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.41098147807826974\n", - "mean of accuracy : 0.9657802480225565\n", - "mean of mcc : 0.5914350093053913\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_1.loc[df_daegu_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan7000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_1.loc[df_daegu_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_2.loc[df_daegu_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan7000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_2.loc[df_daegu_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan7000_3.loc[df_daegu_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan7000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan7000_3.loc[df_daegu_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4978467120830509\n", - "mean of accuracy : 0.9340802288927149\n", - "mean of mcc : 0.6418086424001179\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_1.loc[df_daejeon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan7000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_1.loc[df_daejeon_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_2.loc[df_daejeon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan7000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_2.loc[df_daejeon_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan7000_3.loc[df_daejeon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan7000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan7000_3.loc[df_daejeon_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.48499027571193204\n", - "mean of accuracy : 0.9404240586870275\n", - "mean of mcc : 0.6315570899217192\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_1.loc[df_gwangju_smotenc_ctgan7000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan7000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_1.loc[df_gwangju_smotenc_ctgan7000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_2.loc[df_gwangju_smotenc_ctgan7000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan7000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_2.loc[df_gwangju_smotenc_ctgan7000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan7000_3.loc[df_gwangju_smotenc_ctgan7000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan7000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan7000_3.loc[df_gwangju_smotenc_ctgan7000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan7000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **1만개**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_busan.csv\")\n", - "df_seoul_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_seoul.csv\")\n", - "df_incheon_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_incheon.csv\")\n", - "df_daegu_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan10000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_busan.csv\")\n", - "df_seoul_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_seoul.csv\")\n", - "df_incheon_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_incheon.csv\")\n", - "df_daegu_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan10000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_busan.csv\")\n", - "df_seoul_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_seoul.csv\")\n", - "df_incheon_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_incheon.csv\")\n", - "df_daegu_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan10000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_gwangju.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_smotenc_ctgan10000_1 = preprocessing(df_busan_smotenc_ctgan10000_1).copy()\n", - "df_seoul_smotenc_ctgan10000_1 = preprocessing(df_seoul_smotenc_ctgan10000_1).copy()\n", - "df_incheon_smotenc_ctgan10000_1 = preprocessing(df_incheon_smotenc_ctgan10000_1).copy()\n", - "df_daegu_smotenc_ctgan10000_1 = preprocessing(df_daegu_smotenc_ctgan10000_1).copy()\n", - "df_daejeon_smotenc_ctgan10000_1 = preprocessing(df_daejeon_smotenc_ctgan10000_1).copy()\n", - "df_gwangju_smotenc_ctgan10000_1 = preprocessing(df_gwangju_smotenc_ctgan10000_1).copy()\n", - "\n", - "df_busan_smotenc_ctgan10000_2 = preprocessing(df_busan_smotenc_ctgan10000_2).copy()\n", - "df_seoul_smotenc_ctgan10000_2 = preprocessing(df_seoul_smotenc_ctgan10000_2).copy()\n", - "df_incheon_smotenc_ctgan10000_2 = preprocessing(df_incheon_smotenc_ctgan10000_2).copy()\n", - "df_daegu_smotenc_ctgan10000_2 = preprocessing(df_daegu_smotenc_ctgan10000_2).copy()\n", - "df_daejeon_smotenc_ctgan10000_2 = preprocessing(df_daejeon_smotenc_ctgan10000_2).copy()\n", - "df_gwangju_smotenc_ctgan10000_2 = preprocessing(df_gwangju_smotenc_ctgan10000_2).copy()\n", - "\n", - "df_busan_smotenc_ctgan10000_3 = preprocessing(df_busan_smotenc_ctgan10000_3).copy()\n", - "df_seoul_smotenc_ctgan10000_3 = preprocessing(df_seoul_smotenc_ctgan10000_3).copy()\n", - "df_incheon_smotenc_ctgan10000_3 = preprocessing(df_incheon_smotenc_ctgan10000_3).copy()\n", - "df_daegu_smotenc_ctgan10000_3 = preprocessing(df_daegu_smotenc_ctgan10000_3).copy()\n", - "df_daejeon_smotenc_ctgan10000_3 = preprocessing(df_daejeon_smotenc_ctgan10000_3).copy()\n", - "df_gwangju_smotenc_ctgan10000_3 = preprocessing(df_gwangju_smotenc_ctgan10000_3).copy()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **서울**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5584588498586123\n", - "mean of accuracy : 0.9448134216632981\n", - "mean of mcc : 0.6964135324563553\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_1.loc[df_seoul_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan10000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_1.loc[df_seoul_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_2.loc[df_seoul_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan10000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_2.loc[df_seoul_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan10000_3.loc[df_seoul_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan10000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan10000_3.loc[df_seoul_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4598598851057194\n", - "mean of accuracy : 0.9579939865758414\n", - "mean of mcc : 0.6250673436769149\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_1.loc[df_busan_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan10000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_1.loc[df_busan_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_2.loc[df_busan_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan10000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_2.loc[df_busan_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan10000_3.loc[df_busan_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan10000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan10000_3.loc[df_busan_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5635605041048001\n", - "mean of accuracy : 0.9126383794528866\n", - "mean of mcc : 0.6931491104960332\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_1.loc[df_incheon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan10000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_1.loc[df_incheon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_2.loc[df_incheon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan10000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_2.loc[df_incheon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan10000_3.loc[df_incheon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan10000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan10000_3.loc[df_incheon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.41696614125706577\n", - "mean of accuracy : 0.9666141635684641\n", - "mean of mcc : 0.592889371098272\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_1.loc[df_daegu_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan10000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_1.loc[df_daegu_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_2.loc[df_daegu_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan10000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_2.loc[df_daegu_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan10000_3.loc[df_daegu_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan10000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan10000_3.loc[df_daegu_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4867345078568066\n", - "mean of accuracy : 0.9333559980371119\n", - "mean of mcc : 0.6340727122032903\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_1.loc[df_daejeon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan10000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_1.loc[df_daejeon_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_2.loc[df_daejeon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan10000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_2.loc[df_daejeon_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan10000_3.loc[df_daejeon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan10000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan10000_3.loc[df_daejeon_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.492041646532344\n", - "mean of accuracy : 0.9422477356089528\n", - "mean of mcc : 0.6400420804072541\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_1.loc[df_gwangju_smotenc_ctgan10000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan10000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_1.loc[df_gwangju_smotenc_ctgan10000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_2.loc[df_gwangju_smotenc_ctgan10000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan10000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_2.loc[df_gwangju_smotenc_ctgan10000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan10000_3.loc[df_gwangju_smotenc_ctgan10000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan10000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan10000_3.loc[df_gwangju_smotenc_ctgan10000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan10000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **2만개**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "# 1 Fold\n", - "df_busan_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_busan.csv\")\n", - "df_seoul_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_seoul.csv\")\n", - "df_incheon_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_incheon.csv\")\n", - "df_daegu_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan20000_1 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_gwangju.csv\")\n", - "\n", - "# 2 Fold\n", - "df_busan_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_busan.csv\")\n", - "df_seoul_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_seoul.csv\")\n", - "df_incheon_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_incheon.csv\")\n", - "df_daegu_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan20000_2 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_gwangju.csv\")\n", - "\n", - "# 3 Fold\n", - "df_busan_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_busan.csv\")\n", - "df_seoul_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_seoul.csv\")\n", - "df_incheon_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_incheon.csv\")\n", - "df_daegu_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_daegu.csv\")\n", - "df_daejeon_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_daejeon.csv\")\n", - "df_gwangju_smotenc_ctgan20000_3 = pd.read_csv(\"../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_gwangju.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [], - "source": [ - "df_busan_smotenc_ctgan20000_1 = preprocessing(df_busan_smotenc_ctgan20000_1).copy()\n", - "df_seoul_smotenc_ctgan20000_1 = preprocessing(df_seoul_smotenc_ctgan20000_1).copy()\n", - "df_incheon_smotenc_ctgan20000_1 = preprocessing(df_incheon_smotenc_ctgan20000_1).copy()\n", - "df_daegu_smotenc_ctgan20000_1 = preprocessing(df_daegu_smotenc_ctgan20000_1).copy()\n", - "df_daejeon_smotenc_ctgan20000_1 = preprocessing(df_daejeon_smotenc_ctgan20000_1).copy()\n", - "df_gwangju_smotenc_ctgan20000_1 = preprocessing(df_gwangju_smotenc_ctgan20000_1).copy()\n", - "\n", - "df_busan_smotenc_ctgan20000_2 = preprocessing(df_busan_smotenc_ctgan20000_2).copy()\n", - "df_seoul_smotenc_ctgan20000_2 = preprocessing(df_seoul_smotenc_ctgan20000_2).copy()\n", - "df_incheon_smotenc_ctgan20000_2 = preprocessing(df_incheon_smotenc_ctgan20000_2).copy()\n", - "df_daegu_smotenc_ctgan20000_2 = preprocessing(df_daegu_smotenc_ctgan20000_2).copy()\n", - "df_daejeon_smotenc_ctgan20000_2 = preprocessing(df_daejeon_smotenc_ctgan20000_2).copy()\n", - "df_gwangju_smotenc_ctgan20000_2 = preprocessing(df_gwangju_smotenc_ctgan20000_2).copy()\n", - "\n", - "df_busan_smotenc_ctgan20000_3 = preprocessing(df_busan_smotenc_ctgan20000_3).copy()\n", - "df_seoul_smotenc_ctgan20000_3 = preprocessing(df_seoul_smotenc_ctgan20000_3).copy()\n", - "df_incheon_smotenc_ctgan20000_3 = preprocessing(df_incheon_smotenc_ctgan20000_3).copy()\n", - "df_daegu_smotenc_ctgan20000_3 = preprocessing(df_daegu_smotenc_ctgan20000_3).copy()\n", - "df_daejeon_smotenc_ctgan20000_3 = preprocessing(df_daejeon_smotenc_ctgan20000_3).copy()\n", - "df_gwangju_smotenc_ctgan20000_3 = preprocessing(df_gwangju_smotenc_ctgan20000_3).copy()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5574439981294055\n", - "mean of accuracy : 0.9442437890394325\n", - "mean of mcc : 0.6953786483591325\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_1.loc[df_seoul_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_seoul_smotenc_ctgan20000_1.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_1.loc[df_seoul_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_2.loc[df_seoul_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_seoul_smotenc_ctgan20000_2.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_2.loc[df_seoul_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_seoul_smotenc_ctgan20000_3.loc[df_seoul_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_seoul_smotenc_ctgan20000_3.columns != 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, df_seoul.columns != 'multi_class'], df_seoul_smotenc_ctgan20000_3.loc[df_seoul_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_seoul.loc[df_seoul['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'seoul',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **부산**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4671277823825058\n", - "mean of accuracy : 0.958033077991533\n", - "mean of mcc : 0.6300988436353677\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_1.loc[df_busan_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_busan_smotenc_ctgan20000_1.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2020, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_1.loc[df_busan_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_busan.loc[df_busan['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_2.loc[df_busan_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_busan_smotenc_ctgan20000_2.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2019, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_2.loc[df_busan_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_busan_smotenc_ctgan20000_3.loc[df_busan_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_busan_smotenc_ctgan20000_3.columns != 'multi_class'], df_busan.loc[df_busan['year'] == 2018, df_busan.columns != 'multi_class'], df_busan_smotenc_ctgan20000_3.loc[df_busan_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_busan.loc[df_busan['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'busan',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **인천**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.5572409410571025\n", - "mean of accuracy : 0.9114212432733654\n", - "mean of mcc : 0.6879675745560497\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_1.loc[df_incheon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_incheon_smotenc_ctgan20000_1.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_1.loc[df_incheon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_2.loc[df_incheon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_incheon_smotenc_ctgan20000_2.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_2.loc[df_incheon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_incheon_smotenc_ctgan20000_3.loc[df_incheon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_incheon_smotenc_ctgan20000_3.columns != 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, df_incheon.columns != 'multi_class'], df_incheon_smotenc_ctgan20000_3.loc[df_incheon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_incheon.loc[df_incheon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'incheon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대구**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.40847028864132034\n", - "mean of accuracy : 0.9655496502565893\n", - "mean of mcc : 0.5839149359742953\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_1.loc[df_daegu_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_daegu_smotenc_ctgan20000_1.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_1.loc[df_daegu_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_2.loc[df_daegu_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_daegu_smotenc_ctgan20000_2.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_2.loc[df_daegu_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daegu_smotenc_ctgan20000_3.loc[df_daegu_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_daegu_smotenc_ctgan20000_3.columns != 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, df_daegu.columns != 'multi_class'], df_daegu_smotenc_ctgan20000_3.loc[df_daegu_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daegu.loc[df_daegu['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daegu',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **대전**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4895872216295751\n", - "mean of accuracy : 0.9336976320582878\n", - "mean of mcc : 0.6360989053768741\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_1.loc[df_daejeon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_daejeon_smotenc_ctgan20000_1.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_1.loc[df_daejeon_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_2.loc[df_daejeon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_daejeon_smotenc_ctgan20000_2.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_2.loc[df_daejeon_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_daejeon_smotenc_ctgan20000_3.loc[df_daejeon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_daejeon_smotenc_ctgan20000_3.columns != 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, df_daejeon.columns != 'multi_class'], df_daejeon_smotenc_ctgan20000_3.loc[df_daejeon_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_daejeon.loc[df_daejeon['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'daejeon',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## **광주**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean of csi : 0.4877911320588812\n", - "mean of accuracy : 0.9413747286473538\n", - "mean of mcc : 0.6352378678603517\n" - ] - } - ], - "source": [ - "csi = []\n", - "accuracy = []\n", - "mcc = []\n", - "\n", - "# Fold 1\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_1.loc[df_gwangju_smotenc_ctgan20000_1['year'].isin([2018, 2019]), df_gwangju_smotenc_ctgan20000_1.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_1.loc[df_gwangju_smotenc_ctgan20000_1['year'].isin([2018, 2019]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2020, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 2\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_2.loc[df_gwangju_smotenc_ctgan20000_2['year'].isin([2018, 2020]), df_gwangju_smotenc_ctgan20000_2.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_2.loc[df_gwangju_smotenc_ctgan20000_2['year'].isin([2018, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2019, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "# Fold 3\n", - "X_tr, X_val, Y_tr, Y_val = df_gwangju_smotenc_ctgan20000_3.loc[df_gwangju_smotenc_ctgan20000_3['year'].isin([2019, 2020]), df_gwangju_smotenc_ctgan20000_3.columns != 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, df_gwangju.columns != 'multi_class'], df_gwangju_smotenc_ctgan20000_3.loc[df_gwangju_smotenc_ctgan20000_3['year'].isin([2019, 2020]), 'multi_class'], df_gwangju.loc[df_gwangju['year'] == 2018, 'multi_class']\n", - "X_tr.drop(columns=['year'], inplace=True)\n", - "X_val.drop(columns=['year'], inplace=True)\n", - "\n", - "xgb_model.fit(X_tr, Y_tr, eval_set=[(X_val, Y_val)], verbose=False)\n", - "csi.append(calculate_csi(Y_val, xgb_model.predict(X_val)))\n", - "accuracy.append(accuracy_score(Y_val, xgb_model.predict(X_val)))\n", - "mcc.append(multiclass_mcc(Y_val, xgb_model.predict(X_val)))\n", - "\n", - "\n", - "print(\"mean of csi : \", np.mean(csi))\n", - "print(\"mean of accuracy : \", np.mean(accuracy))\n", - "print(\"mean of mcc : \", np.mean(mcc))\n", - "\n", - "new_row = pd.DataFrame([{\n", - " 'region': 'gwangju',\n", - " 'model': 'XGBoost',\n", - " 'data_sample': 'smotenc_ctgan20000',\n", - " 'CSI': np.mean(csi),\n", - " 'MCC': np.mean(mcc),\n", - " 'Accuracy': np.mean(accuracy),\n", - " 'fold_csi': [csi]\n", - "\n", - "}])\n", - "\n", - "df = pd.concat([df, new_row], ignore_index=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "통합 CSV 저장 완료: 총 96개 결과 (LGB: 48개, XGB: 48개)\n" - ] - } - ], - "source": [ - "# 개별 모델 결과 저장\n", - "df.to_csv(\"../../data/oversampled_data_test_for_model/xgboost_sampled_data_test.csv\", index=False)\n", - "\n", - "# 통합 CSV 생성: LGB 결과와 XGB 결과 합치기\n", - "import os\n", - "\n", - "# LGB 결과 CSV 파일 경로\n", - "lgb_csv_path = \"../../data/oversampled_data_test_for_model/\"\n", - "\n", - "# LGB 결과가 있으면 읽어서 합치기, 없으면 XGB 결과만 사용\n", - "if os.path.exists(lgb_csv_path):\n", - " df_lgb = pd.read_csv(lgb_csv_path)\n", - " # 두 DataFrame 합치기\n", - " df_combined = pd.concat([df_lgb, df], ignore_index=True)\n", - " lgb_count = len(df_lgb)\n", - "else:\n", - " # LGB 결과가 없으면 XGB 결과만 사용\n", - " df_combined = df.copy()\n", - " lgb_count = 0\n", - "\n", - "# 통합 CSV 저장 (현재 디렉토리에 저장)\n", - "df_combined.to_csv(\"../../data/oversampled_data_test_for_model/combined_sampled_data_test.csv\", index=False)\n", - "print(f\"통합 CSV 저장 완료: 총 {len(df_combined)}개 결과 (LGB: {lgb_count}개, XGB: {len(df)}개)\")" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LightGBM 48\n", - "XGBoost 48\n", - "Name: model, dtype: int64" - ] - }, - "execution_count": 74, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_combined['model'].value_counts()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py39", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +version https://git-lfs.github.com/spec/v1 +oid sha256:62e3693f8488dafa51a91db7569af956308f7fb19abc94af2ad2a4b3d430d07a +size 209478