bong9513 commited on Dec 24, 2025

Commit

79a10a0

0 Parent(s):

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +6 -0
Analysis_code/1.data_preprocessing/0.air_data_merge.ipynb +1469 -0
Analysis_code/1.data_preprocessing/1.data_merge.ipynb +0 -0
Analysis_code/1.data_preprocessing/3.make_train_test.ipynb +1099 -0
Analysis_code/2.make_oversample_data/gpu0.log +0 -0
Analysis_code/2.make_oversample_data/gpu1.log +0 -0
Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_10000_1.py +316 -0
Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_10000_2.py +317 -0
Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_10000_3.py +317 -0
Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_20000_1.py +316 -0
Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_20000_2.py +317 -0
Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_20000_3.py +317 -0
Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_7000_1.py +317 -0
Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_7000_2.py +317 -0
Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_7000_3.py +317 -0
Analysis_code/2.make_oversample_data/run_ctgan_gpu0.bash +58 -0
Analysis_code/2.make_oversample_data/run_ctgan_gpu1.bash +58 -0
Analysis_code/2.make_oversample_data/smote_only/smote_sample_1.py +86 -0
Analysis_code/2.make_oversample_data/smote_only/smote_sample_2.py +86 -0
Analysis_code/2.make_oversample_data/smote_only/smote_sample_3.py +86 -0
Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_10000_1.py +375 -0
Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_10000_2.py +376 -0
Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_10000_3.py +376 -0
Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_20000_1.py +375 -0
Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_20000_2.py +376 -0
Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_20000_3.py +376 -0
Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_7000_1.py +378 -0
Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_7000_2.py +376 -0
Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_7000_3.py +376 -0
Analysis_code/3.sampled_data_analysis/make_plot.py +659 -0
Analysis_code/3.sampled_data_analysis/oversampling_model_hyperparameter.ipynb +574 -0
Analysis_code/4.sampling_data_test/analysis.ipynb +244 -0
Analysis_code/4.sampling_data_test/lgb_sampled_test.ipynb +0 -0
Analysis_code/4.sampling_data_test/xgb_sampled_test.ipynb +0 -0
Analysis_code/5.optima/deepgbm_pure/deepgbm_pure_busan.py +98 -0
Analysis_code/5.optima/deepgbm_pure/deepgbm_pure_daegu.py +99 -0
Analysis_code/5.optima/deepgbm_pure/deepgbm_pure_daejeon.py +99 -0
Analysis_code/5.optima/deepgbm_pure/deepgbm_pure_gwangju.py +99 -0
Analysis_code/5.optima/deepgbm_pure/deepgbm_pure_incheon.py +99 -0
Analysis_code/5.optima/deepgbm_pure/deepgbm_pure_seoul.py +99 -0
Analysis_code/5.optima/deepgbm_pure/utils.py +720 -0
Analysis_code/5.optima/deepgbm_smote/deepgbm_smote_busan.py +97 -0
Analysis_code/5.optima/deepgbm_smote/deepgbm_smote_daegu.py +97 -0
Analysis_code/5.optima/deepgbm_smote/deepgbm_smote_daejeon.py +97 -0
Analysis_code/5.optima/deepgbm_smote/deepgbm_smote_gwangju.py +97 -0
Analysis_code/5.optima/deepgbm_smote/deepgbm_smote_incheon.py +97 -0
Analysis_code/5.optima/deepgbm_smote/deepgbm_smote_seoul.py +97 -0
Analysis_code/5.optima/deepgbm_smote/utils.py +720 -0
Analysis_code/5.optima/deepgbm_smotenc_ctgan20000/deepgbm_smotenc_ctgan20000_busan.py +97 -0
Analysis_code/5.optima/deepgbm_smotenc_ctgan20000/deepgbm_smotenc_ctgan20000_daegu.py +97 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+data/*
+Analysis_code/3.sampling_data_test/images/*
+Analysis_code/3.sampled_data_analysis/images/*
+__pycache__/
+Analysis_code/optimization_history/*
+Analysis_code/save_model/*

Analysis_code/1.data_preprocessing/0.air_data_merge.ipynb ADDED Viewed

	@@ -0,0 +1,1469 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Package                       Version\n",
+      "----------------------------- ------------------\n",
+      "absl-py                       1.4.0\n",
+      "accelerate                    0.24.0.dev0\n",
+      "aiofiles                      23.2.1\n",
+      "aiohttp                       3.8.5\n",
+      "aiosignal                     1.3.1\n",
+      "alabaster                     0.7.13\n",
+      "albumentations                1.3.1\n",
+      "alembic                       1.12.0\n",
+      "annotated-types               0.5.0\n",
+      "anyio                         4.0.0\n",
+      "appdirs                       1.4.4\n",
+      "argon2-cffi                   23.1.0\n",
+      "argon2-cffi-bindings          21.2.0\n",
+      "array-record                  0.4.1\n",
+      "arrow                         1.2.3\n",
+      "asttokens                     2.4.0\n",
+      "astunparse                    1.6.3\n",
+      "async-lru                     2.0.4\n",
+      "async-timeout                 4.0.3\n",
+      "attrs                         23.1.0\n",
+      "audioread                     3.0.0\n",
+      "Babel                         2.12.1\n",
+      "backcall                      0.2.0\n",
+      "backoff                       2.2.1\n",
+      "bcrypt                        4.0.1\n",
+      "beautifulsoup4                4.12.2\n",
+      "bitsandbytes                  0.41.1\n",
+      "black                         23.9.1\n",
+      "bleach                        6.0.0\n",
+      "blis                          0.7.10\n",
+      "branca                        0.6.0\n",
+      "Brotli                        1.1.0\n",
+      "cachetools                    5.3.1\n",
+      "captum                        0.6.0\n",
+      "catalogue                     2.0.9\n",
+      "catalyst                      22.4\n",
+      "catboost                      1.2.1.1\n",
+      "certifi                       2023.7.22\n",
+      "cffi                          1.15.1\n",
+      "charset-normalizer            3.2.0\n",
+      "chroma-hnswlib                0.7.3\n",
+      "chromadb                      0.4.10\n",
+      "click                         8.1.7\n",
+      "cloudpickle                   2.2.1\n",
+      "cmaes                         0.10.0\n",
+      "cmake                         3.27.5\n",
+      "cmdstanpy                     1.1.0\n",
+      "coloredlogs                   15.0.1\n",
+      "colorlog                      6.7.0\n",
+      "comm                          0.1.4\n",
+      "confection                    0.1.3\n",
+      "contourpy                     1.1.1\n",
+      "convertdate                   2.4.0\n",
+      "cubinlinker-cu11              0.3.0.post1\n",
+      "cuda-python                   11.8.2\n",
+      "cudf-cu11                     23.8.0\n",
+      "cuml-cu11                     23.8.0\n",
+      "cupy-cuda11x                  12.2.0\n",
+      "curio                         1.6\n",
+      "customized-konlpy             0.0.64\n",
+      "cycler                        0.11.0\n",
+      "cymem                         2.0.8\n",
+      "cysignals                     1.11.2\n",
+      "Cython                        3.0.2\n",
+      "dask                          2023.7.1\n",
+      "dask-cuda                     23.8.0\n",
+      "dask-cudf-cu11                23.8.0\n",
+      "dataclasses-json              0.5.14\n",
+      "datasets                      2.14.5\n",
+      "debugpy                       1.8.0\n",
+      "decorator                     5.1.1\n",
+      "defusedxml                    0.7.1\n",
+      "dill                          0.3.7\n",
+      "distributed                   2023.7.1\n",
+      "dm-tree                       0.1.8\n",
+      "dnspython                     2.4.2\n",
+      "docker-pycreds                0.4.0\n",
+      "docrepr                       0.2.0\n",
+      "docutils                      0.18.1\n",
+      "duckduckgo-search             3.8.5\n",
+      "entrypoints                   0.4\n",
+      "ephem                         4.1.4\n",
+      "etils                         1.4.1\n",
+      "exceptiongroup                1.1.3\n",
+      "executing                     1.2.0\n",
+      "fastai                        2.7.12\n",
+      "fastapi                       0.99.1\n",
+      "fastcore                      1.5.29\n",
+      "fastdownload                  0.0.7\n",
+      "fastjsonschema                2.18.0\n",
+      "fastprogress                  1.0.3\n",
+      "fastrlock                     0.8.2\n",
+      "fasttext                      0.9.2\n",
+      "filelock                      3.12.4\n",
+      "flatbuffers                   23.5.26\n",
+      "folium                        0.14.0\n",
+      "fonttools                     4.42.1\n",
+      "fqdn                          1.5.1\n",
+      "frozenlist                    1.4.0\n",
+      "fsspec                        2023.6.0\n",
+      "future                        0.18.3\n",
+      "fvcore                        0.1.5.post20221221\n",
+      "gast                          0.4.0\n",
+      "gensim                        4.3.2\n",
+      "gitdb                         4.0.10\n",
+      "GitPython                     3.1.36\n",
+      "google-auth                   2.23.0\n",
+      "google-auth-oauthlib          1.0.0\n",
+      "google-pasta                  0.2.0\n",
+      "googleapis-common-protos      1.60.0\n",
+      "graphviz                      0.20.1\n",
+      "greenlet                      2.0.2\n",
+      "grpcio                        1.58.0\n",
+      "h11                           0.14.0\n",
+      "h2                            4.1.0\n",
+      "h5py                          3.9.0\n",
+      "holidays                      0.33\n",
+      "hpack                         4.0.0\n",
+      "httpcore                      0.18.0\n",
+      "httptools                     0.6.0\n",
+      "httpx                         0.25.0\n",
+      "huggingface-hub               0.16.4\n",
+      "humanfriendly                 10.0\n",
+      "hydra-slayer                  0.4.1\n",
+      "hyperframe                    6.0.1\n",
+      "hyperopt                      0.2.7\n",
+      "idna                          3.4\n",
+      "imageio                       2.31.3\n",
+      "imagesize                     1.4.1\n",
+      "importlib-metadata            6.8.0\n",
+      "importlib-resources           6.0.1\n",
+      "iniconfig                     2.0.0\n",
+      "intel-openmp                  2023.2.0\n",
+      "iopath                        0.1.10\n",
+      "ipykernel                     6.25.2\n",
+      "ipyparallel                   8.6.1\n",
+      "ipython                       8.15.0\n",
+      "ipython-genutils              0.2.0\n",
+      "ipywidgets                    8.1.1\n",
+      "isoduration                   20.11.0\n",
+      "jedi                          0.19.0\n",
+      "Jinja2                        3.1.2\n",
+      "joblib                        1.3.2\n",
+      "JPype1                        1.4.1\n",
+      "JPype1-py3                    0.5.5.4\n",
+      "json5                         0.9.14\n",
+      "jsonpointer                   2.4\n",
+      "jsonschema                    4.19.0\n",
+      "jsonschema-specifications     2023.7.1\n",
+      "jupyter                       1.0.0\n",
+      "jupyter_client                8.3.1\n",
+      "jupyter-console               6.6.3\n",
+      "jupyter_core                  5.3.1\n",
+      "jupyter-events                0.7.0\n",
+      "jupyter-lsp                   2.2.0\n",
+      "jupyter_server                2.7.3\n",
+      "jupyter_server_terminals      0.4.4\n",
+      "jupyterlab                    4.0.6\n",
+      "jupyterlab-pygments           0.2.2\n",
+      "jupyterlab_server             2.25.0\n",
+      "jupyterlab-widgets            3.0.9\n",
+      "jupyterthemes                 0.20.0\n",
+      "kaggle                        1.5.16\n",
+      "keras                         2.13.1\n",
+      "kiwisolver                    1.4.5\n",
+      "konlpy                        0.6.0\n",
+      "kornia                        0.7.0\n",
+      "krwordrank                    1.0.3\n",
+      "langchain                     0.0.295\n",
+      "langcodes                     3.3.0\n",
+      "langsmith                     0.0.38\n",
+      "lazy_loader                   0.3\n",
+      "lesscpy                       0.15.1\n",
+      "libclang                      16.0.6\n",
+      "librosa                       0.10.1\n",
+      "lightgbm                      4.1.0\n",
+      "lit                           16.0.6\n",
+      "llvmlite                      0.40.1\n",
+      "locket                        1.0.0\n",
+      "loguru                        0.7.2\n",
+      "LunarCalendar                 0.0.9\n",
+      "lxml                          4.9.3\n",
+      "Mako                          1.2.4\n",
+      "Markdown                      3.4.4\n",
+      "MarkupSafe                    2.1.3\n",
+      "marshmallow                   3.20.1\n",
+      "matplotlib                    3.8.0\n",
+      "matplotlib-inline             0.1.6\n",
+      "mecab-python3                 1.0.7\n",
+      "missingno                     0.5.2\n",
+      "mistune                       3.0.1\n",
+      "mkl                           2023.2.0\n",
+      "mlxtend                       0.22.0\n",
+      "monotonic                     1.6\n",
+      "mpmath                        1.3.0\n",
+      "msgpack                       1.0.5\n",
+      "multidict                     6.0.4\n",
+      "multiprocess                  0.70.15\n",
+      "murmurhash                    1.0.10\n",
+      "mypy-extensions               1.0.0\n",
+      "nbclient                      0.8.0\n",
+      "nbconvert                     7.8.0\n",
+      "nbformat                      5.9.2\n",
+      "nest-asyncio                  1.5.8\n",
+      "networkx                      3.1\n",
+      "nltk                          3.8.1\n",
+      "notebook                      7.0.3\n",
+      "notebook_shim                 0.2.3\n",
+      "numba                         0.57.1\n",
+      "numexpr                       2.8.6\n",
+      "numpy                         1.24.3\n",
+      "nvidia-cublas-cu11            11.10.3.66\n",
+      "nvidia-cuda-cupti-cu11        11.7.101\n",
+      "nvidia-cuda-nvrtc-cu11        11.7.99\n",
+      "nvidia-cuda-runtime-cu11      11.7.99\n",
+      "nvidia-cudnn-cu11             8.5.0.96\n",
+      "nvidia-cufft-cu11             10.9.0.58\n",
+      "nvidia-curand-cu11            10.2.10.91\n",
+      "nvidia-cusolver-cu11          11.4.0.1\n",
+      "nvidia-cusparse-cu11          11.7.4.91\n",
+      "nvidia-nccl-cu11              2.14.3\n",
+      "nvidia-nvtx-cu11              11.7.91\n",
+      "nvtx                          0.2.8\n",
+      "oauthlib                      3.2.2\n",
+      "onnxruntime                   1.15.1\n",
+      "openai                        0.28.0\n",
+      "opencv-python                 4.8.0.76\n",
+      "opencv-python-headless        4.8.0.76\n",
+      "opt-einsum                    3.3.0\n",
+      "optuna                        3.3.0\n",
+      "outcome                       1.2.0\n",
+      "overrides                     7.4.0\n",
+      "packaging                     23.1\n",
+      "pandas                        1.5.3\n",
+      "pandocfilters                 1.5.0\n",
+      "parso                         0.8.3\n",
+      "partd                         1.4.0\n",
+      "pathspec                      0.11.2\n",
+      "pathtools                     0.1.2\n",
+      "pathy                         0.10.2\n",
+      "patsy                         0.5.3\n",
+      "peft                          0.6.0.dev0\n",
+      "pexpect                       4.8.0\n",
+      "pickleshare                   0.7.5\n",
+      "Pillow                        10.0.1\n",
+      "pinecone-client               2.2.4\n",
+      "pip                           23.2.1\n",
+      "platformdirs                  3.10.0\n",
+      "plotly                        5.17.0\n",
+      "pluggy                        1.3.0\n",
+      "ply                           3.11\n",
+      "pooch                         1.7.0\n",
+      "portalocker                   2.8.2\n",
+      "posthog                       3.0.2\n",
+      "preshed                       3.0.9\n",
+      "prometheus-client             0.17.1\n",
+      "promise                       2.3\n",
+      "prompt-toolkit                3.0.39\n",
+      "prophet                       1.1.4\n",
+      "protobuf                      4.24.3\n",
+      "psutil                        5.9.5\n",
+      "ptxcompiler-cu11              0.7.0.post1\n",
+      "ptyprocess                    0.7.0\n",
+      "pulsar-client                 3.3.0\n",
+      "pure-eval                     0.2.2\n",
+      "py                            1.11.0\n",
+      "py4j                          0.10.9.7\n",
+      "pyarrow                       11.0.0\n",
+      "pyasn1                        0.5.0\n",
+      "pyasn1-modules                0.3.0\n",
+      "pybind11                      2.11.1\n",
+      "pycparser                     2.21\n",
+      "pydantic                      1.10.12\n",
+      "pydantic_core                 2.6.3\n",
+      "pydicom                       2.4.3\n",
+      "pyfasttext                    0.4.6\n",
+      "Pygments                      2.16.1\n",
+      "pygraphviz                    1.11\n",
+      "pylibraft-cu11                23.8.0\n",
+      "PyMeeus                       0.5.12\n",
+      "PyMySQL                       1.1.0\n",
+      "pynvml                        11.4.1\n",
+      "pyparsing                     3.1.1\n",
+      "pypdf                         3.16.1\n",
+      "PyPika                        0.48.9\n",
+      "pystan                        2.19.1.1\n",
+      "pytest                        6.2.5\n",
+      "pytest-asyncio                0.20.3\n",
+      "python-dateutil               2.8.2\n",
+      "python-dotenv                 1.0.0\n",
+      "python-json-logger            2.0.7\n",
+      "python-slugify                8.0.1\n",
+      "pytz                          2023.3.post1\n",
+      "PyWavelets                    1.4.1\n",
+      "PyYAML                        6.0.1\n",
+      "pyzmq                         25.1.1\n",
+      "qtconsole                     5.4.4\n",
+      "QtPy                          2.4.0\n",
+      "qudida                        0.0.4\n",
+      "raft-dask-cu11                23.8.0\n",
+      "referencing                   0.30.2\n",
+      "regex                         2023.8.8\n",
+      "requests                      2.31.0\n",
+      "requests-oauthlib             1.3.1\n",
+      "rfc3339-validator             0.1.4\n",
+      "rfc3986-validator             0.1.1\n",
+      "rmm-cu11                      23.8.0\n",
+      "rpds-py                       0.10.3\n",
+      "rsa                           4.9\n",
+      "safetensors                   0.3.3\n",
+      "scikit-image                  0.21.0\n",
+      "scikit-learn                  1.3.0\n",
+      "scipy                         1.11.2\n",
+      "seaborn                       0.12.2\n",
+      "Send2Trash                    1.8.2\n",
+      "sentencepiece                 0.1.99\n",
+      "sentry-sdk                    1.31.0\n",
+      "setproctitle                  1.3.2\n",
+      "setuptools                    68.0.0\n",
+      "shap                          0.42.1\n",
+      "six                           1.16.0\n",
+      "slicer                        0.0.7\n",
+      "smart-open                    6.4.0\n",
+      "smmap                         5.0.1\n",
+      "sniffio                       1.3.0\n",
+      "snowballstemmer               2.2.0\n",
+      "socksio                       1.0.0\n",
+      "sortedcontainers              2.4.0\n",
+      "soundfile                     0.12.1\n",
+      "soupsieve                     2.5\n",
+      "soxr                          0.3.6\n",
+      "soynlp                        0.0.493\n",
+      "soyspacing                    1.0.17\n",
+      "spacy                         3.6.1\n",
+      "spacy-legacy                  3.0.12\n",
+      "spacy-loggers                 1.0.5\n",
+      "Sphinx                        7.2.6\n",
+      "sphinx-rtd-theme              1.3.0\n",
+      "sphinxcontrib-applehelp       1.0.7\n",
+      "sphinxcontrib-devhelp         1.0.5\n",
+      "sphinxcontrib-htmlhelp        2.0.4\n",
+      "sphinxcontrib-jquery          4.1\n",
+      "sphinxcontrib-jsmath          1.0.1\n",
+      "sphinxcontrib-qthelp          1.0.6\n",
+      "sphinxcontrib-serializinghtml 1.1.9\n",
+      "SQLAlchemy                    2.0.21\n",
+      "srsly                         2.4.7\n",
+      "stack-data                    0.6.2\n",
+      "starlette                     0.27.0\n",
+      "statsmodels                   0.14.0\n",
+      "sympy                         1.12\n",
+      "tabulate                      0.9.0\n",
+      "tbb                           2021.10.0\n",
+      "tblib                         2.0.0\n",
+      "tenacity                      8.2.3\n",
+      "tensorboard                   2.13.0\n",
+      "tensorboard-data-server       0.7.1\n",
+      "tensorboardX                  2.6.2.2\n",
+      "tensorflow                    2.13.0\n",
+      "tensorflow-datasets           4.9.3\n",
+      "tensorflow-estimator          2.13.0\n",
+      "tensorflow-io-gcs-filesystem  0.34.0\n",
+      "tensorflow-metadata           1.14.0\n",
+      "termcolor                     2.3.0\n",
+      "terminado                     0.17.1\n",
+      "testpath                      0.6.0\n",
+      "text-unidecode                1.3\n",
+      "thinc                         8.1.12\n",
+      "threadpoolctl                 3.2.0\n",
+      "tifffile                      2023.9.18\n",
+      "tiktoken                      0.5.1\n",
+      "tinycss2                      1.2.1\n",
+      "tokenizers                    0.14.0\n",
+      "toml                          0.10.2\n",
+      "tomli                         2.0.1\n",
+      "toolz                         0.12.0\n",
+      "torch                         2.0.0\n",
+      "torchaudio                    2.0.2+cu118\n",
+      "torchdata                     0.6.0\n",
+      "torchsummary                  1.5.1\n",
+      "torchtext                     0.15.1\n",
+      "torchtriton                   2.0.0+f16138d447\n",
+      "torchvision                   0.15.2\n",
+      "tornado                       6.3.3\n",
+      "tqdm                          4.66.1\n",
+      "traitlets                     5.10.0\n",
+      "transformers                  4.34.0.dev0\n",
+      "treelite                      3.2.0\n",
+      "treelite-runtime              3.2.0\n",
+      "trio                          0.22.2\n",
+      "triton                        2.0.0\n",
+      "typer                         0.9.0\n",
+      "typing_extensions             4.5.0\n",
+      "typing-inspect                0.9.0\n",
+      "tzdata                        2023.3\n",
+      "ucx-py-cu11                   0.33.0\n",
+      "uri-template                  1.3.0\n",
+      "urllib3                       1.26.16\n",
+      "uvicorn                       0.23.2\n",
+      "uvloop                        0.17.0\n",
+      "wandb                         0.15.10\n",
+      "wasabi                        1.1.2\n",
+      "watchfiles                    0.20.0\n",
+      "wcwidth                       0.2.6\n",
+      "webcolors                     1.13\n",
+      "webencodings                  0.5.1\n",
+      "websocket-client              1.6.3\n",
+      "websockets                    11.0.3\n",
+      "Werkzeug                      2.3.7\n",
+      "wheel                         0.38.4\n",
+      "widgetsnbextension            4.0.9\n",
+      "wordcloud                     1.9.2\n",
+      "wrapt                         1.15.0\n",
+      "xgboost                       2.0.0\n",
+      "xxhash                        3.3.0\n",
+      "yacs                          0.1.8\n",
+      "yarl                          1.9.2\n",
+      "zict                          3.0.0\n",
+      "zipp                          3.17.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'numpy'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mos\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnp\u001b[39;00m\n\u001b[32m      3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m      4\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnatsort\u001b[39;00m\n",
+      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'numpy'"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import natsort\n",
+    "from datetime import datetime\n",
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_data(year):\n",
+    "    files = natsort.natsorted(os.listdir(f'../../data/대기질/{year}/'))\n",
+    "    data = []\n",
+    "    for file in tqdm(files, desc=f\"Reading files...({len(files)})\"):\n",
+    "        data.append(pd.read_excel(f'../../data/대기질/{year}/{file}', usecols=[\"지역\", '망', \"측정소코드\", \"측정소명\", \"측정일시\", \"O3\", \"NO2\", \"PM10\", \"PM25\", \"주소\"]))\n",
+    "\n",
+    "    return pd.concat(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 합친 데이터에 날짜 정보를 추가한다.\n",
+    "def add_date(df):\n",
+    "\n",
+    "    df[\"측정일시\"] = df[\"측정일시\"].astype(str).str[:10]\n",
+    "    df[\"측정일시\"] = pd.to_datetime(df[\"측정일시\"], format='%Y%m%d%H', errors=\"coerce\")\n",
+    "\n",
+    "    df[\"year\"] = df[\"측정일시\"].dt.year\n",
+    "    df[\"month\"] = df[\"측정일시\"].dt.month\n",
+    "    df[\"day\"] = df[\"측정일시\"].dt.day\n",
+    "    df[\"hour\"] = df[\"측정일시\"].dt.hour\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/6 [00:00<?, ?it/s]\n",
+      "Reading files...(13):   0%|          | 0/13 [00:00<?, ?it/s]\u001b[A\n",
+      "Reading files...(13):   8%|▊         | 1/13 [00:34<06:57, 34.80s/it]\u001b[A\n",
+      "Reading files...(13):  15%|█▌        | 2/13 [01:12<06:41, 36.47s/it]\u001b[A\n",
+      "Reading files...(13):  23%|██▎       | 3/13 [01:47<05:58, 35.89s/it]\u001b[A\n",
+      "Reading files...(13):  31%|███       | 4/13 [02:23<05:23, 35.96s/it]\u001b[A\n",
+      "Reading files...(13):  38%|███▊      | 5/13 [02:59<04:47, 35.92s/it]\u001b[A\n",
+      "Reading files...(13):  46%|████▌     | 6/13 [03:35<04:12, 36.09s/it]\u001b[A\n",
+      "Reading files...(13):  62%|██████▏   | 8/13 [04:12<02:16, 27.35s/it]\u001b[A\n",
+      "Reading files...(13):  69%|██████▉   | 9/13 [04:46<01:56, 29.05s/it]\u001b[A\n",
+      "Reading files...(13):  77%|███████▋  | 10/13 [05:21<01:31, 30.55s/it]\u001b[A\n",
+      "Reading files...(13):  85%|████████▍ | 11/13 [05:58<01:04, 32.46s/it]\u001b[A\n",
+      "Reading files...(13):  92%|█████████▏| 12/13 [06:37<00:34, 34.28s/it]\u001b[A\n",
+      "Reading files...(13): 100%|██████████| 13/13 [07:08<00:00, 32.93s/it]\u001b[A\n",
+      " 17%|█▋        | 1/6 [07:18<36:30, 438.18s/it]\n",
+      "Reading files...(13):   0%|          | 0/13 [00:00<?, ?it/s]\u001b[A\n",
+      "Reading files...(13):   8%|▊         | 1/13 [00:43<08:41, 43.43s/it]\u001b[A\n",
+      "Reading files...(13):  15%|█▌        | 2/13 [01:26<07:56, 43.29s/it]\u001b[A\n",
+      "Reading files...(13):  23%|██▎       | 3/13 [02:07<07:02, 42.22s/it]\u001b[A\n",
+      "Reading files...(13):  31%|███       | 4/13 [02:50<06:23, 42.66s/it]\u001b[A\n",
+      "Reading files...(13):  38%|███▊      | 5/13 [03:28<05:27, 40.90s/it]\u001b[A\n",
+      "Reading files...(13):  46%|████▌     | 6/13 [04:15<04:59, 42.79s/it]\u001b[A\n",
+      "Reading files...(13):  54%|█████▍    | 7/13 [04:58<04:18, 43.14s/it]\u001b[A\n",
+      "Reading files...(13):  62%|██████▏   | 8/13 [05:43<03:37, 43.47s/it]\u001b[A\n",
+      "Reading files...(13):  69%|██████▉   | 9/13 [06:28<02:55, 43.96s/it]\u001b[A\n",
+      "Reading files...(13):  77%|███████▋  | 10/13 [07:12<02:12, 44.01s/it]\u001b[A\n",
+      "Reading files...(13):  85%|████████▍ | 11/13 [07:52<01:25, 42.90s/it]\u001b[A\n",
+      "Reading files...(13): 100%|██████████| 13/13 [08:34<00:00, 39.61s/it]\u001b[A\n",
+      " 33%|███▎      | 2/6 [16:05<32:42, 490.55s/it]\n",
+      "Reading files...(13):   0%|          | 0/13 [00:00<?, ?it/s]\u001b[A\n",
+      "Reading files...(13):   8%|▊         | 1/13 [00:49<09:56, 49.74s/it]\u001b[A\n",
+      "Reading files...(13):  15%|█▌        | 2/13 [01:43<09:31, 51.98s/it]\u001b[A\n",
+      "Reading files...(13):  23%|██▎       | 3/13 [02:33<08:29, 50.96s/it]\u001b[A\n",
+      "Reading files...(13):  31%|███       | 4/13 [03:23<07:38, 50.95s/it]\u001b[A\n",
+      "Reading files...(13):  38%|███▊      | 5/13 [04:13<06:43, 50.46s/it]\u001b[A\n",
+      "Reading files...(13):  46%|████▌     | 6/13 [04:58<05:40, 48.71s/it]\u001b[A\n",
+      "Reading files...(13):  54%|█████▍    | 7/13 [05:50<04:57, 49.66s/it]\u001b[A\n",
+      "Reading files...(13):  62%|██████▏   | 8/13 [06:45<04:16, 51.29s/it]\u001b[A\n",
+      "Reading files...(13):  77%|███████▋  | 10/13 [07:38<01:58, 39.46s/it]\u001b[A\n",
+      "Reading files...(13):  85%|████████▍ | 11/13 [08:30<01:25, 42.79s/it]\u001b[A\n",
+      "Reading files...(13):  92%|█████████▏| 12/13 [09:26<00:46, 46.32s/it]\u001b[A\n",
+      "Reading files...(13): 100%|██████████| 13/13 [10:13<00:00, 47.19s/it]\u001b[A\n",
+      " 50%|█████     | 3/6 [26:32<27:38, 552.96s/it]\n",
+      "Reading files...(13):   0%|          | 0/13 [00:00<?, ?it/s]\u001b[A\n",
+      "Reading files...(13):   8%|▊         | 1/13 [00:59<11:48, 59.01s/it]\u001b[A\n",
+      "Reading files...(13):  15%|█▌        | 2/13 [01:56<10:40, 58.19s/it]\u001b[A\n",
+      "Reading files...(13):  23%|██▎       | 3/13 [02:53<09:37, 57.77s/it]\u001b[A\n",
+      "Reading files...(13):  31%|███       | 4/13 [03:52<08:41, 58.00s/it]\u001b[A\n",
+      "Reading files...(13):  38%|███▊      | 5/13 [04:44<07:26, 55.77s/it]\u001b[A\n",
+      "Reading files...(13):  46%|████▌     | 6/13 [05:40<06:32, 56.05s/it]\u001b[A\n",
+      "Reading files...(13):  54%|█████▍    | 7/13 [06:36<05:36, 56.06s/it]\u001b[A\n",
+      "Reading files...(13):  62%|██████▏   | 8/13 [07:33<04:42, 56.42s/it]\u001b[A\n",
+      "Reading files...(13):  69%|██████▉   | 9/13 [08:34<03:51, 57.76s/it]\u001b[A\n",
+      "Reading files...(13):  77%|███████▋  | 10/13 [09:35<02:56, 58.75s/it]\u001b[A\n",
+      "Reading files...(13):  92%|█████████▏| 12/13 [10:33<00:44, 44.84s/it]\u001b[A\n",
+      "Reading files...(13): 100%|██████████| 13/13 [11:32<00:00, 53.29s/it]\u001b[A\n",
+      " 67%|██████▋   | 4/6 [38:20<20:28, 614.26s/it]\n",
+      "Reading files...(13):   0%|          | 0/13 [00:00<?, ?it/s]\u001b[A\n",
+      "Reading files...(13):   8%|▊         | 1/13 [00:59<11:57, 59.79s/it]\u001b[A\n",
+      "Reading files...(13):  15%|█▌        | 2/13 [02:01<11:07, 60.67s/it]\u001b[A\n",
+      "Reading files...(13):  23%|██▎       | 3/13 [03:02<10:10, 61.02s/it]\u001b[A\n",
+      "Reading files...(13):  31%|███       | 4/13 [03:57<08:48, 58.74s/it]\u001b[A\n",
+      "Reading files...(13):  38%|███▊      | 5/13 [04:57<07:53, 59.18s/it]\u001b[A\n",
+      "Reading files...(13):  46%|████▌     | 6/13 [06:00<07:03, 60.45s/it]\u001b[A\n",
+      "Reading files...(13):  54%|█████▍    | 7/13 [07:00<06:02, 60.38s/it]\u001b[A\n",
+      "Reading files...(13):  62%|██████▏   | 8/13 [08:02<05:04, 60.85s/it]\u001b[A\n",
+      "Reading files...(13):  69%|██████▉   | 9/13 [09:04<04:04, 61.03s/it]\u001b[A\n",
+      "Reading files...(13):  77%|███████▋  | 10/13 [10:04<03:02, 60.67s/it]\u001b[A\n",
+      "Reading files...(13):  92%|█████████▏| 12/13 [11:06<00:46, 46.76s/it]\u001b[A\n",
+      "Reading files...(13): 100%|██████████| 13/13 [12:09<00:00, 56.08s/it]\u001b[A\n",
+      " 83%|████████▎ | 5/6 [50:46<11:01, 661.78s/it]\n",
+      "Reading files...(13):   0%|          | 0/13 [00:00<?, ?it/s]\u001b[A\n",
+      "Reading files...(13):   8%|▊         | 1/13 [01:03<12:46, 63.88s/it]\u001b[A\n",
+      "Reading files...(13):  15%|█▌        | 2/13 [02:08<11:50, 64.56s/it]\u001b[A\n",
+      "Reading files...(13):  23%|██▎       | 3/13 [03:10<10:32, 63.22s/it]\u001b[A\n",
+      "Reading files...(13):  31%|███       | 4/13 [04:07<09:05, 60.63s/it]\u001b[A\n",
+      "Reading files...(13):  38%|███▊      | 5/13 [05:09<08:11, 61.41s/it]\u001b[A\n",
+      "Reading files...(13):  46%|████▌     | 6/13 [06:12<07:13, 61.92s/it]\u001b[A\n",
+      "Reading files...(13):  54%|█████▍    | 7/13 [07:13<06:09, 61.50s/it]\u001b[A\n",
+      "Reading files...(13):  62%|██████▏   | 8/13 [08:15<05:08, 61.64s/it]\u001b[A\n",
+      "Reading files...(13):  69%|██████▉   | 9/13 [09:17<04:07, 61.81s/it]\u001b[A\n",
+      "Reading files...(13):  77%|███████▋  | 10/13 [10:19<03:05, 61.96s/it]\u001b[A\n",
+      "Reading files...(13):  92%|█████████▏| 12/13 [11:23<00:47, 47.75s/it]\u001b[A\n",
+      "Reading files...(13): 100%|██████████| 13/13 [12:27<00:00, 57.50s/it]\u001b[A\n",
+      "100%|██████████| 6/6 [1:03:31<00:00, 635.28s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "# 대기질 데이터를 불러와서 하나의 파일로 합친다.\n",
+    "def get_data(year):\n",
+    "    directory = f'../../data/대기질/{year}/'\n",
+    "    files = os.listdir(directory)\n",
+    "    data = []\n",
+    "    \n",
+    "    # 파일 목록에서 디렉토리를 제외하고 오직 Excel 파일만 처리\n",
+    "    for file in tqdm(files, desc=f\"Reading files...({len(files)})\"):\n",
+    "        file_path = os.path.join(directory, file)\n",
+    "        if os.path.isfile(file_path) and file_path.endswith(('.xls', '.xlsx')):  # Excel 파일 확장자만 허용\n",
+    "            data.append(pd.read_excel(file_path, usecols=[\"지역\", '망', \"측정소코드\", \"측정소명\", \"측정일시\", \"O3\", \"NO2\", \"PM10\", \"PM25\", \"주소\"]))\n",
+    "    \n",
+    "    return pd.concat(data)\n",
+    "\n",
+    "years = [2018, 2019, 2020,2021,2022,2023] # 2018년부터 2023년까지의 데이터를 합친다.\n",
+    "for year in tqdm(years):\n",
+    "    data = get_data(year)\n",
+    "    data = add_date(data)\n",
+    "    data.reset_index(drop=True, inplace=True)\n",
+    "    data.to_feather(f\"../../data/대기질/{year}.feather\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>지역</th>\n",
+       "      <th>망</th>\n",
+       "      <th>측정소코드</th>\n",
+       "      <th>측정소명</th>\n",
+       "      <th>측정일시</th>\n",
+       "      <th>O3</th>\n",
+       "      <th>NO2</th>\n",
+       "      <th>PM10</th>\n",
+       "      <th>PM25</th>\n",
+       "      <th>주소</th>\n",
+       "      <th>year</th>\n",
+       "      <th>month</th>\n",
+       "      <th>day</th>\n",
+       "      <th>hour</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 01:00:00</td>\n",
+       "      <td>0.0249</td>\n",
+       "      <td>0.0188</td>\n",
+       "      <td>21.0</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 02:00:00</td>\n",
+       "      <td>0.0263</td>\n",
+       "      <td>0.0163</td>\n",
+       "      <td>18.0</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 03:00:00</td>\n",
+       "      <td>0.0218</td>\n",
+       "      <td>0.0192</td>\n",
+       "      <td>24.0</td>\n",
+       "      <td>21.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 04:00:00</td>\n",
+       "      <td>0.0131</td>\n",
+       "      <td>0.0214</td>\n",
+       "      <td>25.0</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 05:00:00</td>\n",
+       "      <td>0.0131</td>\n",
+       "      <td>0.0160</td>\n",
+       "      <td>25.0</td>\n",
+       "      <td>21.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 06:00:00</td>\n",
+       "      <td>0.0115</td>\n",
+       "      <td>0.0196</td>\n",
+       "      <td>23.0</td>\n",
+       "      <td>18.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>6.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 07:00:00</td>\n",
+       "      <td>0.0094</td>\n",
+       "      <td>0.0230</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>21.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>7.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 08:00:00</td>\n",
+       "      <td>0.0222</td>\n",
+       "      <td>0.0175</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>8.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 09:00:00</td>\n",
+       "      <td>0.0396</td>\n",
+       "      <td>0.0153</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>9.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 10:00:00</td>\n",
+       "      <td>0.0530</td>\n",
+       "      <td>0.0105</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>10.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 11:00:00</td>\n",
+       "      <td>0.0607</td>\n",
+       "      <td>0.0090</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>11.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 12:00:00</td>\n",
+       "      <td>0.0688</td>\n",
+       "      <td>0.0114</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>12.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 13:00:00</td>\n",
+       "      <td>0.0758</td>\n",
+       "      <td>0.0101</td>\n",
+       "      <td>23.0</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>13.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 14:00:00</td>\n",
+       "      <td>0.0743</td>\n",
+       "      <td>0.0093</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>14.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 15:00:00</td>\n",
+       "      <td>0.0749</td>\n",
+       "      <td>0.0100</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>11.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>15.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 16:00:00</td>\n",
+       "      <td>0.0716</td>\n",
+       "      <td>0.0092</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>16.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 17:00:00</td>\n",
+       "      <td>0.0613</td>\n",
+       "      <td>0.0099</td>\n",
+       "      <td>18.0</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>17.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 18:00:00</td>\n",
+       "      <td>0.0496</td>\n",
+       "      <td>0.0098</td>\n",
+       "      <td>18.0</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>18.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 19:00:00</td>\n",
+       "      <td>0.0473</td>\n",
+       "      <td>0.0124</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>19.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 20:00:00</td>\n",
+       "      <td>0.0498</td>\n",
+       "      <td>0.0170</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>20.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 21:00:00</td>\n",
+       "      <td>0.0616</td>\n",
+       "      <td>0.0134</td>\n",
+       "      <td>23.0</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>21.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 22:00:00</td>\n",
+       "      <td>0.0543</td>\n",
+       "      <td>0.0109</td>\n",
+       "      <td>18.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>22.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-01 23:00:00</td>\n",
+       "      <td>0.0507</td>\n",
+       "      <td>0.0113</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>23.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>0.0427</td>\n",
+       "      <td>0.0125</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>16.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 01:00:00</td>\n",
+       "      <td>0.0334</td>\n",
+       "      <td>0.0148</td>\n",
+       "      <td>21.0</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 02:00:00</td>\n",
+       "      <td>0.0337</td>\n",
+       "      <td>0.0133</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>18.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 03:00:00</td>\n",
+       "      <td>0.0260</td>\n",
+       "      <td>0.0162</td>\n",
+       "      <td>25.0</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 04:00:00</td>\n",
+       "      <td>0.0195</td>\n",
+       "      <td>0.0179</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>18.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 05:00:00</td>\n",
+       "      <td>0.0171</td>\n",
+       "      <td>0.0170</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 06:00:00</td>\n",
+       "      <td>0.0181</td>\n",
+       "      <td>0.0145</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>6.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 07:00:00</td>\n",
+       "      <td>0.0174</td>\n",
+       "      <td>0.0156</td>\n",
+       "      <td>11.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>7.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 08:00:00</td>\n",
+       "      <td>0.0213</td>\n",
+       "      <td>0.0147</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>8.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 09:00:00</td>\n",
+       "      <td>0.0267</td>\n",
+       "      <td>0.0143</td>\n",
+       "      <td>11.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>9.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 10:00:00</td>\n",
+       "      <td>0.0289</td>\n",
+       "      <td>0.0155</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>10.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 11:00:00</td>\n",
+       "      <td>0.0381</td>\n",
+       "      <td>0.0108</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>11.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 12:00:00</td>\n",
+       "      <td>0.0441</td>\n",
+       "      <td>0.0079</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>12.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 13:00:00</td>\n",
+       "      <td>0.0489</td>\n",
+       "      <td>0.0067</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>13.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 14:00:00</td>\n",
+       "      <td>0.0498</td>\n",
+       "      <td>0.0072</td>\n",
+       "      <td>11.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>14.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 15:00:00</td>\n",
+       "      <td>0.0459</td>\n",
+       "      <td>0.0073</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>15.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>서울 중구</td>\n",
+       "      <td>도시대기</td>\n",
+       "      <td>111121</td>\n",
+       "      <td>중구</td>\n",
+       "      <td>2023-07-02 16:00:00</td>\n",
+       "      <td>0.0474</td>\n",
+       "      <td>0.0079</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>11.0</td>\n",
+       "      <td>서울 중구 덕수궁길 15</td>\n",
+       "      <td>2023.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>16.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       지역     망   측정소코드 측정소명                측정일시      O3     NO2  PM10  PM25  \\\n",
+       "0   서울 중구  도시대기  111121   중구 2023-07-01 01:00:00  0.0249  0.0188  21.0  19.0   \n",
+       "1   서울 중구  도시대기  111121   중구 2023-07-01 02:00:00  0.0263  0.0163  18.0  15.0   \n",
+       "2   서울 중구  도시대기  111121   중구 2023-07-01 03:00:00  0.0218  0.0192  24.0  21.0   \n",
+       "3   서울 중구  도시대기  111121   중구 2023-07-01 04:00:00  0.0131  0.0214  25.0  19.0   \n",
+       "4   서울 중구  도시대기  111121   중구 2023-07-01 05:00:00  0.0131  0.0160  25.0  21.0   \n",
+       "5   서울 중구  도시대기  111121   중구 2023-07-01 06:00:00  0.0115  0.0196  23.0  18.0   \n",
+       "6   서울 중구  도시대기  111121   중구 2023-07-01 07:00:00  0.0094  0.0230  26.0  21.0   \n",
+       "7   서울 중구  도시대기  111121   중구 2023-07-01 08:00:00  0.0222  0.0175  26.0  20.0   \n",
+       "8   서울 중구  도시대기  111121   중구 2023-07-01 09:00:00  0.0396  0.0153  27.0  20.0   \n",
+       "9   서울 중구  도시대기  111121   중구 2023-07-01 10:00:00  0.0530  0.0105  19.0  16.0   \n",
+       "10  서울 중구  도시대기  111121   중구 2023-07-01 11:00:00  0.0607  0.0090  20.0  20.0   \n",
+       "11  서울 중구  도시대기  111121   중구 2023-07-01 12:00:00  0.0688  0.0114  20.0  17.0   \n",
+       "12  서울 중구  도시대기  111121   중구 2023-07-01 13:00:00  0.0758  0.0101  23.0  17.0   \n",
+       "13  서울 중구  도시대기  111121   중구 2023-07-01 14:00:00  0.0743  0.0093  20.0  17.0   \n",
+       "14  서울 중구  도시대기  111121   중구 2023-07-01 15:00:00  0.0749  0.0100  19.0  11.0   \n",
+       "15  서울 중구  도시대기  111121   중구 2023-07-01 16:00:00  0.0716  0.0092  19.0  15.0   \n",
+       "16  서울 중구  도시대기  111121   중구 2023-07-01 17:00:00  0.0613  0.0099  18.0  15.0   \n",
+       "17  서울 중구  도시대기  111121   중구 2023-07-01 18:00:00  0.0496  0.0098  18.0  14.0   \n",
+       "18  서울 중구  도시대기  111121   중구 2023-07-01 19:00:00  0.0473  0.0124  17.0  17.0   \n",
+       "19  서울 중구  도시대기  111121   중구 2023-07-01 20:00:00  0.0498  0.0170  17.0  15.0   \n",
+       "20  서울 중구  도시대기  111121   중구 2023-07-01 21:00:00  0.0616  0.0134  23.0  20.0   \n",
+       "21  서울 중구  도시대기  111121   중구 2023-07-01 22:00:00  0.0543  0.0109  18.0  16.0   \n",
+       "22  서울 중구  도시대기  111121   중구 2023-07-01 23:00:00  0.0507  0.0113  17.0  16.0   \n",
+       "23  서울 중구  도시대기  111121   중구                 NaT  0.0427  0.0125  17.0  16.0   \n",
+       "24  서울 중구  도시대기  111121   중구 2023-07-02 01:00:00  0.0334  0.0148  21.0  20.0   \n",
+       "25  서울 중구  도시대기  111121   중구 2023-07-02 02:00:00  0.0337  0.0133  22.0  18.0   \n",
+       "26  서울 중구  도시대기  111121   중구 2023-07-02 03:00:00  0.0260  0.0162  25.0  20.0   \n",
+       "27  서울 중구  도시대기  111121   중구 2023-07-02 04:00:00  0.0195  0.0179  22.0  18.0   \n",
+       "28  서울 중구  도시대기  111121   중구 2023-07-02 05:00:00  0.0171  0.0170  19.0  17.0   \n",
+       "29  서울 중구  도시대기  111121   중구 2023-07-02 06:00:00  0.0181  0.0145  14.0  10.0   \n",
+       "30  서울 중구  도시대기  111121   중구 2023-07-02 07:00:00  0.0174  0.0156  11.0  10.0   \n",
+       "31  서울 중구  도시대기  111121   중구 2023-07-02 08:00:00  0.0213  0.0147  12.0   9.0   \n",
+       "32  서울 중구  도시대기  111121   중구 2023-07-02 09:00:00  0.0267  0.0143  11.0  10.0   \n",
+       "33  서울 중구  도시대기  111121   중구 2023-07-02 10:00:00  0.0289  0.0155  12.0   9.0   \n",
+       "34  서울 중구  도시대기  111121   중구 2023-07-02 11:00:00  0.0381  0.0108  13.0  13.0   \n",
+       "35  서울 중구  도시대기  111121   중구 2023-07-02 12:00:00  0.0441  0.0079  13.0  12.0   \n",
+       "36  서울 중구  도시대기  111121   중구 2023-07-02 13:00:00  0.0489  0.0067   8.0  10.0   \n",
+       "37  서울 중구  도시대기  111121   중구 2023-07-02 14:00:00  0.0498  0.0072  11.0  10.0   \n",
+       "38  서울 중구  도시대기  111121   중구 2023-07-02 15:00:00  0.0459  0.0073  14.0  12.0   \n",
+       "39  서울 중구  도시대기  111121   중구 2023-07-02 16:00:00  0.0474  0.0079  12.0  11.0   \n",
+       "\n",
+       "               주소    year  month  day  hour  \n",
+       "0   서울 중구 덕수궁길 15  2023.0    7.0  1.0   1.0  \n",
+       "1   서울 중구 덕수궁길 15  2023.0    7.0  1.0   2.0  \n",
+       "2   서울 중구 덕수궁길 15  2023.0    7.0  1.0   3.0  \n",
+       "3   서울 중구 덕수궁길 15  2023.0    7.0  1.0   4.0  \n",
+       "4   서울 중구 덕수궁길 15  2023.0    7.0  1.0   5.0  \n",
+       "5   서울 중구 덕수궁길 15  2023.0    7.0  1.0   6.0  \n",
+       "6   서울 중구 덕수궁길 15  2023.0    7.0  1.0   7.0  \n",
+       "7   서울 중구 덕수궁길 15  2023.0    7.0  1.0   8.0  \n",
+       "8   서울 중구 덕수궁길 15  2023.0    7.0  1.0   9.0  \n",
+       "9   서울 중구 덕수궁길 15  2023.0    7.0  1.0  10.0  \n",
+       "10  서울 중구 덕수궁길 15  2023.0    7.0  1.0  11.0  \n",
+       "11  서울 중구 덕수궁길 15  2023.0    7.0  1.0  12.0  \n",
+       "12  서울 중구 덕수궁길 15  2023.0    7.0  1.0  13.0  \n",
+       "13  서울 중구 덕수궁길 15  2023.0    7.0  1.0  14.0  \n",
+       "14  서울 중구 덕수궁길 15  2023.0    7.0  1.0  15.0  \n",
+       "15  서울 중구 덕수궁길 15  2023.0    7.0  1.0  16.0  \n",
+       "16  서울 중구 덕수궁길 15  2023.0    7.0  1.0  17.0  \n",
+       "17  서울 중구 덕수궁길 15  2023.0    7.0  1.0  18.0  \n",
+       "18  서울 중구 덕수궁길 15  2023.0    7.0  1.0  19.0  \n",
+       "19  서울 중구 덕수궁길 15  2023.0    7.0  1.0  20.0  \n",
+       "20  서울 중구 덕수궁길 15  2023.0    7.0  1.0  21.0  \n",
+       "21  서울 중구 덕수궁길 15  2023.0    7.0  1.0  22.0  \n",
+       "22  서울 중구 덕수궁길 15  2023.0    7.0  1.0  23.0  \n",
+       "23  서울 중구 덕수궁길 15     NaN    NaN  NaN   NaN  \n",
+       "24  서울 중구 덕수궁길 15  2023.0    7.0  2.0   1.0  \n",
+       "25  서울 중구 덕수궁길 15  2023.0    7.0  2.0   2.0  \n",
+       "26  서울 중구 덕수궁길 15  2023.0    7.0  2.0   3.0  \n",
+       "27  서울 중구 덕수궁길 15  2023.0    7.0  2.0   4.0  \n",
+       "28  서울 중구 덕수궁길 15  2023.0    7.0  2.0   5.0  \n",
+       "29  서울 중구 덕수궁길 15  2023.0    7.0  2.0   6.0  \n",
+       "30  서울 중구 덕수궁길 15  2023.0    7.0  2.0   7.0  \n",
+       "31  서울 중구 덕수궁길 15  2023.0    7.0  2.0   8.0  \n",
+       "32  서울 중구 덕수궁길 15  2023.0    7.0  2.0   9.0  \n",
+       "33  서울 중구 덕수궁길 15  2023.0    7.0  2.0  10.0  \n",
+       "34  서울 중구 덕수궁길 15  2023.0    7.0  2.0  11.0  \n",
+       "35  서울 중구 덕수궁길 15  2023.0    7.0  2.0  12.0  \n",
+       "36  서울 중구 덕수궁길 15  2023.0    7.0  2.0  13.0  \n",
+       "37  서울 중구 덕수궁길 15  2023.0    7.0  2.0  14.0  \n",
+       "38  서울 중구 덕수궁길 15  2023.0    7.0  2.0  15.0  \n",
+       "39  서울 중구 덕수궁길 15  2023.0    7.0  2.0  16.0  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head(40)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py39",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

Analysis_code/1.data_preprocessing/1.data_merge.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Analysis_code/1.data_preprocessing/3.make_train_test.ipynb ADDED Viewed

	@@ -0,0 +1,1099 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from collections import Counter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_seoul = pd.read_feather(\"../../data/data_for_modeling/df_seoul.feather\")\n",
+    "df_busan = pd.read_feather(\"../../data/data_for_modeling/df_busan.feather\")\n",
+    "df_incheon = pd.read_feather(\"../../data/data_for_modeling/df_incheon.feather\")\n",
+    "df_daegu = pd.read_feather(\"../../data/data_for_modeling/df_daegu.feather\")\n",
+    "df_daejeon = pd.read_feather(\"../../data/data_for_modeling/df_daejeon.feather\")\n",
+    "df_gwangju = pd.read_feather(\"../../data/data_for_modeling/df_gwangju.feather\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({2: 48534, 1: 3941, 0: 109})"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Counter(df_seoul['multi_class'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({2: 50069, 1: 2350, 0: 165})"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Counter(df_busan['multi_class'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({2: 44944, 1: 6658, 0: 982})"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Counter(df_incheon['multi_class'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({2: 50919, 1: 1610, 0: 55})"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Counter(df_daegu['multi_class'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({2: 48047, 1: 4227, 0: 310})"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Counter(df_daejeon['multi_class'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({2: 48405, 1: 4015, 0: 164})"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Counter(df_gwangju['multi_class'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(52584, 30)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_seoul.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_seoul = df_seoul.loc[df_seoul['year'].isin([2018, 2019, 2020, 2021]),:].copy()\n",
+    "df_busan = df_busan.loc[df_busan['year'].isin([2018, 2019, 2020, 2021]),:].copy()\n",
+    "df_incheon = df_incheon.loc[df_incheon['year'].isin([2018, 2019, 2020, 2021]),:].copy()\n",
+    "df_daegu = df_daegu.loc[df_daegu['year'].isin([2018, 2019, 2020, 2021]),:].copy()\n",
+    "df_daejeon = df_daejeon.loc[df_daejeon['year'].isin([2018, 2019, 2020, 2021]),:].copy()\n",
+    "df_gwangju = df_gwangju.loc[df_gwangju['year'].isin([2018, 2019, 2020, 2021]),:].copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cols = [col for col in df_seoul.columns if col != \"multi_class\"] + [\"multi_class\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_seoul = df_seoul[cols]\n",
+    "df_busan = df_busan[cols]\n",
+    "df_incheon = df_incheon[cols]\n",
+    "df_daegu = df_daegu[cols]\n",
+    "df_daejeon = df_daejeon[cols]\n",
+    "df_gwangju = df_gwangju[cols]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_seoul_train = df_seoul.loc[df_seoul['year'].isin([2018, 2019, 2020]),:].copy()\n",
+    "df_seoul_test = df_seoul.loc[df_seoul['year'].isin([2021]),:].copy()\n",
+    "\n",
+    "df_busan_train = df_busan.loc[df_busan['year'].isin([2018, 2019, 2020]),:].copy()\n",
+    "df_busan_test = df_busan.loc[df_busan['year'].isin([2021]),:].copy()\n",
+    "\n",
+    "df_incheon_train = df_incheon.loc[df_incheon['year'].isin([2018, 2019, 2020]),:].copy()\n",
+    "df_incheon_test = df_incheon.loc[df_incheon['year'].isin([2021]),:].copy()\n",
+    "\n",
+    "df_daegu_train = df_daegu.loc[df_daegu['year'].isin([2018, 2019, 2020]),:].copy()\n",
+    "df_daegu_test = df_daegu.loc[df_daegu['year'].isin([2021]),:].copy()\n",
+    "\n",
+    "df_daejeon_train = df_daejeon.loc[df_daejeon['year'].isin([2018, 2019, 2020]),:].copy()\n",
+    "df_daejeon_test = df_daejeon.loc[df_daejeon['year'].isin([2021]),:].copy()\n",
+    "\n",
+    "df_gwangju_train = df_gwangju.loc[df_gwangju['year'].isin([2018, 2019, 2020]),:].copy()\n",
+    "df_gwangju_test = df_gwangju.loc[df_gwangju['year'].isin([2021]),:].copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>temp_C</th>\n",
+       "      <th>precip_mm</th>\n",
+       "      <th>wind_speed</th>\n",
+       "      <th>wind_dir</th>\n",
+       "      <th>hm</th>\n",
+       "      <th>vap_pressure</th>\n",
+       "      <th>dewpoint_C</th>\n",
+       "      <th>loc_pressure</th>\n",
+       "      <th>sea_pressure</th>\n",
+       "      <th>solarRad</th>\n",
+       "      <th>...</th>\n",
+       "      <th>year</th>\n",
+       "      <th>month</th>\n",
+       "      <th>hour</th>\n",
+       "      <th>ground_temp - temp_C</th>\n",
+       "      <th>hour_sin</th>\n",
+       "      <th>hour_cos</th>\n",
+       "      <th>month_sin</th>\n",
+       "      <th>month_cos</th>\n",
+       "      <th>visi</th>\n",
+       "      <th>multi_class</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1.2</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.6</td>\n",
+       "      <td>360</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>2.3</td>\n",
+       "      <td>-12.6</td>\n",
+       "      <td>1015.8</td>\n",
+       "      <td>1024.6</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-5.4</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000e+00</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.866025</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.3</td>\n",
+       "      <td>360</td>\n",
+       "      <td>33.0</td>\n",
+       "      <td>2.1</td>\n",
+       "      <td>-13.9</td>\n",
+       "      <td>1015.5</td>\n",
+       "      <td>1024.3</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>-5.4</td>\n",
+       "      <td>0.258819</td>\n",
+       "      <td>9.659258e-01</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.866025</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.1</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.5</td>\n",
+       "      <td>20</td>\n",
+       "      <td>34.0</td>\n",
+       "      <td>2.1</td>\n",
+       "      <td>-13.9</td>\n",
+       "      <td>1015.7</td>\n",
+       "      <td>1024.5</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>-5.4</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>8.660254e-01</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.866025</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.1</td>\n",
+       "      <td>320</td>\n",
+       "      <td>37.0</td>\n",
+       "      <td>2.3</td>\n",
+       "      <td>-12.9</td>\n",
+       "      <td>1015.9</td>\n",
+       "      <td>1024.7</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>-5.0</td>\n",
+       "      <td>0.707107</td>\n",
+       "      <td>7.071068e-01</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.866025</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.3</td>\n",
+       "      <td>340</td>\n",
+       "      <td>42.0</td>\n",
+       "      <td>2.5</td>\n",
+       "      <td>-11.5</td>\n",
+       "      <td>1016.0</td>\n",
+       "      <td>1024.9</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>-4.3</td>\n",
+       "      <td>0.866025</td>\n",
+       "      <td>5.000000e-01</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.866025</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.8</td>\n",
+       "      <td>50</td>\n",
+       "      <td>43.0</td>\n",
+       "      <td>2.6</td>\n",
+       "      <td>-11.2</td>\n",
+       "      <td>1016.0</td>\n",
+       "      <td>1024.9</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>-4.0</td>\n",
+       "      <td>0.965926</td>\n",
+       "      <td>2.588190e-01</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.866025</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>-0.5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.1</td>\n",
+       "      <td>20</td>\n",
+       "      <td>45.0</td>\n",
+       "      <td>2.6</td>\n",
+       "      <td>-11.0</td>\n",
+       "      <td>1016.5</td>\n",
+       "      <td>1025.4</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>-4.1</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>6.123234e-17</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.866025</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>-0.8</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.5</td>\n",
+       "      <td>340</td>\n",
+       "      <td>45.0</td>\n",
+       "      <td>2.6</td>\n",
+       "      <td>-11.2</td>\n",
+       "      <td>1017.1</td>\n",
+       "      <td>1026.0</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7</td>\n",
+       "      <td>-4.5</td>\n",
+       "      <td>0.965926</td>\n",
+       "      <td>-2.588190e-01</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.866025</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>-0.5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.2</td>\n",
+       "      <td>360</td>\n",
+       "      <td>43.0</td>\n",
+       "      <td>2.5</td>\n",
+       "      <td>-11.5</td>\n",
+       "      <td>1017.4</td>\n",
+       "      <td>1026.3</td>\n",
+       "      <td>0.03</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>1</td>\n",
+       "      <td>8</td>\n",
+       "      <td>-4.0</td>\n",
+       "      <td>0.866025</td>\n",
+       "      <td>-5.000000e-01</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.866025</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>1.7</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.1</td>\n",
+       "      <td>20</td>\n",
+       "      <td>39.0</td>\n",
+       "      <td>2.7</td>\n",
+       "      <td>-10.8</td>\n",
+       "      <td>1018.1</td>\n",
+       "      <td>1026.9</td>\n",
+       "      <td>0.46</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "      <td>2.8</td>\n",
+       "      <td>0.707107</td>\n",
+       "      <td>-7.071068e-01</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.866025</td>\n",
+       "      <td>1953.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>10 rows × 30 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   temp_C  precip_mm  wind_speed wind_dir    hm  vap_pressure  dewpoint_C  \\\n",
+       "0     1.2        0.0         1.6      360  35.0           2.3       -12.6   \n",
+       "1     0.5        0.0         1.3      360  33.0           2.1       -13.9   \n",
+       "2     0.1        0.0         1.5       20  34.0           2.1       -13.9   \n",
+       "3     0.0        0.0         2.1      320  37.0           2.3       -12.9   \n",
+       "4    -0.1        0.0         2.3      340  42.0           2.5       -11.5   \n",
+       "5    -0.1        0.0         2.8       50  43.0           2.6       -11.2   \n",
+       "6    -0.5        0.0         2.1       20  45.0           2.6       -11.0   \n",
+       "7    -0.8        0.0         2.5      340  45.0           2.6       -11.2   \n",
+       "8    -0.5        0.0         1.2      360  43.0           2.5       -11.5   \n",
+       "9     1.7        0.0         2.1       20  39.0           2.7       -10.8   \n",
+       "\n",
+       "   loc_pressure  sea_pressure  solarRad  ...  year month hour  \\\n",
+       "0        1015.8        1024.6      0.00  ...  2018     1    0   \n",
+       "1        1015.5        1024.3      0.00  ...  2018     1    1   \n",
+       "2        1015.7        1024.5      0.00  ...  2018     1    2   \n",
+       "3        1015.9        1024.7      0.00  ...  2018     1    3   \n",
+       "4        1016.0        1024.9      0.00  ...  2018     1    4   \n",
+       "5        1016.0        1024.9      0.00  ...  2018     1    5   \n",
+       "6        1016.5        1025.4      0.00  ...  2018     1    6   \n",
+       "7        1017.1        1026.0      0.00  ...  2018     1    7   \n",
+       "8        1017.4        1026.3      0.03  ...  2018     1    8   \n",
+       "9        1018.1        1026.9      0.46  ...  2018     1    9   \n",
+       "\n",
+       "   ground_temp - temp_C  hour_sin      hour_cos  month_sin  month_cos    visi  \\\n",
+       "0                  -5.4  0.000000  1.000000e+00        0.5   0.866025  2000.0   \n",
+       "1                  -5.4  0.258819  9.659258e-01        0.5   0.866025  2000.0   \n",
+       "2                  -5.4  0.500000  8.660254e-01        0.5   0.866025  2000.0   \n",
+       "3                  -5.0  0.707107  7.071068e-01        0.5   0.866025  2000.0   \n",
+       "4                  -4.3  0.866025  5.000000e-01        0.5   0.866025  2000.0   \n",
+       "5                  -4.0  0.965926  2.588190e-01        0.5   0.866025  2000.0   \n",
+       "6                  -4.1  1.000000  6.123234e-17        0.5   0.866025  2000.0   \n",
+       "7                  -4.5  0.965926 -2.588190e-01        0.5   0.866025  2000.0   \n",
+       "8                  -4.0  0.866025 -5.000000e-01        0.5   0.866025  2000.0   \n",
+       "9                   2.8  0.707107 -7.071068e-01        0.5   0.866025  1953.0   \n",
+       "\n",
+       "   multi_class  \n",
+       "0            2  \n",
+       "1            2  \n",
+       "2            2  \n",
+       "3            2  \n",
+       "4            2  \n",
+       "5            2  \n",
+       "6            2  \n",
+       "7            2  \n",
+       "8            2  \n",
+       "9            2  \n",
+       "\n",
+       "[10 rows x 30 columns]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_busan_train.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>temp_C</th>\n",
+       "      <th>precip_mm</th>\n",
+       "      <th>wind_speed</th>\n",
+       "      <th>wind_dir</th>\n",
+       "      <th>hm</th>\n",
+       "      <th>vap_pressure</th>\n",
+       "      <th>dewpoint_C</th>\n",
+       "      <th>loc_pressure</th>\n",
+       "      <th>sea_pressure</th>\n",
+       "      <th>solarRad</th>\n",
+       "      <th>...</th>\n",
+       "      <th>year</th>\n",
+       "      <th>month</th>\n",
+       "      <th>hour</th>\n",
+       "      <th>ground_temp - temp_C</th>\n",
+       "      <th>hour_sin</th>\n",
+       "      <th>hour_cos</th>\n",
+       "      <th>month_sin</th>\n",
+       "      <th>month_cos</th>\n",
+       "      <th>visi</th>\n",
+       "      <th>multi_class</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>26294</th>\n",
+       "      <td>0.1</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>6.3</td>\n",
+       "      <td>270</td>\n",
+       "      <td>37.0</td>\n",
+       "      <td>2.3</td>\n",
+       "      <td>-12.9</td>\n",
+       "      <td>1013.3</td>\n",
+       "      <td>1022.1</td>\n",
+       "      <td>2.07</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>12</td>\n",
+       "      <td>14</td>\n",
+       "      <td>5.8</td>\n",
+       "      <td>-0.500000</td>\n",
+       "      <td>-8.660254e-01</td>\n",
+       "      <td>-2.449294e-16</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26295</th>\n",
+       "      <td>1.2</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>5.9</td>\n",
+       "      <td>270</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>2.3</td>\n",
+       "      <td>-12.6</td>\n",
+       "      <td>1013.2</td>\n",
+       "      <td>1022.0</td>\n",
+       "      <td>1.71</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>12</td>\n",
+       "      <td>15</td>\n",
+       "      <td>5.6</td>\n",
+       "      <td>-0.707107</td>\n",
+       "      <td>-7.071068e-01</td>\n",
+       "      <td>-2.449294e-16</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26296</th>\n",
+       "      <td>1.6</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.6</td>\n",
+       "      <td>290</td>\n",
+       "      <td>34.0</td>\n",
+       "      <td>2.3</td>\n",
+       "      <td>-12.6</td>\n",
+       "      <td>1012.8</td>\n",
+       "      <td>1021.6</td>\n",
+       "      <td>1.14</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>12</td>\n",
+       "      <td>16</td>\n",
+       "      <td>1.4</td>\n",
+       "      <td>-0.866025</td>\n",
+       "      <td>-5.000000e-01</td>\n",
+       "      <td>-2.449294e-16</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26297</th>\n",
+       "      <td>1.2</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.8</td>\n",
+       "      <td>250</td>\n",
+       "      <td>38.0</td>\n",
+       "      <td>2.5</td>\n",
+       "      <td>-11.5</td>\n",
+       "      <td>1012.8</td>\n",
+       "      <td>1021.6</td>\n",
+       "      <td>0.48</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>12</td>\n",
+       "      <td>17</td>\n",
+       "      <td>-0.4</td>\n",
+       "      <td>-0.965926</td>\n",
+       "      <td>-2.588190e-01</td>\n",
+       "      <td>-2.449294e-16</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26298</th>\n",
+       "      <td>0.9</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.8</td>\n",
+       "      <td>270</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>2.6</td>\n",
+       "      <td>-11.2</td>\n",
+       "      <td>1013.1</td>\n",
+       "      <td>1021.9</td>\n",
+       "      <td>0.02</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>12</td>\n",
+       "      <td>18</td>\n",
+       "      <td>-0.8</td>\n",
+       "      <td>-1.000000</td>\n",
+       "      <td>-1.836970e-16</td>\n",
+       "      <td>-2.449294e-16</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26299</th>\n",
+       "      <td>0.6</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>6.2</td>\n",
+       "      <td>270</td>\n",
+       "      <td>41.0</td>\n",
+       "      <td>2.6</td>\n",
+       "      <td>-11.1</td>\n",
+       "      <td>1014.0</td>\n",
+       "      <td>1022.8</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>12</td>\n",
+       "      <td>19</td>\n",
+       "      <td>-1.1</td>\n",
+       "      <td>-0.965926</td>\n",
+       "      <td>2.588190e-01</td>\n",
+       "      <td>-2.449294e-16</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26300</th>\n",
+       "      <td>0.1</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>270</td>\n",
+       "      <td>44.0</td>\n",
+       "      <td>2.7</td>\n",
+       "      <td>-10.7</td>\n",
+       "      <td>1014.8</td>\n",
+       "      <td>1023.6</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>12</td>\n",
+       "      <td>20</td>\n",
+       "      <td>-0.9</td>\n",
+       "      <td>-0.866025</td>\n",
+       "      <td>5.000000e-01</td>\n",
+       "      <td>-2.449294e-16</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26301</th>\n",
+       "      <td>-0.2</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>290</td>\n",
+       "      <td>48.0</td>\n",
+       "      <td>2.9</td>\n",
+       "      <td>-9.9</td>\n",
+       "      <td>1014.6</td>\n",
+       "      <td>1023.4</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>12</td>\n",
+       "      <td>21</td>\n",
+       "      <td>-0.8</td>\n",
+       "      <td>-0.707107</td>\n",
+       "      <td>7.071068e-01</td>\n",
+       "      <td>-2.449294e-16</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26302</th>\n",
+       "      <td>-0.7</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.7</td>\n",
+       "      <td>270</td>\n",
+       "      <td>51.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>-9.6</td>\n",
+       "      <td>1014.8</td>\n",
+       "      <td>1023.6</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>12</td>\n",
+       "      <td>22</td>\n",
+       "      <td>-0.6</td>\n",
+       "      <td>-0.500000</td>\n",
+       "      <td>8.660254e-01</td>\n",
+       "      <td>-2.449294e-16</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26303</th>\n",
+       "      <td>-0.7</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.8</td>\n",
+       "      <td>250</td>\n",
+       "      <td>55.0</td>\n",
+       "      <td>3.2</td>\n",
+       "      <td>-8.6</td>\n",
+       "      <td>1015.1</td>\n",
+       "      <td>1024.0</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>12</td>\n",
+       "      <td>23</td>\n",
+       "      <td>-0.6</td>\n",
+       "      <td>-0.258819</td>\n",
+       "      <td>9.659258e-01</td>\n",
+       "      <td>-2.449294e-16</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5000.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>10 rows × 30 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       temp_C  precip_mm  wind_speed wind_dir    hm  vap_pressure  dewpoint_C  \\\n",
+       "26294     0.1        0.0         6.3      270  37.0           2.3       -12.9   \n",
+       "26295     1.2        0.0         5.9      270  35.0           2.3       -12.6   \n",
+       "26296     1.6        0.0         3.6      290  34.0           2.3       -12.6   \n",
+       "26297     1.2        0.0         3.8      250  38.0           2.5       -11.5   \n",
+       "26298     0.9        0.0         3.8      270  40.0           2.6       -11.2   \n",
+       "26299     0.6        0.0         6.2      270  41.0           2.6       -11.1   \n",
+       "26300     0.1        0.0         6.0      270  44.0           2.7       -10.7   \n",
+       "26301    -0.2        0.0         5.0      290  48.0           2.9        -9.9   \n",
+       "26302    -0.7        0.0         2.7      270  51.0           3.0        -9.6   \n",
+       "26303    -0.7        0.0         3.8      250  55.0           3.2        -8.6   \n",
+       "\n",
+       "       loc_pressure  sea_pressure  solarRad  ...  year month hour  \\\n",
+       "26294        1013.3        1022.1      2.07  ...  2020    12   14   \n",
+       "26295        1013.2        1022.0      1.71  ...  2020    12   15   \n",
+       "26296        1012.8        1021.6      1.14  ...  2020    12   16   \n",
+       "26297        1012.8        1021.6      0.48  ...  2020    12   17   \n",
+       "26298        1013.1        1021.9      0.02  ...  2020    12   18   \n",
+       "26299        1014.0        1022.8      0.00  ...  2020    12   19   \n",
+       "26300        1014.8        1023.6      0.00  ...  2020    12   20   \n",
+       "26301        1014.6        1023.4      0.00  ...  2020    12   21   \n",
+       "26302        1014.8        1023.6      0.00  ...  2020    12   22   \n",
+       "26303        1015.1        1024.0      0.00  ...  2020    12   23   \n",
+       "\n",
+       "       ground_temp - temp_C  hour_sin      hour_cos     month_sin  month_cos  \\\n",
+       "26294                   5.8 -0.500000 -8.660254e-01 -2.449294e-16        1.0   \n",
+       "26295                   5.6 -0.707107 -7.071068e-01 -2.449294e-16        1.0   \n",
+       "26296                   1.4 -0.866025 -5.000000e-01 -2.449294e-16        1.0   \n",
+       "26297                  -0.4 -0.965926 -2.588190e-01 -2.449294e-16        1.0   \n",
+       "26298                  -0.8 -1.000000 -1.836970e-16 -2.449294e-16        1.0   \n",
+       "26299                  -1.1 -0.965926  2.588190e-01 -2.449294e-16        1.0   \n",
+       "26300                  -0.9 -0.866025  5.000000e-01 -2.449294e-16        1.0   \n",
+       "26301                  -0.8 -0.707107  7.071068e-01 -2.449294e-16        1.0   \n",
+       "26302                  -0.6 -0.500000  8.660254e-01 -2.449294e-16        1.0   \n",
+       "26303                  -0.6 -0.258819  9.659258e-01 -2.449294e-16        1.0   \n",
+       "\n",
+       "         visi  multi_class  \n",
+       "26294  5000.0            2  \n",
+       "26295  5000.0            2  \n",
+       "26296  5000.0            2  \n",
+       "26297  5000.0            2  \n",
+       "26298  5000.0            2  \n",
+       "26299  5000.0            2  \n",
+       "26300  5000.0            2  \n",
+       "26301  5000.0            2  \n",
+       "26302  5000.0            2  \n",
+       "26303  5000.0            2  \n",
+       "\n",
+       "[10 rows x 30 columns]"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_busan_train.tail(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 26304 entries, 0 to 26303\n",
+      "Data columns (total 30 columns):\n",
+      " #   Column                Non-Null Count  Dtype   \n",
+      "---  ------                --------------  -----   \n",
+      " 0   temp_C                26304 non-null  float64 \n",
+      " 1   precip_mm             26304 non-null  float64 \n",
+      " 2   wind_speed            26304 non-null  float64 \n",
+      " 3   wind_dir              26304 non-null  category\n",
+      " 4   hm                    26304 non-null  float64 \n",
+      " 5   vap_pressure          26304 non-null  float64 \n",
+      " 6   dewpoint_C            26304 non-null  float64 \n",
+      " 7   loc_pressure          26304 non-null  float64 \n",
+      " 8   sea_pressure          26304 non-null  float64 \n",
+      " 9   solarRad              26304 non-null  float64 \n",
+      " 10  snow_cm               26304 non-null  float64 \n",
+      " 11  cloudcover            26304 non-null  category\n",
+      " 12  lm_cloudcover         26304 non-null  category\n",
+      " 13  low_cloudbase         26304 non-null  float64 \n",
+      " 14  groundtemp            26304 non-null  float64 \n",
+      " 15  O3                    26304 non-null  float64 \n",
+      " 16  NO2                   26304 non-null  float64 \n",
+      " 17  PM10                  26304 non-null  float64 \n",
+      " 18  PM25                  26304 non-null  float64 \n",
+      " 19  binary_class          26304 non-null  int64   \n",
+      " 20  year                  26304 non-null  int64   \n",
+      " 21  month                 26304 non-null  int64   \n",
+      " 22  hour                  26304 non-null  int64   \n",
+      " 23  ground_temp - temp_C  26304 non-null  float64 \n",
+      " 24  hour_sin              26304 non-null  float64 \n",
+      " 25  hour_cos              26304 non-null  float64 \n",
+      " 26  month_sin             26304 non-null  float64 \n",
+      " 27  month_cos             26304 non-null  float64 \n",
+      " 28  visi                  26304 non-null  float64 \n",
+      " 29  multi_class           26304 non-null  int64   \n",
+      "dtypes: category(3), float64(22), int64(5)\n",
+      "memory usage: 5.7 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_busan_train.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_seoul_train.to_csv(\"../../data/data_for_modeling/seoul_train.csv\")\n",
+    "df_seoul_test.to_csv(\"../../data/data_for_modeling/seoul_test.csv\")\n",
+    "\n",
+    "df_busan_train.to_csv(\"../../data/data_for_modeling/busan_train.csv\")\n",
+    "df_busan_test.to_csv(\"../../data/data_for_modeling/busan_test.csv\")\n",
+    "\n",
+    "df_incheon_train.to_csv(\"../../data/data_for_modeling/incheon_train.csv\")\n",
+    "df_incheon_test.to_csv(\"../../data/data_for_modeling/incheon_test.csv\")\n",
+    "\n",
+    "df_daegu_train.to_csv(\"../../data/data_for_modeling/daegu_train.csv\")\n",
+    "df_daegu_test.to_csv(\"../../data/data_for_modeling/daegu_test.csv\")\n",
+    "\n",
+    "df_daejeon_train.to_csv(\"../../data/data_for_modeling/daejeon_train.csv\")\n",
+    "df_daejeon_test.to_csv(\"../../data/data_for_modeling/daejeon_test.csv\")\n",
+    "\n",
+    "df_gwangju_train.to_csv(\"../../data/data_for_modeling/gwangju_train.csv\")\n",
+    "df_gwangju_test.to_csv(\"../../data/data_for_modeling/gwangju_test.csv\")\n",
+    "\n",
+    "df_seoul_train = pd.read_csv(\"../../data/data_for_modeling/seoul_train.csv\")\n",
+    "df_seoul_test = pd.read_csv(\"../../data/data_for_modeling/seoul_test.csv\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Counter({2: 8266, 1: 481, 0: 13})\n",
+      "Counter({2: 23686, 1: 2579, 0: 39})\n",
+      "Counter({2: 8455, 1: 281, 0: 24})\n",
+      "Counter({2: 24694, 1: 1516, 0: 94})\n",
+      "Counter({2: 7373, 1: 1205, 0: 182})\n",
+      "Counter({2: 21893, 1: 3892, 0: 519})\n",
+      "Counter({2: 8631, 1: 128, 0: 1})\n",
+      "Counter({2: 25149, 1: 1107, 0: 48})\n",
+      "Counter({2: 8089, 1: 618, 0: 53})\n",
+      "Counter({2: 23471, 1: 2660, 0: 173})\n",
+      "Counter({2: 8087, 1: 643, 0: 30})\n",
+      "Counter({2: 23798, 1: 2411, 0: 95})\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(Counter(df_seoul_test['multi_class']))\n",
+    "print(Counter(df_seoul_train['multi_class']))\n",
+    "\n",
+    "print(Counter(df_busan_test['multi_class']))\n",
+    "print(Counter(df_busan_train['multi_class']))\n",
+    "\n",
+    "print(Counter(df_incheon_test['multi_class']))\n",
+    "print(Counter(df_incheon_train['multi_class']))\n",
+    "\n",
+    "print(Counter(df_daegu_test['multi_class']))\n",
+    "print(Counter(df_daegu_train['multi_class']))\n",
+    "\n",
+    "print(Counter(df_daejeon_test['multi_class']))\n",
+    "print(Counter(df_daejeon_train['multi_class']))\n",
+    "\n",
+    "print(Counter(df_gwangju_test['multi_class']))\n",
+    "print(Counter(df_gwangju_train['multi_class']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Analysis_code/2.make_oversample_data/gpu0.log ADDED Viewed

The diff for this file is too large to render. See raw diff

Analysis_code/2.make_oversample_data/gpu1.log ADDED Viewed

The diff for this file is too large to render. See raw diff

Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_10000_1.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import pandas as pd
+import numpy as np
+from pathlib import Path
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2018, 2019]
+TARGET_SAMPLES_CLASS_0 = 10000
+TARGET_SAMPLES_CLASS_1_BASE = 10000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 CTGAN만 적용하여 증강
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # 원본 데이터에 multi_class 추가
+    train_data = X.copy()
+    train_data['multi_class'] = y
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(train_data)
+    # 클래스별 샘플 수 계산
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - count_class_1
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        train_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        train_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'ctgan_only_10000_1_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'ctgan_only_10000_1_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 저장 (CTGAN으로 생성된 샘플만)
+    augmented_only = pd.concat([well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # 원본 데이터와 필터링된 CTGAN 샘플 병합
+    ctgan_data = pd.concat([train_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    ctgan_data = add_derived_features(ctgan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = ctgan_data[ctgan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/ctgan10000/ctgan10000_1_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_10000_2.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2018, 2020]
+TARGET_SAMPLES_CLASS_0 = 10000
+TARGET_SAMPLES_CLASS_1_BASE = 10000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 CTGAN만 적용하여 증강
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # 원본 데이터에 multi_class 추가
+    train_data = X.copy()
+    train_data['multi_class'] = y
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(train_data)
+    # 클래스별 샘플 수 계산
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - count_class_1
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        train_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        train_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'ctgan_only_10000_2_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'ctgan_only_10000_2_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 저장 (CTGAN으로 생성된 샘플만)
+    augmented_only = pd.concat([well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # 원본 데이터와 필터링된 CTGAN 샘플 병합
+    ctgan_data = pd.concat([train_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    ctgan_data = add_derived_features(ctgan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = ctgan_data[ctgan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/ctgan10000/ctgan10000_2_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_10000_3.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2019, 2020]
+TARGET_SAMPLES_CLASS_0 = 10000
+TARGET_SAMPLES_CLASS_1_BASE = 10000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 CTGAN만 적용하여 증강
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # 원본 데이터에 multi_class 추가
+    train_data = X.copy()
+    train_data['multi_class'] = y
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(train_data)
+    # 클래스별 샘플 수 계산
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - count_class_1
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        train_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        train_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'ctgan_only_10000_3_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'ctgan_only_10000_3_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 저장 (CTGAN으로 생성된 샘플만)
+    augmented_only = pd.concat([well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # 원본 데이터와 필터링된 CTGAN 샘플 병합
+    ctgan_data = pd.concat([train_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    ctgan_data = add_derived_features(ctgan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = ctgan_data[ctgan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/ctgan10000/ctgan10000_3_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_20000_1.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import pandas as pd
+import numpy as np
+from pathlib import Path
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2018, 2019]
+TARGET_SAMPLES_CLASS_0 = 20000
+TARGET_SAMPLES_CLASS_1_BASE = 20000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 CTGAN만 적용하여 증강
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # 원본 데이터에 multi_class 추가
+    train_data = X.copy()
+    train_data['multi_class'] = y
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(train_data)
+    # 클래스별 샘플 수 계산
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - count_class_1
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        train_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        train_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'ctgan_only_20000_1_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'ctgan_only_20000_1_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 저장 (CTGAN으로 생성된 샘플만)
+    augmented_only = pd.concat([well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # 원본 데이터와 필터링된 CTGAN 샘플 병합
+    ctgan_data = pd.concat([train_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    ctgan_data = add_derived_features(ctgan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = ctgan_data[ctgan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/ctgan20000/ctgan20000_1_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_20000_2.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2018, 2020]
+TARGET_SAMPLES_CLASS_0 = 20000
+TARGET_SAMPLES_CLASS_1_BASE = 20000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 CTGAN만 적용하여 증강
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # 원본 데이터에 multi_class 추가
+    train_data = X.copy()
+    train_data['multi_class'] = y
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(train_data)
+    # 클래스별 샘플 수 계산
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - count_class_1
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        train_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        train_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'ctgan_only_20000_2_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'ctgan_only_20000_2_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 저장 (CTGAN으로 생성된 샘플만)
+    augmented_only = pd.concat([well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # 원본 데이터와 필터링된 CTGAN 샘플 병합
+    ctgan_data = pd.concat([train_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    ctgan_data = add_derived_features(ctgan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = ctgan_data[ctgan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/ctgan20000/ctgan20000_2_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_20000_3.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2019, 2020]
+TARGET_SAMPLES_CLASS_0 = 20000
+TARGET_SAMPLES_CLASS_1_BASE = 20000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 CTGAN만 적용하여 증강
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # 원본 데이터에 multi_class 추가
+    train_data = X.copy()
+    train_data['multi_class'] = y
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(train_data)
+    # 클래스별 샘플 수 계산
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - count_class_1
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        train_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        train_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'ctgan_only_20000_3_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'ctgan_only_20000_3_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 저장 (CTGAN으로 생성된 샘플만)
+    augmented_only = pd.concat([well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # 원본 데이터와 필터링된 CTGAN 샘플 병합
+    ctgan_data = pd.concat([train_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    ctgan_data = add_derived_features(ctgan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = ctgan_data[ctgan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/ctgan20000/ctgan20000_3_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_7000_1.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2018, 2019]
+TARGET_SAMPLES_CLASS_0 = 7000
+TARGET_SAMPLES_CLASS_1_BASE = 7000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 CTGAN만 적용하여 증강
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # 원본 데이터에 multi_class 추가
+    train_data = X.copy()
+    train_data['multi_class'] = y
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(train_data)
+    # 클래스별 샘플 수 계산
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - count_class_1
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        train_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        train_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'ctgan_only_7000_1_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'ctgan_only_7000_1_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 저장 (CTGAN으로 생성된 샘플만)
+    augmented_only = pd.concat([well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # 원본 데이터와 필터링된 CTGAN 샘플 병합
+    ctgan_data = pd.concat([train_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    ctgan_data = add_derived_features(ctgan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = ctgan_data[ctgan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/ctgan7000/ctgan7000_1_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_7000_2.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2018, 2020]
+TARGET_SAMPLES_CLASS_0 = 7000
+TARGET_SAMPLES_CLASS_1_BASE = 7000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 CTGAN만 적용하여 증강
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # 원본 데이터에 multi_class 추가
+    train_data = X.copy()
+    train_data['multi_class'] = y
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(train_data)
+    # 클래스별 샘플 수 계산
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - count_class_1
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        train_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        train_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'ctgan_only_7000_2_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'ctgan_only_7000_2_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 저장 (CTGAN으로 생성된 샘플만)
+    augmented_only = pd.concat([well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # 원본 데이터와 필터링된 CTGAN 샘플 병합
+    ctgan_data = pd.concat([train_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    ctgan_data = add_derived_features(ctgan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = ctgan_data[ctgan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/ctgan7000/ctgan7000_2_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/only_ctgan/ctgan_sample_7000_3.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2019, 2020]
+TARGET_SAMPLES_CLASS_0 = 7000
+TARGET_SAMPLES_CLASS_1_BASE = 7000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 CTGAN만 적용하여 증강
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # 원본 데이터에 multi_class 추가
+    train_data = X.copy()
+    train_data['multi_class'] = y
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(train_data)
+    # 클래스별 샘플 수 계산
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - count_class_1
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        train_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        train_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'ctgan_only_7000_3_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'ctgan_only_7000_3_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 저장 (CTGAN으로 생성된 샘플만)
+    augmented_only = pd.concat([well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # 원본 데이터와 필터링된 CTGAN 샘플 병합
+    ctgan_data = pd.concat([train_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    ctgan_data = add_derived_features(ctgan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = ctgan_data[ctgan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/ctgan7000/ctgan7000_3_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/run_ctgan_gpu0.bash ADDED Viewed

	@@ -0,0 +1,58 @@

+#!/bin/bash
+# GPU 0번에서 CTGAN 샘플 생성 스크립트들을 순차적으로 실행
+# 실행 디렉토리: /workspace/visibility_prediction/Analysis_code/make_oversample_data
+export CUDA_VISIBLE_DEVICES=0
+echo "=========================================="
+echo "Starting CTGAN sample generation on GPU 0"
+echo "=========================================="
+echo ""
+# 7000 샘플 생성
+echo "=== Processing 7000 samples ==="
+echo "Running only_ctgan/ctgan_sample_7000_1.py..."
+python only_ctgan/ctgan_sample_7000_1.py
+echo ""
+echo "Running only_ctgan/ctgan_sample_7000_2.py..."
+python only_ctgan/ctgan_sample_7000_2.py
+echo ""
+echo "Running only_ctgan/ctgan_sample_7000_3.py..."
+python only_ctgan/ctgan_sample_7000_3.py
+echo ""
+# 10000 샘플 생성
+echo "=== Processing 10000 samples ==="
+echo "Running only_ctgan/ctgan_sample_10000_1.py..."
+python only_ctgan/ctgan_sample_10000_1.py
+echo ""
+echo "Running only_ctgan/ctgan_sample_10000_2.py..."
+python only_ctgan/ctgan_sample_10000_2.py
+echo ""
+echo "Running only_ctgan/ctgan_sample_10000_3.py..."
+python only_ctgan/ctgan_sample_10000_3.py
+echo ""
+# 20000 샘플 생성
+echo "=== Processing 20000 samples ==="
+echo "Running only_ctgan/ctgan_sample_20000_1.py..."
+python only_ctgan/ctgan_sample_20000_1.py
+echo ""
+echo "Running only_ctgan/ctgan_sample_20000_2.py..."
+python only_ctgan/ctgan_sample_20000_2.py
+echo ""
+echo "Running only_ctgan/ctgan_sample_20000_3.py..."
+python only_ctgan/ctgan_sample_20000_3.py
+echo ""
+echo "=========================================="
+echo "All CTGAN sample generation completed!"
+echo "=========================================="

Analysis_code/2.make_oversample_data/run_ctgan_gpu1.bash ADDED Viewed

	@@ -0,0 +1,58 @@

+#!/bin/bash
+# GPU 1번에서 SMOTENC+CTGAN 샘플 생성 스크립트들을 순차적으로 실행
+# 실행 디렉토리: /workspace/visibility_prediction/Analysis_code/make_oversample_data
+export CUDA_VISIBLE_DEVICES=1
+echo "=========================================="
+echo "Starting SMOTENC+CTGAN sample generation on GPU 1"
+echo "=========================================="
+echo ""
+# 7000 샘플 생성
+echo "=== Processing 7000 samples ==="
+echo "Running smotenc_ctgan/smotenc_ctgan_sample_7000_1.py..."
+python smotenc_ctgan/smotenc_ctgan_sample_7000_1.py
+echo ""
+echo "Running smotenc_ctgan/smotenc_ctgan_sample_7000_2.py..."
+python smotenc_ctgan/smotenc_ctgan_sample_7000_2.py
+echo ""
+echo "Running smotenc_ctgan/smotenc_ctgan_sample_7000_3.py..."
+python smotenc_ctgan/smotenc_ctgan_sample_7000_3.py
+echo ""
+# 10000 샘플 생성
+echo "=== Processing 10000 samples ==="
+echo "Running smotenc_ctgan/smotenc_ctgan_sample_10000_1.py..."
+python smotenc_ctgan/smotenc_ctgan_sample_10000_1.py
+echo ""
+echo "Running smotenc_ctgan/smotenc_ctgan_sample_10000_2.py..."
+python smotenc_ctgan/smotenc_ctgan_sample_10000_2.py
+echo ""
+echo "Running smotenc_ctgan/smotenc_ctgan_sample_10000_3.py..."
+python smotenc_ctgan/smotenc_ctgan_sample_10000_3.py
+echo ""
+# 20000 샘플 생성
+echo "=== Processing 20000 samples ==="
+echo "Running smotenc_ctgan/smotenc_ctgan_sample_20000_1.py..."
+python smotenc_ctgan/smotenc_ctgan_sample_20000_1.py
+echo ""
+echo "Running smotenc_ctgan/smotenc_ctgan_sample_20000_2.py..."
+python smotenc_ctgan/smotenc_ctgan_sample_20000_2.py
+echo ""
+echo "Running smotenc_ctgan/smotenc_ctgan_sample_20000_3.py..."
+python smotenc_ctgan/smotenc_ctgan_sample_20000_3.py
+echo ""
+echo "=========================================="
+echo "All SMOTENC+CTGAN sample generation completed!"
+echo "=========================================="

Analysis_code/2.make_oversample_data/smote_only/smote_sample_1.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import pandas as pd
+import numpy as np
+from pathlib import Path
+from imblearn.over_sampling import SMOTENC
+# 지역별 데이터 파일 경로
+regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
+file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in regions]
+output_paths = [f'../../../data/data_oversampled/smote/smote_1_{region}.csv' for region in regions]
+# 지역별 처리
+for file_path, output_path in zip(file_paths, output_paths):
+    # 데이터 로드
+    original_data = pd.read_csv(file_path, index_col=0)
+    data = original_data.loc[original_data['year'].isin([2018, 2019]), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
+    # SMOTENC에서 사용할 범주형 변수 열 번호 설정
+    categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
+    # sampling_strategy 설정
+    count_class_2 = (y == 2).sum()
+    sampling_strategy = {
+        0: int(np.ceil(count_class_2 / 1000) * 500),
+        1: int(np.ceil(count_class_2 / 1000) * 500),
+        2: count_class_2
+    }
+    # SMOTENC 적용
+    smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
+    X_resampled, y_resampled = smotenc.fit_resample(X, y)
+    # Resampled 데이터 생성
+    lerp_data = X_resampled.copy()
+    lerp_data['multi_class'] = y_resampled
+    # 제거변수 복구
+    lerp_data['binary_class'] = lerp_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    lerp_data['hour_sin'] = np.sin(2 * np.pi * lerp_data['hour'] / 24)
+    lerp_data['hour_cos'] = np.cos(2 * np.pi * lerp_data['hour'] / 24)
+    lerp_data['month_sin'] = np.sin(2 * np.pi * lerp_data['month'] / 12)
+    lerp_data['month_cos'] = np.cos(2 * np.pi * lerp_data['month'] / 12)
+    lerp_data['ground_temp - temp_C'] = lerp_data['groundtemp'] - lerp_data['temp_C']
+    # 증강된 데이터만 저장 (SMOTENC으로 증강된 부분만)
+    # lerp_data의 처음 len(X)개는 원본 데이터이므로 제외
+    original_data_count = len(X)
+    augmented_only = lerp_data.iloc[original_data_count:].copy()  # SMOTENC으로 증강된 부분만
+    augmented_only = augmented_only[augmented_only['multi_class'] != 2].copy()  # 클래스 2 제외
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = lerp_data[lerp_data['multi_class'] != 2]
+    original_class_2 = data[data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")

Analysis_code/2.make_oversample_data/smote_only/smote_sample_2.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import pandas as pd
+import numpy as np
+from pathlib import Path
+from imblearn.over_sampling import SMOTENC
+# 지역별 데이터 파일 경로
+regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
+file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in regions]
+output_paths = [f'../../../data/data_oversampled/smote/smote_2_{region}.csv' for region in regions]
+# 지역별 처리
+for file_path, output_path in zip(file_paths, output_paths):
+    # 데이터 로드
+    original_data = pd.read_csv(file_path, index_col=0)
+    data = original_data.loc[original_data['year'].isin([2018, 2020]), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
+    # SMOTENC에서 사용할 범주형 변수 열 번호 설정
+    categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
+    # sampling_strategy 설정
+    count_class_2 = (y == 2).sum()
+    sampling_strategy = {
+        0: int(np.ceil(count_class_2 / 1000) * 500),
+        1: int(np.ceil(count_class_2 / 1000) * 500),
+        2: count_class_2
+    }
+    # SMOTENC 적용
+    smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
+    X_resampled, y_resampled = smotenc.fit_resample(X, y)
+    # Resampled 데이터 생성
+    lerp_data = X_resampled.copy()
+    lerp_data['multi_class'] = y_resampled
+    # 제거변수 복구
+    lerp_data['binary_class'] = lerp_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    lerp_data['hour_sin'] = np.sin(2 * np.pi * lerp_data['hour'] / 24)
+    lerp_data['hour_cos'] = np.cos(2 * np.pi * lerp_data['hour'] / 24)
+    lerp_data['month_sin'] = np.sin(2 * np.pi * lerp_data['month'] / 12)
+    lerp_data['month_cos'] = np.cos(2 * np.pi * lerp_data['month'] / 12)
+    lerp_data['ground_temp - temp_C'] = lerp_data['groundtemp'] - lerp_data['temp_C']
+    # 증강된 데이터만 저장 (SMOTENC으로 증강된 부분만)
+    # lerp_data의 처음 len(X)개는 원본 데이터이므로 제외
+    original_data_count = len(X)
+    augmented_only = lerp_data.iloc[original_data_count:].copy()  # SMOTENC으로 증강된 부분만
+    augmented_only = augmented_only[augmented_only['multi_class'] != 2].copy()  # 클래스 2 제외
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = lerp_data[lerp_data['multi_class'] != 2]
+    original_class_2 = data[data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")

Analysis_code/2.make_oversample_data/smote_only/smote_sample_3.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import pandas as pd
+import numpy as np
+from pathlib import Path
+from imblearn.over_sampling import SMOTENC
+# 지역별 데이터 파일 경로
+regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
+file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in regions]
+output_paths = [f'../../../data/data_oversampled/smote/smote_3_{region}.csv' for region in regions]
+# 지역별 처리
+for file_path, output_path in zip(file_paths, output_paths):
+    # 데이터 로드
+    original_data = pd.read_csv(file_path, index_col=0)
+    data = original_data.loc[original_data['year'].isin([2019, 2020]), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
+    # SMOTENC에서 사용할 범주형 변수 열 번호 설정
+    categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
+    # sampling_strategy 설정
+    count_class_2 = (y == 2).sum()
+    sampling_strategy = {
+        0: int(np.ceil(count_class_2 / 1000) * 500),
+        1: int(np.ceil(count_class_2 / 1000) * 500),
+        2: count_class_2
+    }
+    # SMOTENC 적용
+    smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
+    X_resampled, y_resampled = smotenc.fit_resample(X, y)
+    # Resampled 데이터 생성
+    lerp_data = X_resampled.copy()
+    lerp_data['multi_class'] = y_resampled
+    # 제거변수 복구
+    lerp_data['binary_class'] = lerp_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    lerp_data['hour_sin'] = np.sin(2 * np.pi * lerp_data['hour'] / 24)
+    lerp_data['hour_cos'] = np.cos(2 * np.pi * lerp_data['hour'] / 24)
+    lerp_data['month_sin'] = np.sin(2 * np.pi * lerp_data['month'] / 12)
+    lerp_data['month_cos'] = np.cos(2 * np.pi * lerp_data['month'] / 12)
+    lerp_data['ground_temp - temp_C'] = lerp_data['groundtemp'] - lerp_data['temp_C']
+    # 증강된 데이터만 저장 (SMOTENC으로 증강된 부분만)
+    # lerp_data의 처음 len(X)개는 원본 데이터이므로 제외
+    original_data_count = len(X)
+    augmented_only = lerp_data.iloc[original_data_count:].copy()  # SMOTENC으로 증강된 부분만
+    augmented_only = augmented_only[augmented_only['multi_class'] != 2].copy()  # 클래스 2 제외
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = lerp_data[lerp_data['multi_class'] != 2]
+    original_class_2 = data[data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")

Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_10000_1.py ADDED Viewed

	@@ -0,0 +1,375 @@

+import pandas as pd
+import numpy as np
+from pathlib import Path
+from imblearn.over_sampling import SMOTENC
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2018, 2019]
+TARGET_SAMPLES_CLASS_0 = 10000
+TARGET_SAMPLES_CLASS_1_BASE = 10000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_indices(X: pd.DataFrame) -> list:
+    """범주형 변수의 열 인덱스 반환"""
+    return [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def calculate_sampling_strategy(y: pd.Series) -> dict:
+    """
+    SMOTENC를 위한 sampling_strategy 계산
+    Args:
+        y: 타겟 변수
+    Returns:
+        sampling_strategy 딕셔너리
+    """
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    count_class_2 = (y == 2).sum()
+    return {
+        0: 500 if count_class_0 <= 500 else 1000,
+        1: int(np.ceil(count_class_1 / 100) * 100),  # 백의 자리로 올림
+        2: count_class_2
+    }
+def apply_smotenc(X: pd.DataFrame, y: pd.Series,
+                  categorical_features_indices: list,
+                  sampling_strategy: dict) -> pd.DataFrame:
+    """
+    SMOTENC 적용하여 데이터 증강
+    Args:
+        X: 특징 데이터
+        y: 타겟 데이터
+        categorical_features_indices: 범주형 변수 인덱스
+        sampling_strategy: 샘플링 전략
+    Returns:
+        증강된 데이터프레임 (multi_class 포함)
+    """
+    smotenc = SMOTENC(
+        categorical_features=categorical_features_indices,
+        sampling_strategy=sampling_strategy,
+        random_state=RANDOM_STATE
+    )
+    X_resampled, y_resampled = smotenc.fit_resample(X, y)
+    resampled_data = X_resampled.copy()
+    resampled_data['multi_class'] = y_resampled
+    return resampled_data
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 SMOTENC와 CTGAN을 순차적으로 적용
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # SMOTENC 적용
+    categorical_features_indices = get_categorical_feature_indices(X)
+    sampling_strategy = calculate_sampling_strategy(y)
+    smotenc_data = apply_smotenc(X, y, categorical_features_indices, sampling_strategy)
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(smotenc_data)
+    # 클래스별 샘플 수 계산
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - int(np.ceil(count_class_1 / 100) * 100)
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        smotenc_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        smotenc_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'smotenc_ctgan_10000_1_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'smotenc_ctgan_10000_1_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 추출 (SMOTENC으로 증강된 부분 + CTGAN으로 생성된 샘플)
+    # smotenc_data의 처음 len(X)개는 원본 데이터이므로 제외
+    original_data_count = len(X)
+    smotenc_augmented = smotenc_data.iloc[original_data_count:].copy()  # SMOTENC으로 증강된 부분만
+    # 증강된 데이터만 병합 (SMOTENC 증강 + CTGAN 증강)
+    augmented_only = pd.concat([smotenc_augmented, well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # SMOTENC 데이터와 필터링된 CTGAN 샘플 병합 (최종 결과용)
+    smote_gan_data = pd.concat([smotenc_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    smote_gan_data = add_derived_features(smote_gan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_1_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_10000_2.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+from imblearn.over_sampling import SMOTENC
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2018, 2020]
+TARGET_SAMPLES_CLASS_0 = 10000
+TARGET_SAMPLES_CLASS_1_BASE = 10000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_indices(X: pd.DataFrame) -> list:
+    """범주형 변수의 열 인덱스 반환"""
+    return [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def calculate_sampling_strategy(y: pd.Series) -> dict:
+    """
+    SMOTENC를 위한 sampling_strategy 계산
+    Args:
+        y: 타겟 변수
+    Returns:
+        sampling_strategy 딕셔너리
+    """
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    count_class_2 = (y == 2).sum()
+    return {
+        0: 500 if count_class_0 <= 500 else 1000,
+        1: int(np.ceil(count_class_1 / 100) * 100),  # 백의 자리로 올림
+        2: count_class_2
+    }
+def apply_smotenc(X: pd.DataFrame, y: pd.Series,
+                  categorical_features_indices: list,
+                  sampling_strategy: dict) -> pd.DataFrame:
+    """
+    SMOTENC 적용하여 데이터 증강
+    Args:
+        X: 특징 데이터
+        y: 타겟 데이터
+        categorical_features_indices: 범주형 변수 인덱스
+        sampling_strategy: 샘플링 전략
+    Returns:
+        증강된 데이터프레임 (multi_class 포함)
+    """
+    smotenc = SMOTENC(
+        categorical_features=categorical_features_indices,
+        sampling_strategy=sampling_strategy,
+        random_state=RANDOM_STATE
+    )
+    X_resampled, y_resampled = smotenc.fit_resample(X, y)
+    resampled_data = X_resampled.copy()
+    resampled_data['multi_class'] = y_resampled
+    return resampled_data
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 SMOTENC와 CTGAN을 순차적으로 적용
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # SMOTENC 적용
+    categorical_features_indices = get_categorical_feature_indices(X)
+    sampling_strategy = calculate_sampling_strategy(y)
+    smotenc_data = apply_smotenc(X, y, categorical_features_indices, sampling_strategy)
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(smotenc_data)
+    # 클래스별 샘플 수 계산
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - int(np.ceil(count_class_1 / 100) * 100)
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        smotenc_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클���스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        smotenc_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'smotenc_ctgan_10000_2_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'smotenc_ctgan_10000_2_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 추출 (SMOTENC으로 증강된 부분 + CTGAN으로 생성된 샘플)
+    # smotenc_data의 처음 len(X)개는 원본 데이터이므로 제외
+    original_data_count = len(X)
+    smotenc_augmented = smotenc_data.iloc[original_data_count:].copy()  # SMOTENC으로 증강된 부분만
+    # 증강된 데이터만 병합 (SMOTENC 증강 + CTGAN 증강)
+    augmented_only = pd.concat([smotenc_augmented, well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # SMOTENC 데이터와 필터링된 CTGAN 샘플 병합 (최종 결과용)
+    smote_gan_data = pd.concat([smotenc_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    smote_gan_data = add_derived_features(smote_gan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_2_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_10000_3.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+from imblearn.over_sampling import SMOTENC
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2019, 2020]
+TARGET_SAMPLES_CLASS_0 = 10000
+TARGET_SAMPLES_CLASS_1_BASE = 10000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_indices(X: pd.DataFrame) -> list:
+    """범주형 변수의 열 인덱스 반환"""
+    return [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def calculate_sampling_strategy(y: pd.Series) -> dict:
+    """
+    SMOTENC를 위한 sampling_strategy 계산
+    Args:
+        y: 타겟 변수
+    Returns:
+        sampling_strategy 딕셔너리
+    """
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    count_class_2 = (y == 2).sum()
+    return {
+        0: 500 if count_class_0 <= 500 else 1000,
+        1: int(np.ceil(count_class_1 / 100) * 100),  # 백의 자리로 올림
+        2: count_class_2
+    }
+def apply_smotenc(X: pd.DataFrame, y: pd.Series,
+                  categorical_features_indices: list,
+                  sampling_strategy: dict) -> pd.DataFrame:
+    """
+    SMOTENC 적용하여 데이터 증강
+    Args:
+        X: 특징 데이터
+        y: 타겟 데이터
+        categorical_features_indices: 범주형 변수 인덱스
+        sampling_strategy: 샘플링 전략
+    Returns:
+        증강된 데이터프레임 (multi_class 포함)
+    """
+    smotenc = SMOTENC(
+        categorical_features=categorical_features_indices,
+        sampling_strategy=sampling_strategy,
+        random_state=RANDOM_STATE
+    )
+    X_resampled, y_resampled = smotenc.fit_resample(X, y)
+    resampled_data = X_resampled.copy()
+    resampled_data['multi_class'] = y_resampled
+    return resampled_data
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 SMOTENC와 CTGAN을 순차적으로 적용
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # SMOTENC 적용
+    categorical_features_indices = get_categorical_feature_indices(X)
+    sampling_strategy = calculate_sampling_strategy(y)
+    smotenc_data = apply_smotenc(X, y, categorical_features_indices, sampling_strategy)
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(smotenc_data)
+    # 클래스별 샘플 수 계산
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - int(np.ceil(count_class_1 / 100) * 100)
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        smotenc_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클���스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        smotenc_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'smotenc_ctgan_10000_3_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'smotenc_ctgan_10000_3_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 추출 (SMOTENC으로 증강된 부분 + CTGAN으로 생성된 샘플)
+    # smotenc_data의 처음 len(X)개는 원본 데이터이므로 제외
+    original_data_count = len(X)
+    smotenc_augmented = smotenc_data.iloc[original_data_count:].copy()  # SMOTENC으로 증강된 부분만
+    # 증강된 데이터만 병합 (SMOTENC 증강 + CTGAN 증강)
+    augmented_only = pd.concat([smotenc_augmented, well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # SMOTENC 데이터와 필터링된 CTGAN 샘플 병합 (최종 결과용)
+    smote_gan_data = pd.concat([smotenc_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    smote_gan_data = add_derived_features(smote_gan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/smotenc_ctgan10000/smotenc_ctgan10000_3_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_20000_1.py ADDED Viewed

	@@ -0,0 +1,375 @@

+import pandas as pd
+import numpy as np
+from pathlib import Path
+from imblearn.over_sampling import SMOTENC
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2018, 2019]
+TARGET_SAMPLES_CLASS_0 = 20000
+TARGET_SAMPLES_CLASS_1_BASE = 20000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_indices(X: pd.DataFrame) -> list:
+    """범주형 변수의 열 인덱스 반환"""
+    return [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def calculate_sampling_strategy(y: pd.Series) -> dict:
+    """
+    SMOTENC를 위한 sampling_strategy 계산
+    Args:
+        y: 타겟 변수
+    Returns:
+        sampling_strategy 딕셔너리
+    """
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    count_class_2 = (y == 2).sum()
+    return {
+        0: 500 if count_class_0 <= 500 else 1000,
+        1: int(np.ceil(count_class_1 / 100) * 100),  # 백의 자리로 올림
+        2: count_class_2
+    }
+def apply_smotenc(X: pd.DataFrame, y: pd.Series,
+                  categorical_features_indices: list,
+                  sampling_strategy: dict) -> pd.DataFrame:
+    """
+    SMOTENC 적용하여 데이터 증강
+    Args:
+        X: 특징 데이터
+        y: 타겟 데이터
+        categorical_features_indices: 범주형 변수 인덱스
+        sampling_strategy: 샘플링 전략
+    Returns:
+        증강된 데이터프레임 (multi_class 포함)
+    """
+    smotenc = SMOTENC(
+        categorical_features=categorical_features_indices,
+        sampling_strategy=sampling_strategy,
+        random_state=RANDOM_STATE
+    )
+    X_resampled, y_resampled = smotenc.fit_resample(X, y)
+    resampled_data = X_resampled.copy()
+    resampled_data['multi_class'] = y_resampled
+    return resampled_data
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 SMOTENC와 CTGAN을 순차적으로 적용
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # SMOTENC 적용
+    categorical_features_indices = get_categorical_feature_indices(X)
+    sampling_strategy = calculate_sampling_strategy(y)
+    smotenc_data = apply_smotenc(X, y, categorical_features_indices, sampling_strategy)
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(smotenc_data)
+    # 클래스별 샘플 수 계산
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - int(np.ceil(count_class_1 / 100) * 100)
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        smotenc_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        smotenc_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'smotenc_ctgan_20000_1_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'smotenc_ctgan_20000_1_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 추출 (SMOTENC으로 증강된 부분 + CTGAN으로 생성된 샘플)
+    # smotenc_data의 처음 len(X)개는 원본 데이터이므로 제외
+    original_data_count = len(X)
+    smotenc_augmented = smotenc_data.iloc[original_data_count:].copy()  # SMOTENC으로 증강된 부분만
+    # 증강된 데이터만 병합 (SMOTENC 증강 + CTGAN 증강)
+    augmented_only = pd.concat([smotenc_augmented, well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # SMOTENC 데이터와 필터링된 CTGAN 샘플 병합 (최종 결과용)
+    smote_gan_data = pd.concat([smotenc_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    smote_gan_data = add_derived_features(smote_gan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_1_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_20000_2.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+from imblearn.over_sampling import SMOTENC
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2018, 2020]
+TARGET_SAMPLES_CLASS_0 = 20000
+TARGET_SAMPLES_CLASS_1_BASE = 20000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_indices(X: pd.DataFrame) -> list:
+    """범주형 변수의 열 인덱스 반환"""
+    return [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def calculate_sampling_strategy(y: pd.Series) -> dict:
+    """
+    SMOTENC를 위한 sampling_strategy 계산
+    Args:
+        y: 타겟 변수
+    Returns:
+        sampling_strategy 딕셔너리
+    """
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    count_class_2 = (y == 2).sum()
+    return {
+        0: 500 if count_class_0 <= 500 else 1000,
+        1: int(np.ceil(count_class_1 / 100) * 100),  # 백의 자리로 올림
+        2: count_class_2
+    }
+def apply_smotenc(X: pd.DataFrame, y: pd.Series,
+                  categorical_features_indices: list,
+                  sampling_strategy: dict) -> pd.DataFrame:
+    """
+    SMOTENC 적용하여 데이터 증강
+    Args:
+        X: 특징 데이터
+        y: 타겟 데이터
+        categorical_features_indices: 범주형 변수 인덱스
+        sampling_strategy: 샘플링 전략
+    Returns:
+        증강된 데이터프레임 (multi_class 포함)
+    """
+    smotenc = SMOTENC(
+        categorical_features=categorical_features_indices,
+        sampling_strategy=sampling_strategy,
+        random_state=RANDOM_STATE
+    )
+    X_resampled, y_resampled = smotenc.fit_resample(X, y)
+    resampled_data = X_resampled.copy()
+    resampled_data['multi_class'] = y_resampled
+    return resampled_data
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 SMOTENC와 CTGAN을 순차적으로 적용
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # SMOTENC 적용
+    categorical_features_indices = get_categorical_feature_indices(X)
+    sampling_strategy = calculate_sampling_strategy(y)
+    smotenc_data = apply_smotenc(X, y, categorical_features_indices, sampling_strategy)
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(smotenc_data)
+    # 클래스별 샘플 수 계산
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - int(np.ceil(count_class_1 / 100) * 100)
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        smotenc_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클���스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        smotenc_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'smotenc_ctgan_20000_2_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'smotenc_ctgan_20000_2_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 추출 (SMOTENC으로 증강된 부분 + CTGAN으로 생성된 샘플)
+    # smotenc_data의 처음 len(X)개는 원본 데이터이므로 제외
+    original_data_count = len(X)
+    smotenc_augmented = smotenc_data.iloc[original_data_count:].copy()  # SMOTENC으로 증강된 부분만
+    # 증강된 데이터만 병합 (SMOTENC 증강 + CTGAN 증강)
+    augmented_only = pd.concat([smotenc_augmented, well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # SMOTENC 데이터와 필터링된 CTGAN 샘플 병합 (최종 결과용)
+    smote_gan_data = pd.concat([smotenc_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    smote_gan_data = add_derived_features(smote_gan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_2_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_20000_3.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+from imblearn.over_sampling import SMOTENC
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2019, 2020]
+TARGET_SAMPLES_CLASS_0 = 20000
+TARGET_SAMPLES_CLASS_1_BASE = 20000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_indices(X: pd.DataFrame) -> list:
+    """범주형 변수의 열 인덱스 반환"""
+    return [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def calculate_sampling_strategy(y: pd.Series) -> dict:
+    """
+    SMOTENC를 위한 sampling_strategy 계산
+    Args:
+        y: 타겟 변수
+    Returns:
+        sampling_strategy 딕셔너리
+    """
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    count_class_2 = (y == 2).sum()
+    return {
+        0: 500 if count_class_0 <= 500 else 1000,
+        1: int(np.ceil(count_class_1 / 100) * 100),  # 백의 자리로 올림
+        2: count_class_2
+    }
+def apply_smotenc(X: pd.DataFrame, y: pd.Series,
+                  categorical_features_indices: list,
+                  sampling_strategy: dict) -> pd.DataFrame:
+    """
+    SMOTENC 적용하여 데이터 증강
+    Args:
+        X: 특징 데이터
+        y: 타겟 데이터
+        categorical_features_indices: 범주형 변수 인덱스
+        sampling_strategy: 샘플링 전략
+    Returns:
+        증강된 데이터프레임 (multi_class 포함)
+    """
+    smotenc = SMOTENC(
+        categorical_features=categorical_features_indices,
+        sampling_strategy=sampling_strategy,
+        random_state=RANDOM_STATE
+    )
+    X_resampled, y_resampled = smotenc.fit_resample(X, y)
+    resampled_data = X_resampled.copy()
+    resampled_data['multi_class'] = y_resampled
+    return resampled_data
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 SMOTENC와 CTGAN을 순차적으로 적용
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # SMOTENC 적용
+    categorical_features_indices = get_categorical_feature_indices(X)
+    sampling_strategy = calculate_sampling_strategy(y)
+    smotenc_data = apply_smotenc(X, y, categorical_features_indices, sampling_strategy)
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(smotenc_data)
+    # 클래스별 샘플 수 계산
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - int(np.ceil(count_class_1 / 100) * 100)
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        smotenc_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클���스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        smotenc_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'smotenc_ctgan_20000_3_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'smotenc_ctgan_20000_3_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 추출 (SMOTENC으로 증강된 부분 + CTGAN으로 생성된 샘플)
+    # smotenc_data의 처음 len(X)개는 원본 데이터이므로 제외
+    original_data_count = len(X)
+    smotenc_augmented = smotenc_data.iloc[original_data_count:].copy()  # SMOTENC으로 증강된 부분만
+    # 증강된 데이터만 병합 (SMOTENC 증강 + CTGAN 증강)
+    augmented_only = pd.concat([smotenc_augmented, well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # SMOTENC 데이터와 필터링된 CTGAN 샘플 병합 (최종 결과용)
+    smote_gan_data = pd.concat([smotenc_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    smote_gan_data = add_derived_features(smote_gan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/smotenc_ctgan20000/smotenc_ctgan20000_3_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_7000_1.py ADDED Viewed

	@@ -0,0 +1,378 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+from imblearn.over_sampling import SMOTENC
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2018, 2019]
+TARGET_SAMPLES_CLASS_0 = 7000
+TARGET_SAMPLES_CLASS_1_BASE = 7000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_indices(X: pd.DataFrame) -> list:
+    """범주형 변수의 열 인덱스 반환"""
+    return [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def calculate_sampling_strategy(y: pd.Series) -> dict:
+    """
+    SMOTENC를 위한 sampling_strategy 계산
+    Args:
+        y: 타겟 변수
+    Returns:
+        sampling_strategy 딕셔너리
+    """
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    count_class_2 = (y == 2).sum()
+    return {
+        0: 500 if count_class_0 <= 500 else 1000,
+        1: int(np.ceil(count_class_1 / 100) * 100),  # 백의 자리로 올림
+        2: count_class_2
+    }
+def apply_smotenc(X: pd.DataFrame, y: pd.Series,
+                  categorical_features_indices: list,
+                  sampling_strategy: dict) -> pd.DataFrame:
+    """
+    SMOTENC 적용하여 데이터 증강
+    Args:
+        X: 특징 데이터
+        y: 타겟 데이터
+        categorical_features_indices: 범주형 변수 인덱스
+        sampling_strategy: 샘플링 전략
+    Returns:
+        증강된 데이터프레임 (multi_class 포함)
+    """
+    smotenc = SMOTENC(
+        categorical_features=categorical_features_indices,
+        sampling_strategy=sampling_strategy,
+        random_state=RANDOM_STATE
+    )
+    X_resampled, y_resampled = smotenc.fit_resample(X, y)
+    resampled_data = X_resampled.copy()
+    resampled_data['multi_class'] = y_resampled
+    return resampled_data
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 SMOTENC와 CTGAN을 순차적으로 적용
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # SMOTENC 적용
+    categorical_features_indices = get_categorical_feature_indices(X)
+    sampling_strategy = calculate_sampling_strategy(y)
+    smotenc_data = apply_smotenc(X, y, categorical_features_indices, sampling_strategy)
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(smotenc_data)
+    # 클래스별 샘플 수 계산
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - int(np.ceil(count_class_1 / 100) * 100)
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        smotenc_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # ��래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        smotenc_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'smotenc_ctgan_7000_1_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'smotenc_ctgan_7000_1_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 추출 (SMOTENC으로 증강된 부분 + CTGAN으로 생성된 샘플)
+    # smotenc_data의 처음 len(X)개는 원본 데이터이므로 제외
+    original_data_count = len(X)
+    smotenc_augmented = smotenc_data.iloc[original_data_count:].copy()  # SMOTENC으로 증강된 부분만
+    # 증강된 데이터만 병합 (SMOTENC 증강 + CTGAN 증강)
+    augmented_only = pd.concat([smotenc_augmented, well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # SMOTENC 데이터와 필터링된 CTGAN 샘플 병합 (최종 결과용)
+    smote_gan_data = pd.concat([smotenc_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    smote_gan_data = add_derived_features(smote_gan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_1_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_7000_2.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+from imblearn.over_sampling import SMOTENC
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2018, 2020]
+TARGET_SAMPLES_CLASS_0 = 7000
+TARGET_SAMPLES_CLASS_1_BASE = 7000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_indices(X: pd.DataFrame) -> list:
+    """범주형 변수의 열 인덱스 반환"""
+    return [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def calculate_sampling_strategy(y: pd.Series) -> dict:
+    """
+    SMOTENC를 위한 sampling_strategy 계산
+    Args:
+        y: 타겟 변수
+    Returns:
+        sampling_strategy 딕셔너리
+    """
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    count_class_2 = (y == 2).sum()
+    return {
+        0: 500 if count_class_0 <= 500 else 1000,
+        1: int(np.ceil(count_class_1 / 100) * 100),  # 백의 자리로 올림
+        2: count_class_2
+    }
+def apply_smotenc(X: pd.DataFrame, y: pd.Series,
+                  categorical_features_indices: list,
+                  sampling_strategy: dict) -> pd.DataFrame:
+    """
+    SMOTENC 적용하여 데이터 증강
+    Args:
+        X: 특징 데이터
+        y: 타겟 데이터
+        categorical_features_indices: 범주형 변수 인덱스
+        sampling_strategy: 샘플링 전략
+    Returns:
+        증강된 데이터프레임 (multi_class 포함)
+    """
+    smotenc = SMOTENC(
+        categorical_features=categorical_features_indices,
+        sampling_strategy=sampling_strategy,
+        random_state=RANDOM_STATE
+    )
+    X_resampled, y_resampled = smotenc.fit_resample(X, y)
+    resampled_data = X_resampled.copy()
+    resampled_data['multi_class'] = y_resampled
+    return resampled_data
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 SMOTENC와 CTGAN을 순차적으로 적용
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # SMOTENC 적용
+    categorical_features_indices = get_categorical_feature_indices(X)
+    sampling_strategy = calculate_sampling_strategy(y)
+    smotenc_data = apply_smotenc(X, y, categorical_features_indices, sampling_strategy)
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(smotenc_data)
+    # 클래스별 샘플 수 계산
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - int(np.ceil(count_class_1 / 100) * 100)
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        smotenc_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        smotenc_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'smotenc_ctgan_7000_2_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'smotenc_ctgan_7000_2_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 추출 (SMOTENC으로 증강된 부분 + CTGAN으로 생성된 샘플)
+    # smotenc_data의 처음 len(X)개는 원본 데이터이므로 제외
+    original_data_count = len(X)
+    smotenc_augmented = smotenc_data.iloc[original_data_count:].copy()  # SMOTENC으로 증강된 부분만
+    # 증강된 데이터만 병합 (SMOTENC 증강 + CTGAN 증강)
+    augmented_only = pd.concat([smotenc_augmented, well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # SMOTENC 데이터와 필터링된 CTGAN 샘플 병합 (최종 결과용)
+    smote_gan_data = pd.concat([smotenc_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    smote_gan_data = add_derived_features(smote_gan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_2_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/2.make_oversample_data/smotenc_ctgan/smotenc_ctgan_sample_7000_3.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+from imblearn.over_sampling import SMOTENC
+import optuna
+from ctgan import CTGAN
+import torch
+import warnings
+# ==================== 상수 정의 ====================
+REGIONS = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+TRAIN_YEARS = [2019, 2020]
+TARGET_SAMPLES_CLASS_0 = 7000
+TARGET_SAMPLES_CLASS_1_BASE = 7000
+RANDOM_STATE = 42
+# Optuna 최적화 설정
+CLASS_0_TRIALS = 50
+CLASS_1_TRIALS = 30
+# 클래스별 하이퍼파라미터 탐색 범위
+CLASS_0_HP_RANGES = {
+    'embedding_dim': (64, 128),
+    'generator_dim': [(64, 64), (128, 128)],
+    'discriminator_dim': [(64, 64), (128, 128)],
+    'pac': [4, 8],
+    'batch_size': [64, 128, 256],
+    'discriminator_steps': (1, 3)
+}
+CLASS_1_HP_RANGES = {
+    'embedding_dim': (128, 512),
+    'generator_dim': [(128, 128), (256, 256)],
+    'discriminator_dim': [(128, 128), (256, 256)],
+    'pac': [4, 8],
+    'batch_size': [256, 512, 1024],
+    'discriminator_steps': (1, 5)
+}
+# 제거할 열 목록
+COLUMNS_TO_DROP = ['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
+# ==================== 유틸리티 함수 ====================
+def setup_environment():
+    """환경 설정 (GPU, 경고 무시)"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
+    return device
+def load_and_preprocess_data(file_path: str, train_years: list) -> tuple:
+    """
+    데이터 로드 및 전처리
+    Args:
+        file_path: 데이터 파일 경로
+        train_years: 학습에 사용할 연도 리스트
+    Returns:
+        (data, X, y): 원본 데이터, 특징 데이터, 타겟 데이터
+    """
+    data = pd.read_csv(file_path, index_col=0)
+    data = data.loc[data['year'].isin(train_years), :]
+    data['cloudcover'] = data['cloudcover'].astype('int')
+    data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
+    X = data.drop(columns=['multi_class', 'binary_class'])
+    y = data['multi_class']
+    # 불필요한 열 제거
+    X.drop(columns=COLUMNS_TO_DROP, inplace=True)
+    return data, X, y
+def get_categorical_feature_indices(X: pd.DataFrame) -> list:
+    """범주형 변수의 열 인덱스 반환"""
+    return [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
+def get_categorical_feature_names(df: pd.DataFrame) -> list:
+    """범주형 변수의 열 이름 반환"""
+    return [col for col, dtype in zip(df.columns, df.dtypes) if dtype != 'float64']
+def calculate_sampling_strategy(y: pd.Series) -> dict:
+    """
+    SMOTENC를 위한 sampling_strategy 계산
+    Args:
+        y: 타겟 변수
+    Returns:
+        sampling_strategy 딕셔너리
+    """
+    count_class_0 = (y == 0).sum()
+    count_class_1 = (y == 1).sum()
+    count_class_2 = (y == 2).sum()
+    return {
+        0: 500 if count_class_0 <= 500 else 1000,
+        1: int(np.ceil(count_class_1 / 100) * 100),  # 백의 자리로 올림
+        2: count_class_2
+    }
+def apply_smotenc(X: pd.DataFrame, y: pd.Series,
+                  categorical_features_indices: list,
+                  sampling_strategy: dict) -> pd.DataFrame:
+    """
+    SMOTENC 적용하여 데이터 증강
+    Args:
+        X: 특징 데이터
+        y: 타겟 데이터
+        categorical_features_indices: 범주형 변수 인덱스
+        sampling_strategy: 샘플링 전략
+    Returns:
+        증강된 데이터프레임 (multi_class 포함)
+    """
+    smotenc = SMOTENC(
+        categorical_features=categorical_features_indices,
+        sampling_strategy=sampling_strategy,
+        random_state=RANDOM_STATE
+    )
+    X_resampled, y_resampled = smotenc.fit_resample(X, y)
+    resampled_data = X_resampled.copy()
+    resampled_data['multi_class'] = y_resampled
+    return resampled_data
+def create_ctgan_objective(data: pd.DataFrame, class_label: int,
+                           categorical_features: list,
+                           hp_ranges: dict) -> callable:
+    """
+    Optuna 최적화를 위한 목적 함수 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+    Returns:
+        Optuna 목적 함수
+    """
+    class_data = data[data['multi_class'] == class_label]
+    def objective(trial):
+        # 하이퍼파라미터 탐색 범위 설정
+        embedding_dim = trial.suggest_int("embedding_dim", *hp_ranges['embedding_dim'])
+        generator_dim = trial.suggest_categorical("generator_dim", hp_ranges['generator_dim'])
+        discriminator_dim = trial.suggest_categorical("discriminator_dim", hp_ranges['discriminator_dim'])
+        pac = trial.suggest_categorical("pac", hp_ranges['pac'])
+        batch_size = trial.suggest_categorical("batch_size", hp_ranges['batch_size'])
+        discriminator_steps = trial.suggest_int("discriminator_steps", *hp_ranges['discriminator_steps'])
+        # CTGAN 모델 생성
+        ctgan = CTGAN(
+            embedding_dim=embedding_dim,
+            generator_dim=generator_dim,
+            discriminator_dim=discriminator_dim,
+            batch_size=batch_size,
+            discriminator_steps=discriminator_steps,
+            pac=pac
+        )
+        ctgan.set_random_state(RANDOM_STATE)
+        # 모델 학습
+        ctgan.fit(class_data, discrete_columns=categorical_features)
+        # 샘플 생성
+        generated_data = ctgan.sample(len(class_data) * 2)
+        # 평가: 샘플의 연속형 변수 분포 비교
+        real_visi = class_data['visi']
+        generated_visi = generated_data['visi']
+        # 분포 간 차이(MSE) 계산
+        mse = ((real_visi.mean() - generated_visi.mean())**2 +
+               (real_visi.std() - generated_visi.std())**2)
+        return -mse
+    return objective
+def optimize_and_generate_samples(data: pd.DataFrame, class_label: int,
+                                  categorical_features: list,
+                                  hp_ranges: dict, n_trials: int,
+                                  target_samples: int) -> tuple:
+    """
+    CTGAN 최적화 및 샘플 생성
+    Args:
+        data: 학습 데이터
+        class_label: 클래스 레이블 (0 또는 1)
+        categorical_features: 범주형 변수 이름 리스트
+        hp_ranges: 하이퍼파라미터 탐색 범위
+        n_trials: Optuna 최적화 시도 횟수
+        target_samples: 생성할 샘플 수
+    Returns:
+        (생성된 샘플 데이터프레임, 학습된 CTGAN 모델)
+    """
+    # 목적 함수 생성
+    objective = create_ctgan_objective(data, class_label, categorical_features, hp_ranges)
+    # Optuna로 최적화 수행
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=n_trials)
+    # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
+    best_params = study.best_params
+    ctgan = CTGAN(
+        embedding_dim=best_params["embedding_dim"],
+        generator_dim=best_params["generator_dim"],
+        discriminator_dim=best_params["discriminator_dim"],
+        batch_size=best_params["batch_size"],
+        discriminator_steps=best_params["discriminator_steps"],
+        pac=best_params["pac"]
+    )
+    ctgan.set_random_state(RANDOM_STATE)
+    # 최종 학습 및 샘플 생성
+    class_data = data[data['multi_class'] == class_label]
+    ctgan.fit(class_data, discrete_columns=categorical_features)
+    generated_samples = ctgan.sample(target_samples)
+    return generated_samples, ctgan
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['binary_class'] = df['multi_class'].apply(lambda x: 0 if x == 2 else 1)
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def process_region(file_path: str, output_path: str, model_save_dir: Path) -> None:
+    """
+    특정 지역의 데이터에 SMOTENC와 CTGAN을 순차적으로 적용
+    Args:
+        file_path: 입력 데이터 파일 경로
+        output_path: 출력 데이터 파일 경로
+        model_save_dir: 모델 저장 디렉토리 경로
+    """
+    # 지역명 추출 (파일 경로에서)
+    region_name = Path(file_path).stem.replace('_train', '')
+    # 데이터 로드 및 전처리
+    original_data, X, y = load_and_preprocess_data(file_path, TRAIN_YEARS)
+    # SMOTENC 적용
+    categorical_features_indices = get_categorical_feature_indices(X)
+    sampling_strategy = calculate_sampling_strategy(y)
+    smotenc_data = apply_smotenc(X, y, categorical_features_indices, sampling_strategy)
+    # CTGAN을 위한 범주형 변수 이름 추출
+    categorical_features = get_categorical_feature_names(smotenc_data)
+    # 클래스별 샘플 수 계산
+    count_class_1 = (y == 1).sum()
+    target_samples_class_1 = TARGET_SAMPLES_CLASS_1_BASE - int(np.ceil(count_class_1 / 100) * 100)
+    # 클래스 0에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 0...")
+    generated_0, ctgan_model_0 = optimize_and_generate_samples(
+        smotenc_data, 0, categorical_features,
+        CLASS_0_HP_RANGES, CLASS_0_TRIALS, TARGET_SAMPLES_CLASS_0
+    )
+    # 클래스 1에 대한 CTGAN 최적화 및 샘플 생성
+    print(f"Processing {file_path}: Optimizing CTGAN for class 1...")
+    generated_1, ctgan_model_1 = optimize_and_generate_samples(
+        smotenc_data, 1, categorical_features,
+        CLASS_1_HP_RANGES, CLASS_1_TRIALS, target_samples_class_1
+    )
+    # 모델 저장 디렉토리 생성
+    model_save_dir.mkdir(parents=True, exist_ok=True)
+    # 클래스 0 모델 저장
+    model_path_0 = model_save_dir / f'smotenc_ctgan_7000_3_{region_name}_class0.pkl'
+    ctgan_model_0.save(str(model_path_0))
+    print(f"Saved CTGAN model for class 0: {model_path_0}")
+    # 클래스 1 모델 저장
+    model_path_1 = model_save_dir / f'smotenc_ctgan_7000_3_{region_name}_class1.pkl'
+    ctgan_model_1.save(str(model_path_1))
+    print(f"Saved CTGAN model for class 1: {model_path_1}")
+    # 클래스별 가시도 범위로 필터링
+    well_generated_0 = generated_0[
+        (generated_0['visi'] >= 0) & (generated_0['visi'] < 100)
+    ]
+    well_generated_1 = generated_1[
+        (generated_1['visi'] >= 100) & (generated_1['visi'] < 500)
+    ]
+    # 증강된 데이터만 추출 (SMOTENC으로 증강된 부분 + CTGAN으로 생성된 샘플)
+    # smotenc_data의 처음 len(X)개는 원본 데이터이므로 제외
+    original_data_count = len(X)
+    smotenc_augmented = smotenc_data.iloc[original_data_count:].copy()  # SMOTENC으로 증강된 부분만
+    # 증강된 데이터만 병합 (SMOTENC 증강 + CTGAN 증강)
+    augmented_only = pd.concat([smotenc_augmented, well_generated_0, well_generated_1], axis=0)
+    augmented_only = add_derived_features(augmented_only)
+    augmented_only.reset_index(drop=True, inplace=True)
+    # augmented_only 폴더에 저장
+    output_path_obj = Path(output_path)
+    augmented_dir = output_path_obj.parent.parent / 'augmented_only'
+    augmented_dir.mkdir(parents=True, exist_ok=True)
+    augmented_output_path = augmented_dir / output_path_obj.name
+    augmented_only.to_csv(augmented_output_path, index=False)
+    # SMOTENC 데이터와 필터링된 CTGAN 샘플 병합 (최종 결과용)
+    smote_gan_data = pd.concat([smotenc_data, well_generated_0, well_generated_1], axis=0)
+    # 파생 변수 추가
+    smote_gan_data = add_derived_features(smote_gan_data)
+    # 증강된 데이터만 결과 출력
+    aug_count_0 = len(augmented_only[augmented_only['multi_class'] == 0])
+    aug_count_1 = len(augmented_only[augmented_only['multi_class'] == 1])
+    print(f"Saved augmented data only {augmented_output_path}: Class 0={aug_count_0} | Class 1={aug_count_1}")
+    # 클래스 2 제거 후 원본 클래스 2 데이터 추가
+    filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
+    original_class_2 = original_data[original_data['multi_class'] == 2]
+    final_data = pd.concat([filtered_data, original_class_2], axis=0)
+    final_data.reset_index(drop=True, inplace=True)
+    # 출력 디렉토리 생성
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # 결과 저장
+    final_data.to_csv(output_path, index=False)
+    # 결과 출력
+    count_0 = len(final_data[final_data['multi_class'] == 0])
+    count_1 = len(final_data[final_data['multi_class'] == 1])
+    count_2 = len(final_data[final_data['multi_class'] == 2])
+    print(f"Saved {output_path}: Class 0={count_0} | Class 1={count_1} | Class 2={count_2}")
+# ==================== 메인 실행 ====================
+if __name__ == "__main__":
+    setup_environment()
+    file_paths = [f'../../../data/data_for_modeling/{region}_train.csv' for region in REGIONS]
+    output_paths = [f'../../../data/data_oversampled/smotenc_ctgan7000/smotenc_ctgan7000_3_{region}.csv' for region in REGIONS]
+    model_save_dir = Path('../../save_model/oversampling_models')
+    for file_path, output_path in zip(file_paths, output_paths):
+        process_region(file_path, output_path, model_save_dir)

Analysis_code/3.sampled_data_analysis/make_plot.py ADDED Viewed

	@@ -0,0 +1,659 @@

+"""
+데이터 시각화 모듈: Original과 Synthetic 데이터 비교 시각화
+이 모듈은 원본 데이터와 합성 데이터를 로드하고, 전처리한 후
+UMAP을 사용하여 차원 축소 및 시각화를 수행합니다.
+"""
+import os
+# TensorFlow 로그 메시지 숨기기
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 0=모두, 1=INFO 제외, 2=INFO/WARNING 제외, 3=ERROR만
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # oneDNN 경고 숨기기
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import warnings
+from dataclasses import dataclass
+from typing import List, Tuple, Optional
+from sklearn.preprocessing import StandardScaler
+import umap
+from pathlib import Path
+@dataclass
+class PlotConfig:
+    """시각화 설정값을 관리하는 클래스"""
+    cols_to_drop: List[str] = None
+    umap_n_neighbors: int = 30
+    umap_min_dist: float = 0.1
+    umap_random_state: int = 42
+    umap_n_jobs: int = 1  # random_state 설정 시 병렬 처리 불가 (경고 방지)
+    figsize: Tuple[int, int] = (16, 6)
+    alpha: float = 0.6  # Original과 Synthetic 데이터 모두 동일한 투명도
+    visibility_threshold: int = 500
+    scale_on_original_only: bool = True  # True: 원본 기준 스케일링 (데이터 누설 방지), False: 합쳐서 스케일링
+    def __post_init__(self):
+        """기본값 설정"""
+        if self.cols_to_drop is None:
+            self.cols_to_drop = [
+                'wind_dir',              # 문자열 (에러 발생)
+                'multi_class',           # 타겟 변수 (시각화용 색깔로만 사용)
+                'binary_class',          # 타겟 변수
+                'year', 'month', 'hour', # sin/cos 변수와 중복
+                'ground_temp - temp_C',  # 단순 선형 결합 (정보 중복)
+                'visi'
+            ]
+def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    입력 데이터프레임에 시간 관련 파생변수를 추가합니다.
+    Args:
+        df: 입력 데이터프레임 (hour, month 컬럼 필요)
+    Returns:
+        시간 특성이 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    return df
+def create_binary_class(visi: pd.Series, threshold: int = 500) -> pd.Series:
+    """
+    가시도(visi) 값을 기반으로 이진 분류를 생성합니다.
+    Args:
+        visi: 가시도 값 시리즈
+        threshold: 이진 분류 임계값 (기본값: 500)
+    Returns:
+        이진 분류 결과 (1: < threshold, 0: >= threshold)
+    """
+    return visi.apply(lambda x: 1 if x < threshold else (0 if x >= threshold else np.nan))
+def load_region_data(
+    region: str,
+    data_dir: str = "../../data/data_for_modeling"
+) -> pd.DataFrame:
+    """
+    특정 지역의 원본 데이터를 로드합니다.
+    Args:
+        region: 지역명 ('incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju')
+        data_dir: 데이터 디렉토리 경로
+    Returns:
+        로드된 지역 데이터프레임
+    """
+    file_path = f"{data_dir}/{region}_train.csv"
+    df = pd.read_csv(file_path)
+    # 필요한 컬럼만 선택
+    required_cols = [
+        'temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',
+        'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',
+        'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',
+        'NO2', 'PM10', 'PM25', 'year', 'month', 'hour', 'visi', 'multi_class',
+        'binary_class', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
+        'ground_temp - temp_C'
+    ]
+    # 존재하는 컬럼만 선택
+    available_cols = [col for col in required_cols if col in df.columns]
+    df = df.loc[:, available_cols].copy()
+    return df
+def load_and_preprocess_data(
+    synthetic_path: str,
+    config: PlotConfig,
+    region: Optional[str] = None,
+    fold_idx: Optional[int] = None,
+    data_dir: str = "../../data/data_for_modeling",
+    original_path: Optional[str] = None
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    원본 및 합성 데이터를 로드하고 전처리합니다.
+    Args:
+        synthetic_path: 합성 데이터 파일 경로
+        config: PlotConfig 객체
+        region: 지역명 ('incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju')
+                original_path가 None일 때 사용
+        fold_idx: fold 인덱스 (0, 1, 2 중 하나), None이면 전체 데이터
+                  original_path가 None일 때 사용
+        data_dir: 원본 데이터 디렉토리 경로 (region 사용 시)
+        original_path: 원본 데이터 파일 경로 (지정하면 region/fold 무시)
+    Returns:
+        (전처리된 원본 데이터, 전처리된 합성 데이터) 튜플
+    """
+    # 원본 데이터 로드
+    if original_path is not None:
+        # 기존 방식: 파일 경로로 직접 로드
+        original_data = pd.read_csv(original_path)
+    elif region is not None:
+        # 새로운 방식: 지역과 fold로 로드
+        original_data = load_region_data(region, data_dir)
+        # fold에 따라 필터링
+        if fold_idx is not None:
+            fold = [[2018, 2019], [2018, 2020], [2019, 2020]]
+            if 0 <= fold_idx < len(fold):
+                years = fold[fold_idx]
+                original_data = original_data.loc[original_data['year'].isin(years), :].copy()
+    else:
+        raise ValueError("original_path 또는 region을 지정해야 합니다.")
+    # 합성 데이터 로드
+    synthetic_data = pd.read_csv(synthetic_path)
+    # 이진 분류 생성
+    original_data['binary_class'] = create_binary_class(
+        original_data['visi'],
+        config.visibility_threshold
+    )
+    synthetic_data['binary_class'] = create_binary_class(
+        synthetic_data['visi'],
+        config.visibility_threshold
+    )
+    # 시간 특성 추가
+    original_data = add_time_features(original_data)
+    synthetic_data = add_time_features(synthetic_data)
+    # multi_class 필터링 (Original만)
+    original_data = original_data.loc[original_data['multi_class'].isin([0, 1]), :]
+    # 라벨 추가
+    original_data['Label'] = 'Original'
+    synthetic_data['Label'] = 'Synthetic'
+    # 불필요한 컬럼 제거
+    original_data = original_data.drop(config.cols_to_drop, axis=1)
+    synthetic_data = synthetic_data.drop(config.cols_to_drop, axis=1)
+    return original_data, synthetic_data
+def prepare_features_for_visualization(
+    original_data: pd.DataFrame,
+    synthetic_data: pd.DataFrame,
+    config: PlotConfig
+) -> Tuple[np.ndarray, pd.Series, StandardScaler]:
+    """
+    시각화를 위한 피처를 준비하고 스케일링합니다.
+    중요: 데이터 누설을 방지하기 위해 기본적으로 원본 데이터로만 scaler를 fit하고,
+    합성 데이터는 transform만 합니다. 이렇게 하면 합성 데이터의 분포가 원본 데이터의
+    스케일링에 영향을 주지 않습니다.
+    Args:
+        original_data: 원본 데이터프레임
+        synthetic_data: 합성 데이터프레임
+        config: PlotConfig 객체 (scale_on_original_only 설정 포함)
+    Returns:
+        (스케일링된 피처, 라벨, 스케일러) 튜플
+    """
+    # 피처와 라벨 분리
+    original_features = original_data.drop('Label', axis=1)
+    synthetic_features = synthetic_data.drop('Label', axis=1)
+    if config.scale_on_original_only:
+        # 방법 1: 원본 데이터로만 scaler fit (데이터 누설 방지, 권장)
+        # 이 방법은 합성 데이터가 원본 데이터의 스케일링에 영향을 주지 않습니다.
+        scaler = StandardScaler()
+        scaled_original = scaler.fit_transform(original_features)
+        scaled_synthetic = scaler.transform(synthetic_features)
+        # 스케일링된 데이터 합치기
+        scaled_features = np.vstack([scaled_original, scaled_synthetic])
+        # 라벨 합치기
+        labels = pd.concat([
+            original_data['Label'],
+            synthetic_data['Label']
+        ], ignore_index=True)
+    else:
+        # 방법 2: 합쳐서 스케일링 (데이터 누설 있음, 비교 목적일 때만 사용)
+        # 주의: 이 방법은 합성 데이터의 분포가 원본 스케일링에 영향을 줍니다.
+        combined_df = pd.concat([original_data, synthetic_data], ignore_index=True)
+        features = combined_df.drop('Label', axis=1)
+        labels = combined_df['Label']
+        scaler = StandardScaler()
+        scaled_features = scaler.fit_transform(features)
+    return scaled_features, labels, scaler
+def plot_umap_comparison(
+    scaled_features: np.ndarray,
+    labels: pd.Series,
+    config: PlotConfig,
+    region: Optional[str] = None,
+    fold_idx: Optional[int] = None,
+    ax: Optional[plt.Axes] = None
+) -> plt.Figure:
+    """
+    UMAP을 사용하여 차원 축소 후 Original과 Synthetic 데이터를 비교 시각화합니다.
+    핵심: 원본 데이터가 정의한 공간(Manifold) 위에 합성 데이터를 투영합니다.
+    - Original 데이터로만 UMAP을 fit하여 공간 구조를 학습
+    - Synthetic 데이터는 학습된 공간에 transform만 적용
+    - 이렇게 하면 합성 데이터가 원본 데이터의 공간 형성에 영향을 주지 않습니다.
+    Args:
+        scaled_features: 스케일링된 피처 배열
+        labels: 데이터 라벨 (Original/Synthetic)
+        config: PlotConfig 객체
+        region: 지역명 (표시용)
+        fold_idx: fold 인덱스 (표시용)
+        ax: matplotlib axes 객체 (None이��� 새 figure 생성)
+    Returns:
+        matplotlib Figure 객체
+    """
+    print("UMAP 실행 중... (Original 기준 학습 후 Synthetic 변환)")
+    # 1. 데이터 분리 (Labels를 이용해서 다시 나눔)
+    is_original = labels == 'Original'
+    original_data = scaled_features[is_original]
+    synthetic_data = scaled_features[~is_original]
+    # 2. UMAP 모델 생성
+    umap_model = umap.UMAP(
+        n_neighbors=config.umap_n_neighbors,
+        min_dist=config.umap_min_dist,
+        random_state=config.umap_random_state,
+        n_jobs=config.umap_n_jobs
+    )
+    # 3. [핵심] Original 데이터로만 공간 학습 (Fit)
+    # 원본 데이터의 구조(Manifold)만 학습합니다.
+    original_embedding = umap_model.fit_transform(original_data)
+    # 4. [핵심] 학습된 공간에 Synthetic 데이터 투영 (Transform)
+    # 합성 데이터는 공간 형성에 관여하지 않고, 이미 만들어진 공간에 위치만 찾습니다.
+    synthetic_embedding = umap_model.transform(synthetic_data)
+    # 5. 결과 합치기 (시각화를 위해)
+    umap_results = np.vstack([original_embedding, synthetic_embedding])
+    # 순서 보장을 위해 라벨도 다시 정리 (Original이 앞, Synthetic이 뒤)
+    combined_labels = pd.concat([
+        labels[is_original],
+        labels[~is_original]
+    ], ignore_index=True)
+    # 결과를 데이터프레임으로 변환
+    df_umap = pd.DataFrame(umap_results, columns=['UMAP1', 'UMAP2'])
+    df_umap['Label'] = combined_labels
+    # 지역 및 fold 정보 문자열 생성 (title에 사용)
+    title_parts = ["UMAP: Original vs Synthetic"]
+    if region is not None:
+        if fold_idx is not None:
+            fold = [[2018, 2019], [2018, 2020], [2019, 2020]]
+            if 0 <= fold_idx < len(fold):
+                years = fold[fold_idx]
+                fold_display = fold_idx + 1  # fold를 +1해서 표시
+                title_parts.append(f"Region: {region.upper()} | Fold {fold_display}: {years[0]}-{years[1]}")
+            else:
+                title_parts.append(f"Region: {region.upper()}")
+        else:
+            title_parts.append(f"Region: {region.upper()}")
+    elif fold_idx is not None:
+        fold = [[2018, 2019], [2018, 2020], [2019, 2020]]
+        if 0 <= fold_idx < len(fold):
+            years = fold[fold_idx]
+            fold_display = fold_idx + 1  # fold를 +1해서 표시
+            title_parts.append(f"Fold {fold_display}: {years[0]}-{years[1]}")
+    title_str = " - ".join(title_parts)
+    # Figure 및 Axes 설정 (단일 플롯)
+    if ax is None:
+        fig, ax = plt.subplots(1, 1, figsize=(10, 8))
+    else:
+        fig = ax.figure if hasattr(ax, 'figure') else plt.gcf()
+    # 전체 데이터의 UMAP 범위 계산
+    x_min = df_umap['UMAP1'].min() - 1
+    x_max = df_umap['UMAP1'].max() + 1
+    y_min = df_umap['UMAP2'].min() - 1
+    y_max = df_umap['UMAP2'].max() + 1
+    # Synthetic 데이터 시각화 (빨간색, 먼저 그려서 뒤에 위치)
+    sns.scatterplot(
+        data=df_umap.loc[df_umap['Label'] == 'Synthetic'],
+        x='UMAP1', y='UMAP2',
+        color='red',
+        alpha=config.alpha,
+        label='Synthetic',
+        ax=ax,
+        s=30
+    )
+    # Original 데이터 시각화 (파란색, 나중에 그려서 앞에 위치하여 더 잘 보이게)
+    sns.scatterplot(
+        data=df_umap.loc[df_umap['Label'] == 'Original'],
+        x='UMAP1', y='UMAP2',
+        color='blue',
+        alpha=config.alpha,
+        label='Original',
+        ax=ax,
+        s=30
+    )
+    ax.set_xlim(x_min, x_max)
+    ax.set_ylim(y_min, y_max)
+    ax.set_xlabel('UMAP1', fontsize=12)
+    ax.set_ylabel('UMAP2', fontsize=12)
+    ax.set_title(title_str, fontsize=14, fontweight='bold')
+    ax.legend(title='Label', loc='best')
+    ax.grid(True, alpha=0.3)
+    plt.tight_layout()
+    return fig
+def generate_synthetic_path(
+    method: str,
+    region: str,
+    sample_size: Optional[int] = None,
+    fold_idx: Optional[int] = None,
+    base_dir: str = "../../data/data_oversampled"
+) -> str:
+    """
+    합성 데이터 파일 경로를 생성합니다.
+    Args:
+        method: 증강 방법 ('ctgan', 'smotenc_ctgan', 'smote')
+        region: 지역명
+        sample_size: 샘플 수 (ctgan, smotenc_ctgan인 경우 필수, smote인 경우 무시)
+        fold_idx: fold 인덱스 (0, 1, 2 중 하나, None이면 0 사용)
+        base_dir: 기본 디렉토리 경로
+    Returns:
+        합성 데이터 파일 경로
+    """
+    # fold_idx 기본값 설정 (None이면 0, 즉 fold 1)
+    if fold_idx is None:
+        fold_idx = 0
+    fold_num = fold_idx + 1  # 파일명은 1부터 시작 (fold_idx는 0부터)
+    if method == 'ctgan':
+        if sample_size is None:
+            raise ValueError("ctgan 방법은 sample_size가 필요합니다 (7000, 10000, 20000 중 선택)")
+        if sample_size not in [7000, 10000, 20000]:
+            raise ValueError(f"sample_size는 7000, 10000, 20000 중 하나여야 합니다. 입력값: {sample_size}")
+        return f"{base_dir}/augmented_only/ctgan{sample_size}_{fold_num}_{region}.csv"
+    elif method == 'smotenc_ctgan':
+        if sample_size is None:
+            raise ValueError("smotenc_ctgan 방법은 sample_size가 필요합니다 (7000, 10000, 20000 중 선택)")
+        if sample_size not in [7000, 10000, 20000]:
+            raise ValueError(f"sample_size는 7000, 10000, 20000 중 하나여야 합니다. 입력값: {sample_size}")
+        return f"{base_dir}/augmented_only/smotenc_ctgan{sample_size}_{fold_num}_{region}.csv"
+    elif method == 'smote':
+        # smote는 sample_size를 사용하지 않으므로 무시
+        # smote 파일도 augmented_only에 있다고 가정 (fold 번호 포함 여부 확인 필요)
+        return f"{base_dir}/augmented_only/smote_{fold_num}_{region}.csv"
+    else:
+        raise ValueError(f"지원하지 않는 method입니다: {method}. 'ctgan', 'smotenc_ctgan', 'smote' 중 하나를 선택하세요.")
+def main(
+    method: str = "ctgan",
+    sample_size: Optional[int] = 7000,
+    config: Optional[PlotConfig] = None,
+    region: Optional[str] = "busan",
+    fold_idx: Optional[int] = 0,
+    data_dir: str = "../../data/data_for_modeling",
+    original_path: Optional[str] = None,
+    synthetic_path: Optional[str] = None,
+    base_dir: str = "../../data/data_oversampled"
+) -> None:
+    """
+    전체 파이프라인을 실행하는 메인 함수.
+    Args:
+        method: 증강 방법 ('ctgan', 'smotenc_ctgan', 'smote')
+        sample_size: 샘플 수 (ctgan, smotenc_ctgan인 경우: 7000, 10000, 20000 중 선택, smote인 경우 무시)
+        config: PlotConfig 객체 (None이면 기본값 사용)
+        region: 지역명 ('incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju')
+                original_path가 None일 때 사용
+        fold_idx: fold 인덱스 (0, 1, 2 중 하나), None이면 전체 데이터
+                  original_path가 None일 때 사용
+        data_dir: 원본 데이터 디렉토리 경로 (region 사용 시)
+        original_path: 원본 데이터 파일 경로 (지정하면 region/fold 무시)
+        synthetic_path: 합성 데이터 파일 경로 (지정하면 method/sample_size 무시)
+        base_dir: 합성 데이터 기본 디렉토리 경로
+    """
+    if config is None:
+        config = PlotConfig()
+    # 합성 데이터 경로 생성
+    if synthetic_path is None:
+        if region is None:
+            raise ValueError("synthetic_path를 지정하지 않으면 region이 필요합니다.")
+        synthetic_path = generate_synthetic_path(method, region, sample_size, fold_idx, base_dir)
+    # 데이터 로드 및 전처리
+    original_data, synthetic_data = load_and_preprocess_data(
+        synthetic_path=synthetic_path,
+        config=config,
+        region=region,
+        fold_idx=fold_idx,
+        data_dir=data_dir,
+        original_path=original_path
+    )
+    # 피처 준비 및 스케일링
+    scaled_features, labels, scaler = prepare_features_for_visualization(
+        original_data, synthetic_data, config
+    )
+    # UMAP 시각화
+    plot_umap_comparison(
+        scaled_features,
+        labels,
+        config,
+        region=region,
+        fold_idx=fold_idx
+    )
+    plt.show()
+def generate_all_plots(
+    output_dir: str = "images",
+    config: Optional[PlotConfig] = None,
+    data_dir: str = "../../data/data_for_modeling",
+    base_dir: str = "../../data/data_oversampled"
+) -> None:
+    """
+    논문 게재를 위한 모든 조합의 plot을 생성하고 저장합니다.
+    생성되는 조합:
+    - 지역: incheon, seoul, busan, daegu, daejeon, gwangju (6개)
+    - Fold: 0, 1, 2 (3개)
+    - Method: ctgan, smotenc_ctgan, smote (3개)
+    - Sample size (ctgan, smotenc_ctgan): 7000, 10000, 20000 (3개)
+    총: (6 지역 × 3 fold × 3 sample_size × 2 methods) + (6 지역 × 3 fold × 1 smote) = 126개
+    Args:
+        output_dir: 저장할 디렉토리 경로
+        config: PlotConfig 객체 (None이면 기본값 사용)
+        data_dir: 원본 데이터 디렉토리 경로
+        base_dir: 합성 데이터 기본 디렉토리 경로
+    """
+    if config is None:
+        config = PlotConfig()
+    # 출력 디렉토리 생성
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    # 모든 조합 정의
+    regions = ['incheon', 'seoul', 'busan', 'daegu', 'daejeon', 'gwangju']
+    fold_indices = [0, 1, 2]
+    methods_with_size = [
+        ('ctgan', 7000),
+        ('ctgan', 10000),
+        ('ctgan', 20000),
+        ('smotenc_ctgan', 7000),
+        ('smotenc_ctgan', 10000),
+        ('smotenc_ctgan', 20000)
+    ]
+    methods_without_size = [('smote', None)]
+    total_plots = len(regions) * len(fold_indices) * (len(methods_with_size) + len(methods_without_size))
+    current_plot = 0
+    print(f"총 {total_plots}개의 plot을 생성합니다...")
+    print("=" * 60)
+    # Method와 sample_size가 있는 경우 (ctgan, smotenc_ctgan)
+    for method, sample_size in methods_with_size:
+        for region in regions:
+            for fold_idx in fold_indices:
+                current_plot += 1
+                try:
+                    print(f"[{current_plot}/{total_plots}] {method} (size={sample_size}) - {region.upper()} - Fold {fold_idx + 1} 생성 중...")
+                    # 합성 데이터 경로 생성
+                    synthetic_path = generate_synthetic_path(method, region, sample_size, fold_idx, base_dir)
+                    # 데이터 로드 및 전처리
+                    original_data, synthetic_data = load_and_preprocess_data(
+                        synthetic_path=synthetic_path,
+                        config=config,
+                        region=region,
+                        fold_idx=fold_idx,
+                        data_dir=data_dir,
+                        original_path=None
+                    )
+                    # 피처 준비 및 스케일링
+                    scaled_features, labels, scaler = prepare_features_for_visualization(
+                        original_data, synthetic_data, config
+                    )
+                    # UMAP 시각화
+                    fig = plot_umap_comparison(
+                        scaled_features,
+                        labels,
+                        config,
+                        region=region,
+                        fold_idx=fold_idx
+                    )
+                    # 파일명 생성: method_sample_size_region_fold_years.png
+                    fold = [[2018, 2019], [2018, 2020], [2019, 2020]]
+                    years = fold[fold_idx]
+                    filename = f"{method}_{sample_size}_{region}_fold{fold_idx + 1}_{years[0]}-{years[1]}.png"
+                    filepath = output_path / filename
+                    # 저장 (논문 게재 품질)
+                    fig.savefig(
+                        filepath,
+                        dpi=600,                    # 해상도 (300dpi는 대부분 저널 요구사항)
+                        bbox_inches='tight',        # 여백 자동 제거
+                        pad_inches=0.1,             # tight일 때 약간의 여백 유지 (가독성)
+                        facecolor='white',          # 배경색 (흰색)
+                        edgecolor='none',           # 테두리 없음
+                        format='png',               # 파일 형식 (pdf로 변경 가능)
+                        transparent= True           # 투명 배경 여부
+                    )
+                    plt.close(fig)
+                    print(f"  ✓ 저장 완료: {filename}")
+                except Exception as e:
+                    print(f"  ✗ 오류 발생: {str(e)}")
+                    continue
+    # Method만 있고 sample_size가 없는 경우 (smote)
+    for method, _ in methods_without_size:
+        for region in regions:
+            for fold_idx in fold_indices:
+                current_plot += 1
+                try:
+                    print(f"[{current_plot}/{total_plots}] {method} - {region.upper()} - Fold {fold_idx + 1} 생성 중...")
+                    # 합성 데이터 경로 생성
+                    synthetic_path = generate_synthetic_path(method, region, None, fold_idx, base_dir)
+                    # 데이터 로드 및 전처리
+                    original_data, synthetic_data = load_and_preprocess_data(
+                        synthetic_path=synthetic_path,
+                        config=config,
+                        region=region,
+                        fold_idx=fold_idx,
+                        data_dir=data_dir,
+                        original_path=None
+                    )
+                    # 피처 준비 및 스케일링
+                    scaled_features, labels, scaler = prepare_features_for_visualization(
+                        original_data, synthetic_data, config
+                    )
+                    # UMAP 시각화
+                    fig = plot_umap_comparison(
+                        scaled_features,
+                        labels,
+                        config,
+                        region=region,
+                        fold_idx=fold_idx
+                    )
+                    # 파일명 생성: method_region_fold_years.png
+                    fold = [[2018, 2019], [2018, 2020], [2019, 2020]]
+                    years = fold[fold_idx]
+                    filename = f"{method}_{region}_fold{fold_idx + 1}_{years[0]}-{years[1]}.png"
+                    filepath = output_path / filename
+                    # 저장 (논문 게재 품질)
+                    fig.savefig(
+                        filepath,
+                        dpi=300,                    # 해상도 (300dpi는 대부분 저널 요구사항)
+                        bbox_inches='tight',        # 여백 자동 제거
+                        pad_inches=0.1,             # tight일 때 약간의 여백 유지 (가독성)
+                        facecolor='white',          # 배경색 (흰색)
+                        edgecolor='none',           # 테두리 없음
+                        format='png',               # 파일 형식 (pdf로 변경 가능)
+                        transparent=False           # 투명 배경 여부
+                    )
+                    plt.close(fig)
+                    print(f"  ✓ 저장 완료: {filename}")
+                except Exception as e:
+                    print(f"  ✗ 오류 발생: {str(e)}")
+                    continue
+    print("=" * 60)
+    print(f"모든 plot 생성 완료! 총 {current_plot}개 파일이 {output_dir}에 저장되었습니다.")
+if __name__ == "__main__":
+    # 단일 plot 생성 (기본)
+    # main()
+    # 모든 조합의 plot 생성 (논문용)
+    generate_all_plots(output_dir="images")

Analysis_code/3.sampled_data_analysis/oversampling_model_hyperparameter.ipynb ADDED Viewed

	@@ -0,0 +1,574 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "829c34fa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "CTGAN 모델 하이퍼파라미터 추출 및 정리\n",
+    "논문 작성용으로 모든 저장된 모델의 하이퍼파라미터를 추출합니다.\n",
+    "\"\"\"\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "from ctgan import CTGAN\n",
+    "import re\n",
+    "from typing import Dict, Any\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "98679ba3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "총 216개의 모델 파일을 찾았습니다.\n",
+      "\n",
+      "처음 5개 파일 예시:\n",
+      "  - ctgan_only_10000_1_busan_class0.pkl\n",
+      "  - ctgan_only_10000_1_busan_class1.pkl\n",
+      "  - ctgan_only_10000_1_daegu_class0.pkl\n",
+      "  - ctgan_only_10000_1_daegu_class1.pkl\n",
+      "  - ctgan_only_10000_1_daejeon_class0.pkl\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 모델 디렉토리 경로 설정\n",
+    "model_dir = Path(\"../save_model/oversampling_models\")\n",
+    "\n",
+    "# 모델 파일 목록 확인\n",
+    "model_files = sorted(list(model_dir.glob(\"*.pkl\")))\n",
+    "print(f\"총 {len(model_files)}개의 모델 파일을 찾았습니다.\")\n",
+    "print(f\"\\n처음 5개 파일 예시:\")\n",
+    "for f in model_files[:5]:\n",
+    "    print(f\"  - {f.name}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "97cde9e3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CTGAN 모델 하이퍼파라미터:\n",
+      "  embedding_dim: 64\n",
+      "  generator_dim: (64, 64)\n",
+      "  discriminator_dim: (128, 128)\n",
+      "  batch_size: 256\n",
+      "  epochs: 300\n",
+      "  pac: 8\n",
+      "  discriminator_steps: 2\n",
+      "  generator_lr: 0.0002\n",
+      "  discriminator_lr: 0.0002\n",
+      "  generator_decay: 1e-06\n",
+      "  discriminator_decay: 1e-06\n",
+      "\n",
+      "딕셔너리 형태:\n",
+      "{'embedding_dim': 64, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'batch_size': 256, 'epochs': 300, 'pac': 8, 'discriminator_steps': 2, 'generator_lr': 0.0002, 'discriminator_lr': 0.0002, 'generator_decay': 1e-06, 'discriminator_decay': 1e-06}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# CTGAN 모델 로드 및 하이퍼파라미터 확인 예제\n",
+    "model = CTGAN.load(\"../save_model/oversampling_models/ctgan_only_10000_1_busan_class0.pkl\")\n",
+    "\n",
+    "# CTGAN 모델의 하이퍼파라미터는 내부 속성(_로 시작)에 저장되어 있습니다\n",
+    "print(\"CTGAN 모델 하이퍼파라미터:\")\n",
+    "print(f\"  embedding_dim: {model._embedding_dim}\")\n",
+    "print(f\"  generator_dim: {model._generator_dim}\")\n",
+    "print(f\"  discriminator_dim: {model._discriminator_dim}\")\n",
+    "print(f\"  batch_size: {model._batch_size}\")\n",
+    "print(f\"  epochs: {model._epochs}\")\n",
+    "print(f\"  pac: {model.pac}\")  # pac는 공개 속성으로도 접근 가능\n",
+    "print(f\"  discriminator_steps: {model._discriminator_steps}\")\n",
+    "print(f\"  generator_lr: {model._generator_lr}\")\n",
+    "print(f\"  discriminator_lr: {model._discriminator_lr}\")\n",
+    "print(f\"  generator_decay: {model._generator_decay}\")\n",
+    "print(f\"  discriminator_decay: {model._discriminator_decay}\")\n",
+    "\n",
+    "# 모든 하이퍼파라미터를 딕셔너리로 추출하는 방법\n",
+    "hyperparams = {\n",
+    "    'embedding_dim': model._embedding_dim,\n",
+    "    'generator_dim': model._generator_dim,\n",
+    "    'discriminator_dim': model._discriminator_dim,\n",
+    "    'batch_size': model._batch_size,\n",
+    "    'epochs': model._epochs,\n",
+    "    'pac': model.pac,\n",
+    "    'discriminator_steps': model._discriminator_steps,\n",
+    "    'generator_lr': model._generator_lr,\n",
+    "    'discriminator_lr': model._discriminator_lr,\n",
+    "    'generator_decay': model._generator_decay,\n",
+    "    'discriminator_decay': model._discriminator_decay,\n",
+    "}\n",
+    "print(\"\\n딕셔너리 형태:\")\n",
+    "print(hyperparams)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e3631f3b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "테스트 파일: ctgan_only_10000_1_busan_class0.pkl\n",
+      "파싱 결과: {'method': 'ctgan', 'sample_size': 10000, 'fold': 1, 'region': 'busan', 'class': 0}\n",
+      "하이퍼파라미터: {'embedding_dim': 64, 'generator_dim': '(64, 64)', 'discriminator_dim': '(128, 128)', 'pac': 8, 'batch_size': 256, 'discriminator_steps': 2, 'epochs': 300, 'generator_lr': 0.0002, 'discriminator_lr': 0.0002, 'generator_decay': 1e-06, 'discriminator_decay': 1e-06}\n"
+     ]
+    }
+   ],
+   "source": [
+    "def parse_model_filename(filename: str) -> Dict[str, Any]:\n",
+    "    \"\"\"\n",
+    "    모델 파일명에서 정보를 파싱합니다.\n",
+    "    \n",
+    "    파일명 패턴:\n",
+    "    - ctgan_only_{sample_size}_{fold}_{region}_class{0|1}.pkl\n",
+    "    - smotenc_ctgan_{sample_size}_{fold}_{region}_class{0|1}.pkl\n",
+    "    \n",
+    "    Returns:\n",
+    "        파싱된 정보 딕셔너리\n",
+    "    \"\"\"\n",
+    "    # 파일명에서 확장자 제거\n",
+    "    name = filename.replace('.pkl', '')\n",
+    "    \n",
+    "    # 패턴 매칭\n",
+    "    if name.startswith('ctgan_only_'):\n",
+    "        method = 'ctgan'\n",
+    "        parts = name.replace('ctgan_only_', '').split('_')\n",
+    "    elif name.startswith('smotenc_ctgan_'):\n",
+    "        method = 'smotenc_ctgan'\n",
+    "        parts = name.replace('smotenc_ctgan_', '').split('_')\n",
+    "    else:\n",
+    "        return None\n",
+    "    \n",
+    "    # sample_size, fold, region, class 추출\n",
+    "    sample_size = int(parts[0])\n",
+    "    fold = int(parts[1])\n",
+    "    region = parts[2]\n",
+    "    class_label = int(parts[3].replace('class', ''))\n",
+    "    \n",
+    "    return {\n",
+    "        'method': method,\n",
+    "        'sample_size': sample_size,\n",
+    "        'fold': fold,\n",
+    "        'region': region,\n",
+    "        'class': class_label\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "def extract_hyperparameters(model_path: Path) -> Dict[str, Any]:\n",
+    "    \"\"\"\n",
+    "    CTGAN 모델에서 하이퍼파라미터를 추출합니다.\n",
+    "    \n",
+    "    CTGAN 모델의 하이퍼파라미터는 내부 속성(_로 시작)에 저장되어 있습니다:\n",
+    "    - _embedding_dim: 임베딩 차원\n",
+    "    - _generator_dim: 생성기 네트워크 차원 (튜플)\n",
+    "    - _discriminator_dim: 판별기 네트워크 차원 (튜플)\n",
+    "    - _batch_size: 배치 크기\n",
+    "    - _epochs: 에포크 수\n",
+    "    - _pac: PAC 파라미터 (또는 pac 속성으로 접근 가능)\n",
+    "    - _generator_lr: 생성기 학습률\n",
+    "    - _discriminator_lr: 판별기 학습률\n",
+    "    - _discriminator_steps: 판별기 업데이트 스텝 수\n",
+    "    \n",
+    "    Args:\n",
+    "        model_path: 모델 파일 경로\n",
+    "        \n",
+    "    Returns:\n",
+    "        하이퍼파라미터 딕셔너리\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        # 모델 로드\n",
+    "        model = CTGAN.load(str(model_path))\n",
+    "        \n",
+    "        # 하이퍼파라미터 추출 (내부 속성 사용)\n",
+    "        hyperparams = {\n",
+    "            'embedding_dim': getattr(model, '_embedding_dim', None),\n",
+    "            'generator_dim': str(getattr(model, '_generator_dim', None)),  # 튜플을 문자열로 변환\n",
+    "            'discriminator_dim': str(getattr(model, '_discriminator_dim', None)),  # 튜플을 문자열로 변환\n",
+    "            'pac': getattr(model, 'pac', None) or getattr(model, '_pac', None),  # pac 속성 또는 _pac 속성\n",
+    "            'batch_size': getattr(model, '_batch_size', None),\n",
+    "            'discriminator_steps': getattr(model, '_discriminator_steps', None),\n",
+    "            'epochs': getattr(model, '_epochs', None),\n",
+    "            'generator_lr': getattr(model, '_generator_lr', None),\n",
+    "            'discriminator_lr': getattr(model, '_discriminator_lr', None),\n",
+    "            'generator_decay': getattr(model, '_generator_decay', None),\n",
+    "            'discriminator_decay': getattr(model, '_discriminator_decay', None),\n",
+    "        }\n",
+    "        \n",
+    "        return hyperparams\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error loading {model_path.name}: {str(e)}\")\n",
+    "        import traceback\n",
+    "        print(traceback.format_exc())\n",
+    "        return None\n",
+    "\n",
+    "\n",
+    "# 테스트: 첫 번째 모델 파일로 테스트\n",
+    "if len(model_files) > 0:\n",
+    "    test_file = model_files[0]\n",
+    "    print(f\"테스트 파일: {test_file.name}\")\n",
+    "    parsed = parse_model_filename(test_file.name)\n",
+    "    print(f\"파싱 결과: {parsed}\")\n",
+    "    hyperparams = extract_hyperparameters(test_file)\n",
+    "    print(f\"하이퍼파라미터: {hyperparams}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "9fc03ebe",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "모든 모델 파일에서 하이퍼파라미터 추출 중...\n",
+      "================================================================================\n",
+      "[20/216] 진행 중... (20개 성공)\n",
+      "[40/216] 진행 중... (40개 성공)\n",
+      "[60/216] 진행 중... (60개 성공)\n",
+      "[80/216] 진행 중... (80개 성공)\n",
+      "[100/216] 진행 중... (100개 성공)\n",
+      "[120/216] 진행 중... (120개 성공)\n",
+      "[140/216] 진행 중... (140개 성공)\n",
+      "[160/216] 진행 중... (160개 성공)\n",
+      "[180/216] 진행 중... (180개 성공)\n",
+      "[200/216] 진행 중... (200개 성공)\n",
+      "================================================================================\n",
+      "완료! 총 216개의 모델에서 하이퍼파라미터를 추출했습니다.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 모든 모델 파일에서 하이퍼파라미터 추출\n",
+    "all_results = []\n",
+    "\n",
+    "print(\"모든 모델 파일에서 하이퍼파라미터 추출 중...\")\n",
+    "print(\"=\" * 80)\n",
+    "\n",
+    "for i, model_file in enumerate(model_files, 1):\n",
+    "    # 파일명 파싱\n",
+    "    parsed_info = parse_model_filename(model_file.name)\n",
+    "    if parsed_info is None:\n",
+    "        print(f\"[{i}/{len(model_files)}] 스킵: {model_file.name} (파일명 패턴 불일치)\")\n",
+    "        continue\n",
+    "    \n",
+    "    # 하이퍼파라미터 추출\n",
+    "    hyperparams = extract_hyperparameters(model_file)\n",
+    "    if hyperparams is None:\n",
+    "        print(f\"[{i}/{len(model_files)}] 실패: {model_file.name}\")\n",
+    "        continue\n",
+    "    \n",
+    "    # 정보 합치기\n",
+    "    result = {**parsed_info, **hyperparams}\n",
+    "    result['filename'] = model_file.name\n",
+    "    all_results.append(result)\n",
+    "    \n",
+    "    if i % 20 == 0:\n",
+    "        print(f\"[{i}/{len(model_files)}] 진행 중... ({len(all_results)}개 성공)\")\n",
+    "\n",
+    "print(\"=\" * 80)\n",
+    "print(f\"완료! 총 {len(all_results)}개의 모델에서 하이퍼파라미터를 추출했습니다.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "223e2b49",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "총 216개의 모델 하이퍼파라미터가 정리되었습니다.\n",
+      "\n",
+      "컬럼: ['method', 'sample_size', 'fold', 'region', 'class', 'embedding_dim', 'generator_dim', 'discriminator_dim', 'pac', 'batch_size', 'discriminator_steps', 'epochs', 'generator_lr', 'discriminator_lr', 'filename']\n",
+      "\n",
+      "처음 5개 행:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>method</th>\n",
+       "      <th>sample_size</th>\n",
+       "      <th>fold</th>\n",
+       "      <th>region</th>\n",
+       "      <th>class</th>\n",
+       "      <th>embedding_dim</th>\n",
+       "      <th>generator_dim</th>\n",
+       "      <th>discriminator_dim</th>\n",
+       "      <th>pac</th>\n",
+       "      <th>batch_size</th>\n",
+       "      <th>discriminator_steps</th>\n",
+       "      <th>epochs</th>\n",
+       "      <th>generator_lr</th>\n",
+       "      <th>discriminator_lr</th>\n",
+       "      <th>filename</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ctgan</td>\n",
+       "      <td>7000</td>\n",
+       "      <td>1</td>\n",
+       "      <td>busan</td>\n",
+       "      <td>0</td>\n",
+       "      <td>78</td>\n",
+       "      <td>(128, 128)</td>\n",
+       "      <td>(128, 128)</td>\n",
+       "      <td>8</td>\n",
+       "      <td>256</td>\n",
+       "      <td>3</td>\n",
+       "      <td>300</td>\n",
+       "      <td>0.0002</td>\n",
+       "      <td>0.0002</td>\n",
+       "      <td>ctgan_only_7000_1_busan_class0.pkl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ctgan</td>\n",
+       "      <td>7000</td>\n",
+       "      <td>1</td>\n",
+       "      <td>busan</td>\n",
+       "      <td>1</td>\n",
+       "      <td>269</td>\n",
+       "      <td>(256, 256)</td>\n",
+       "      <td>(128, 128)</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1024</td>\n",
+       "      <td>1</td>\n",
+       "      <td>300</td>\n",
+       "      <td>0.0002</td>\n",
+       "      <td>0.0002</td>\n",
+       "      <td>ctgan_only_7000_1_busan_class1.pkl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>ctgan</td>\n",
+       "      <td>7000</td>\n",
+       "      <td>1</td>\n",
+       "      <td>daegu</td>\n",
+       "      <td>0</td>\n",
+       "      <td>121</td>\n",
+       "      <td>(128, 128)</td>\n",
+       "      <td>(64, 64)</td>\n",
+       "      <td>4</td>\n",
+       "      <td>64</td>\n",
+       "      <td>2</td>\n",
+       "      <td>300</td>\n",
+       "      <td>0.0002</td>\n",
+       "      <td>0.0002</td>\n",
+       "      <td>ctgan_only_7000_1_daegu_class0.pkl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ctgan</td>\n",
+       "      <td>7000</td>\n",
+       "      <td>1</td>\n",
+       "      <td>daegu</td>\n",
+       "      <td>1</td>\n",
+       "      <td>217</td>\n",
+       "      <td>(128, 128)</td>\n",
+       "      <td>(128, 128)</td>\n",
+       "      <td>4</td>\n",
+       "      <td>256</td>\n",
+       "      <td>5</td>\n",
+       "      <td>300</td>\n",
+       "      <td>0.0002</td>\n",
+       "      <td>0.0002</td>\n",
+       "      <td>ctgan_only_7000_1_daegu_class1.pkl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ctgan</td>\n",
+       "      <td>7000</td>\n",
+       "      <td>1</td>\n",
+       "      <td>daejeon</td>\n",
+       "      <td>0</td>\n",
+       "      <td>101</td>\n",
+       "      <td>(128, 128)</td>\n",
+       "      <td>(128, 128)</td>\n",
+       "      <td>4</td>\n",
+       "      <td>128</td>\n",
+       "      <td>2</td>\n",
+       "      <td>300</td>\n",
+       "      <td>0.0002</td>\n",
+       "      <td>0.0002</td>\n",
+       "      <td>ctgan_only_7000_1_daejeon_class0.pkl</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  method  sample_size  fold   region  class  embedding_dim generator_dim  \\\n",
+       "0  ctgan         7000     1    busan      0             78    (128, 128)   \n",
+       "1  ctgan         7000     1    busan      1            269    (256, 256)   \n",
+       "2  ctgan         7000     1    daegu      0            121    (128, 128)   \n",
+       "3  ctgan         7000     1    daegu      1            217    (128, 128)   \n",
+       "4  ctgan         7000     1  daejeon      0            101    (128, 128)   \n",
+       "\n",
+       "  discriminator_dim  pac  batch_size  discriminator_steps  epochs  \\\n",
+       "0        (128, 128)    8         256                    3     300   \n",
+       "1        (128, 128)    4        1024                    1     300   \n",
+       "2          (64, 64)    4          64                    2     300   \n",
+       "3        (128, 128)    4         256                    5     300   \n",
+       "4        (128, 128)    4         128                    2     300   \n",
+       "\n",
+       "   generator_lr  discriminator_lr                              filename  \n",
+       "0        0.0002            0.0002    ctgan_only_7000_1_busan_class0.pkl  \n",
+       "1        0.0002            0.0002    ctgan_only_7000_1_busan_class1.pkl  \n",
+       "2        0.0002            0.0002    ctgan_only_7000_1_daegu_class0.pkl  \n",
+       "3        0.0002            0.0002    ctgan_only_7000_1_daegu_class1.pkl  \n",
+       "4        0.0002            0.0002  ctgan_only_7000_1_daejeon_class0.pkl  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# DataFrame으로 변환\n",
+    "df_hyperparams = pd.DataFrame(all_results)\n",
+    "\n",
+    "# 컬럼 순서 정리\n",
+    "column_order = [\n",
+    "    'method', 'sample_size', 'fold', 'region', 'class',\n",
+    "    'embedding_dim', 'generator_dim', 'discriminator_dim',\n",
+    "    'pac', 'batch_size', 'discriminator_steps',\n",
+    "    'epochs', 'generator_lr', 'discriminator_lr',\n",
+    "    'filename'\n",
+    "]\n",
+    "df_hyperparams = df_hyperparams[column_order]\n",
+    "\n",
+    "# 정렬: method -> sample_size -> fold -> region -> class\n",
+    "df_hyperparams = df_hyperparams.sort_values(\n",
+    "    ['method', 'sample_size', 'fold', 'region', 'class']\n",
+    ").reset_index(drop=True)\n",
+    "\n",
+    "print(f\"총 {len(df_hyperparams)}개의 모델 하이퍼파라미터가 정리되었습니다.\")\n",
+    "print(f\"\\n컬럼: {list(df_hyperparams.columns)}\")\n",
+    "print(f\"\\n처음 5개 행:\")\n",
+    "df_hyperparams.head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "9d3a8a65",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_hyperparams.sort_values(by=['region','method','sample_size','fold','class'], inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "f92f352e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "하이퍼파라미터 데이터가 'oversampling_models_hyperparameters_all.csv'에 저장되었습니다.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# CSV로 저장 (선택사항)\n",
+    "output_csv = \"oversampling_models_hyperparameters_all.csv\"\n",
+    "df_hyperparams.to_csv(output_csv, index=False, encoding='utf-8-sig')\n",
+    "print(f\"하이퍼파라미터 데이터가 '{output_csv}'에 저장되었습니다.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "8ee1c56a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ctgan            108\n",
+       "smotenc_ctgan    108\n",
+       "Name: method, dtype: int64"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_hyperparams['method'].value_counts()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py39",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Analysis_code/4.sampling_data_test/analysis.ipynb ADDED Viewed

	@@ -0,0 +1,244 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "70effd7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f38ce7d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df= pd.read_csv(\"../../data/oversampled_data_test_for_model/combined_sampled_data_test.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2bae91e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>region</th>\n",
+       "      <th>model</th>\n",
+       "      <th>data_sample</th>\n",
+       "      <th>CSI</th>\n",
+       "      <th>MCC</th>\n",
+       "      <th>Accuracy</th>\n",
+       "      <th>fold_csi</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>seoul</td>\n",
+       "      <td>LightGBM</td>\n",
+       "      <td>pure</td>\n",
+       "      <td>0.505041</td>\n",
+       "      <td>0.646992</td>\n",
+       "      <td>0.936174</td>\n",
+       "      <td>[[0.46595932802825235, 0.5771195097037204, 0.4...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>busan</td>\n",
+       "      <td>LightGBM</td>\n",
+       "      <td>pure</td>\n",
+       "      <td>0.430188</td>\n",
+       "      <td>0.600801</td>\n",
+       "      <td>0.956971</td>\n",
+       "      <td>[[0.32824427480911017, 0.4782608695651431, 0.4...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  region     model data_sample       CSI       MCC  Accuracy  \\\n",
+       "0  seoul  LightGBM        pure  0.505041  0.646992  0.936174   \n",
+       "1  busan  LightGBM        pure  0.430188  0.600801  0.956971   \n",
+       "\n",
+       "                                            fold_csi  \n",
+       "0  [[0.46595932802825235, 0.5771195097037204, 0.4...  \n",
+       "1  [[0.32824427480911017, 0.4782608695651431, 0.4...  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "6893a958",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>region</th>\n",
+       "      <th>model</th>\n",
+       "      <th>data_sample</th>\n",
+       "      <th>CSI</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>busan</td>\n",
+       "      <td>LightGBM</td>\n",
+       "      <td>ctgan10000</td>\n",
+       "      <td>0.467663</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>daegu</td>\n",
+       "      <td>XGBoost</td>\n",
+       "      <td>smote</td>\n",
+       "      <td>0.454066</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>daejeon</td>\n",
+       "      <td>LightGBM</td>\n",
+       "      <td>smote</td>\n",
+       "      <td>0.521335</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>gwangju</td>\n",
+       "      <td>LightGBM</td>\n",
+       "      <td>smote</td>\n",
+       "      <td>0.522731</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>incheon</td>\n",
+       "      <td>XGBoost</td>\n",
+       "      <td>smote</td>\n",
+       "      <td>0.589146</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>seoul</td>\n",
+       "      <td>XGBoost</td>\n",
+       "      <td>smote</td>\n",
+       "      <td>0.582266</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    region     model data_sample       CSI\n",
+       "0    busan  LightGBM  ctgan10000  0.467663\n",
+       "1    daegu   XGBoost       smote  0.454066\n",
+       "2  daejeon  LightGBM       smote  0.521335\n",
+       "3  gwangju  LightGBM       smote  0.522731\n",
+       "4  incheon   XGBoost       smote  0.589146\n",
+       "5    seoul   XGBoost       smote  0.582266"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 지역별로 CSI가 가장 높은 model과 data_sample 조합 보기\n",
+    "top_csi_per_region = df.loc[df.groupby('region')['CSI'].idxmax()][['region', 'model', 'data_sample', 'CSI']]\n",
+    "top_csi_per_region = top_csi_per_region.sort_values('region').reset_index(drop=True)\n",
+    "top_csi_per_region"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2942ba86",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d55af59c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py39",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Analysis_code/4.sampling_data_test/lgb_sampled_test.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Analysis_code/4.sampling_data_test/xgb_sampled_test.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Analysis_code/5.optima/deepgbm_pure/deepgbm_pure_busan.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="busan"),
+    n_trials=100,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_pure_busan_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_paths = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="busan",
+        data_sample='pure',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로:")
+    for path in model_paths:
+        print(f"  - {path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_pure/deepgbm_pure_daegu.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="daegu"),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_pure_daegu_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_paths = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="daegu",
+        data_sample='pure',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로:")
+    for path in model_paths:
+        print(f"  - {path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_pure/deepgbm_pure_daejeon.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="daejeon"),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_pure_daejeon_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_paths = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="daejeon",
+        data_sample='pure',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로:")
+    for path in model_paths:
+        print(f"  - {path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_pure/deepgbm_pure_gwangju.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="gwangju"),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_pure_gwangju_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_paths = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="gwangju",
+        data_sample='pure',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로:")
+    for path in model_paths:
+        print(f"  - {path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_pure/deepgbm_pure_incheon.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="incheon"),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_pure_incheon_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_paths = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="incheon",
+        data_sample='pure',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로:")
+    for path in model_paths:
+        print(f"  - {path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_pure/deepgbm_pure_seoul.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="seoul"),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_pure_seoul_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_paths = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="seoul",
+        data_sample='pure',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로:")
+    for path in model_paths:
+        print(f"  - {path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_pure/utils.py ADDED Viewed

	@@ -0,0 +1,720 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+import random
+import os
+import copy
+from sklearn.preprocessing import QuantileTransformer, LabelEncoder
+from torch.utils.data import DataLoader, TensorDataset
+from sklearn.metrics import confusion_matrix
+from sklearn.utils.class_weight import compute_class_weight
+import pandas as pd
+import optuna
+from sklearn.metrics import accuracy_score, f1_score
+import joblib
+import sys
+# 파일 위치 기반으로 models 디렉토리 경로 설정
+current_file_dir = os.path.dirname(os.path.abspath(__file__))
+models_path = os.path.abspath(os.path.join(current_file_dir, '../../models'))
+sys.path.insert(0, models_path)
+from ft_transformer import FTTransformer
+from resnet_like import ResNetLike
+from deepgbm import DeepGBM
+import warnings
+warnings.filterwarnings('ignore')
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# PyTorch 시드 고정
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.cuda.manual_seed_all(seed)  # Multi-GPU 환경에서 동일한 시드 적용
+# PyTorch 연산의 결정적 모드 설정
+torch.backends.cudnn.deterministic = True  # 실행마다 동일한 결과를 보장
+torch.backends.cudnn.benchmark = True  # 성능 최적화를 활성화 (가능한 한 빠른 연산 수행)
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def preprocessing(df):
+    """데이터 전처리 함수.
+    Args:
+        df: 원본 데이터프레임
+    Returns:
+        전처리된 데이터프레임
+    """
+    df = df[df.columns].copy()
+    df['year'] = df['year'].astype('int')
+    df['month'] = df['month'].astype('int')
+    df['hour'] = df['hour'].astype('int')
+    df = add_derived_features(df).copy()
+    df['multi_class'] = df['multi_class'].astype('int')
+    df.loc[df['wind_dir']=='정온', 'wind_dir'] = "0"
+    df['wind_dir'] = df['wind_dir'].astype('int')
+    df = df[['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm',
+       'vap_pressure', 'dewpoint_C', 'loc_pressure', 'sea_pressure',
+       'solarRad', 'snow_cm', 'cloudcover', 'lm_cloudcover', 'low_cloudbase',
+       'groundtemp', 'O3', 'NO2', 'PM10', 'PM25', 'year',
+       'month', 'hour', 'ground_temp - temp_C', 'hour_sin', 'hour_cos',
+       'month_sin', 'month_cos','multi_class']].copy()
+    return df
+# 데이터셋 준비 함수
+def prepare_dataset(region, data_sample='pure', target='multi', fold=3):
+    # 파일 위치 기반으로 데이터 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    data_base_dir = os.path.abspath(os.path.join(current_file_dir, '../../../data'))
+    # 데이터 경로 지정
+    dat_path = os.path.join(data_base_dir, f"data_for_modeling/{region}_train.csv")
+    if data_sample == 'pure':
+        train_path = dat_path
+    else:
+        train_path = os.path.join(data_base_dir, f'data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv')
+    test_path = os.path.join(data_base_dir, f"data_for_modeling/{region}_test.csv")
+    drop_col = ['multi_class','year']
+    target_col = f'{target}_class'
+    # 데이터 로드
+    region_dat = preprocessing(pd.read_csv(dat_path, index_col=0))
+    if data_sample == 'pure':
+        region_train = region_dat.loc[~region_dat['year'].isin([2021-fold]), :]
+    else:
+        region_train = preprocessing(pd.read_csv(train_path))
+    region_val = region_dat.loc[region_dat['year'].isin([2021-fold]), :]
+    region_test = preprocessing(pd.read_csv(test_path))
+    # 컬럼 정렬 (일관성 유지)
+    common_columns = region_train.columns.to_list()
+    train_data = region_train[common_columns]
+    val_data = region_val[common_columns]
+    test_data = region_test[common_columns]
+    # 설명변수 & 타겟 분리
+    X_train = train_data.drop(columns=drop_col)
+    y_train = train_data[target_col]
+    X_val = val_data.drop(columns=drop_col)
+    y_val = val_data[target_col]
+    X_test = test_data.drop(columns=drop_col)
+    y_test = test_data[target_col]
+    # 범주형 & 연속형 변수 분리
+    categorical_cols = X_train.select_dtypes(include=['object', 'category', 'int64']).columns
+    numerical_cols = X_train.select_dtypes(include=['float64']).columns
+    # 범주형 변수 Label Encoding
+    label_encoders = {}
+    for col in categorical_cols:
+        le = LabelEncoder()
+        le.fit(X_train[col])  # Train 데이터 기준으로 학습
+        label_encoders[col] = le
+    # 변환 적용
+    for col in categorical_cols:
+        X_train[col] = label_encoders[col].transform(X_train[col])
+        X_val[col] = label_encoders[col].transform(X_val[col])
+        X_test[col] = label_encoders[col].transform(X_test[col])
+    # 연속형 변수 Quantile Transformation
+    scaler = QuantileTransformer(output_distribution='normal')
+    scaler.fit(X_train[numerical_cols])  # Train 데이터 기준으로 학습
+    # 변환 적용
+    X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])
+    X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
+    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
+    return X_train, X_val, X_test, y_train, y_val, y_test, categorical_cols, numerical_cols
+# 데이터 변환 및 dataloader 생성 함수
+def prepare_dataloader(region, data_sample='pure', target='multi', fold=3, random_state=None):
+    # 파일 위치 기반으로 데이터 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    data_base_dir = os.path.abspath(os.path.join(current_file_dir, '../../../data'))
+    # 데이터 경로 지정
+    dat_path = os.path.join(data_base_dir, f"data_for_modeling/{region}_train.csv")
+    if data_sample == 'pure':
+        train_path = dat_path
+    else:
+        train_path = os.path.join(data_base_dir, f'data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv')
+    test_path = os.path.join(data_base_dir, f"data_for_modeling/{region}_test.csv")
+    drop_col = ['multi_class','year']
+    target_col = f'{target}_class'
+    # 데이터 로드
+    region_dat = preprocessing(pd.read_csv(dat_path, index_col=0))
+    if data_sample == 'pure':
+        region_train = region_dat.loc[~region_dat['year'].isin([2021-fold]), :]
+    else:
+        region_train = preprocessing(pd.read_csv(train_path))
+    region_val = region_dat.loc[region_dat['year'].isin([2021-fold]), :]
+    region_test = preprocessing(pd.read_csv(test_path))
+    # 컬럼 정렬 (일관성 유지)
+    common_columns = region_train.columns.to_list()
+    train_data = region_train[common_columns]
+    val_data = region_val[common_columns]
+    test_data = region_test[common_columns]
+    # 설명변수 & 타겟 분리
+    X_train = train_data.drop(columns=drop_col)
+    y_train = train_data[target_col]
+    X_val = val_data.drop(columns=drop_col)
+    y_val = val_data[target_col]
+    X_test = test_data.drop(columns=drop_col)
+    y_test = test_data[target_col]
+    # 범주형 & 연속형 변수 분리
+    categorical_cols = X_train.select_dtypes(include=['object', 'category', 'int64']).columns
+    numerical_cols = X_train.select_dtypes(include=['float64']).columns
+    # 범주형 변수 Label Encoding
+    label_encoders = {}
+    for col in categorical_cols:
+        le = LabelEncoder()
+        le.fit(X_train[col])  # Train 데이터 기준으로 학습
+        label_encoders[col] = le
+    # 변환 적용
+    for col in categorical_cols:
+        X_train[col] = label_encoders[col].transform(X_train[col])
+        X_val[col] = label_encoders[col].transform(X_val[col])
+        X_test[col] = label_encoders[col].transform(X_test[col])
+    # 연속형 변수 Quantile Transformation
+    scaler = QuantileTransformer(output_distribution='normal')
+    scaler.fit(X_train[numerical_cols])  # Train 데이터 기준으로 학습
+    # 변환 적용
+    X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])
+    X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
+    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
+    # 연속형 변수와 범주형 변수 분리
+    X_train_num = torch.tensor(X_train[numerical_cols].values, dtype=torch.float32)
+    X_train_cat = torch.tensor(X_train[categorical_cols].values, dtype=torch.long)
+    X_val_num = torch.tensor(X_val[numerical_cols].values, dtype=torch.float32)
+    X_val_cat = torch.tensor(X_val[categorical_cols].values, dtype=torch.long)
+    X_test_num = torch.tensor(X_test[numerical_cols].values, dtype=torch.float32)
+    X_test_cat = torch.tensor(X_test[categorical_cols].values, dtype=torch.long)
+    # 레이블 변환
+    if target == "binary":
+        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)  # 이진 분류 → float32
+        y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
+        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
+    elif target == "multi":
+        y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # 다중 분류 → long
+        y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)
+        y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)
+    else:
+        raise ValueError("target must be 'binary' or 'multi'")
+    # TensorDataset 생성
+    train_dataset = TensorDataset(X_train_num, X_train_cat, y_train_tensor)
+    val_dataset = TensorDataset(X_val_num, X_val_cat, y_val_tensor)
+    test_dataset = TensorDataset(X_test_num, X_test_cat, y_test_tensor)
+    # DataLoader 생성
+    if random_state == None:
+        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
+    else:
+        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=torch.Generator().manual_seed(random_state))
+    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
+    return X_train, categorical_cols, numerical_cols, train_loader, val_loader, test_loader
+# 데이터 변환 및 dataloader 생성 함수 (batch_size 파라미터 추가 버전)
+def prepare_dataloader_with_batchsize(region, data_sample='pure', target='multi', fold=3, random_state=None, batch_size=64):
+    # 파일 위치 기반으로 데이터 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    data_base_dir = os.path.abspath(os.path.join(current_file_dir, '../../../data'))
+    # 데이터 경로 지정
+    dat_path = os.path.join(data_base_dir, f"data_for_modeling/{region}_train.csv")
+    if data_sample == 'pure':
+        train_path = dat_path
+    else:
+        train_path = os.path.join(data_base_dir, f'data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv')
+    test_path = os.path.join(data_base_dir, f"data_for_modeling/{region}_test.csv")
+    drop_col = ['multi_class','year']
+    target_col = f'{target}_class'
+    # 데이터 로드
+    region_dat = preprocessing(pd.read_csv(dat_path, index_col=0))
+    if data_sample == 'pure':
+        region_train = region_dat.loc[~region_dat['year'].isin([2021-fold]), :]
+    else:
+        region_train = preprocessing(pd.read_csv(train_path))
+    region_val = region_dat.loc[region_dat['year'].isin([2021-fold]), :]
+    region_test = preprocessing(pd.read_csv(test_path))
+    # 컬럼 정렬 (일관성 유지)
+    common_columns = region_train.columns.to_list()
+    train_data = region_train[common_columns]
+    val_data = region_val[common_columns]
+    test_data = region_test[common_columns]
+    # 설명변수 & 타겟 분리
+    X_train = train_data.drop(columns=drop_col)
+    y_train = train_data[target_col]
+    X_val = val_data.drop(columns=drop_col)
+    y_val = val_data[target_col]
+    X_test = test_data.drop(columns=drop_col)
+    y_test = test_data[target_col]
+    # 범주형 & 연속형 변수 분리
+    categorical_cols = X_train.select_dtypes(include=['object', 'category', 'int64']).columns
+    numerical_cols = X_train.select_dtypes(include=['float64']).columns
+    # 범주형 변수 Label Encoding
+    label_encoders = {}
+    for col in categorical_cols:
+        le = LabelEncoder()
+        le.fit(X_train[col])  # Train 데이터 기준으로 학습
+        label_encoders[col] = le
+    # 변환 적용
+    for col in categorical_cols:
+        X_train[col] = label_encoders[col].transform(X_train[col])
+        X_val[col] = label_encoders[col].transform(X_val[col])
+        X_test[col] = label_encoders[col].transform(X_test[col])
+    # 연속형 변수 Quantile Transformation
+    scaler = QuantileTransformer(output_distribution='normal')
+    scaler.fit(X_train[numerical_cols])  # Train 데이터 기준으로 학습
+    # 변환 적용
+    X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])
+    X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
+    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
+    # 연속형 변수와 범주형 변수 분리
+    X_train_num = torch.tensor(X_train[numerical_cols].values, dtype=torch.float32)
+    X_train_cat = torch.tensor(X_train[categorical_cols].values, dtype=torch.long)
+    X_val_num = torch.tensor(X_val[numerical_cols].values, dtype=torch.float32)
+    X_val_cat = torch.tensor(X_val[categorical_cols].values, dtype=torch.long)
+    X_test_num = torch.tensor(X_test[numerical_cols].values, dtype=torch.float32)
+    X_test_cat = torch.tensor(X_test[categorical_cols].values, dtype=torch.long)
+    # 레이블 변환
+    if target == "binary":
+        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)  # 이진 분류 → float32
+        y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
+        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
+    elif target == "multi":
+        y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # 다중 분류 → long
+        y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)
+        y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)
+    else:
+        raise ValueError("target must be 'binary' or 'multi'")
+    # TensorDataset 생성
+    train_dataset = TensorDataset(X_train_num, X_train_cat, y_train_tensor)
+    val_dataset = TensorDataset(X_val_num, X_val_cat, y_val_tensor)
+    test_dataset = TensorDataset(X_test_num, X_test_cat, y_test_tensor)
+    # DataLoader 생성 (batch_size 파라미터 사용)
+    if random_state == None:
+        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    else:
+        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=torch.Generator().manual_seed(random_state))
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+    return X_train, categorical_cols, numerical_cols, train_loader, val_loader, test_loader, y_train, scaler
+def calculate_csi(y_true, pred):
+    cm = confusion_matrix(y_true, pred)  # 변수 이름을 cm으로 변경
+    # 혼동 행렬에서 H, F, M 추출
+    H = (cm[0, 0] + cm[1, 1])
+    F = (cm[1, 0] + cm[2, 0] +
+         cm[0, 1] + cm[2, 1])
+    M = (cm[0, 2] + cm[1, 2])
+    # CSI 계산
+    CSI = H / (H + F + M + 1e-10)
+    return CSI
+def sample_weight(y_train):
+    class_weights = compute_class_weight(
+        class_weight='balanced',
+        classes=np.unique(y_train),  # 고유 클래스
+        y=y_train                   # 학습 데이터 레이블
+    )
+    sample_weights = np.array([class_weights[label] for label in y_train])
+    return sample_weights
+# 하이퍼파라미터 최적화 함수 정의
+def objective(trial, model_choose, region, data_sample='pure', target='multi', n_folds=3, random_state=42):
+    # GPU 사용 가능 여부 확인 및 device 설정
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    val_scores = []
+    # --- 1. 하이퍼파라미터 탐색 범위 정의 (수정됨) ---
+    if model_choose == "ft_transformer":
+        d_token = trial.suggest_int("d_token", 64, 256, step=32)
+        n_blocks = trial.suggest_int("n_blocks", 2, 6) # 깊이 축소로 과적합 방지
+        n_heads = trial.suggest_categorical("n_heads", [4, 8])
+        # d_token은 n_heads의 배수여야 함 (FT-Transformer의 구조적 제약 대응)
+        if d_token % n_heads != 0:
+            d_token = (d_token // n_heads) * n_heads
+        attention_dropout = trial.suggest_float("attention_dropout", 0.1, 0.4)
+        ffn_dropout = trial.suggest_float("ffn_dropout", 0.1, 0.4)
+        lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True) # 범위 확대
+        weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-1, log=True)  # 더 공격적인 범위로 확장
+        batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])  # Batch Size 추가
+    elif model_choose == 'resnet_like':
+        d_main = trial.suggest_int("d_main", 64, 256, step=32)
+        d_hidden = trial.suggest_int("d_hidden", 64, 512, step=64)
+        n_blocks = trial.suggest_int("n_blocks", 2, 5) # 너무 깊지 않게 조절
+        dropout_first = trial.suggest_float("dropout_first", 0.1, 0.4)
+        dropout_second = trial.suggest_float("dropout_second", 0.0, 0.2)
+        lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
+        weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-1, log=True)  # 더 공격적인 범위로 확장
+        batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])  # Batch Size 추가
+    elif model_choose == 'deepgbm':
+        # DeepGBM의 경우 모델 특성에 맞춰 ResNet 블록 및 임베딩 차원 조절
+        d_main = trial.suggest_int("d_main", 64, 256, step=32)
+        d_hidden = trial.suggest_int("d_hidden", 64, 256, step=64)
+        n_blocks = trial.suggest_int("n_blocks", 2, 6)
+        dropout = trial.suggest_float("dropout", 0.1, 0.4)
+        lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
+        weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-1, log=True)  # 더 공격적인 범위로 확장
+        batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])  # Batch Size 추가
+    # --- 2. Fold별 학습 및 교차 검증 ---
+    for fold in range(1, n_folds + 1):
+        X_train_df, categorical_cols, numerical_cols, train_loader, val_loader, _, y_train, _ = prepare_dataloader_with_batchsize(
+            region, data_sample=data_sample, target=target, fold=fold, random_state=random_state, batch_size=batch_size
+        )
+        # 모델 초기화
+        if model_choose == "ft_transformer":
+            model = FTTransformer(
+                num_features=len(numerical_cols),
+                cat_cardinalities=[len(X_train_df[col].unique()) for col in categorical_cols],
+                d_token=d_token,
+                n_blocks=n_blocks,
+                n_heads=n_heads,
+                attention_dropout=attention_dropout,
+                ffn_dropout=ffn_dropout,
+                num_classes=3
+            ).to(device)
+        elif model_choose == 'resnet_like':
+            input_dim = len(numerical_cols) + len(categorical_cols)
+            model = ResNetLike(
+                input_dim=input_dim,
+                d_main=d_main,
+                d_hidden=d_hidden,
+                n_blocks=n_blocks,
+                dropout_first=dropout_first,
+                dropout_second=dropout_second,
+                num_classes=3
+            ).to(device)
+        elif model_choose == 'deepgbm':
+            model = DeepGBM(
+                num_features=len(numerical_cols),
+                cat_features=[len(X_train_df[col].unique()) for col in categorical_cols],
+                d_main=d_main,
+                d_hidden=d_hidden,
+                n_blocks=n_blocks,
+                dropout=dropout,
+                num_classes=3
+            ).to(device)
+        # 클래스 가중치 계산 및 손실 함수 설정 (Label Smoothing 적용)
+        if target == 'multi':
+            class_weights = compute_class_weight(
+                class_weight='balanced',
+                classes=np.unique(y_train),
+                y=y_train
+            )
+            # 클래스별 가중치 로그 출력
+            unique_classes = np.unique(y_train)
+            class_counts = {cls: np.sum(y_train == cls) for cls in unique_classes}
+            print(f"  Fold {fold} - 클래스별 가중치: {dict(zip(unique_classes, class_weights))} (클래스별 샘플 수: {class_counts})")
+            class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)
+            criterion = nn.CrossEntropyLoss(weight=class_weights_tensor, label_smoothing=0.0)  # Label Smoothing 추가
+        else:
+            criterion = nn.BCEWithLogitsLoss()
+        optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
+        # 학습률 스케줄러 추가: 성능 정체 시 LR을 0.5배 감소 (검증 CSI 기준)
+        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)
+        # 학습 설정 (에폭 및 페이션스 상향)
+        epochs = 200
+        patience = 12 # 딥러닝의 정체 구간을 고려하여 소폭 상향
+        best_fold_csi = 0
+        counter = 0
+        for epoch in range(epochs):
+            model.train()
+            for x_num_batch, x_cat_batch, y_batch in train_loader:
+                x_num_batch, x_cat_batch, y_batch = x_num_batch.to(device), x_cat_batch.to(device), y_batch.to(device)
+                optimizer.zero_grad()
+                y_pred = model(x_num_batch, x_cat_batch)
+                loss = criterion(y_pred, y_batch if target == 'multi' else y_batch.float())
+                loss.backward()
+                optimizer.step()
+            # Validation 평가
+            model.eval()
+            y_pred_val, y_true_val = [], []
+            with torch.no_grad():
+                for x_num_batch, x_cat_batch, y_batch in val_loader:
+                    x_num_batch, x_cat_batch, y_batch = x_num_batch.to(device), x_cat_batch.to(device), y_batch.to(device)
+                    output = model(x_num_batch, x_cat_batch)
+                    pred = output.argmax(dim=1) if target == 'multi' else (torch.sigmoid(output) >= 0.5).long()
+                    y_pred_val.extend(pred.cpu().numpy())
+                    y_true_val.extend(y_batch.cpu().numpy())
+            # CSI 계산 및 스케줄러 업데이트
+            val_csi = calculate_csi(y_true_val, y_pred_val)
+            scheduler.step(val_csi)
+            # Optuna Pruning 적용 (첫 번째 Fold에서 조기 종료 판단 강화)
+            trial.report(val_csi, epoch)
+            if trial.should_prune():
+                raise optuna.exceptions.TrialPruned()
+            # Early Stopping 체크
+            if val_csi > best_fold_csi:
+                best_fold_csi = val_csi
+                counter = 0
+            else:
+                counter += 1
+            if counter >= patience:
+                break
+        val_scores.append(best_fold_csi)
+    # 모든 fold의 평균 성능 반환
+    return np.mean(val_scores)
+# 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장 함수
+def train_final_model(best_params, model_choose, region, data_sample='pure', target='multi', n_folds=3, random_state=42):
+    """
+    최적화된 하이퍼파라미터로 최종 모델을 학습하고 저장합니다.
+    Args:
+        best_params: 최적화된 하이퍼파라미터 딕셔너리
+        model_choose: 모델 선택 ('ft_transformer', 'resnet_like', 'deepgbm')
+        region: 지역명
+        data_sample: 데이터 샘플 타입 ('pure', 'smote', etc.)
+        target: 타겟 타입 ('multi', 'binary')
+        n_folds: 교차 검증 fold 수
+        random_state: 랜덤 시드
+    Returns:
+        저장된 모델 경로 리스트
+    """
+    # GPU 사용 가능 여부 확인 및 device 설정
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    models = []
+    scalers = []  # scaler 리스트 추가
+    print("최종 모델 학습 시작...")
+    for fold in range(1, n_folds + 1):
+        print(f"Fold {fold} 학습 중...")
+        # 최적화된 batch_size 사용
+        batch_size = best_params.get("batch_size", 64)
+        X_train_df, categorical_cols, numerical_cols, train_loader, val_loader, _, y_train, scaler = prepare_dataloader_with_batchsize(
+            region, data_sample=data_sample, target=target, fold=fold, random_state=random_state, batch_size=batch_size
+        )
+        # 모델 초기화
+        if model_choose == "ft_transformer":
+            d_token = best_params["d_token"]
+            n_heads = best_params.get("n_heads", 8)
+            # d_token은 n_heads의 배수여야 함 (FT-Transformer의 구조적 제약 대응)
+            if d_token % n_heads != 0:
+                d_token = (d_token // n_heads) * n_heads
+            model = FTTransformer(
+                num_features=len(numerical_cols),
+                cat_cardinalities=[len(X_train_df[col].unique()) for col in categorical_cols],
+                d_token=d_token,
+                n_blocks=best_params["n_blocks"],
+                n_heads=n_heads,
+                attention_dropout=best_params["attention_dropout"],
+                ffn_dropout=best_params["ffn_dropout"],
+                num_classes=3
+            ).to(device)
+        elif model_choose == 'resnet_like':
+            input_dim = len(numerical_cols) + len(categorical_cols)
+            model = ResNetLike(
+                input_dim=input_dim,
+                d_main=best_params["d_main"],
+                d_hidden=best_params["d_hidden"],
+                n_blocks=best_params["n_blocks"],
+                dropout_first=best_params["dropout_first"],
+                dropout_second=best_params["dropout_second"],
+                num_classes=3
+            ).to(device)
+        elif model_choose == 'deepgbm':
+            model = DeepGBM(
+                num_features=len(numerical_cols),
+                cat_features=[len(X_train_df[col].unique()) for col in categorical_cols],
+                d_main=best_params["d_main"],
+                d_hidden=best_params["d_hidden"],
+                n_blocks=best_params["n_blocks"],
+                dropout=best_params["dropout"],
+                num_classes=3
+            ).to(device)
+        else:
+            raise ValueError(f"Unknown model_choose: {model_choose}")
+        # 클래스 가중치 계산 및 손실 함수 설정 (Label Smoothing 적용)
+        if target == 'multi':
+            class_weights = compute_class_weight(
+                class_weight='balanced',
+                classes=np.unique(y_train),
+                y=y_train
+            )
+            class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)
+            criterion = nn.CrossEntropyLoss(weight=class_weights_tensor, label_smoothing=0.0)  # Label Smoothing 추가
+        else:
+            criterion = nn.BCEWithLogitsLoss()
+        optimizer = optim.AdamW(model.parameters(), lr=best_params["lr"], weight_decay=best_params["weight_decay"])
+        # 학습률 스케줄러
+        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)
+        # 학습 설정
+        epochs = 200
+        patience = 12
+        best_fold_csi = 0
+        counter = 0
+        best_model = None
+        for epoch in range(epochs):
+            model.train()
+            for x_num_batch, x_cat_batch, y_batch in train_loader:
+                x_num_batch, x_cat_batch, y_batch = x_num_batch.to(device), x_cat_batch.to(device), y_batch.to(device)
+                optimizer.zero_grad()
+                y_pred = model(x_num_batch, x_cat_batch)
+                loss = criterion(y_pred, y_batch if target == 'multi' else y_batch.float())
+                loss.backward()
+                optimizer.step()
+            # Validation 평가
+            model.eval()
+            y_pred_val, y_true_val = [], []
+            with torch.no_grad():
+                for x_num_batch, x_cat_batch, y_batch in val_loader:
+                    x_num_batch, x_cat_batch, y_batch = x_num_batch.to(device), x_cat_batch.to(device), y_batch.to(device)
+                    output = model(x_num_batch, x_cat_batch)
+                    pred = output.argmax(dim=1) if target == 'multi' else (torch.sigmoid(output) >= 0.5).long()
+                    y_pred_val.extend(pred.cpu().numpy())
+                    y_true_val.extend(y_batch.cpu().numpy())
+            # CSI 계산 및 스케줄러 업데이트
+            val_csi = calculate_csi(y_true_val, y_pred_val)
+            scheduler.step(val_csi)
+            # Early Stopping 체크
+            if val_csi > best_fold_csi:
+                best_fold_csi = val_csi
+                counter = 0
+                best_model = copy.deepcopy(model)
+            else:
+                counter += 1
+            if counter >= patience:
+                print(f"  Early stopping at epoch {epoch+1}, Best CSI: {best_fold_csi:.4f}")
+                break
+        if best_model is None:
+            best_model = model
+        scalers.append(scaler)  # scaler 저장 (fold 순서대로)
+        models.append(best_model)
+        print(f"  Fold {fold} 학습 완료 (검증 CSI: {best_fold_csi:.4f})")
+    # 모델 저장 경로 설정
+    save_dir = f'../save_model/{model_choose}_optima'
+    os.makedirs(save_dir, exist_ok=True)
+    # 파일명 생성
+    if data_sample == 'pure':
+        model_filename = f'{model_choose}_pure_{region}.pkl'
+    else:
+        model_filename = f'{model_choose}_{data_sample}_{region}.pkl'
+    model_path = f'{save_dir}/{model_filename}'
+    # 리스트에 담아 한 번에 저장
+    joblib.dump(models, model_path)
+    print(f"\n모든 모델 저장 완료: {model_path} (총 {len(models)}개 fold)")
+    # Scaler 별도 저장
+    scaler_save_dir = f'../save_model/{model_choose}_optima/scaler'
+    os.makedirs(scaler_save_dir, exist_ok=True)
+    # 파일명 생성 (모델과 동일한 패턴)
+    if data_sample == 'pure':
+        scaler_filename = f'{model_choose}_pure_{region}_scaler.pkl'
+    else:
+        scaler_filename = f'{model_choose}_{data_sample}_{region}_scaler.pkl'
+    scaler_path = f'{scaler_save_dir}/{scaler_filename}'
+    joblib.dump(scalers, scaler_path)
+    print(f"Scaler 저장 완료: {scaler_path} (총 {len(scalers)}개 fold)")
+    return model_path

Analysis_code/5.optima/deepgbm_smote/deepgbm_smote_busan.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="busan", data_sample='smote'),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_smote_busan_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_path = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="busan",
+        data_sample='smote',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로: {model_path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_smote/deepgbm_smote_daegu.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="daegu", data_sample='smote'),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_smote_daegu_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_path = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="daegu",
+        data_sample='smote',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로: {model_path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_smote/deepgbm_smote_daejeon.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="daejeon", data_sample='smote'),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_smote_daejeon_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_path = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="daejeon",
+        data_sample='smote',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로: {model_path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_smote/deepgbm_smote_gwangju.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="gwangju", data_sample='smote'),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_smote_gwangju_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_path = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="gwangju",
+        data_sample='smote',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로: {model_path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_smote/deepgbm_smote_incheon.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="incheon", data_sample='smote'),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_smote_incheon_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_path = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="incheon",
+        data_sample='smote',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로: {model_path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_smote/deepgbm_smote_seoul.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="seoul", data_sample='smote'),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_smote_seoul_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_path = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="seoul",
+        data_sample='smote',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로: {model_path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_smote/utils.py ADDED Viewed

	@@ -0,0 +1,720 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+import random
+import os
+import copy
+from sklearn.preprocessing import QuantileTransformer, LabelEncoder
+from torch.utils.data import DataLoader, TensorDataset
+from sklearn.metrics import confusion_matrix
+from sklearn.utils.class_weight import compute_class_weight
+import pandas as pd
+import optuna
+from sklearn.metrics import accuracy_score, f1_score
+import joblib
+import sys
+# 파일 위치 기반으로 models 디렉토리 경로 설정
+current_file_dir = os.path.dirname(os.path.abspath(__file__))
+models_path = os.path.abspath(os.path.join(current_file_dir, '../../models'))
+sys.path.insert(0, models_path)
+from ft_transformer import FTTransformer
+from resnet_like import ResNetLike
+from deepgbm import DeepGBM
+import warnings
+warnings.filterwarnings('ignore')
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# PyTorch 시드 고정
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.cuda.manual_seed_all(seed)  # Multi-GPU 환경에서 동일한 시드 적용
+# PyTorch 연산의 결정적 모드 설정
+torch.backends.cudnn.deterministic = True  # 실행마다 동일한 결과를 보장
+torch.backends.cudnn.benchmark = True  # 성능 최적화를 활성화 (가능한 한 빠른 연산 수행)
+def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    제거했던 파생 변수들을 복구
+    Args:
+        df: 데이터프레임
+    Returns:
+        파생 변수가 추가된 데이터프레임
+    """
+    df = df.copy()
+    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['ground_temp - temp_C'] = df['groundtemp'] - df['temp_C']
+    return df
+def preprocessing(df):
+    """데이터 전처리 함수.
+    Args:
+        df: 원본 데이터프레임
+    Returns:
+        전처리된 데이터프레임
+    """
+    df = df[df.columns].copy()
+    df['year'] = df['year'].astype('int')
+    df['month'] = df['month'].astype('int')
+    df['hour'] = df['hour'].astype('int')
+    df = add_derived_features(df).copy()
+    df['multi_class'] = df['multi_class'].astype('int')
+    df.loc[df['wind_dir']=='정온', 'wind_dir'] = "0"
+    df['wind_dir'] = df['wind_dir'].astype('int')
+    df = df[['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm',
+       'vap_pressure', 'dewpoint_C', 'loc_pressure', 'sea_pressure',
+       'solarRad', 'snow_cm', 'cloudcover', 'lm_cloudcover', 'low_cloudbase',
+       'groundtemp', 'O3', 'NO2', 'PM10', 'PM25', 'year',
+       'month', 'hour', 'ground_temp - temp_C', 'hour_sin', 'hour_cos',
+       'month_sin', 'month_cos','multi_class']].copy()
+    return df
+# 데이터셋 준비 함수
+def prepare_dataset(region, data_sample='pure', target='multi', fold=3):
+    # 파일 위치 기반으로 데이터 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    data_base_dir = os.path.abspath(os.path.join(current_file_dir, '../../../data'))
+    # 데이터 경로 지정
+    dat_path = os.path.join(data_base_dir, f"data_for_modeling/{region}_train.csv")
+    if data_sample == 'pure':
+        train_path = dat_path
+    else:
+        train_path = os.path.join(data_base_dir, f'data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv')
+    test_path = os.path.join(data_base_dir, f"data_for_modeling/{region}_test.csv")
+    drop_col = ['multi_class','year']
+    target_col = f'{target}_class'
+    # 데이터 로드
+    region_dat = preprocessing(pd.read_csv(dat_path, index_col=0))
+    if data_sample == 'pure':
+        region_train = region_dat.loc[~region_dat['year'].isin([2021-fold]), :]
+    else:
+        region_train = preprocessing(pd.read_csv(train_path))
+    region_val = region_dat.loc[region_dat['year'].isin([2021-fold]), :]
+    region_test = preprocessing(pd.read_csv(test_path))
+    # 컬럼 정렬 (일관성 유지)
+    common_columns = region_train.columns.to_list()
+    train_data = region_train[common_columns]
+    val_data = region_val[common_columns]
+    test_data = region_test[common_columns]
+    # 설명변수 & 타겟 분리
+    X_train = train_data.drop(columns=drop_col)
+    y_train = train_data[target_col]
+    X_val = val_data.drop(columns=drop_col)
+    y_val = val_data[target_col]
+    X_test = test_data.drop(columns=drop_col)
+    y_test = test_data[target_col]
+    # 범주형 & 연속형 변수 분리
+    categorical_cols = X_train.select_dtypes(include=['object', 'category', 'int64']).columns
+    numerical_cols = X_train.select_dtypes(include=['float64']).columns
+    # 범주형 변수 Label Encoding
+    label_encoders = {}
+    for col in categorical_cols:
+        le = LabelEncoder()
+        le.fit(X_train[col])  # Train 데이터 기준으로 학습
+        label_encoders[col] = le
+    # 변환 적용
+    for col in categorical_cols:
+        X_train[col] = label_encoders[col].transform(X_train[col])
+        X_val[col] = label_encoders[col].transform(X_val[col])
+        X_test[col] = label_encoders[col].transform(X_test[col])
+    # 연속형 변수 Quantile Transformation
+    scaler = QuantileTransformer(output_distribution='normal')
+    scaler.fit(X_train[numerical_cols])  # Train 데이터 기준으로 학습
+    # 변환 적용
+    X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])
+    X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
+    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
+    return X_train, X_val, X_test, y_train, y_val, y_test, categorical_cols, numerical_cols
+# 데이터 변환 및 dataloader 생성 함수
+def prepare_dataloader(region, data_sample='pure', target='multi', fold=3, random_state=None):
+    # 파일 위치 기반으로 데이터 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    data_base_dir = os.path.abspath(os.path.join(current_file_dir, '../../../data'))
+    # 데이터 경로 지정
+    dat_path = os.path.join(data_base_dir, f"data_for_modeling/{region}_train.csv")
+    if data_sample == 'pure':
+        train_path = dat_path
+    else:
+        train_path = os.path.join(data_base_dir, f'data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv')
+    test_path = os.path.join(data_base_dir, f"data_for_modeling/{region}_test.csv")
+    drop_col = ['multi_class','year']
+    target_col = f'{target}_class'
+    # 데이터 로드
+    region_dat = preprocessing(pd.read_csv(dat_path, index_col=0))
+    if data_sample == 'pure':
+        region_train = region_dat.loc[~region_dat['year'].isin([2021-fold]), :]
+    else:
+        region_train = preprocessing(pd.read_csv(train_path))
+    region_val = region_dat.loc[region_dat['year'].isin([2021-fold]), :]
+    region_test = preprocessing(pd.read_csv(test_path))
+    # 컬럼 정렬 (일관성 유지)
+    common_columns = region_train.columns.to_list()
+    train_data = region_train[common_columns]
+    val_data = region_val[common_columns]
+    test_data = region_test[common_columns]
+    # 설명변수 & 타겟 분리
+    X_train = train_data.drop(columns=drop_col)
+    y_train = train_data[target_col]
+    X_val = val_data.drop(columns=drop_col)
+    y_val = val_data[target_col]
+    X_test = test_data.drop(columns=drop_col)
+    y_test = test_data[target_col]
+    # 범주형 & 연속형 변수 분리
+    categorical_cols = X_train.select_dtypes(include=['object', 'category', 'int64']).columns
+    numerical_cols = X_train.select_dtypes(include=['float64']).columns
+    # 범주형 변수 Label Encoding
+    label_encoders = {}
+    for col in categorical_cols:
+        le = LabelEncoder()
+        le.fit(X_train[col])  # Train 데이터 기준으로 학습
+        label_encoders[col] = le
+    # 변환 적용
+    for col in categorical_cols:
+        X_train[col] = label_encoders[col].transform(X_train[col])
+        X_val[col] = label_encoders[col].transform(X_val[col])
+        X_test[col] = label_encoders[col].transform(X_test[col])
+    # 연속형 변수 Quantile Transformation
+    scaler = QuantileTransformer(output_distribution='normal')
+    scaler.fit(X_train[numerical_cols])  # Train 데이터 기준으로 학습
+    # 변환 적용
+    X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])
+    X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
+    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
+    # 연속형 변수와 범주형 변수 분리
+    X_train_num = torch.tensor(X_train[numerical_cols].values, dtype=torch.float32)
+    X_train_cat = torch.tensor(X_train[categorical_cols].values, dtype=torch.long)
+    X_val_num = torch.tensor(X_val[numerical_cols].values, dtype=torch.float32)
+    X_val_cat = torch.tensor(X_val[categorical_cols].values, dtype=torch.long)
+    X_test_num = torch.tensor(X_test[numerical_cols].values, dtype=torch.float32)
+    X_test_cat = torch.tensor(X_test[categorical_cols].values, dtype=torch.long)
+    # 레이블 변환
+    if target == "binary":
+        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)  # 이진 분류 → float32
+        y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
+        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
+    elif target == "multi":
+        y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # 다중 분류 → long
+        y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)
+        y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)
+    else:
+        raise ValueError("target must be 'binary' or 'multi'")
+    # TensorDataset 생성
+    train_dataset = TensorDataset(X_train_num, X_train_cat, y_train_tensor)
+    val_dataset = TensorDataset(X_val_num, X_val_cat, y_val_tensor)
+    test_dataset = TensorDataset(X_test_num, X_test_cat, y_test_tensor)
+    # DataLoader 생성
+    if random_state == None:
+        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
+    else:
+        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=torch.Generator().manual_seed(random_state))
+    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
+    return X_train, categorical_cols, numerical_cols, train_loader, val_loader, test_loader
+# 데이터 변환 및 dataloader 생성 함수 (batch_size 파라미터 추가 버전)
+def prepare_dataloader_with_batchsize(region, data_sample='pure', target='multi', fold=3, random_state=None, batch_size=64):
+    # 파일 위치 기반으로 데이터 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    data_base_dir = os.path.abspath(os.path.join(current_file_dir, '../../../data'))
+    # 데이터 경로 지정
+    dat_path = os.path.join(data_base_dir, f"data_for_modeling/{region}_train.csv")
+    if data_sample == 'pure':
+        train_path = dat_path
+    else:
+        train_path = os.path.join(data_base_dir, f'data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv')
+    test_path = os.path.join(data_base_dir, f"data_for_modeling/{region}_test.csv")
+    drop_col = ['multi_class','year']
+    target_col = f'{target}_class'
+    # 데이터 로드
+    region_dat = preprocessing(pd.read_csv(dat_path, index_col=0))
+    if data_sample == 'pure':
+        region_train = region_dat.loc[~region_dat['year'].isin([2021-fold]), :]
+    else:
+        region_train = preprocessing(pd.read_csv(train_path))
+    region_val = region_dat.loc[region_dat['year'].isin([2021-fold]), :]
+    region_test = preprocessing(pd.read_csv(test_path))
+    # 컬럼 정렬 (일관성 유지)
+    common_columns = region_train.columns.to_list()
+    train_data = region_train[common_columns]
+    val_data = region_val[common_columns]
+    test_data = region_test[common_columns]
+    # 설명변수 & 타겟 분리
+    X_train = train_data.drop(columns=drop_col)
+    y_train = train_data[target_col]
+    X_val = val_data.drop(columns=drop_col)
+    y_val = val_data[target_col]
+    X_test = test_data.drop(columns=drop_col)
+    y_test = test_data[target_col]
+    # 범주형 & 연속형 변수 분리
+    categorical_cols = X_train.select_dtypes(include=['object', 'category', 'int64']).columns
+    numerical_cols = X_train.select_dtypes(include=['float64']).columns
+    # 범주형 변수 Label Encoding
+    label_encoders = {}
+    for col in categorical_cols:
+        le = LabelEncoder()
+        le.fit(X_train[col])  # Train 데이터 기준으로 학습
+        label_encoders[col] = le
+    # 변환 적용
+    for col in categorical_cols:
+        X_train[col] = label_encoders[col].transform(X_train[col])
+        X_val[col] = label_encoders[col].transform(X_val[col])
+        X_test[col] = label_encoders[col].transform(X_test[col])
+    # 연속형 변수 Quantile Transformation
+    scaler = QuantileTransformer(output_distribution='normal')
+    scaler.fit(X_train[numerical_cols])  # Train 데이터 기준으로 학습
+    # 변환 적용
+    X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])
+    X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
+    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
+    # 연속형 변수와 범주형 변수 분리
+    X_train_num = torch.tensor(X_train[numerical_cols].values, dtype=torch.float32)
+    X_train_cat = torch.tensor(X_train[categorical_cols].values, dtype=torch.long)
+    X_val_num = torch.tensor(X_val[numerical_cols].values, dtype=torch.float32)
+    X_val_cat = torch.tensor(X_val[categorical_cols].values, dtype=torch.long)
+    X_test_num = torch.tensor(X_test[numerical_cols].values, dtype=torch.float32)
+    X_test_cat = torch.tensor(X_test[categorical_cols].values, dtype=torch.long)
+    # 레이블 변환
+    if target == "binary":
+        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)  # 이진 분류 → float32
+        y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
+        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
+    elif target == "multi":
+        y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # 다중 분류 → long
+        y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)
+        y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)
+    else:
+        raise ValueError("target must be 'binary' or 'multi'")
+    # TensorDataset 생성
+    train_dataset = TensorDataset(X_train_num, X_train_cat, y_train_tensor)
+    val_dataset = TensorDataset(X_val_num, X_val_cat, y_val_tensor)
+    test_dataset = TensorDataset(X_test_num, X_test_cat, y_test_tensor)
+    # DataLoader 생성 (batch_size 파라미터 사용)
+    if random_state == None:
+        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    else:
+        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=torch.Generator().manual_seed(random_state))
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+    return X_train, categorical_cols, numerical_cols, train_loader, val_loader, test_loader, y_train, scaler
+def calculate_csi(y_true, pred):
+    cm = confusion_matrix(y_true, pred)  # 변수 이름을 cm으로 변경
+    # 혼동 행렬에서 H, F, M 추출
+    H = (cm[0, 0] + cm[1, 1])
+    F = (cm[1, 0] + cm[2, 0] +
+         cm[0, 1] + cm[2, 1])
+    M = (cm[0, 2] + cm[1, 2])
+    # CSI 계산
+    CSI = H / (H + F + M + 1e-10)
+    return CSI
+def sample_weight(y_train):
+    class_weights = compute_class_weight(
+        class_weight='balanced',
+        classes=np.unique(y_train),  # 고유 클래스
+        y=y_train                   # 학습 데이터 레이블
+    )
+    sample_weights = np.array([class_weights[label] for label in y_train])
+    return sample_weights
+# 하이퍼파라미터 최적화 함수 정의
+def objective(trial, model_choose, region, data_sample='pure', target='multi', n_folds=3, random_state=42):
+    # GPU 사용 가능 여부 확인 및 device 설정
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    val_scores = []
+    # --- 1. 하이퍼파라미터 탐색 범위 정의 (수정됨) ---
+    if model_choose == "ft_transformer":
+        d_token = trial.suggest_int("d_token", 64, 256, step=32)
+        n_blocks = trial.suggest_int("n_blocks", 2, 6) # 깊이 축소로 과적합 방지
+        n_heads = trial.suggest_categorical("n_heads", [4, 8])
+        # d_token은 n_heads의 배수여야 함 (FT-Transformer의 구조적 제약 대응)
+        if d_token % n_heads != 0:
+            d_token = (d_token // n_heads) * n_heads
+        attention_dropout = trial.suggest_float("attention_dropout", 0.1, 0.4)
+        ffn_dropout = trial.suggest_float("ffn_dropout", 0.1, 0.4)
+        lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True) # 범위 확대
+        weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-1, log=True)  # 더 공격적인 범위로 확장
+        batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])  # Batch Size 추가
+    elif model_choose == 'resnet_like':
+        d_main = trial.suggest_int("d_main", 64, 256, step=32)
+        d_hidden = trial.suggest_int("d_hidden", 64, 512, step=64)
+        n_blocks = trial.suggest_int("n_blocks", 2, 5) # 너무 깊지 않게 조절
+        dropout_first = trial.suggest_float("dropout_first", 0.1, 0.4)
+        dropout_second = trial.suggest_float("dropout_second", 0.0, 0.2)
+        lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
+        weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-1, log=True)  # 더 공격적인 범위로 확장
+        batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])  # Batch Size 추가
+    elif model_choose == 'deepgbm':
+        # DeepGBM의 경우 모델 특성에 맞춰 ResNet 블록 및 임베딩 차원 조절
+        d_main = trial.suggest_int("d_main", 64, 256, step=32)
+        d_hidden = trial.suggest_int("d_hidden", 64, 256, step=64)
+        n_blocks = trial.suggest_int("n_blocks", 2, 6)
+        dropout = trial.suggest_float("dropout", 0.1, 0.4)
+        lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
+        weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-1, log=True)  # 더 공격적인 범위로 확장
+        batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])  # Batch Size 추가
+    # --- 2. Fold별 학습 및 교차 검증 ---
+    for fold in range(1, n_folds + 1):
+        X_train_df, categorical_cols, numerical_cols, train_loader, val_loader, _, y_train, _ = prepare_dataloader_with_batchsize(
+            region, data_sample=data_sample, target=target, fold=fold, random_state=random_state, batch_size=batch_size
+        )
+        # 모델 초기화
+        if model_choose == "ft_transformer":
+            model = FTTransformer(
+                num_features=len(numerical_cols),
+                cat_cardinalities=[len(X_train_df[col].unique()) for col in categorical_cols],
+                d_token=d_token,
+                n_blocks=n_blocks,
+                n_heads=n_heads,
+                attention_dropout=attention_dropout,
+                ffn_dropout=ffn_dropout,
+                num_classes=3
+            ).to(device)
+        elif model_choose == 'resnet_like':
+            input_dim = len(numerical_cols) + len(categorical_cols)
+            model = ResNetLike(
+                input_dim=input_dim,
+                d_main=d_main,
+                d_hidden=d_hidden,
+                n_blocks=n_blocks,
+                dropout_first=dropout_first,
+                dropout_second=dropout_second,
+                num_classes=3
+            ).to(device)
+        elif model_choose == 'deepgbm':
+            model = DeepGBM(
+                num_features=len(numerical_cols),
+                cat_features=[len(X_train_df[col].unique()) for col in categorical_cols],
+                d_main=d_main,
+                d_hidden=d_hidden,
+                n_blocks=n_blocks,
+                dropout=dropout,
+                num_classes=3
+            ).to(device)
+        # 클래스 가중치 계산 및 손실 함수 설정 (Label Smoothing 적용)
+        if target == 'multi':
+            class_weights = compute_class_weight(
+                class_weight='balanced',
+                classes=np.unique(y_train),
+                y=y_train
+            )
+            # 클래스별 가중치 로그 출력
+            unique_classes = np.unique(y_train)
+            class_counts = {cls: np.sum(y_train == cls) for cls in unique_classes}
+            print(f"  Fold {fold} - 클래스별 가중치: {dict(zip(unique_classes, class_weights))} (클래스별 샘플 수: {class_counts})")
+            class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)
+            criterion = nn.CrossEntropyLoss(weight=class_weights_tensor, label_smoothing=0.0)  # Label Smoothing 추가
+        else:
+            criterion = nn.BCEWithLogitsLoss()
+        optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
+        # 학습률 스케줄러 추가: 성능 정체 시 LR을 0.5배 감소 (검증 CSI 기준)
+        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)
+        # 학습 설정 (에폭 및 페이션스 상향)
+        epochs = 200
+        patience = 12 # 딥러닝의 정체 구간을 고려하여 소폭 상향
+        best_fold_csi = 0
+        counter = 0
+        for epoch in range(epochs):
+            model.train()
+            for x_num_batch, x_cat_batch, y_batch in train_loader:
+                x_num_batch, x_cat_batch, y_batch = x_num_batch.to(device), x_cat_batch.to(device), y_batch.to(device)
+                optimizer.zero_grad()
+                y_pred = model(x_num_batch, x_cat_batch)
+                loss = criterion(y_pred, y_batch if target == 'multi' else y_batch.float())
+                loss.backward()
+                optimizer.step()
+            # Validation 평가
+            model.eval()
+            y_pred_val, y_true_val = [], []
+            with torch.no_grad():
+                for x_num_batch, x_cat_batch, y_batch in val_loader:
+                    x_num_batch, x_cat_batch, y_batch = x_num_batch.to(device), x_cat_batch.to(device), y_batch.to(device)
+                    output = model(x_num_batch, x_cat_batch)
+                    pred = output.argmax(dim=1) if target == 'multi' else (torch.sigmoid(output) >= 0.5).long()
+                    y_pred_val.extend(pred.cpu().numpy())
+                    y_true_val.extend(y_batch.cpu().numpy())
+            # CSI 계산 및 스케줄러 업데이트
+            val_csi = calculate_csi(y_true_val, y_pred_val)
+            scheduler.step(val_csi)
+            # Optuna Pruning 적용 (첫 번째 Fold에서 조기 종료 판단 강화)
+            trial.report(val_csi, epoch)
+            if trial.should_prune():
+                raise optuna.exceptions.TrialPruned()
+            # Early Stopping 체크
+            if val_csi > best_fold_csi:
+                best_fold_csi = val_csi
+                counter = 0
+            else:
+                counter += 1
+            if counter >= patience:
+                break
+        val_scores.append(best_fold_csi)
+    # 모든 fold의 평균 성능 반환
+    return np.mean(val_scores)
+# 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장 함수
+def train_final_model(best_params, model_choose, region, data_sample='pure', target='multi', n_folds=3, random_state=42):
+    """
+    최적화된 하이퍼파라미터로 최종 모델을 학습하고 저장합니다.
+    Args:
+        best_params: 최적화된 하이퍼파라미터 딕셔너리
+        model_choose: 모델 선택 ('ft_transformer', 'resnet_like', 'deepgbm')
+        region: 지역명
+        data_sample: 데이터 샘플 타입 ('pure', 'smote', etc.)
+        target: 타겟 타입 ('multi', 'binary')
+        n_folds: 교차 검증 fold 수
+        random_state: 랜덤 시드
+    Returns:
+        저장된 모델 경로 리스트
+    """
+    # GPU 사용 가능 여부 확인 및 device 설정
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    models = []
+    scalers = []  # scaler 리스트 추가
+    print("최종 모델 학습 시작...")
+    for fold in range(1, n_folds + 1):
+        print(f"Fold {fold} 학습 중...")
+        # 최적화된 batch_size 사용
+        batch_size = best_params.get("batch_size", 64)
+        X_train_df, categorical_cols, numerical_cols, train_loader, val_loader, _, y_train, scaler = prepare_dataloader_with_batchsize(
+            region, data_sample=data_sample, target=target, fold=fold, random_state=random_state, batch_size=batch_size
+        )
+        # 모델 초기화
+        if model_choose == "ft_transformer":
+            d_token = best_params["d_token"]
+            n_heads = best_params.get("n_heads", 8)
+            # d_token은 n_heads의 배수여야 함 (FT-Transformer의 구조적 제약 대응)
+            if d_token % n_heads != 0:
+                d_token = (d_token // n_heads) * n_heads
+            model = FTTransformer(
+                num_features=len(numerical_cols),
+                cat_cardinalities=[len(X_train_df[col].unique()) for col in categorical_cols],
+                d_token=d_token,
+                n_blocks=best_params["n_blocks"],
+                n_heads=n_heads,
+                attention_dropout=best_params["attention_dropout"],
+                ffn_dropout=best_params["ffn_dropout"],
+                num_classes=3
+            ).to(device)
+        elif model_choose == 'resnet_like':
+            input_dim = len(numerical_cols) + len(categorical_cols)
+            model = ResNetLike(
+                input_dim=input_dim,
+                d_main=best_params["d_main"],
+                d_hidden=best_params["d_hidden"],
+                n_blocks=best_params["n_blocks"],
+                dropout_first=best_params["dropout_first"],
+                dropout_second=best_params["dropout_second"],
+                num_classes=3
+            ).to(device)
+        elif model_choose == 'deepgbm':
+            model = DeepGBM(
+                num_features=len(numerical_cols),
+                cat_features=[len(X_train_df[col].unique()) for col in categorical_cols],
+                d_main=best_params["d_main"],
+                d_hidden=best_params["d_hidden"],
+                n_blocks=best_params["n_blocks"],
+                dropout=best_params["dropout"],
+                num_classes=3
+            ).to(device)
+        else:
+            raise ValueError(f"Unknown model_choose: {model_choose}")
+        # 클래스 가중치 계산 및 손실 함수 설정 (Label Smoothing 적용)
+        if target == 'multi':
+            class_weights = compute_class_weight(
+                class_weight='balanced',
+                classes=np.unique(y_train),
+                y=y_train
+            )
+            class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)
+            criterion = nn.CrossEntropyLoss(weight=class_weights_tensor, label_smoothing=0.0)  # Label Smoothing 추가
+        else:
+            criterion = nn.BCEWithLogitsLoss()
+        optimizer = optim.AdamW(model.parameters(), lr=best_params["lr"], weight_decay=best_params["weight_decay"])
+        # 학습률 스케줄러
+        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)
+        # 학습 설정
+        epochs = 200
+        patience = 12
+        best_fold_csi = 0
+        counter = 0
+        best_model = None
+        for epoch in range(epochs):
+            model.train()
+            for x_num_batch, x_cat_batch, y_batch in train_loader:
+                x_num_batch, x_cat_batch, y_batch = x_num_batch.to(device), x_cat_batch.to(device), y_batch.to(device)
+                optimizer.zero_grad()
+                y_pred = model(x_num_batch, x_cat_batch)
+                loss = criterion(y_pred, y_batch if target == 'multi' else y_batch.float())
+                loss.backward()
+                optimizer.step()
+            # Validation 평가
+            model.eval()
+            y_pred_val, y_true_val = [], []
+            with torch.no_grad():
+                for x_num_batch, x_cat_batch, y_batch in val_loader:
+                    x_num_batch, x_cat_batch, y_batch = x_num_batch.to(device), x_cat_batch.to(device), y_batch.to(device)
+                    output = model(x_num_batch, x_cat_batch)
+                    pred = output.argmax(dim=1) if target == 'multi' else (torch.sigmoid(output) >= 0.5).long()
+                    y_pred_val.extend(pred.cpu().numpy())
+                    y_true_val.extend(y_batch.cpu().numpy())
+            # CSI 계산 및 스케줄러 업데이트
+            val_csi = calculate_csi(y_true_val, y_pred_val)
+            scheduler.step(val_csi)
+            # Early Stopping 체크
+            if val_csi > best_fold_csi:
+                best_fold_csi = val_csi
+                counter = 0
+                best_model = copy.deepcopy(model)
+            else:
+                counter += 1
+            if counter >= patience:
+                print(f"  Early stopping at epoch {epoch+1}, Best CSI: {best_fold_csi:.4f}")
+                break
+        if best_model is None:
+            best_model = model
+        scalers.append(scaler)  # scaler 저장 (fold 순서대로)
+        models.append(best_model)
+        print(f"  Fold {fold} 학습 완료 (검증 CSI: {best_fold_csi:.4f})")
+    # 모델 저장 경로 설정
+    save_dir = f'../save_model/{model_choose}_optima'
+    os.makedirs(save_dir, exist_ok=True)
+    # 파일명 생성
+    if data_sample == 'pure':
+        model_filename = f'{model_choose}_pure_{region}.pkl'
+    else:
+        model_filename = f'{model_choose}_{data_sample}_{region}.pkl'
+    model_path = f'{save_dir}/{model_filename}'
+    # 리스트에 담아 한 번에 저장
+    joblib.dump(models, model_path)
+    print(f"\n모든 모델 저장 완료: {model_path} (총 {len(models)}개 fold)")
+    # Scaler 별도 저장
+    scaler_save_dir = f'../save_model/{model_choose}_optima/scaler'
+    os.makedirs(scaler_save_dir, exist_ok=True)
+    # 파일명 생성 (모델과 동일한 패턴)
+    if data_sample == 'pure':
+        scaler_filename = f'{model_choose}_pure_{region}_scaler.pkl'
+    else:
+        scaler_filename = f'{model_choose}_{data_sample}_{region}_scaler.pkl'
+    scaler_path = f'{scaler_save_dir}/{scaler_filename}'
+    joblib.dump(scalers, scaler_path)
+    print(f"Scaler 저장 완료: {scaler_path} (총 {len(scalers)}개 fold)")
+    return model_path

Analysis_code/5.optima/deepgbm_smotenc_ctgan20000/deepgbm_smotenc_ctgan20000_busan.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="busan", data_sample='smotenc_ctgan20000'),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_smotenc_ctgan20000_busan_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_path = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="busan",
+        data_sample='smotenc_ctgan20000',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로: {model_path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)

Analysis_code/5.optima/deepgbm_smotenc_ctgan20000/deepgbm_smotenc_ctgan20000_daegu.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import optuna
+import numpy as np
+import random
+import pandas as pd
+import joblib
+import os
+import torch
+from utils import *
+# Python 및 Numpy 시드 고정
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+# 1. Study 생성 시 'maximize'로 설정
+study = optuna.create_study(
+    direction="maximize",  # CSI 점수가 높을수록 좋으므로 maximize
+    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10) # 초반 10에폭은 지켜보고 이후 가지치기
+)
+# Trial 완료 시 상세 정보 출력하는 callback 함수
+def print_trial_callback(study, trial):
+    """각 trial 완료 시 best value를 포함한 상세 정보 출력"""
+    print(f"\n{'='*80}")
+    print(f"Trial {trial.number} 완료")
+    print(f"  Value (CSI): {trial.value:.6f}" if trial.value is not None else f"  Value: {trial.value}")
+    print(f"  Parameters: {trial.params}")
+    print(f"  Best Value (CSI): {study.best_value:.6f}" if study.best_value is not None else f"  Best Value: {study.best_value}")
+    print(f"  Best Trial: {study.best_trial.number}")
+    print(f"  Best Parameters: {study.best_params}")
+    print(f"{'='*80}\n")
+# 2. 최적화 실행
+study.optimize(
+    lambda trial: objective(trial, model_choose="deepgbm", region="daegu", data_sample='smotenc_ctgan20000'),
+    n_trials=100
+,
+    callbacks=[print_trial_callback]
+)
+# 3. 결과 확인 및 요약
+print(f"\n최적화 완료.")
+print(f"Best CSI Score: {study.best_value:.4f}")
+print(f"Best Hyperparameters: {study.best_params}")
+try:
+    # 모든 trial의 CSI 점수 추출
+    csi_scores = [trial.value for trial in study.trials if trial.value is not None]
+    if len(csi_scores) > 0:
+        print(f"\n최적화 과정 요약:")
+        print(f"  - 총 시도 횟수: {len(study.trials)}")
+        print(f"  - 성공한 시도: {len(csi_scores)}")
+        print(f"  - 최초 CSI: {csi_scores[0]:.4f}")
+        print(f"  - 최종 CSI: {csi_scores[-1]:.4f}")
+        print(f"  - 최고 CSI: {max(csi_scores):.4f}")
+        print(f"  - 최저 CSI: {min(csi_scores):.4f}")
+        print(f"  - 평균 CSI: {np.mean(csi_scores):.4f}")
+    # Study 객체 저장
+    # 파일 위치 기반으로 base 디렉토리 경로 설정
+    current_file_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(os.path.dirname(current_file_dir))  # 5.optima 디렉토리
+    os.makedirs(os.path.join(base_dir, "optimization_history"), exist_ok=True)
+    study_path = os.path.join(base_dir, "optimization_history/deepgbm_smotenc_ctgan20000_daegu_trials.pkl")
+    joblib.dump(study, study_path)
+    print(f"\n최적화 Study 객체가 {study_path}에 저장되었습니다.")
+    # 최적화된 하이퍼파라미터로 최종 모델 학습 및 저장
+    print("\n" + "="*50)
+    print("최적화된 하이퍼파라미터로 최종 모델 학습 시작")
+    print("="*50)
+    best_params = study.best_params
+    model_path = train_final_model(
+        best_params=best_params,
+        model_choose="deepgbm",
+        region="daegu",
+        data_sample='smotenc_ctgan20000',
+        target='multi',
+        n_folds=3,
+        random_state=seed
+    )
+    print(f"\n최종 모델 학습 및 저장 완료!")
+    print(f"저장된 모델 경로: {model_path}")
+except Exception as e:
+    print(f"\n⚠️  최적화 결과 분석 중 오류 발생: {e}")
+    import traceback
+    traceback.print_exc()
+# 정상 종료
+import sys
+sys.exit(0)