bong9513 commited on
Commit
9e55601
·
verified ·
1 Parent(s): 7a72c1c

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +144 -0
  2. Analysis_code/0.air_data_merge.ipynb +1029 -0
  3. Analysis_code/1.data_merge.ipynb +0 -0
  4. Analysis_code/2.eda_preproccesing.ipynb +3 -0
  5. Analysis_code/3.oversampling.ipynb +0 -0
  6. Analysis_code/__pycache__/code.cpython-312.pyc +0 -0
  7. Analysis_code/__pycache__/deepgbm.cpython-312.pyc +0 -0
  8. Analysis_code/__pycache__/deepgbm.cpython-38.pyc +0 -0
  9. Analysis_code/__pycache__/deepgbm.cpython-39.pyc +0 -0
  10. Analysis_code/__pycache__/ft_transformer.cpython-312.pyc +0 -0
  11. Analysis_code/__pycache__/ft_transformer.cpython-38.pyc +0 -0
  12. Analysis_code/__pycache__/ft_transformer.cpython-39.pyc +0 -0
  13. Analysis_code/__pycache__/resnet_like.cpython-312.pyc +0 -0
  14. Analysis_code/__pycache__/resnet_like.cpython-38.pyc +0 -0
  15. Analysis_code/__pycache__/resnet_like.cpython-39.pyc +0 -0
  16. Analysis_code/best_deepgbm_model.pth +3 -0
  17. Analysis_code/best_model_f1.pth +3 -0
  18. Analysis_code/deepgbm.py +47 -0
  19. Analysis_code/deeplearning_model_binary.ipynb +0 -0
  20. Analysis_code/deeplearning_model_multi.ipynb +0 -0
  21. Analysis_code/final_test/final.ipynb +1143 -0
  22. Analysis_code/find_reason/ busan_trend.ipynb +0 -0
  23. Analysis_code/find_reason/ daegu_trend.ipynb +0 -0
  24. Analysis_code/find_reason/ gwangju_trend.ipynb +0 -0
  25. Analysis_code/find_reason/ incheon_trend.ipynb +0 -0
  26. Analysis_code/find_reason/ seoul_trend.ipynb +0 -0
  27. Analysis_code/find_reason/busan.ipynb +0 -0
  28. Analysis_code/find_reason/daegu.ipynb +0 -0
  29. Analysis_code/find_reason/daejeon.ipynb +0 -0
  30. Analysis_code/find_reason/daejeon_trend.ipynb +0 -0
  31. Analysis_code/find_reason/gwangju.ipynb +0 -0
  32. Analysis_code/find_reason/incheon.ipynb +0 -0
  33. Analysis_code/find_reason/seoul.ipynb +0 -0
  34. Analysis_code/ft_transformer.py +56 -0
  35. Analysis_code/make_oversample_data/gan_sample_10000_1.py +181 -0
  36. Analysis_code/make_oversample_data/gan_sample_10000_2.py +182 -0
  37. Analysis_code/make_oversample_data/gan_sample_10000_3.py +182 -0
  38. Analysis_code/make_oversample_data/gan_sample_20000_1.py +183 -0
  39. Analysis_code/make_oversample_data/gan_sample_20000_2.py +183 -0
  40. Analysis_code/make_oversample_data/gan_sample_20000_3.py +183 -0
  41. Analysis_code/make_oversample_data/gan_sample_7000_1.py +180 -0
  42. Analysis_code/make_oversample_data/gan_sample_7000_2.py +182 -0
  43. Analysis_code/make_oversample_data/gan_sample_7000_3.py +182 -0
  44. Analysis_code/make_oversample_data/oversampling_code.py +355 -0
  45. Analysis_code/make_oversample_data/smote_sample_1.py +53 -0
  46. Analysis_code/make_oversample_data/smote_sample_2.py +53 -0
  47. Analysis_code/make_oversample_data/smote_sample_3.py +53 -0
  48. Analysis_code/make_train_test.ipynb +1099 -0
  49. Analysis_code/model_result/best_sample/ensemble_best_sample.csv +157 -0
  50. Analysis_code/model_result/deepgbm_sampled_data_test.csv +31 -0
.gitattributes CHANGED
@@ -33,3 +33,147 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Analysis_code/2.eda_preproccesing.ipynb filter=lfs diff=lfs merge=lfs -text
37
+ data/ASOS/2018.csv filter=lfs diff=lfs merge=lfs -text
38
+ data/ASOS/2019.csv filter=lfs diff=lfs merge=lfs -text
39
+ data/ASOS/2020.csv filter=lfs diff=lfs merge=lfs -text
40
+ data/ASOS/2021.csv filter=lfs diff=lfs merge=lfs -text
41
+ data/ASOS/2022.csv filter=lfs diff=lfs merge=lfs -text
42
+ data/ASOS/2023.csv filter=lfs diff=lfs merge=lfs -text
43
+ data/ASOS/asos.feather filter=lfs diff=lfs merge=lfs -text
44
+ data/ASOS/asos_dup.feather filter=lfs diff=lfs merge=lfs -text
45
+ data/ASOS/asos_fill_위경.feather filter=lfs diff=lfs merge=lfs -text
46
+ data/ASOS/asos_merge.feather filter=lfs diff=lfs merge=lfs -text
47
+ data/asos_air_merge.feather filter=lfs diff=lfs merge=lfs -text
48
+ data/data_for_demo/df_busan.feather filter=lfs diff=lfs merge=lfs -text
49
+ data/data_for_demo/df_daegu.feather filter=lfs diff=lfs merge=lfs -text
50
+ data/data_for_demo/df_daejeon.feather filter=lfs diff=lfs merge=lfs -text
51
+ data/data_for_demo/df_gwangju.feather filter=lfs diff=lfs merge=lfs -text
52
+ data/data_for_demo/df_incheon.feather filter=lfs diff=lfs merge=lfs -text
53
+ data/data_for_demo/df_seoul.feather filter=lfs diff=lfs merge=lfs -text
54
+ data/data_for_modeling/df_busan.feather filter=lfs diff=lfs merge=lfs -text
55
+ data/data_for_modeling/df_daegu.feather filter=lfs diff=lfs merge=lfs -text
56
+ data/data_for_modeling/df_daejeon.feather filter=lfs diff=lfs merge=lfs -text
57
+ data/data_for_modeling/df_gwangju.feather filter=lfs diff=lfs merge=lfs -text
58
+ data/data_for_modeling/df_incheon.feather filter=lfs diff=lfs merge=lfs -text
59
+ data/data_for_modeling/df_seoul.feather filter=lfs diff=lfs merge=lfs -text
60
+ data/data_oversampled/ctgan10000/ctgan10000_1_busan.csv filter=lfs diff=lfs merge=lfs -text
61
+ data/data_oversampled/ctgan10000/ctgan10000_1_daegu.csv filter=lfs diff=lfs merge=lfs -text
62
+ data/data_oversampled/ctgan10000/ctgan10000_1_daejeon.csv filter=lfs diff=lfs merge=lfs -text
63
+ data/data_oversampled/ctgan10000/ctgan10000_1_gwangju.csv filter=lfs diff=lfs merge=lfs -text
64
+ data/data_oversampled/ctgan10000/ctgan10000_1_incheon.csv filter=lfs diff=lfs merge=lfs -text
65
+ data/data_oversampled/ctgan10000/ctgan10000_1_seoul.csv filter=lfs diff=lfs merge=lfs -text
66
+ data/data_oversampled/ctgan10000/ctgan10000_2_busan.csv filter=lfs diff=lfs merge=lfs -text
67
+ data/data_oversampled/ctgan10000/ctgan10000_2_daegu.csv filter=lfs diff=lfs merge=lfs -text
68
+ data/data_oversampled/ctgan10000/ctgan10000_2_daejeon.csv filter=lfs diff=lfs merge=lfs -text
69
+ data/data_oversampled/ctgan10000/ctgan10000_2_gwangju.csv filter=lfs diff=lfs merge=lfs -text
70
+ data/data_oversampled/ctgan10000/ctgan10000_2_incheon.csv filter=lfs diff=lfs merge=lfs -text
71
+ data/data_oversampled/ctgan10000/ctgan10000_2_seoul.csv filter=lfs diff=lfs merge=lfs -text
72
+ data/data_oversampled/ctgan10000/ctgan10000_3_busan.csv filter=lfs diff=lfs merge=lfs -text
73
+ data/data_oversampled/ctgan10000/ctgan10000_3_daegu.csv filter=lfs diff=lfs merge=lfs -text
74
+ data/data_oversampled/ctgan10000/ctgan10000_3_daejeon.csv filter=lfs diff=lfs merge=lfs -text
75
+ data/data_oversampled/ctgan10000/ctgan10000_3_gwangju.csv filter=lfs diff=lfs merge=lfs -text
76
+ data/data_oversampled/ctgan10000/ctgan10000_3_incheon.csv filter=lfs diff=lfs merge=lfs -text
77
+ data/data_oversampled/ctgan10000/ctgan10000_3_seoul.csv filter=lfs diff=lfs merge=lfs -text
78
+ data/data_oversampled/ctgan20000/ctgan20000_1_busan.csv filter=lfs diff=lfs merge=lfs -text
79
+ data/data_oversampled/ctgan20000/ctgan20000_1_daegu.csv filter=lfs diff=lfs merge=lfs -text
80
+ data/data_oversampled/ctgan20000/ctgan20000_1_daejeon.csv filter=lfs diff=lfs merge=lfs -text
81
+ data/data_oversampled/ctgan20000/ctgan20000_1_gwangju.csv filter=lfs diff=lfs merge=lfs -text
82
+ data/data_oversampled/ctgan20000/ctgan20000_1_incheon.csv filter=lfs diff=lfs merge=lfs -text
83
+ data/data_oversampled/ctgan20000/ctgan20000_1_seoul.csv filter=lfs diff=lfs merge=lfs -text
84
+ data/data_oversampled/ctgan20000/ctgan20000_2_busan.csv filter=lfs diff=lfs merge=lfs -text
85
+ data/data_oversampled/ctgan20000/ctgan20000_2_daegu.csv filter=lfs diff=lfs merge=lfs -text
86
+ data/data_oversampled/ctgan20000/ctgan20000_2_daejeon.csv filter=lfs diff=lfs merge=lfs -text
87
+ data/data_oversampled/ctgan20000/ctgan20000_2_gwangju.csv filter=lfs diff=lfs merge=lfs -text
88
+ data/data_oversampled/ctgan20000/ctgan20000_2_incheon.csv filter=lfs diff=lfs merge=lfs -text
89
+ data/data_oversampled/ctgan20000/ctgan20000_2_seoul.csv filter=lfs diff=lfs merge=lfs -text
90
+ data/data_oversampled/ctgan20000/ctgan20000_3_busan.csv filter=lfs diff=lfs merge=lfs -text
91
+ data/data_oversampled/ctgan20000/ctgan20000_3_daegu.csv filter=lfs diff=lfs merge=lfs -text
92
+ data/data_oversampled/ctgan20000/ctgan20000_3_daejeon.csv filter=lfs diff=lfs merge=lfs -text
93
+ data/data_oversampled/ctgan20000/ctgan20000_3_gwangju.csv filter=lfs diff=lfs merge=lfs -text
94
+ data/data_oversampled/ctgan20000/ctgan20000_3_incheon.csv filter=lfs diff=lfs merge=lfs -text
95
+ data/data_oversampled/ctgan20000/ctgan20000_3_seoul.csv filter=lfs diff=lfs merge=lfs -text
96
+ data/local_selected.feather filter=lfs diff=lfs merge=lfs -text
97
+ data/대기질/2017_12.xlsx filter=lfs diff=lfs merge=lfs -text
98
+ data/대기질/2018/2018_1.xlsx filter=lfs diff=lfs merge=lfs -text
99
+ data/대기질/2018/2018_10.xlsx filter=lfs diff=lfs merge=lfs -text
100
+ data/대기질/2018/2018_11.xlsx filter=lfs diff=lfs merge=lfs -text
101
+ data/대기질/2018/2018_12.xlsx filter=lfs diff=lfs merge=lfs -text
102
+ data/대기질/2018/2018_2.xlsx filter=lfs diff=lfs merge=lfs -text
103
+ data/대기질/2018/2018_3.xlsx filter=lfs diff=lfs merge=lfs -text
104
+ data/대기질/2018/2018_4.xlsx filter=lfs diff=lfs merge=lfs -text
105
+ data/대기질/2018/2018_5.xlsx filter=lfs diff=lfs merge=lfs -text
106
+ data/대기질/2018/2018_6.xlsx filter=lfs diff=lfs merge=lfs -text
107
+ data/대기질/2018/2018_7.xlsx filter=lfs diff=lfs merge=lfs -text
108
+ data/대기질/2018/2018_8.xlsx filter=lfs diff=lfs merge=lfs -text
109
+ data/대기질/2018/2018_9.xlsx filter=lfs diff=lfs merge=lfs -text
110
+ data/대기질/2018.feather filter=lfs diff=lfs merge=lfs -text
111
+ data/대기질/2019/2019_01.xlsx filter=lfs diff=lfs merge=lfs -text
112
+ data/대기질/2019/2019_02.xlsx filter=lfs diff=lfs merge=lfs -text
113
+ data/대기질/2019/2019_03.xlsx filter=lfs diff=lfs merge=lfs -text
114
+ data/대기질/2019/2019_04.xlsx filter=lfs diff=lfs merge=lfs -text
115
+ data/대기질/2019/2019_05.xlsx filter=lfs diff=lfs merge=lfs -text
116
+ data/대기질/2019/2019_06.xlsx filter=lfs diff=lfs merge=lfs -text
117
+ data/대기질/2019/2019_07.xlsx filter=lfs diff=lfs merge=lfs -text
118
+ data/대기질/2019/2019_08.xlsx filter=lfs diff=lfs merge=lfs -text
119
+ data/대기질/2019/2019_09.xlsx filter=lfs diff=lfs merge=lfs -text
120
+ data/대기질/2019/2019_10.xlsx filter=lfs diff=lfs merge=lfs -text
121
+ data/대기질/2019/2019_11.xlsx filter=lfs diff=lfs merge=lfs -text
122
+ data/대기질/2019/2019_12.xlsx filter=lfs diff=lfs merge=lfs -text
123
+ data/대기질/2019.feather filter=lfs diff=lfs merge=lfs -text
124
+ data/대기질/2020/2020_1.xlsx filter=lfs diff=lfs merge=lfs -text
125
+ data/대기질/2020/2020_10.xlsx filter=lfs diff=lfs merge=lfs -text
126
+ data/대기질/2020/2020_11.xlsx filter=lfs diff=lfs merge=lfs -text
127
+ data/대기질/2020/2020_12.xlsx filter=lfs diff=lfs merge=lfs -text
128
+ data/대기질/2020/2020_2.xlsx filter=lfs diff=lfs merge=lfs -text
129
+ data/대기질/2020/2020_3.xlsx filter=lfs diff=lfs merge=lfs -text
130
+ data/대기질/2020/2020_4.xlsx filter=lfs diff=lfs merge=lfs -text
131
+ data/대기질/2020/2020_5.xlsx filter=lfs diff=lfs merge=lfs -text
132
+ data/대기질/2020/2020_6.xlsx filter=lfs diff=lfs merge=lfs -text
133
+ data/대기질/2020/2020_7.xlsx filter=lfs diff=lfs merge=lfs -text
134
+ data/대기질/2020/2020_8.xlsx filter=lfs diff=lfs merge=lfs -text
135
+ data/대기질/2020/2020_9.xlsx filter=lfs diff=lfs merge=lfs -text
136
+ data/대기질/2020.feather filter=lfs diff=lfs merge=lfs -text
137
+ data/대기질/2021/2021_1.xlsx filter=lfs diff=lfs merge=lfs -text
138
+ data/대기질/2021/2021_10.xlsx filter=lfs diff=lfs merge=lfs -text
139
+ data/대기질/2021/2021_11.xlsx filter=lfs diff=lfs merge=lfs -text
140
+ data/대기질/2021/2021_12.xlsx filter=lfs diff=lfs merge=lfs -text
141
+ data/대기질/2021/2021_2.xlsx filter=lfs diff=lfs merge=lfs -text
142
+ data/대기질/2021/2021_3.xlsx filter=lfs diff=lfs merge=lfs -text
143
+ data/대기질/2021/2021_4.xlsx filter=lfs diff=lfs merge=lfs -text
144
+ data/대기질/2021/2021_5.xlsx filter=lfs diff=lfs merge=lfs -text
145
+ data/대기질/2021/2021_6.xlsx filter=lfs diff=lfs merge=lfs -text
146
+ data/대기질/2021/2021_7.xlsx filter=lfs diff=lfs merge=lfs -text
147
+ data/대기질/2021/2021_8.xlsx filter=lfs diff=lfs merge=lfs -text
148
+ data/대기질/2021/2021_9.xlsx filter=lfs diff=lfs merge=lfs -text
149
+ data/대기질/2021.feather filter=lfs diff=lfs merge=lfs -text
150
+ data/대기질/2022/2022_1.xlsx filter=lfs diff=lfs merge=lfs -text
151
+ data/대기질/2022/2022_10.xlsx filter=lfs diff=lfs merge=lfs -text
152
+ data/대기질/2022/2022_11.xlsx filter=lfs diff=lfs merge=lfs -text
153
+ data/대기질/2022/2022_12.xlsx filter=lfs diff=lfs merge=lfs -text
154
+ data/대기질/2022/2022_2.xlsx filter=lfs diff=lfs merge=lfs -text
155
+ data/대기질/2022/2022_3.xlsx filter=lfs diff=lfs merge=lfs -text
156
+ data/대기질/2022/2022_4.xlsx filter=lfs diff=lfs merge=lfs -text
157
+ data/대기질/2022/2022_5.xlsx filter=lfs diff=lfs merge=lfs -text
158
+ data/대기질/2022/2022_6.xlsx filter=lfs diff=lfs merge=lfs -text
159
+ data/대기질/2022/2022_7.xlsx filter=lfs diff=lfs merge=lfs -text
160
+ data/대기질/2022/2022_8.xlsx filter=lfs diff=lfs merge=lfs -text
161
+ data/대기질/2022/2022_9.xlsx filter=lfs diff=lfs merge=lfs -text
162
+ data/대기질/2022.feather filter=lfs diff=lfs merge=lfs -text
163
+ data/대기질/2023/2023_1.xlsx filter=lfs diff=lfs merge=lfs -text
164
+ data/대기질/2023/2023_10.xlsx filter=lfs diff=lfs merge=lfs -text
165
+ data/대기질/2023/2023_11.xlsx filter=lfs diff=lfs merge=lfs -text
166
+ data/대기질/2023/2023_12.xlsx filter=lfs diff=lfs merge=lfs -text
167
+ data/대기질/2023/2023_2.xlsx filter=lfs diff=lfs merge=lfs -text
168
+ data/대기질/2023/2023_3.xlsx filter=lfs diff=lfs merge=lfs -text
169
+ data/대기질/2023/2023_4.xlsx filter=lfs diff=lfs merge=lfs -text
170
+ data/대기질/2023/2023_5.xlsx filter=lfs diff=lfs merge=lfs -text
171
+ data/대기질/2023/2023_6.xlsx filter=lfs diff=lfs merge=lfs -text
172
+ data/대기질/2023/2023_7.xlsx filter=lfs diff=lfs merge=lfs -text
173
+ data/대기질/2023/2023_8.xlsx filter=lfs diff=lfs merge=lfs -text
174
+ data/대기질/2023/2023_9.xlsx filter=lfs diff=lfs merge=lfs -text
175
+ data/대기질/2023.feather filter=lfs diff=lfs merge=lfs -text
176
+ data/대기질/air.feather filter=lfs diff=lfs merge=lfs -text
177
+ data/대기질/air_add_위도경도.feather filter=lfs diff=lfs merge=lfs -text
178
+ data/대기질/air_dataon_merged.feather filter=lfs diff=lfs merge=lfs -text
179
+ data/대기질/air_dup.feather filter=lfs diff=lfs merge=lfs -text
Analysis_code/0.air_data_merge.ipynb ADDED
@@ -0,0 +1,1029 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/opt/conda/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import os\n",
19
+ "import numpy as np\n",
20
+ "import pandas as pd\n",
21
+ "import natsort\n",
22
+ "from datetime import datetime\n",
23
+ "from tqdm.auto import tqdm"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 2,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "def get_data(year):\n",
33
+ " files = natsort.natsorted(os.listdir(f'../data/대기질/{year}/'))\n",
34
+ " data = []\n",
35
+ " for file in tqdm(files, desc=f\"Reading files...({len(files)})\"):\n",
36
+ " data.append(pd.read_excel(f'../data/대기질/{year}/{file}', usecols=[\"지역\", '망', \"측정소코드\", \"측정소명\", \"측정일시\", \"O3\", \"NO2\", \"PM10\", \"PM25\", \"주소\"]))\n",
37
+ "\n",
38
+ " return pd.concat(data)"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 3,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "# 합친 데이터에 날짜 정보를 추가한다.\n",
48
+ "def add_date(df):\n",
49
+ "\n",
50
+ " df[\"측정일시\"] = df[\"측정일시\"].astype(str).str[:10]\n",
51
+ " df[\"측정일시\"] = pd.to_datetime(df[\"측정일시\"], format='%Y%m%d%H', errors=\"coerce\")\n",
52
+ "\n",
53
+ " df[\"year\"] = df[\"측정일시\"].dt.year\n",
54
+ " df[\"month\"] = df[\"측정일시\"].dt.month\n",
55
+ " df[\"day\"] = df[\"측정일시\"].dt.day\n",
56
+ " df[\"hour\"] = df[\"측정일시\"].dt.hour\n",
57
+ "\n",
58
+ " return df"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 4,
64
+ "metadata": {},
65
+ "outputs": [
66
+ {
67
+ "name": "stderr",
68
+ "output_type": "stream",
69
+ "text": [
70
+ " 0%| | 0/6 [00:00<?, ?it/s]\n",
71
+ "Reading files...(13): 0%| | 0/13 [00:00<?, ?it/s]\u001b[A\n",
72
+ "Reading files...(13): 8%|▊ | 1/13 [00:34<06:57, 34.80s/it]\u001b[A\n",
73
+ "Reading files...(13): 15%|█▌ | 2/13 [01:12<06:41, 36.47s/it]\u001b[A\n",
74
+ "Reading files...(13): 23%|██▎ | 3/13 [01:47<05:58, 35.89s/it]\u001b[A\n",
75
+ "Reading files...(13): 31%|███ | 4/13 [02:23<05:23, 35.96s/it]\u001b[A\n",
76
+ "Reading files...(13): 38%|███▊ | 5/13 [02:59<04:47, 35.92s/it]\u001b[A\n",
77
+ "Reading files...(13): 46%|████▌ | 6/13 [03:35<04:12, 36.09s/it]\u001b[A\n",
78
+ "Reading files...(13): 62%|██████▏ | 8/13 [04:12<02:16, 27.35s/it]\u001b[A\n",
79
+ "Reading files...(13): 69%|██████▉ | 9/13 [04:46<01:56, 29.05s/it]\u001b[A\n",
80
+ "Reading files...(13): 77%|███████▋ | 10/13 [05:21<01:31, 30.55s/it]\u001b[A\n",
81
+ "Reading files...(13): 85%|████████▍ | 11/13 [05:58<01:04, 32.46s/it]\u001b[A\n",
82
+ "Reading files...(13): 92%|█████████▏| 12/13 [06:37<00:34, 34.28s/it]\u001b[A\n",
83
+ "Reading files...(13): 100%|██████████| 13/13 [07:08<00:00, 32.93s/it]\u001b[A\n",
84
+ " 17%|█▋ | 1/6 [07:18<36:30, 438.18s/it]\n",
85
+ "Reading files...(13): 0%| | 0/13 [00:00<?, ?it/s]\u001b[A\n",
86
+ "Reading files...(13): 8%|▊ | 1/13 [00:43<08:41, 43.43s/it]\u001b[A\n",
87
+ "Reading files...(13): 15%|█▌ | 2/13 [01:26<07:56, 43.29s/it]\u001b[A\n",
88
+ "Reading files...(13): 23%|██▎ | 3/13 [02:07<07:02, 42.22s/it]\u001b[A\n",
89
+ "Reading files...(13): 31%|███ | 4/13 [02:50<06:23, 42.66s/it]\u001b[A\n",
90
+ "Reading files...(13): 38%|███▊ | 5/13 [03:28<05:27, 40.90s/it]\u001b[A\n",
91
+ "Reading files...(13): 46%|████▌ | 6/13 [04:15<04:59, 42.79s/it]\u001b[A\n",
92
+ "Reading files...(13): 54%|█████▍ | 7/13 [04:58<04:18, 43.14s/it]\u001b[A\n",
93
+ "Reading files...(13): 62%|██████▏ | 8/13 [05:43<03:37, 43.47s/it]\u001b[A\n",
94
+ "Reading files...(13): 69%|██████▉ | 9/13 [06:28<02:55, 43.96s/it]\u001b[A\n",
95
+ "Reading files...(13): 77%|███████▋ | 10/13 [07:12<02:12, 44.01s/it]\u001b[A\n",
96
+ "Reading files...(13): 85%|████████▍ | 11/13 [07:52<01:25, 42.90s/it]\u001b[A\n",
97
+ "Reading files...(13): 100%|██████████| 13/13 [08:34<00:00, 39.61s/it]\u001b[A\n",
98
+ " 33%|███▎ | 2/6 [16:05<32:42, 490.55s/it]\n",
99
+ "Reading files...(13): 0%| | 0/13 [00:00<?, ?it/s]\u001b[A\n",
100
+ "Reading files...(13): 8%|▊ | 1/13 [00:49<09:56, 49.74s/it]\u001b[A\n",
101
+ "Reading files...(13): 15%|█▌ | 2/13 [01:43<09:31, 51.98s/it]\u001b[A\n",
102
+ "Reading files...(13): 23%|██▎ | 3/13 [02:33<08:29, 50.96s/it]\u001b[A\n",
103
+ "Reading files...(13): 31%|███ | 4/13 [03:23<07:38, 50.95s/it]\u001b[A\n",
104
+ "Reading files...(13): 38%|███▊ | 5/13 [04:13<06:43, 50.46s/it]\u001b[A\n",
105
+ "Reading files...(13): 46%|████▌ | 6/13 [04:58<05:40, 48.71s/it]\u001b[A\n",
106
+ "Reading files...(13): 54%|█████▍ | 7/13 [05:50<04:57, 49.66s/it]\u001b[A\n",
107
+ "Reading files...(13): 62%|██████▏ | 8/13 [06:45<04:16, 51.29s/it]\u001b[A\n",
108
+ "Reading files...(13): 77%|███████▋ | 10/13 [07:38<01:58, 39.46s/it]\u001b[A\n",
109
+ "Reading files...(13): 85%|████████▍ | 11/13 [08:30<01:25, 42.79s/it]\u001b[A\n",
110
+ "Reading files...(13): 92%|█████████▏| 12/13 [09:26<00:46, 46.32s/it]\u001b[A\n",
111
+ "Reading files...(13): 100%|██████████| 13/13 [10:13<00:00, 47.19s/it]\u001b[A\n",
112
+ " 50%|█████ | 3/6 [26:32<27:38, 552.96s/it]\n",
113
+ "Reading files...(13): 0%| | 0/13 [00:00<?, ?it/s]\u001b[A\n",
114
+ "Reading files...(13): 8%|▊ | 1/13 [00:59<11:48, 59.01s/it]\u001b[A\n",
115
+ "Reading files...(13): 15%|█▌ | 2/13 [01:56<10:40, 58.19s/it]\u001b[A\n",
116
+ "Reading files...(13): 23%|██▎ | 3/13 [02:53<09:37, 57.77s/it]\u001b[A\n",
117
+ "Reading files...(13): 31%|███ | 4/13 [03:52<08:41, 58.00s/it]\u001b[A\n",
118
+ "Reading files...(13): 38%|███▊ | 5/13 [04:44<07:26, 55.77s/it]\u001b[A\n",
119
+ "Reading files...(13): 46%|████▌ | 6/13 [05:40<06:32, 56.05s/it]\u001b[A\n",
120
+ "Reading files...(13): 54%|█████▍ | 7/13 [06:36<05:36, 56.06s/it]\u001b[A\n",
121
+ "Reading files...(13): 62%|██████▏ | 8/13 [07:33<04:42, 56.42s/it]\u001b[A\n",
122
+ "Reading files...(13): 69%|██████▉ | 9/13 [08:34<03:51, 57.76s/it]\u001b[A\n",
123
+ "Reading files...(13): 77%|███████▋ | 10/13 [09:35<02:56, 58.75s/it]\u001b[A\n",
124
+ "Reading files...(13): 92%|█████████▏| 12/13 [10:33<00:44, 44.84s/it]\u001b[A\n",
125
+ "Reading files...(13): 100%|██████████| 13/13 [11:32<00:00, 53.29s/it]\u001b[A\n",
126
+ " 67%|██████▋ | 4/6 [38:20<20:28, 614.26s/it]\n",
127
+ "Reading files...(13): 0%| | 0/13 [00:00<?, ?it/s]\u001b[A\n",
128
+ "Reading files...(13): 8%|▊ | 1/13 [00:59<11:57, 59.79s/it]\u001b[A\n",
129
+ "Reading files...(13): 15%|█▌ | 2/13 [02:01<11:07, 60.67s/it]\u001b[A\n",
130
+ "Reading files...(13): 23%|██▎ | 3/13 [03:02<10:10, 61.02s/it]\u001b[A\n",
131
+ "Reading files...(13): 31%|███ | 4/13 [03:57<08:48, 58.74s/it]\u001b[A\n",
132
+ "Reading files...(13): 38%|███▊ | 5/13 [04:57<07:53, 59.18s/it]\u001b[A\n",
133
+ "Reading files...(13): 46%|████▌ | 6/13 [06:00<07:03, 60.45s/it]\u001b[A\n",
134
+ "Reading files...(13): 54%|█████▍ | 7/13 [07:00<06:02, 60.38s/it]\u001b[A\n",
135
+ "Reading files...(13): 62%|██████▏ | 8/13 [08:02<05:04, 60.85s/it]\u001b[A\n",
136
+ "Reading files...(13): 69%|██████▉ | 9/13 [09:04<04:04, 61.03s/it]\u001b[A\n",
137
+ "Reading files...(13): 77%|███████▋ | 10/13 [10:04<03:02, 60.67s/it]\u001b[A\n",
138
+ "Reading files...(13): 92%|█████████▏| 12/13 [11:06<00:46, 46.76s/it]\u001b[A\n",
139
+ "Reading files...(13): 100%|██████████| 13/13 [12:09<00:00, 56.08s/it]\u001b[A\n",
140
+ " 83%|████████▎ | 5/6 [50:46<11:01, 661.78s/it]\n",
141
+ "Reading files...(13): 0%| | 0/13 [00:00<?, ?it/s]\u001b[A\n",
142
+ "Reading files...(13): 8%|▊ | 1/13 [01:03<12:46, 63.88s/it]\u001b[A\n",
143
+ "Reading files...(13): 15%|█▌ | 2/13 [02:08<11:50, 64.56s/it]\u001b[A\n",
144
+ "Reading files...(13): 23%|██▎ | 3/13 [03:10<10:32, 63.22s/it]\u001b[A\n",
145
+ "Reading files...(13): 31%|███ | 4/13 [04:07<09:05, 60.63s/it]\u001b[A\n",
146
+ "Reading files...(13): 38%|███▊ | 5/13 [05:09<08:11, 61.41s/it]\u001b[A\n",
147
+ "Reading files...(13): 46%|████▌ | 6/13 [06:12<07:13, 61.92s/it]\u001b[A\n",
148
+ "Reading files...(13): 54%|█████▍ | 7/13 [07:13<06:09, 61.50s/it]\u001b[A\n",
149
+ "Reading files...(13): 62%|██████▏ | 8/13 [08:15<05:08, 61.64s/it]\u001b[A\n",
150
+ "Reading files...(13): 69%|██████▉ | 9/13 [09:17<04:07, 61.81s/it]\u001b[A\n",
151
+ "Reading files...(13): 77%|███████▋ | 10/13 [10:19<03:05, 61.96s/it]\u001b[A\n",
152
+ "Reading files...(13): 92%|█████████▏| 12/13 [11:23<00:47, 47.75s/it]\u001b[A\n",
153
+ "Reading files...(13): 100%|██████████| 13/13 [12:27<00:00, 57.50s/it]\u001b[A\n",
154
+ "100%|██████████| 6/6 [1:03:31<00:00, 635.28s/it]\n"
155
+ ]
156
+ }
157
+ ],
158
+ "source": [
159
+ "import os\n",
160
+ "import pandas as pd\n",
161
+ "from tqdm.auto import tqdm\n",
162
+ "\n",
163
+ "# 대기질 데이터를 불러와서 하나의 파일로 합친다.\n",
164
+ "def get_data(year):\n",
165
+ " directory = f'../data/대기질/{year}/'\n",
166
+ " files = os.listdir(directory)\n",
167
+ " data = []\n",
168
+ " \n",
169
+ " # 파일 목록에서 디렉토리를 제외하고 오직 Excel 파일만 처리\n",
170
+ " for file in tqdm(files, desc=f\"Reading files...({len(files)})\"):\n",
171
+ " file_path = os.path.join(directory, file)\n",
172
+ " if os.path.isfile(file_path) and file_path.endswith(('.xls', '.xlsx')): # Excel 파일 확장자만 허용\n",
173
+ " data.append(pd.read_excel(file_path, usecols=[\"지역\", '망', \"측정소코드\", \"측정소명\", \"측정일시\", \"O3\", \"NO2\", \"PM10\", \"PM25\", \"주소\"]))\n",
174
+ " \n",
175
+ " return pd.concat(data)\n",
176
+ "\n",
177
+ "years = [2018, 2019, 2020,2021,2022,2023] # 2018년부터 2023년까지의 데이터를 합친다.\n",
178
+ "for year in tqdm(years):\n",
179
+ " data = get_data(year)\n",
180
+ " data = add_date(data)\n",
181
+ " data.reset_index(drop=True, inplace=True)\n",
182
+ " data.to_feather(f\"../data/대기질/{year}.feather\")\n"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": 6,
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/html": [
193
+ "<div>\n",
194
+ "<style scoped>\n",
195
+ " .dataframe tbody tr th:only-of-type {\n",
196
+ " vertical-align: middle;\n",
197
+ " }\n",
198
+ "\n",
199
+ " .dataframe tbody tr th {\n",
200
+ " vertical-align: top;\n",
201
+ " }\n",
202
+ "\n",
203
+ " .dataframe thead th {\n",
204
+ " text-align: right;\n",
205
+ " }\n",
206
+ "</style>\n",
207
+ "<table border=\"1\" class=\"dataframe\">\n",
208
+ " <thead>\n",
209
+ " <tr style=\"text-align: right;\">\n",
210
+ " <th></th>\n",
211
+ " <th>지역</th>\n",
212
+ " <th>망</th>\n",
213
+ " <th>측정소코드</th>\n",
214
+ " <th>측정소명</th>\n",
215
+ " <th>측정일시</th>\n",
216
+ " <th>O3</th>\n",
217
+ " <th>NO2</th>\n",
218
+ " <th>PM10</th>\n",
219
+ " <th>PM25</th>\n",
220
+ " <th>주소</th>\n",
221
+ " <th>year</th>\n",
222
+ " <th>month</th>\n",
223
+ " <th>day</th>\n",
224
+ " <th>hour</th>\n",
225
+ " </tr>\n",
226
+ " </thead>\n",
227
+ " <tbody>\n",
228
+ " <tr>\n",
229
+ " <th>0</th>\n",
230
+ " <td>서울 중구</td>\n",
231
+ " <td>도시대기</td>\n",
232
+ " <td>111121</td>\n",
233
+ " <td>중구</td>\n",
234
+ " <td>2023-07-01 01:00:00</td>\n",
235
+ " <td>0.0249</td>\n",
236
+ " <td>0.0188</td>\n",
237
+ " <td>21.0</td>\n",
238
+ " <td>19.0</td>\n",
239
+ " <td>서울 중구 덕수궁길 15</td>\n",
240
+ " <td>2023.0</td>\n",
241
+ " <td>7.0</td>\n",
242
+ " <td>1.0</td>\n",
243
+ " <td>1.0</td>\n",
244
+ " </tr>\n",
245
+ " <tr>\n",
246
+ " <th>1</th>\n",
247
+ " <td>서울 중구</td>\n",
248
+ " <td>도시대기</td>\n",
249
+ " <td>111121</td>\n",
250
+ " <td>중구</td>\n",
251
+ " <td>2023-07-01 02:00:00</td>\n",
252
+ " <td>0.0263</td>\n",
253
+ " <td>0.0163</td>\n",
254
+ " <td>18.0</td>\n",
255
+ " <td>15.0</td>\n",
256
+ " <td>서울 중구 덕수궁길 15</td>\n",
257
+ " <td>2023.0</td>\n",
258
+ " <td>7.0</td>\n",
259
+ " <td>1.0</td>\n",
260
+ " <td>2.0</td>\n",
261
+ " </tr>\n",
262
+ " <tr>\n",
263
+ " <th>2</th>\n",
264
+ " <td>서울 중구</td>\n",
265
+ " <td>도시대기</td>\n",
266
+ " <td>111121</td>\n",
267
+ " <td>중구</td>\n",
268
+ " <td>2023-07-01 03:00:00</td>\n",
269
+ " <td>0.0218</td>\n",
270
+ " <td>0.0192</td>\n",
271
+ " <td>24.0</td>\n",
272
+ " <td>21.0</td>\n",
273
+ " <td>서울 중구 덕수궁길 15</td>\n",
274
+ " <td>2023.0</td>\n",
275
+ " <td>7.0</td>\n",
276
+ " <td>1.0</td>\n",
277
+ " <td>3.0</td>\n",
278
+ " </tr>\n",
279
+ " <tr>\n",
280
+ " <th>3</th>\n",
281
+ " <td>서울 중구</td>\n",
282
+ " <td>도시대기</td>\n",
283
+ " <td>111121</td>\n",
284
+ " <td>중구</td>\n",
285
+ " <td>2023-07-01 04:00:00</td>\n",
286
+ " <td>0.0131</td>\n",
287
+ " <td>0.0214</td>\n",
288
+ " <td>25.0</td>\n",
289
+ " <td>19.0</td>\n",
290
+ " <td>서울 중구 덕수궁길 15</td>\n",
291
+ " <td>2023.0</td>\n",
292
+ " <td>7.0</td>\n",
293
+ " <td>1.0</td>\n",
294
+ " <td>4.0</td>\n",
295
+ " </tr>\n",
296
+ " <tr>\n",
297
+ " <th>4</th>\n",
298
+ " <td>서울 중구</td>\n",
299
+ " <td>도시대기</td>\n",
300
+ " <td>111121</td>\n",
301
+ " <td>중구</td>\n",
302
+ " <td>2023-07-01 05:00:00</td>\n",
303
+ " <td>0.0131</td>\n",
304
+ " <td>0.0160</td>\n",
305
+ " <td>25.0</td>\n",
306
+ " <td>21.0</td>\n",
307
+ " <td>서울 중구 덕수궁길 15</td>\n",
308
+ " <td>2023.0</td>\n",
309
+ " <td>7.0</td>\n",
310
+ " <td>1.0</td>\n",
311
+ " <td>5.0</td>\n",
312
+ " </tr>\n",
313
+ " <tr>\n",
314
+ " <th>5</th>\n",
315
+ " <td>서울 중구</td>\n",
316
+ " <td>도시대기</td>\n",
317
+ " <td>111121</td>\n",
318
+ " <td>중구</td>\n",
319
+ " <td>2023-07-01 06:00:00</td>\n",
320
+ " <td>0.0115</td>\n",
321
+ " <td>0.0196</td>\n",
322
+ " <td>23.0</td>\n",
323
+ " <td>18.0</td>\n",
324
+ " <td>서울 중구 덕수궁길 15</td>\n",
325
+ " <td>2023.0</td>\n",
326
+ " <td>7.0</td>\n",
327
+ " <td>1.0</td>\n",
328
+ " <td>6.0</td>\n",
329
+ " </tr>\n",
330
+ " <tr>\n",
331
+ " <th>6</th>\n",
332
+ " <td>서울 중구</td>\n",
333
+ " <td>도시대기</td>\n",
334
+ " <td>111121</td>\n",
335
+ " <td>중구</td>\n",
336
+ " <td>2023-07-01 07:00:00</td>\n",
337
+ " <td>0.0094</td>\n",
338
+ " <td>0.0230</td>\n",
339
+ " <td>26.0</td>\n",
340
+ " <td>21.0</td>\n",
341
+ " <td>서울 중구 덕수궁길 15</td>\n",
342
+ " <td>2023.0</td>\n",
343
+ " <td>7.0</td>\n",
344
+ " <td>1.0</td>\n",
345
+ " <td>7.0</td>\n",
346
+ " </tr>\n",
347
+ " <tr>\n",
348
+ " <th>7</th>\n",
349
+ " <td>서울 중구</td>\n",
350
+ " <td>도시대기</td>\n",
351
+ " <td>111121</td>\n",
352
+ " <td>중구</td>\n",
353
+ " <td>2023-07-01 08:00:00</td>\n",
354
+ " <td>0.0222</td>\n",
355
+ " <td>0.0175</td>\n",
356
+ " <td>26.0</td>\n",
357
+ " <td>20.0</td>\n",
358
+ " <td>서울 중구 덕수궁길 15</td>\n",
359
+ " <td>2023.0</td>\n",
360
+ " <td>7.0</td>\n",
361
+ " <td>1.0</td>\n",
362
+ " <td>8.0</td>\n",
363
+ " </tr>\n",
364
+ " <tr>\n",
365
+ " <th>8</th>\n",
366
+ " <td>서울 중구</td>\n",
367
+ " <td>도시대기</td>\n",
368
+ " <td>111121</td>\n",
369
+ " <td>중구</td>\n",
370
+ " <td>2023-07-01 09:00:00</td>\n",
371
+ " <td>0.0396</td>\n",
372
+ " <td>0.0153</td>\n",
373
+ " <td>27.0</td>\n",
374
+ " <td>20.0</td>\n",
375
+ " <td>서울 중구 덕수궁길 15</td>\n",
376
+ " <td>2023.0</td>\n",
377
+ " <td>7.0</td>\n",
378
+ " <td>1.0</td>\n",
379
+ " <td>9.0</td>\n",
380
+ " </tr>\n",
381
+ " <tr>\n",
382
+ " <th>9</th>\n",
383
+ " <td>서울 중구</td>\n",
384
+ " <td>도시대기</td>\n",
385
+ " <td>111121</td>\n",
386
+ " <td>중구</td>\n",
387
+ " <td>2023-07-01 10:00:00</td>\n",
388
+ " <td>0.0530</td>\n",
389
+ " <td>0.0105</td>\n",
390
+ " <td>19.0</td>\n",
391
+ " <td>16.0</td>\n",
392
+ " <td>서울 중구 덕수궁길 15</td>\n",
393
+ " <td>2023.0</td>\n",
394
+ " <td>7.0</td>\n",
395
+ " <td>1.0</td>\n",
396
+ " <td>10.0</td>\n",
397
+ " </tr>\n",
398
+ " <tr>\n",
399
+ " <th>10</th>\n",
400
+ " <td>서울 중구</td>\n",
401
+ " <td>도시대기</td>\n",
402
+ " <td>111121</td>\n",
403
+ " <td>중구</td>\n",
404
+ " <td>2023-07-01 11:00:00</td>\n",
405
+ " <td>0.0607</td>\n",
406
+ " <td>0.0090</td>\n",
407
+ " <td>20.0</td>\n",
408
+ " <td>20.0</td>\n",
409
+ " <td>서울 중구 덕수궁길 15</td>\n",
410
+ " <td>2023.0</td>\n",
411
+ " <td>7.0</td>\n",
412
+ " <td>1.0</td>\n",
413
+ " <td>11.0</td>\n",
414
+ " </tr>\n",
415
+ " <tr>\n",
416
+ " <th>11</th>\n",
417
+ " <td>서울 중구</td>\n",
418
+ " <td>도시대기</td>\n",
419
+ " <td>111121</td>\n",
420
+ " <td>중구</td>\n",
421
+ " <td>2023-07-01 12:00:00</td>\n",
422
+ " <td>0.0688</td>\n",
423
+ " <td>0.0114</td>\n",
424
+ " <td>20.0</td>\n",
425
+ " <td>17.0</td>\n",
426
+ " <td>서울 중구 덕수궁길 15</td>\n",
427
+ " <td>2023.0</td>\n",
428
+ " <td>7.0</td>\n",
429
+ " <td>1.0</td>\n",
430
+ " <td>12.0</td>\n",
431
+ " </tr>\n",
432
+ " <tr>\n",
433
+ " <th>12</th>\n",
434
+ " <td>서울 중구</td>\n",
435
+ " <td>도시대기</td>\n",
436
+ " <td>111121</td>\n",
437
+ " <td>중구</td>\n",
438
+ " <td>2023-07-01 13:00:00</td>\n",
439
+ " <td>0.0758</td>\n",
440
+ " <td>0.0101</td>\n",
441
+ " <td>23.0</td>\n",
442
+ " <td>17.0</td>\n",
443
+ " <td>서울 중구 덕수궁길 15</td>\n",
444
+ " <td>2023.0</td>\n",
445
+ " <td>7.0</td>\n",
446
+ " <td>1.0</td>\n",
447
+ " <td>13.0</td>\n",
448
+ " </tr>\n",
449
+ " <tr>\n",
450
+ " <th>13</th>\n",
451
+ " <td>서울 중구</td>\n",
452
+ " <td>도시대기</td>\n",
453
+ " <td>111121</td>\n",
454
+ " <td>중구</td>\n",
455
+ " <td>2023-07-01 14:00:00</td>\n",
456
+ " <td>0.0743</td>\n",
457
+ " <td>0.0093</td>\n",
458
+ " <td>20.0</td>\n",
459
+ " <td>17.0</td>\n",
460
+ " <td>서울 중구 덕수궁길 15</td>\n",
461
+ " <td>2023.0</td>\n",
462
+ " <td>7.0</td>\n",
463
+ " <td>1.0</td>\n",
464
+ " <td>14.0</td>\n",
465
+ " </tr>\n",
466
+ " <tr>\n",
467
+ " <th>14</th>\n",
468
+ " <td>서울 중구</td>\n",
469
+ " <td>도시대기</td>\n",
470
+ " <td>111121</td>\n",
471
+ " <td>중구</td>\n",
472
+ " <td>2023-07-01 15:00:00</td>\n",
473
+ " <td>0.0749</td>\n",
474
+ " <td>0.0100</td>\n",
475
+ " <td>19.0</td>\n",
476
+ " <td>11.0</td>\n",
477
+ " <td>서울 중구 덕수궁길 15</td>\n",
478
+ " <td>2023.0</td>\n",
479
+ " <td>7.0</td>\n",
480
+ " <td>1.0</td>\n",
481
+ " <td>15.0</td>\n",
482
+ " </tr>\n",
483
+ " <tr>\n",
484
+ " <th>15</th>\n",
485
+ " <td>서울 중구</td>\n",
486
+ " <td>도시대기</td>\n",
487
+ " <td>111121</td>\n",
488
+ " <td>중구</td>\n",
489
+ " <td>2023-07-01 16:00:00</td>\n",
490
+ " <td>0.0716</td>\n",
491
+ " <td>0.0092</td>\n",
492
+ " <td>19.0</td>\n",
493
+ " <td>15.0</td>\n",
494
+ " <td>서울 중구 덕수궁길 15</td>\n",
495
+ " <td>2023.0</td>\n",
496
+ " <td>7.0</td>\n",
497
+ " <td>1.0</td>\n",
498
+ " <td>16.0</td>\n",
499
+ " </tr>\n",
500
+ " <tr>\n",
501
+ " <th>16</th>\n",
502
+ " <td>서울 중구</td>\n",
503
+ " <td>도시대기</td>\n",
504
+ " <td>111121</td>\n",
505
+ " <td>중구</td>\n",
506
+ " <td>2023-07-01 17:00:00</td>\n",
507
+ " <td>0.0613</td>\n",
508
+ " <td>0.0099</td>\n",
509
+ " <td>18.0</td>\n",
510
+ " <td>15.0</td>\n",
511
+ " <td>서울 중구 덕수궁길 15</td>\n",
512
+ " <td>2023.0</td>\n",
513
+ " <td>7.0</td>\n",
514
+ " <td>1.0</td>\n",
515
+ " <td>17.0</td>\n",
516
+ " </tr>\n",
517
+ " <tr>\n",
518
+ " <th>17</th>\n",
519
+ " <td>서울 중구</td>\n",
520
+ " <td>도시대기</td>\n",
521
+ " <td>111121</td>\n",
522
+ " <td>중구</td>\n",
523
+ " <td>2023-07-01 18:00:00</td>\n",
524
+ " <td>0.0496</td>\n",
525
+ " <td>0.0098</td>\n",
526
+ " <td>18.0</td>\n",
527
+ " <td>14.0</td>\n",
528
+ " <td>서울 중구 덕수궁길 15</td>\n",
529
+ " <td>2023.0</td>\n",
530
+ " <td>7.0</td>\n",
531
+ " <td>1.0</td>\n",
532
+ " <td>18.0</td>\n",
533
+ " </tr>\n",
534
+ " <tr>\n",
535
+ " <th>18</th>\n",
536
+ " <td>서울 중구</td>\n",
537
+ " <td>도시대기</td>\n",
538
+ " <td>111121</td>\n",
539
+ " <td>중구</td>\n",
540
+ " <td>2023-07-01 19:00:00</td>\n",
541
+ " <td>0.0473</td>\n",
542
+ " <td>0.0124</td>\n",
543
+ " <td>17.0</td>\n",
544
+ " <td>17.0</td>\n",
545
+ " <td>서울 중구 덕수궁길 15</td>\n",
546
+ " <td>2023.0</td>\n",
547
+ " <td>7.0</td>\n",
548
+ " <td>1.0</td>\n",
549
+ " <td>19.0</td>\n",
550
+ " </tr>\n",
551
+ " <tr>\n",
552
+ " <th>19</th>\n",
553
+ " <td>서울 중구</td>\n",
554
+ " <td>도시대기</td>\n",
555
+ " <td>111121</td>\n",
556
+ " <td>중구</td>\n",
557
+ " <td>2023-07-01 20:00:00</td>\n",
558
+ " <td>0.0498</td>\n",
559
+ " <td>0.0170</td>\n",
560
+ " <td>17.0</td>\n",
561
+ " <td>15.0</td>\n",
562
+ " <td>서울 중구 덕수궁길 15</td>\n",
563
+ " <td>2023.0</td>\n",
564
+ " <td>7.0</td>\n",
565
+ " <td>1.0</td>\n",
566
+ " <td>20.0</td>\n",
567
+ " </tr>\n",
568
+ " <tr>\n",
569
+ " <th>20</th>\n",
570
+ " <td>서울 중구</td>\n",
571
+ " <td>도시대기</td>\n",
572
+ " <td>111121</td>\n",
573
+ " <td>중구</td>\n",
574
+ " <td>2023-07-01 21:00:00</td>\n",
575
+ " <td>0.0616</td>\n",
576
+ " <td>0.0134</td>\n",
577
+ " <td>23.0</td>\n",
578
+ " <td>20.0</td>\n",
579
+ " <td>서울 중구 덕수궁길 15</td>\n",
580
+ " <td>2023.0</td>\n",
581
+ " <td>7.0</td>\n",
582
+ " <td>1.0</td>\n",
583
+ " <td>21.0</td>\n",
584
+ " </tr>\n",
585
+ " <tr>\n",
586
+ " <th>21</th>\n",
587
+ " <td>서울 중구</td>\n",
588
+ " <td>도시대기</td>\n",
589
+ " <td>111121</td>\n",
590
+ " <td>중구</td>\n",
591
+ " <td>2023-07-01 22:00:00</td>\n",
592
+ " <td>0.0543</td>\n",
593
+ " <td>0.0109</td>\n",
594
+ " <td>18.0</td>\n",
595
+ " <td>16.0</td>\n",
596
+ " <td>서울 중구 덕수궁길 15</td>\n",
597
+ " <td>2023.0</td>\n",
598
+ " <td>7.0</td>\n",
599
+ " <td>1.0</td>\n",
600
+ " <td>22.0</td>\n",
601
+ " </tr>\n",
602
+ " <tr>\n",
603
+ " <th>22</th>\n",
604
+ " <td>서울 중구</td>\n",
605
+ " <td>도시대기</td>\n",
606
+ " <td>111121</td>\n",
607
+ " <td>중구</td>\n",
608
+ " <td>2023-07-01 23:00:00</td>\n",
609
+ " <td>0.0507</td>\n",
610
+ " <td>0.0113</td>\n",
611
+ " <td>17.0</td>\n",
612
+ " <td>16.0</td>\n",
613
+ " <td>서울 중구 덕수궁길 15</td>\n",
614
+ " <td>2023.0</td>\n",
615
+ " <td>7.0</td>\n",
616
+ " <td>1.0</td>\n",
617
+ " <td>23.0</td>\n",
618
+ " </tr>\n",
619
+ " <tr>\n",
620
+ " <th>23</th>\n",
621
+ " <td>서울 중구</td>\n",
622
+ " <td>도시대기</td>\n",
623
+ " <td>111121</td>\n",
624
+ " <td>중구</td>\n",
625
+ " <td>NaT</td>\n",
626
+ " <td>0.0427</td>\n",
627
+ " <td>0.0125</td>\n",
628
+ " <td>17.0</td>\n",
629
+ " <td>16.0</td>\n",
630
+ " <td>서울 중구 덕수궁길 15</td>\n",
631
+ " <td>NaN</td>\n",
632
+ " <td>NaN</td>\n",
633
+ " <td>NaN</td>\n",
634
+ " <td>NaN</td>\n",
635
+ " </tr>\n",
636
+ " <tr>\n",
637
+ " <th>24</th>\n",
638
+ " <td>서울 중구</td>\n",
639
+ " <td>도시대기</td>\n",
640
+ " <td>111121</td>\n",
641
+ " <td>중구</td>\n",
642
+ " <td>2023-07-02 01:00:00</td>\n",
643
+ " <td>0.0334</td>\n",
644
+ " <td>0.0148</td>\n",
645
+ " <td>21.0</td>\n",
646
+ " <td>20.0</td>\n",
647
+ " <td>서울 중구 덕수궁길 15</td>\n",
648
+ " <td>2023.0</td>\n",
649
+ " <td>7.0</td>\n",
650
+ " <td>2.0</td>\n",
651
+ " <td>1.0</td>\n",
652
+ " </tr>\n",
653
+ " <tr>\n",
654
+ " <th>25</th>\n",
655
+ " <td>서울 중구</td>\n",
656
+ " <td>도시대기</td>\n",
657
+ " <td>111121</td>\n",
658
+ " <td>중구</td>\n",
659
+ " <td>2023-07-02 02:00:00</td>\n",
660
+ " <td>0.0337</td>\n",
661
+ " <td>0.0133</td>\n",
662
+ " <td>22.0</td>\n",
663
+ " <td>18.0</td>\n",
664
+ " <td>서울 중구 덕수궁길 15</td>\n",
665
+ " <td>2023.0</td>\n",
666
+ " <td>7.0</td>\n",
667
+ " <td>2.0</td>\n",
668
+ " <td>2.0</td>\n",
669
+ " </tr>\n",
670
+ " <tr>\n",
671
+ " <th>26</th>\n",
672
+ " <td>서울 중구</td>\n",
673
+ " <td>도시대기</td>\n",
674
+ " <td>111121</td>\n",
675
+ " <td>중구</td>\n",
676
+ " <td>2023-07-02 03:00:00</td>\n",
677
+ " <td>0.0260</td>\n",
678
+ " <td>0.0162</td>\n",
679
+ " <td>25.0</td>\n",
680
+ " <td>20.0</td>\n",
681
+ " <td>서울 중구 덕수궁길 15</td>\n",
682
+ " <td>2023.0</td>\n",
683
+ " <td>7.0</td>\n",
684
+ " <td>2.0</td>\n",
685
+ " <td>3.0</td>\n",
686
+ " </tr>\n",
687
+ " <tr>\n",
688
+ " <th>27</th>\n",
689
+ " <td>서울 중구</td>\n",
690
+ " <td>도시대기</td>\n",
691
+ " <td>111121</td>\n",
692
+ " <td>중구</td>\n",
693
+ " <td>2023-07-02 04:00:00</td>\n",
694
+ " <td>0.0195</td>\n",
695
+ " <td>0.0179</td>\n",
696
+ " <td>22.0</td>\n",
697
+ " <td>18.0</td>\n",
698
+ " <td>서울 중구 덕수궁길 15</td>\n",
699
+ " <td>2023.0</td>\n",
700
+ " <td>7.0</td>\n",
701
+ " <td>2.0</td>\n",
702
+ " <td>4.0</td>\n",
703
+ " </tr>\n",
704
+ " <tr>\n",
705
+ " <th>28</th>\n",
706
+ " <td>서울 중구</td>\n",
707
+ " <td>도시대기</td>\n",
708
+ " <td>111121</td>\n",
709
+ " <td>중구</td>\n",
710
+ " <td>2023-07-02 05:00:00</td>\n",
711
+ " <td>0.0171</td>\n",
712
+ " <td>0.0170</td>\n",
713
+ " <td>19.0</td>\n",
714
+ " <td>17.0</td>\n",
715
+ " <td>서울 중구 덕수궁길 15</td>\n",
716
+ " <td>2023.0</td>\n",
717
+ " <td>7.0</td>\n",
718
+ " <td>2.0</td>\n",
719
+ " <td>5.0</td>\n",
720
+ " </tr>\n",
721
+ " <tr>\n",
722
+ " <th>29</th>\n",
723
+ " <td>서울 중구</td>\n",
724
+ " <td>도시대기</td>\n",
725
+ " <td>111121</td>\n",
726
+ " <td>중구</td>\n",
727
+ " <td>2023-07-02 06:00:00</td>\n",
728
+ " <td>0.0181</td>\n",
729
+ " <td>0.0145</td>\n",
730
+ " <td>14.0</td>\n",
731
+ " <td>10.0</td>\n",
732
+ " <td>서울 중구 덕수궁길 15</td>\n",
733
+ " <td>2023.0</td>\n",
734
+ " <td>7.0</td>\n",
735
+ " <td>2.0</td>\n",
736
+ " <td>6.0</td>\n",
737
+ " </tr>\n",
738
+ " <tr>\n",
739
+ " <th>30</th>\n",
740
+ " <td>서울 중구</td>\n",
741
+ " <td>도시대기</td>\n",
742
+ " <td>111121</td>\n",
743
+ " <td>중구</td>\n",
744
+ " <td>2023-07-02 07:00:00</td>\n",
745
+ " <td>0.0174</td>\n",
746
+ " <td>0.0156</td>\n",
747
+ " <td>11.0</td>\n",
748
+ " <td>10.0</td>\n",
749
+ " <td>서울 중구 덕수궁길 15</td>\n",
750
+ " <td>2023.0</td>\n",
751
+ " <td>7.0</td>\n",
752
+ " <td>2.0</td>\n",
753
+ " <td>7.0</td>\n",
754
+ " </tr>\n",
755
+ " <tr>\n",
756
+ " <th>31</th>\n",
757
+ " <td>서울 중구</td>\n",
758
+ " <td>도시대기</td>\n",
759
+ " <td>111121</td>\n",
760
+ " <td>중구</td>\n",
761
+ " <td>2023-07-02 08:00:00</td>\n",
762
+ " <td>0.0213</td>\n",
763
+ " <td>0.0147</td>\n",
764
+ " <td>12.0</td>\n",
765
+ " <td>9.0</td>\n",
766
+ " <td>서울 중구 덕수궁길 15</td>\n",
767
+ " <td>2023.0</td>\n",
768
+ " <td>7.0</td>\n",
769
+ " <td>2.0</td>\n",
770
+ " <td>8.0</td>\n",
771
+ " </tr>\n",
772
+ " <tr>\n",
773
+ " <th>32</th>\n",
774
+ " <td>서울 중구</td>\n",
775
+ " <td>도시대기</td>\n",
776
+ " <td>111121</td>\n",
777
+ " <td>중구</td>\n",
778
+ " <td>2023-07-02 09:00:00</td>\n",
779
+ " <td>0.0267</td>\n",
780
+ " <td>0.0143</td>\n",
781
+ " <td>11.0</td>\n",
782
+ " <td>10.0</td>\n",
783
+ " <td>서울 중구 덕수궁길 15</td>\n",
784
+ " <td>2023.0</td>\n",
785
+ " <td>7.0</td>\n",
786
+ " <td>2.0</td>\n",
787
+ " <td>9.0</td>\n",
788
+ " </tr>\n",
789
+ " <tr>\n",
790
+ " <th>33</th>\n",
791
+ " <td>서울 중구</td>\n",
792
+ " <td>도시대기</td>\n",
793
+ " <td>111121</td>\n",
794
+ " <td>중구</td>\n",
795
+ " <td>2023-07-02 10:00:00</td>\n",
796
+ " <td>0.0289</td>\n",
797
+ " <td>0.0155</td>\n",
798
+ " <td>12.0</td>\n",
799
+ " <td>9.0</td>\n",
800
+ " <td>서울 중구 덕수궁길 15</td>\n",
801
+ " <td>2023.0</td>\n",
802
+ " <td>7.0</td>\n",
803
+ " <td>2.0</td>\n",
804
+ " <td>10.0</td>\n",
805
+ " </tr>\n",
806
+ " <tr>\n",
807
+ " <th>34</th>\n",
808
+ " <td>서울 중구</td>\n",
809
+ " <td>도시대기</td>\n",
810
+ " <td>111121</td>\n",
811
+ " <td>중구</td>\n",
812
+ " <td>2023-07-02 11:00:00</td>\n",
813
+ " <td>0.0381</td>\n",
814
+ " <td>0.0108</td>\n",
815
+ " <td>13.0</td>\n",
816
+ " <td>13.0</td>\n",
817
+ " <td>서울 중구 덕수궁길 15</td>\n",
818
+ " <td>2023.0</td>\n",
819
+ " <td>7.0</td>\n",
820
+ " <td>2.0</td>\n",
821
+ " <td>11.0</td>\n",
822
+ " </tr>\n",
823
+ " <tr>\n",
824
+ " <th>35</th>\n",
825
+ " <td>서울 중구</td>\n",
826
+ " <td>도시대기</td>\n",
827
+ " <td>111121</td>\n",
828
+ " <td>중구</td>\n",
829
+ " <td>2023-07-02 12:00:00</td>\n",
830
+ " <td>0.0441</td>\n",
831
+ " <td>0.0079</td>\n",
832
+ " <td>13.0</td>\n",
833
+ " <td>12.0</td>\n",
834
+ " <td>서울 중구 덕수궁길 15</td>\n",
835
+ " <td>2023.0</td>\n",
836
+ " <td>7.0</td>\n",
837
+ " <td>2.0</td>\n",
838
+ " <td>12.0</td>\n",
839
+ " </tr>\n",
840
+ " <tr>\n",
841
+ " <th>36</th>\n",
842
+ " <td>서울 중구</td>\n",
843
+ " <td>도시대기</td>\n",
844
+ " <td>111121</td>\n",
845
+ " <td>중구</td>\n",
846
+ " <td>2023-07-02 13:00:00</td>\n",
847
+ " <td>0.0489</td>\n",
848
+ " <td>0.0067</td>\n",
849
+ " <td>8.0</td>\n",
850
+ " <td>10.0</td>\n",
851
+ " <td>서울 중구 덕수궁길 15</td>\n",
852
+ " <td>2023.0</td>\n",
853
+ " <td>7.0</td>\n",
854
+ " <td>2.0</td>\n",
855
+ " <td>13.0</td>\n",
856
+ " </tr>\n",
857
+ " <tr>\n",
858
+ " <th>37</th>\n",
859
+ " <td>서울 중구</td>\n",
860
+ " <td>도시대기</td>\n",
861
+ " <td>111121</td>\n",
862
+ " <td>중구</td>\n",
863
+ " <td>2023-07-02 14:00:00</td>\n",
864
+ " <td>0.0498</td>\n",
865
+ " <td>0.0072</td>\n",
866
+ " <td>11.0</td>\n",
867
+ " <td>10.0</td>\n",
868
+ " <td>서울 중구 덕수궁길 15</td>\n",
869
+ " <td>2023.0</td>\n",
870
+ " <td>7.0</td>\n",
871
+ " <td>2.0</td>\n",
872
+ " <td>14.0</td>\n",
873
+ " </tr>\n",
874
+ " <tr>\n",
875
+ " <th>38</th>\n",
876
+ " <td>서울 중구</td>\n",
877
+ " <td>도시대기</td>\n",
878
+ " <td>111121</td>\n",
879
+ " <td>중구</td>\n",
880
+ " <td>2023-07-02 15:00:00</td>\n",
881
+ " <td>0.0459</td>\n",
882
+ " <td>0.0073</td>\n",
883
+ " <td>14.0</td>\n",
884
+ " <td>12.0</td>\n",
885
+ " <td>서울 중구 덕수궁길 15</td>\n",
886
+ " <td>2023.0</td>\n",
887
+ " <td>7.0</td>\n",
888
+ " <td>2.0</td>\n",
889
+ " <td>15.0</td>\n",
890
+ " </tr>\n",
891
+ " <tr>\n",
892
+ " <th>39</th>\n",
893
+ " <td>서울 중구</td>\n",
894
+ " <td>도시대기</td>\n",
895
+ " <td>111121</td>\n",
896
+ " <td>중구</td>\n",
897
+ " <td>2023-07-02 16:00:00</td>\n",
898
+ " <td>0.0474</td>\n",
899
+ " <td>0.0079</td>\n",
900
+ " <td>12.0</td>\n",
901
+ " <td>11.0</td>\n",
902
+ " <td>서울 중구 덕수궁길 15</td>\n",
903
+ " <td>2023.0</td>\n",
904
+ " <td>7.0</td>\n",
905
+ " <td>2.0</td>\n",
906
+ " <td>16.0</td>\n",
907
+ " </tr>\n",
908
+ " </tbody>\n",
909
+ "</table>\n",
910
+ "</div>"
911
+ ],
912
+ "text/plain": [
913
+ " 지역 망 측정소코드 측정소명 측정일시 O3 NO2 PM10 PM25 \\\n",
914
+ "0 서울 중구 도시대기 111121 중구 2023-07-01 01:00:00 0.0249 0.0188 21.0 19.0 \n",
915
+ "1 서울 중구 도시대기 111121 중구 2023-07-01 02:00:00 0.0263 0.0163 18.0 15.0 \n",
916
+ "2 서울 중구 도시대기 111121 중구 2023-07-01 03:00:00 0.0218 0.0192 24.0 21.0 \n",
917
+ "3 서울 중구 도시대기 111121 중구 2023-07-01 04:00:00 0.0131 0.0214 25.0 19.0 \n",
918
+ "4 서울 중구 도시대기 111121 중구 2023-07-01 05:00:00 0.0131 0.0160 25.0 21.0 \n",
919
+ "5 서울 중구 도시대기 111121 중구 2023-07-01 06:00:00 0.0115 0.0196 23.0 18.0 \n",
920
+ "6 서울 중구 도시대기 111121 중구 2023-07-01 07:00:00 0.0094 0.0230 26.0 21.0 \n",
921
+ "7 서울 중구 도시대기 111121 중구 2023-07-01 08:00:00 0.0222 0.0175 26.0 20.0 \n",
922
+ "8 서울 중구 도시대기 111121 중구 2023-07-01 09:00:00 0.0396 0.0153 27.0 20.0 \n",
923
+ "9 서울 중구 도시대기 111121 중구 2023-07-01 10:00:00 0.0530 0.0105 19.0 16.0 \n",
924
+ "10 서울 중구 도시대기 111121 중구 2023-07-01 11:00:00 0.0607 0.0090 20.0 20.0 \n",
925
+ "11 서울 중구 도시대기 111121 중구 2023-07-01 12:00:00 0.0688 0.0114 20.0 17.0 \n",
926
+ "12 서울 중구 도시대기 111121 중구 2023-07-01 13:00:00 0.0758 0.0101 23.0 17.0 \n",
927
+ "13 서울 중구 도시대기 111121 중구 2023-07-01 14:00:00 0.0743 0.0093 20.0 17.0 \n",
928
+ "14 서울 중구 도시대기 111121 중구 2023-07-01 15:00:00 0.0749 0.0100 19.0 11.0 \n",
929
+ "15 서울 중구 도시대기 111121 중구 2023-07-01 16:00:00 0.0716 0.0092 19.0 15.0 \n",
930
+ "16 서울 중구 도시대기 111121 중구 2023-07-01 17:00:00 0.0613 0.0099 18.0 15.0 \n",
931
+ "17 서울 중구 도시대기 111121 중구 2023-07-01 18:00:00 0.0496 0.0098 18.0 14.0 \n",
932
+ "18 서울 중구 도시대기 111121 중구 2023-07-01 19:00:00 0.0473 0.0124 17.0 17.0 \n",
933
+ "19 서울 중구 도시대기 111121 중구 2023-07-01 20:00:00 0.0498 0.0170 17.0 15.0 \n",
934
+ "20 서울 중구 도시대기 111121 중구 2023-07-01 21:00:00 0.0616 0.0134 23.0 20.0 \n",
935
+ "21 서울 중구 도시대기 111121 중구 2023-07-01 22:00:00 0.0543 0.0109 18.0 16.0 \n",
936
+ "22 서울 중구 도시대기 111121 중구 2023-07-01 23:00:00 0.0507 0.0113 17.0 16.0 \n",
937
+ "23 서울 중구 도시대기 111121 중구 NaT 0.0427 0.0125 17.0 16.0 \n",
938
+ "24 서울 중구 도시대기 111121 중구 2023-07-02 01:00:00 0.0334 0.0148 21.0 20.0 \n",
939
+ "25 서울 중구 도시대기 111121 중구 2023-07-02 02:00:00 0.0337 0.0133 22.0 18.0 \n",
940
+ "26 서울 중구 도시대기 111121 중구 2023-07-02 03:00:00 0.0260 0.0162 25.0 20.0 \n",
941
+ "27 서울 중구 도시대기 111121 중구 2023-07-02 04:00:00 0.0195 0.0179 22.0 18.0 \n",
942
+ "28 서울 중구 도시대기 111121 중구 2023-07-02 05:00:00 0.0171 0.0170 19.0 17.0 \n",
943
+ "29 서울 중구 도시대기 111121 중구 2023-07-02 06:00:00 0.0181 0.0145 14.0 10.0 \n",
944
+ "30 서울 중구 도시대기 111121 중구 2023-07-02 07:00:00 0.0174 0.0156 11.0 10.0 \n",
945
+ "31 서울 중구 도시대기 111121 중구 2023-07-02 08:00:00 0.0213 0.0147 12.0 9.0 \n",
946
+ "32 서울 중구 도시대기 111121 중구 2023-07-02 09:00:00 0.0267 0.0143 11.0 10.0 \n",
947
+ "33 서울 중구 도시대기 111121 중구 2023-07-02 10:00:00 0.0289 0.0155 12.0 9.0 \n",
948
+ "34 서울 중구 도시대기 111121 중구 2023-07-02 11:00:00 0.0381 0.0108 13.0 13.0 \n",
949
+ "35 서울 중구 도시대기 111121 중구 2023-07-02 12:00:00 0.0441 0.0079 13.0 12.0 \n",
950
+ "36 서울 중구 도시대기 111121 중구 2023-07-02 13:00:00 0.0489 0.0067 8.0 10.0 \n",
951
+ "37 서울 중구 도시대기 111121 중구 2023-07-02 14:00:00 0.0498 0.0072 11.0 10.0 \n",
952
+ "38 서울 중구 도시대기 111121 중구 2023-07-02 15:00:00 0.0459 0.0073 14.0 12.0 \n",
953
+ "39 서울 중구 도시대기 111121 중구 2023-07-02 16:00:00 0.0474 0.0079 12.0 11.0 \n",
954
+ "\n",
955
+ " 주소 year month day hour \n",
956
+ "0 서울 중구 덕수궁길 15 2023.0 7.0 1.0 1.0 \n",
957
+ "1 서울 중구 덕수궁길 15 2023.0 7.0 1.0 2.0 \n",
958
+ "2 서울 중구 덕수궁길 15 2023.0 7.0 1.0 3.0 \n",
959
+ "3 서울 중구 덕수궁길 15 2023.0 7.0 1.0 4.0 \n",
960
+ "4 서울 중구 덕수궁길 15 2023.0 7.0 1.0 5.0 \n",
961
+ "5 서울 중구 덕수궁길 15 2023.0 7.0 1.0 6.0 \n",
962
+ "6 서울 중구 덕수궁길 15 2023.0 7.0 1.0 7.0 \n",
963
+ "7 서울 중구 덕수궁길 15 2023.0 7.0 1.0 8.0 \n",
964
+ "8 서울 중구 덕수궁길 15 2023.0 7.0 1.0 9.0 \n",
965
+ "9 서울 중구 덕수궁길 15 2023.0 7.0 1.0 10.0 \n",
966
+ "10 서울 중구 덕수궁길 15 2023.0 7.0 1.0 11.0 \n",
967
+ "11 서울 중구 덕수궁길 15 2023.0 7.0 1.0 12.0 \n",
968
+ "12 서울 중구 덕수궁길 15 2023.0 7.0 1.0 13.0 \n",
969
+ "13 서울 중구 덕수궁길 15 2023.0 7.0 1.0 14.0 \n",
970
+ "14 서울 중구 덕수궁길 15 2023.0 7.0 1.0 15.0 \n",
971
+ "15 서울 중구 덕수궁길 15 2023.0 7.0 1.0 16.0 \n",
972
+ "16 서울 중구 덕수궁길 15 2023.0 7.0 1.0 17.0 \n",
973
+ "17 서울 중구 덕수궁길 15 2023.0 7.0 1.0 18.0 \n",
974
+ "18 서울 중구 덕수궁길 15 2023.0 7.0 1.0 19.0 \n",
975
+ "19 서울 중구 덕수궁길 15 2023.0 7.0 1.0 20.0 \n",
976
+ "20 서울 중구 덕수궁길 15 2023.0 7.0 1.0 21.0 \n",
977
+ "21 서울 중구 덕수궁길 15 2023.0 7.0 1.0 22.0 \n",
978
+ "22 서울 중구 덕수궁길 15 2023.0 7.0 1.0 23.0 \n",
979
+ "23 서울 중구 덕수궁길 15 NaN NaN NaN NaN \n",
980
+ "24 서울 중구 덕수궁길 15 2023.0 7.0 2.0 1.0 \n",
981
+ "25 서울 중구 덕수궁길 15 2023.0 7.0 2.0 2.0 \n",
982
+ "26 서울 중구 덕수궁길 15 2023.0 7.0 2.0 3.0 \n",
983
+ "27 서울 중구 덕수궁길 15 2023.0 7.0 2.0 4.0 \n",
984
+ "28 서울 중구 덕수궁길 15 2023.0 7.0 2.0 5.0 \n",
985
+ "29 서울 중구 덕수궁길 15 2023.0 7.0 2.0 6.0 \n",
986
+ "30 서울 중구 덕수궁길 15 2023.0 7.0 2.0 7.0 \n",
987
+ "31 서울 중구 덕수궁길 15 2023.0 7.0 2.0 8.0 \n",
988
+ "32 서울 중구 덕수궁길 15 2023.0 7.0 2.0 9.0 \n",
989
+ "33 서울 중구 덕수궁길 15 2023.0 7.0 2.0 10.0 \n",
990
+ "34 서울 중구 덕수궁길 15 2023.0 7.0 2.0 11.0 \n",
991
+ "35 서울 중구 덕수궁길 15 2023.0 7.0 2.0 12.0 \n",
992
+ "36 서울 중구 덕수궁길 15 2023.0 7.0 2.0 13.0 \n",
993
+ "37 서울 중구 덕수궁길 15 2023.0 7.0 2.0 14.0 \n",
994
+ "38 서울 중구 덕수궁길 15 2023.0 7.0 2.0 15.0 \n",
995
+ "39 서울 중구 덕수궁길 15 2023.0 7.0 2.0 16.0 "
996
+ ]
997
+ },
998
+ "execution_count": 6,
999
+ "metadata": {},
1000
+ "output_type": "execute_result"
1001
+ }
1002
+ ],
1003
+ "source": [
1004
+ "data.head(40)"
1005
+ ]
1006
+ }
1007
+ ],
1008
+ "metadata": {
1009
+ "kernelspec": {
1010
+ "display_name": "Python 3 (ipykernel)",
1011
+ "language": "python",
1012
+ "name": "python3"
1013
+ },
1014
+ "language_info": {
1015
+ "codemirror_mode": {
1016
+ "name": "ipython",
1017
+ "version": 3
1018
+ },
1019
+ "file_extension": ".py",
1020
+ "mimetype": "text/x-python",
1021
+ "name": "python",
1022
+ "nbconvert_exporter": "python",
1023
+ "pygments_lexer": "ipython3",
1024
+ "version": "3.8.13"
1025
+ }
1026
+ },
1027
+ "nbformat": 4,
1028
+ "nbformat_minor": 4
1029
+ }
Analysis_code/1.data_merge.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/2.eda_preproccesing.ipynb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8fe49fd26e48bcada89076a0b3c8ffe45e3d2e8407fe953b1558df9bfcfcddb
3
+ size 41518580
Analysis_code/3.oversampling.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/__pycache__/code.cpython-312.pyc ADDED
Binary file (13.8 kB). View file
 
Analysis_code/__pycache__/deepgbm.cpython-312.pyc ADDED
Binary file (2.67 kB). View file
 
Analysis_code/__pycache__/deepgbm.cpython-38.pyc ADDED
Binary file (1.88 kB). View file
 
Analysis_code/__pycache__/deepgbm.cpython-39.pyc ADDED
Binary file (1.82 kB). View file
 
Analysis_code/__pycache__/ft_transformer.cpython-312.pyc ADDED
Binary file (2.71 kB). View file
 
Analysis_code/__pycache__/ft_transformer.cpython-38.pyc ADDED
Binary file (1.99 kB). View file
 
Analysis_code/__pycache__/ft_transformer.cpython-39.pyc ADDED
Binary file (1.93 kB). View file
 
Analysis_code/__pycache__/resnet_like.cpython-312.pyc ADDED
Binary file (2.26 kB). View file
 
Analysis_code/__pycache__/resnet_like.cpython-38.pyc ADDED
Binary file (1.54 kB). View file
 
Analysis_code/__pycache__/resnet_like.cpython-39.pyc ADDED
Binary file (1.48 kB). View file
 
Analysis_code/best_deepgbm_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:471ce03d818bdefc9631121537afd4b771d85d872f72d74634e5cee824de2b62
3
+ size 988522
Analysis_code/best_model_f1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12c6e86e039528b07c4cc6e90e3033da7b02d67c33539cbb81d280da281f46c3
3
+ size 8999933
Analysis_code/deepgbm.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class DeepGBM(nn.Module):
6
+ def __init__(self, num_features, cat_features, num_classes, d_main=128, d_hidden=64, n_blocks=4, dropout=0.2):
7
+ super(DeepGBM, self).__init__()
8
+
9
+ self.num_classes = num_classes
10
+
11
+ # 연속형 변수 처리 (Linear)
12
+ self.num_linear = nn.Linear(num_features, d_main)
13
+
14
+ # 범주형 변수 처리 (Embedding)
15
+ self.cat_embedding = nn.ModuleList([
16
+ nn.Embedding(cat_size, d_main) for cat_size in cat_features
17
+ ])
18
+
19
+ # ResNet-like 블록
20
+ self.blocks = nn.ModuleList([
21
+ nn.Sequential(
22
+ nn.Linear(d_main, d_hidden),
23
+ nn.ReLU(),
24
+ nn.Dropout(dropout),
25
+ nn.Linear(d_hidden, d_main),
26
+ nn.ReLU()
27
+ ) for _ in range(n_blocks)
28
+ ])
29
+
30
+ if num_classes == 2:
31
+ self.output_layer = nn.Linear(d_main, 1) # Binary classification
32
+ elif num_classes > 2:
33
+ self.output_layer = nn.Linear(d_main, num_classes) # Multi classification
34
+
35
+ def forward(self, x_num, x_cat): # 두 개의 입력을 받음
36
+ x_num = self.num_linear(x_num)
37
+
38
+ # 범주형 변수를 임베딩 후 합산
39
+ x_cat = [embed(x_cat[:, i]) for i, embed in enumerate(self.cat_embedding)]
40
+ x_cat = torch.stack(x_cat, dim=1).sum(dim=1)
41
+ x = x_num + x_cat # 연속형 + 범주형 결합
42
+
43
+ for block in self.blocks:
44
+ x = x + block(x) # Residual connection
45
+ x = self.output_layer(x)
46
+
47
+ return x
Analysis_code/deeplearning_model_binary.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/deeplearning_model_multi.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/final_test/final.ipynb ADDED
@@ -0,0 +1,1143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import numpy as np\n",
11
+ "from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
12
+ "import torch\n",
13
+ "from torch.utils.data import DataLoader, TensorDataset\n",
14
+ "import random\n",
15
+ "from collections import Counter\n",
16
+ "import sys\n",
17
+ "sys.path.append('../../../../../../../../mnt/workspace/LightGBM/python-package')\n",
18
+ "from lightgbm import LGBMClassifier\n",
19
+ "import numpy as np\n",
20
+ "from sklearn.model_selection import train_test_split\n",
21
+ "from sklearn.inspection import permutation_importance\n",
22
+ "from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score\n",
23
+ "from sklearn.model_selection import StratifiedKFold\n",
24
+ "from xgboost import XGBClassifier\n",
25
+ "from warnings import filterwarnings\n",
26
+ "filterwarnings('ignore')\n",
27
+ "import sys\n",
28
+ "sys.path.append('../')\n",
29
+ "import torch\n",
30
+ "import torch.nn as nn\n",
31
+ "import torch.optim as optim\n",
32
+ "import optuna\n",
33
+ "import pandas as pd\n",
34
+ "import numpy as np\n",
35
+ "import random\n",
36
+ "from ft_transformer import FTTransformer\n",
37
+ "from resnet_like import ResNetLike\n",
38
+ "from deepgbm import DeepGBM\n",
39
+ "from pytorch_tabnet.tab_model import TabNetClassifier\n",
40
+ "from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 2,
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "# Python 및 Numpy 시드 고정\n",
50
+ "seed = 42\n",
51
+ "random.seed(seed)\n",
52
+ "np.random.seed(seed)\n",
53
+ "\n",
54
+ "# PyTorch 시드 고정\n",
55
+ "torch.manual_seed(seed)\n",
56
+ "torch.cuda.manual_seed(seed)\n",
57
+ "torch.cuda.manual_seed_all(seed) # Multi-GPU 환경에서 동일한 시드 적용\n",
58
+ "\n",
59
+ "# PyTorch 연산의 결정적 모드 설정\n",
60
+ "torch.backends.cudnn.deterministic = True # 실행마다 동일한 결과를 보장\n",
61
+ "torch.backends.cudnn.benchmark = True # 성능 최적화를 활성화 (가능한 한 빠른 연산 수행)\n",
62
+ "\n",
63
+ "# 전처리 함수\n",
64
+ "def preprocessing(df):\n",
65
+ " df = df[df.columns].copy()\n",
66
+ " df.loc[df['wind_dir']=='정온', 'wind_dir'] = \"0\"\n",
67
+ " df['wind_dir'] = df['wind_dir'].astype('int')\n",
68
+ " df['lm_cloudcover'] = df['lm_cloudcover'].astype('int')\n",
69
+ " df['cloudcover'] = df['cloudcover'].astype('int')\n",
70
+ " return df\n",
71
+ "\n",
72
+ "# 데이터셋 준비 함수\n",
73
+ "def prepare_dataset(region, data_sample='pure', target='multi', fold=3):\n",
74
+ "\n",
75
+ " # 데이터 경로 지정\n",
76
+ " dat_path = f\"../../data/data_for_modeling/{region}_train.csv\"\n",
77
+ " if data_sample == 'pure':\n",
78
+ " train_path = dat_path\n",
79
+ " else:\n",
80
+ " train_path = f'../../data/data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv'\n",
81
+ " train_path = f'../../data/data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv'\n",
82
+ " test_path = f\"../../data/data_for_modeling/{region}_test.csv\"\n",
83
+ " drop_col = ['binary_class','multi_class','visi','year']\n",
84
+ " target_col = f'{target}_class'\n",
85
+ " \n",
86
+ " # 데이터 로드\n",
87
+ " region_dat = preprocessing(pd.read_csv(dat_path, index_col=0))\n",
88
+ " if data_sample == 'pure':\n",
89
+ " region_train = region_dat.loc[~region_dat['year'].isin([2021-fold]), :]\n",
90
+ " else:\n",
91
+ " region_train = preprocessing(pd.read_csv(train_path))\n",
92
+ " region_val = region_dat.loc[region_dat['year'].isin([2021-fold]), :]\n",
93
+ " region_test = preprocessing(pd.read_csv(test_path))\n",
94
+ "\n",
95
+ " # 컬럼 정렬 (일관성 유지)\n",
96
+ " common_columns = region_train.columns.to_list()\n",
97
+ " train_data = region_train[common_columns]\n",
98
+ " val_data = region_val[common_columns]\n",
99
+ " test_data = region_test[common_columns]\n",
100
+ "\n",
101
+ " # 설명변수 & 타겟 분리\n",
102
+ " X_train = train_data.drop(columns=drop_col)\n",
103
+ " y_train = train_data[target_col]\n",
104
+ " X_val = val_data.drop(columns=drop_col)\n",
105
+ " y_val = val_data[target_col]\n",
106
+ " X_test = test_data.drop(columns=drop_col)\n",
107
+ " y_test = test_data[target_col]\n",
108
+ "\n",
109
+ " # 범주형 & 연속형 변수 분리\n",
110
+ " categorical_cols = X_train.select_dtypes(include=['object', 'category', 'int64']).columns\n",
111
+ " numerical_cols = X_train.select_dtypes(include=['float64']).columns\n",
112
+ "\n",
113
+ " # 범주형 변수 Label Encoding\n",
114
+ " label_encoders = {}\n",
115
+ " for col in categorical_cols:\n",
116
+ " le = LabelEncoder()\n",
117
+ " le.fit(X_train[col]) # Train 데이터 기준으로 학���\n",
118
+ " label_encoders[col] = le\n",
119
+ "\n",
120
+ " # 변환 적용\n",
121
+ " for col in categorical_cols:\n",
122
+ " X_train[col] = label_encoders[col].transform(X_train[col])\n",
123
+ " X_val[col] = label_encoders[col].transform(X_val[col])\n",
124
+ " X_test[col] = label_encoders[col].transform(X_test[col])\n",
125
+ "\n",
126
+ " # 연속형 변수 Standard Scaling\n",
127
+ " scaler = StandardScaler()\n",
128
+ " scaler.fit(X_train[numerical_cols]) # Train 데이터 기준으로 학습\n",
129
+ "\n",
130
+ " # 변환 적용\n",
131
+ " X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])\n",
132
+ " X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])\n",
133
+ " X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])\n",
134
+ "\n",
135
+ " return X_train, X_val, X_test, y_train, y_val, y_test, categorical_cols, numerical_cols\n",
136
+ "\n",
137
+ "# 데이터 변환 및 dataloader 생성 함수\n",
138
+ "def prepare_dataloader(region, data_sample='pure', target='multi', fold=3, random_state=None):\n",
139
+ "\n",
140
+ " # 데이터 경로 지정\n",
141
+ " dat_path = f\"../../data/data_for_modeling/{region}_train.csv\"\n",
142
+ " if data_sample == 'pure':\n",
143
+ " train_path = dat_path\n",
144
+ " else:\n",
145
+ " train_path = f'../../data/data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv'\n",
146
+ " train_path = f'../../data/data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv'\n",
147
+ " test_path = f\"../../data/data_for_modeling/{region}_test.csv\"\n",
148
+ " drop_col = ['binary_class','multi_class','visi','year']\n",
149
+ " target_col = f'{target}_class'\n",
150
+ " \n",
151
+ " # 데이터 로드\n",
152
+ " region_dat = preprocessing(pd.read_csv(dat_path, index_col=0))\n",
153
+ " if data_sample == 'pure':\n",
154
+ " region_train = region_dat.loc[~region_dat['year'].isin([2021-fold]), :]\n",
155
+ " else:\n",
156
+ " region_train = preprocessing(pd.read_csv(train_path))\n",
157
+ " region_val = region_dat.loc[region_dat['year'].isin([2021-fold]), :]\n",
158
+ " region_test = preprocessing(pd.read_csv(test_path))\n",
159
+ "\n",
160
+ " # 컬럼 정렬 (일관성 유지)\n",
161
+ " common_columns = region_train.columns.to_list()\n",
162
+ " train_data = region_train[common_columns]\n",
163
+ " val_data = region_val[common_columns]\n",
164
+ " test_data = region_test[common_columns]\n",
165
+ "\n",
166
+ " # 설명변수 & 타겟 분리\n",
167
+ " X_train = train_data.drop(columns=drop_col)\n",
168
+ " y_train = train_data[target_col]\n",
169
+ " X_val = val_data.drop(columns=drop_col)\n",
170
+ " y_val = val_data[target_col]\n",
171
+ " X_test = test_data.drop(columns=drop_col)\n",
172
+ " y_test = test_data[target_col]\n",
173
+ "\n",
174
+ " # 범주형 & 연속형 변수 분리\n",
175
+ " categorical_cols = X_train.select_dtypes(include=['object', 'category', 'int64']).columns\n",
176
+ " numerical_cols = X_train.select_dtypes(include=['float64']).columns\n",
177
+ "\n",
178
+ " # 범주형 변수 Label Encoding\n",
179
+ " label_encoders = {}\n",
180
+ " for col in categorical_cols:\n",
181
+ " le = LabelEncoder()\n",
182
+ " le.fit(X_train[col]) # Train 데이터 기준으로 학습\n",
183
+ " label_encoders[col] = le\n",
184
+ "\n",
185
+ " # 변환 적용\n",
186
+ " for col in categorical_cols:\n",
187
+ " X_train[col] = label_encoders[col].transform(X_train[col])\n",
188
+ " X_val[col] = label_encoders[col].transform(X_val[col])\n",
189
+ " X_test[col] = label_encoders[col].transform(X_test[col])\n",
190
+ "\n",
191
+ " # 연속형 변수 Standard Scaling\n",
192
+ " scaler = StandardScaler()\n",
193
+ " scaler.fit(X_train[numerical_cols]) # Train 데이터 기준으로 학습\n",
194
+ "\n",
195
+ " # 변환 적용\n",
196
+ " X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])\n",
197
+ " X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])\n",
198
+ " X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])\n",
199
+ "\n",
200
+ " # 연속형 변수와 범주형 변수 분리\n",
201
+ " X_train_num = torch.tensor(X_train[numerical_cols].values, dtype=torch.float32)\n",
202
+ " X_train_cat = torch.tensor(X_train[categorical_cols].values, dtype=torch.long)\n",
203
+ "\n",
204
+ " X_val_num = torch.tensor(X_val[numerical_cols].values, dtype=torch.float32)\n",
205
+ " X_val_cat = torch.tensor(X_val[categorical_cols].values, dtype=torch.long)\n",
206
+ "\n",
207
+ " X_test_num = torch.tensor(X_test[numerical_cols].values, dtype=torch.float32)\n",
208
+ " X_test_cat = torch.tensor(X_test[categorical_cols].values, dtype=torch.long)\n",
209
+ "\n",
210
+ " # 레이블 변환\n",
211
+ " if target == \"binary\":\n",
212
+ " y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32) # 이진 분류 → float32\n",
213
+ " y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)\n",
214
+ " y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)\n",
215
+ " elif target == \"multi\":\n",
216
+ " y_train_tensor = torch.tensor(y_train.values, dtype=torch.long) # 다중 분류 → long\n",
217
+ " y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)\n",
218
+ " y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)\n",
219
+ " else:\n",
220
+ " raise ValueError(\"target must be 'binary' or 'multi'\")\n",
221
+ "\n",
222
+ " # TensorDataset 생성\n",
223
+ " train_dataset = TensorDataset(X_train_num, X_train_cat, y_train_tensor)\n",
224
+ " val_dataset = TensorDataset(X_val_num, X_val_cat, y_val_tensor)\n",
225
+ " test_dataset = TensorDataset(X_test_num, X_test_cat, y_test_tensor)\n",
226
+ "\n",
227
+ " # DataLoader 생성\n",
228
+ " if random_state == None:\n",
229
+ " train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
230
+ " else:\n",
231
+ " train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=torch.Generator().manual_seed(random_state))\n",
232
+ " val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)\n",
233
+ " test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)\n",
234
+ " \n",
235
+ " return X_train, categorical_cols, numerical_cols, train_loader, val_loader, test_loader"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": 3,
241
+ "metadata": {},
242
+ "outputs": [],
243
+ "source": [
244
+ "import os\n",
245
+ "import torch\n",
246
+ "# 디바이스 설정 (CUDA 사용 가능하면 GPU로, 아니면 CPU로)\n",
247
+ "import glob\n",
248
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
249
+ "\n",
250
+ "def calculate_csi(Y_test, pred):\n",
251
+ "\n",
252
+ " cm = confusion_matrix(Y_test, pred) # 변수 이름을 cm으로 변경\n",
253
+ " # 혼동 행렬에서 H, F, M 추출\n",
254
+ " H = (cm[0, 0] + cm[1, 1])\n",
255
+ " \n",
256
+ " F = (cm[1, 0] + cm[2, 0] +\n",
257
+ " cm[0, 1] + cm[2, 1])\n",
258
+ "\n",
259
+ " M = (cm[0, 2] + cm[1, 2])\n",
260
+ " \n",
261
+ " # CSI 계산\n",
262
+ " CSI = H / (H + F + M + 1e-10)\n",
263
+ " return CSI\n",
264
+ "\n",
265
+ "def csi_metric(y_true, pred):\n",
266
+ " y_pred_binary = np.argmax(pred, axis=1)\n",
267
+ " score = calculate_csi(y_true, y_pred_binary)\n",
268
+ " return 'CSI', score, True # higher_better=True\n",
269
+ "\n",
270
+ "\n",
271
+ "def eval_metric_csi(y_true, pred_prob):\n",
272
+ "\n",
273
+ " pred = np.argmax(pred_prob, axis=1)\n",
274
+ " y_true = y_true\n",
275
+ " y_pred = pred\n",
276
+ " csi = calculate_csi(y_true, y_pred)\n",
277
+ " return -1*csi\n",
278
+ "\n",
279
+ "\n",
280
+ "from sklearn.metrics import matthews_corrcoef, accuracy_score\n",
281
+ "\n",
282
+ "def multiclass_mcc(y_val, y_pred):\n",
283
+ " \"\"\"\n",
284
+ " 다중 분류에서도 sklearn의 matthews_corrcoef를 그대로 사용할 수 있음.\n",
285
+ " \"\"\"\n",
286
+ " return matthews_corrcoef(y_val, y_pred)"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": 4,
292
+ "metadata": {},
293
+ "outputs": [],
294
+ "source": [
295
+ "import os\n",
296
+ "import torch\n",
297
+ "# 디바이스 설정 (CUDA 사용 가능하면 GPU로, 아니면 CPU로)\n",
298
+ "import glob\n",
299
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
300
+ "import warnings\n",
301
+ "warnings.filterwarnings('ignore')\n",
302
+ "\n",
303
+ "def calculate_csi(Y_test, pred):\n",
304
+ "\n",
305
+ " cm = confusion_matrix(Y_test, pred) # 변수 이름을 cm으로 변경\n",
306
+ " # 혼동 행렬에서 H, F, M 추출\n",
307
+ " H = (cm[0, 0] + cm[1, 1])\n",
308
+ " \n",
309
+ " F = (cm[1, 0] + cm[2, 0] +\n",
310
+ " cm[0, 1] + cm[2, 1])\n",
311
+ "\n",
312
+ " M = (cm[0, 2] + cm[1, 2])\n",
313
+ " \n",
314
+ " # CSI 계산\n",
315
+ " CSI = H / (H + F + M + 1e-10)\n",
316
+ " return CSI\n",
317
+ "\n",
318
+ "# Soft Voting 앙상블\n",
319
+ "def get_proba(region, model_choose, data_sample, fold, target='multi'):\n",
320
+ " _, _, _, _,val_loader , test_loader = prepare_dataloader(region=region, data_sample=data_sample, target=target,fold=fold ,random_state=120)\n",
321
+ "\n",
322
+ " folder_path = f'../save_model/{model_choose}/{data_sample}'\n",
323
+ " model_paths = [path for path in glob.glob(f'{folder_path}/*.pth') if f'{region}' in path]\n",
324
+ "\n",
325
+ " model = torch.load(model_paths[fold-1], weights_only=False).to(device)\n",
326
+ " model.eval()\n",
327
+ "\n",
328
+ " test_preds = []\n",
329
+ "\n",
330
+ "\n",
331
+ " with torch.no_grad():\n",
332
+ " for x_num_batch, x_cat_batch, _ in test_loader:\n",
333
+ " output = model(x_num_batch.to(device), x_cat_batch.to(device))\n",
334
+ " output = torch.softmax(output, dim=1)\n",
335
+ " test_preds.extend(output.cpu().numpy())\n",
336
+ "\n",
337
+ "\n",
338
+ " return test_preds\n"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": 5,
344
+ "metadata": {},
345
+ "outputs": [],
346
+ "source": [
347
+ "df_seoul = pd.read_csv(\"../../data/data_for_modeling/seoul_train.csv\")\n",
348
+ "df_seoul_test = pd.read_csv(\"../../data/data_for_modeling/seoul_test.csv\")\n",
349
+ "\n",
350
+ "df_busan = pd.read_csv(\"../../data/data_for_modeling/busan_train.csv\")\n",
351
+ "df_busan_test = pd.read_csv(\"../../data/data_for_modeling/busan_test.csv\")\n",
352
+ "\n",
353
+ "df_daegu = pd.read_csv(\"../../data/data_for_modeling/daegu_train.csv\")\n",
354
+ "df_daegu_test = pd.read_csv(\"../../data/data_for_modeling/daegu_test.csv\")\n",
355
+ "\n",
356
+ "df_daejeon = pd.read_csv(\"../../data/data_for_modeling/daejeon_train.csv\")\n",
357
+ "df_daejeon_test = pd.read_csv(\"../../data/data_for_modeling/daejeon_test.csv\")\n",
358
+ "\n",
359
+ "df_incheon = pd.read_csv(\"../../data/data_for_modeling/incheon_train.csv\")\n",
360
+ "df_incheon_test = pd.read_csv(\"../../data/data_for_modeling/incheon_test.csv\")\n",
361
+ "\n",
362
+ "df_gwangju = pd.read_csv(\"../../data/data_for_modeling/gwangju_train.csv\")\n",
363
+ "df_gwangju_test = pd.read_csv(\"../../data/data_for_modeling/gwangju_test.csv\")"
364
+ ]
365
+ },
366
+ {
367
+ "cell_type": "code",
368
+ "execution_count": 6,
369
+ "metadata": {},
370
+ "outputs": [],
371
+ "source": [
372
+ "def preprocessing_df(df):\n",
373
+ " df = df[df.columns].copy()\n",
374
+ " df['year'] = df['year'].astype('int')\n",
375
+ " df['month'] = df['month'].astype('int')\n",
376
+ " df['hour'] = df['hour'].astype('int')\n",
377
+ " df['binary_class'] = df['binary_class'].astype('int')\n",
378
+ " df['multi_class'] = df['multi_class'].astype('int')\n",
379
+ "\n",
380
+ " df.loc[df['wind_dir']=='정온', 'wind_dir'] = \"0\"\n",
381
+ " df['wind_dir'] = df['wind_dir'].astype('int')\n",
382
+ " df= df[['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm',\n",
383
+ " 'vap_pressure', 'dewpoint_C', 'loc_pressure', 'sea_pressure',\n",
384
+ " 'solarRad', 'snow_cm', 'cloudcover', 'lm_cloudcover', 'low_cloudbase',\n",
385
+ " 'groundtemp', 'O3', 'NO2', 'PM10', 'PM25', 'year',\n",
386
+ " 'month', 'hour', 'ground_temp - temp_C', 'hour_sin', 'hour_cos',\n",
387
+ " 'month_sin', 'month_cos','multi_class']]\n",
388
+ " return df\n",
389
+ "\n"
390
+ ]
391
+ },
392
+ {
393
+ "cell_type": "code",
394
+ "execution_count": 7,
395
+ "metadata": {},
396
+ "outputs": [],
397
+ "source": [
398
+ "df_seoul_test= preprocessing_df(df_seoul_test).copy()\n",
399
+ "df_busan_test= preprocessing_df(df_busan_test).copy()\n",
400
+ "df_daegu_test= preprocessing_df(df_daegu_test).copy()\n",
401
+ "df_gwangju_test= preprocessing_df(df_gwangju_test).copy()\n",
402
+ "df_daejeon_test= preprocessing_df(df_daejeon_test).copy()\n",
403
+ "df_incheon_test= preprocessing_df(df_incheon_test).copy()\n",
404
+ "\n",
405
+ "df_seoul= preprocessing_df(df_seoul).copy()\n",
406
+ "df_busan= preprocessing_df(df_busan).copy()\n",
407
+ "df_daegu= preprocessing_df(df_daegu).copy()\n",
408
+ "df_gwangju= preprocessing_df(df_gwangju).copy()\n",
409
+ "df_daejeon= preprocessing_df(df_daejeon).copy()\n",
410
+ "df_incheon= preprocessing_df(df_incheon).copy()\n",
411
+ "\n",
412
+ "df_seoul_test.drop(columns=['year'], inplace=True)\n",
413
+ "df_busan_test.drop(columns=['year'], inplace=True)\n",
414
+ "df_daegu_test.drop(columns=['year'], inplace=True)\n",
415
+ "df_daejeon_test.drop(columns=['year'], inplace=True)\n",
416
+ "df_incheon_test.drop(columns=['year'], inplace=True)\n",
417
+ "df_gwangju_test.drop(columns=['year'], inplace=True)"
418
+ ]
419
+ },
420
+ {
421
+ "cell_type": "code",
422
+ "execution_count": 8,
423
+ "metadata": {},
424
+ "outputs": [],
425
+ "source": [
426
+ "import joblib\n",
427
+ "\n",
428
+ "lgb_seoul= joblib.load('../save_model/LGB_optima/lgb_seoul_smote.pkl')\n",
429
+ "lgb_busan= joblib.load('../save_model/LGB_optima/lgb_busan_smote.pkl')\n",
430
+ "lgb_incheon= joblib.load('../save_model/LGB_optima/lgb_incheon_smote.pkl')\n",
431
+ "lgb_daegu= joblib.load('../save_model/LGB_optima/lgb_daegu_smote.pkl')\n",
432
+ "lgb_daejeon= joblib.load('../save_model/LGB_optima/lgb_daejeon_smote.pkl')\n",
433
+ "lgb_gwangju= joblib.load('../save_model/LGB_optima/lgb_gwangju_smote.pkl')\n",
434
+ "\n",
435
+ "xgb_seoul= joblib.load('../save_model/XGB_optima/xgb_seoul_smote.pkl')\n",
436
+ "xgb_busan= joblib.load('../save_model/XGB_optima/xgb_busan_ctgan20000.pkl')\n",
437
+ "xgb_incheon= joblib.load('../save_model/XGB_optima/xgb_incheon_smote.pkl')\n",
438
+ "xgb_daegu= joblib.load('../save_model/XGB_optima/xgb_daegu_smote.pkl')\n",
439
+ "xgb_daejeon= joblib.load('../save_model/XGB_optima/xgb_daejeon_smote.pkl')\n",
440
+ "xgb_gwangju= joblib.load('../save_model/XGB_optima/xgb_gwangju_smote.pkl')\n",
441
+ "\n",
442
+ "lgb_seoul_1= lgb_seoul[0]\n",
443
+ "lgb_seoul_2= lgb_seoul[1]\n",
444
+ "lgb_seoul_3= lgb_seoul[2]\n",
445
+ "\n",
446
+ "lgb_busan_1= lgb_busan[0]\n",
447
+ "lgb_busan_2= lgb_busan[1]\n",
448
+ "lgb_busan_3= lgb_busan[2]\n",
449
+ "\n",
450
+ "lgb_incheon_1= lgb_incheon[0]\n",
451
+ "lgb_incheon_2= lgb_incheon[1]\n",
452
+ "lgb_incheon_3= lgb_incheon[2]\n",
453
+ "\n",
454
+ "lgb_daegu_1= lgb_daegu[0]\n",
455
+ "lgb_daegu_2= lgb_daegu[1]\n",
456
+ "lgb_daegu_3= lgb_daegu[2]\n",
457
+ "\n",
458
+ "lgb_daejeon_1= lgb_daejeon[0]\n",
459
+ "lgb_daejeon_2= lgb_daejeon[1]\n",
460
+ "lgb_daejeon_3= lgb_daejeon[2]\n",
461
+ "\n",
462
+ "lgb_gwangju_1= lgb_gwangju[0]\n",
463
+ "lgb_gwangju_2= lgb_gwangju[1]\n",
464
+ "lgb_gwangju_3= lgb_gwangju[2]\n",
465
+ "\n",
466
+ "\n",
467
+ "xgb_seoul_1= xgb_seoul[0]\n",
468
+ "xgb_seoul_2= xgb_seoul[1]\n",
469
+ "xgb_seoul_3= xgb_seoul[2]\n",
470
+ "\n",
471
+ "xgb_busan_1= xgb_busan[0]\n",
472
+ "xgb_busan_2= xgb_busan[1]\n",
473
+ "xgb_busan_3= xgb_busan[2]\n",
474
+ "\n",
475
+ "xgb_incheon_1= xgb_incheon[0]\n",
476
+ "xgb_incheon_2= xgb_incheon[1]\n",
477
+ "xgb_incheon_3= xgb_incheon[2]\n",
478
+ "\n",
479
+ "xgb_daegu_1= xgb_daegu[0]\n",
480
+ "xgb_daegu_2= xgb_daegu[1]\n",
481
+ "xgb_daegu_3= xgb_daegu[2]\n",
482
+ "\n",
483
+ "xgb_daejeon_1= xgb_daejeon[0]\n",
484
+ "xgb_daejeon_2= xgb_daejeon[1]\n",
485
+ "xgb_daejeon_3= xgb_daejeon[2]\n",
486
+ "\n",
487
+ "xgb_gwangju_1= xgb_gwangju[0]\n",
488
+ "xgb_gwangju_2= xgb_gwangju[1]\n",
489
+ "xgb_gwangju_3= xgb_gwangju[2]\n"
490
+ ]
491
+ },
492
+ {
493
+ "cell_type": "markdown",
494
+ "metadata": {},
495
+ "source": [
496
+ "## **Soft Voting**"
497
+ ]
498
+ },
499
+ {
500
+ "cell_type": "markdown",
501
+ "metadata": {},
502
+ "source": [
503
+ "## **서울**"
504
+ ]
505
+ },
506
+ {
507
+ "cell_type": "code",
508
+ "execution_count": 9,
509
+ "metadata": {},
510
+ "outputs": [],
511
+ "source": [
512
+ "voting = []\n",
513
+ "mcc = []\n",
514
+ "accuracy = []\n"
515
+ ]
516
+ },
517
+ {
518
+ "cell_type": "code",
519
+ "execution_count": 10,
520
+ "metadata": {},
521
+ "outputs": [
522
+ {
523
+ "data": {
524
+ "text/plain": [
525
+ "Index(['temp_C', 'precip_mm', 'wind_speed', 'wind_dir', 'hm', 'vap_pressure',\n",
526
+ " 'dewpoint_C', 'loc_pressure', 'sea_pressure', 'solarRad', 'snow_cm',\n",
527
+ " 'cloudcover', 'lm_cloudcover', 'low_cloudbase', 'groundtemp', 'O3',\n",
528
+ " 'NO2', 'PM10', 'PM25', 'month', 'hour', 'ground_temp - temp_C',\n",
529
+ " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'multi_class'],\n",
530
+ " dtype='object')"
531
+ ]
532
+ },
533
+ "execution_count": 10,
534
+ "metadata": {},
535
+ "output_type": "execute_result"
536
+ }
537
+ ],
538
+ "source": [
539
+ "df_seoul_test.columns"
540
+ ]
541
+ },
542
+ {
543
+ "cell_type": "code",
544
+ "execution_count": 11,
545
+ "metadata": {},
546
+ "outputs": [
547
+ {
548
+ "name": "stdout",
549
+ "output_type": "stream",
550
+ "text": [
551
+ "CSI score of soft(test) : 0.3248062015503624\n"
552
+ ]
553
+ }
554
+ ],
555
+ "source": [
556
+ "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n",
557
+ "\n",
558
+ "probas = []\n",
559
+ "\n",
560
+ "# 1 Fold\n",
561
+ "test_preds = get_proba('seoul', 'deepgbm', 'ctgan20000', 1)\n",
562
+ "probas.append(test_preds)\n",
563
+ "test_preds = get_proba('seoul', 'resnet_like', 'smote', 1)\n",
564
+ "probas.append(test_preds)\n",
565
+ "# probas.append(xgb_seoul_1.predict_proba(df_seoul_test.iloc[:,:-1]))\n",
566
+ "\n",
567
+ "# 2 Fold\n",
568
+ "test_preds = get_proba('seoul', 'deepgbm', 'ctgan20000', 2)\n",
569
+ "probas.append(test_preds)\n",
570
+ "test_preds = get_proba('seoul', 'resnet_like', 'smote', 2)\n",
571
+ "probas.append(test_preds)\n",
572
+ "# probas.append(xgb_seoul_2.predict_proba(df_seoul_test.iloc[:,:-1]))\n",
573
+ "\n",
574
+ "# 3 Fold\n",
575
+ "test_preds = get_proba('seoul', 'deepgbm', 'ctgan20000', 3)\n",
576
+ "probas.append(test_preds)\n",
577
+ "test_preds = get_proba('seoul', 'resnet_like', 'smote', 3)\n",
578
+ "probas.append(test_preds)\n",
579
+ "# probas.append(xgb_seoul_3.predict_proba(df_seoul_test.iloc[:,:-1]))\n",
580
+ "\n",
581
+ "voting.append(calculate_csi(df_seoul_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
582
+ "mcc.append(multiclass_mcc(df_seoul_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
583
+ "accuracy.append(accuracy_score(df_seoul_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
584
+ "\n",
585
+ "\n",
586
+ "print(\"CSI score of soft(test) :\", calculate_csi(df_seoul_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n"
587
+ ]
588
+ },
589
+ {
590
+ "cell_type": "code",
591
+ "execution_count": 12,
592
+ "metadata": {},
593
+ "outputs": [
594
+ {
595
+ "data": {
596
+ "text/plain": [
597
+ "array([[ 2, 11, 0],\n",
598
+ " [ 6, 417, 58],\n",
599
+ " [ 7, 789, 7470]])"
600
+ ]
601
+ },
602
+ "execution_count": 12,
603
+ "metadata": {},
604
+ "output_type": "execute_result"
605
+ }
606
+ ],
607
+ "source": [
608
+ "confusion_matrix(df_seoul_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1))"
609
+ ]
610
+ },
611
+ {
612
+ "cell_type": "markdown",
613
+ "metadata": {},
614
+ "source": [
615
+ "## **부산**"
616
+ ]
617
+ },
618
+ {
619
+ "cell_type": "code",
620
+ "execution_count": 13,
621
+ "metadata": {},
622
+ "outputs": [
623
+ {
624
+ "name": "stdout",
625
+ "output_type": "stream",
626
+ "text": [
627
+ "CSI score of soft(test) : 0.46608315098458075\n"
628
+ ]
629
+ }
630
+ ],
631
+ "source": [
632
+ "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n",
633
+ "\n",
634
+ "probas = []\n",
635
+ "\n",
636
+ "# 1 Fold\n",
637
+ "test_preds = get_proba('busan', 'deepgbm', 'pure', 1)\n",
638
+ "probas.append(test_preds)\n",
639
+ "test_preds = get_proba('busan', 'resnet_like', 'ctgan10000', 1)\n",
640
+ "probas.append(test_preds)\n",
641
+ "\n",
642
+ "\n",
643
+ "# 2 Fold\n",
644
+ "test_preds = get_proba('busan', 'deepgbm', 'pure', 2)\n",
645
+ "probas.append(test_preds)\n",
646
+ "test_preds = get_proba('busan', 'resnet_like', 'ctgan10000', 2)\n",
647
+ "probas.append(test_preds)\n",
648
+ "\n",
649
+ "\n",
650
+ "# 3 Fold\n",
651
+ "test_preds = get_proba('busan', 'deepgbm', 'pure', 3)\n",
652
+ "probas.append(test_preds)\n",
653
+ "test_preds = get_proba('busan', 'resnet_like', 'ctgan10000', 3)\n",
654
+ "probas.append(test_preds)\n",
655
+ "\n",
656
+ "\n",
657
+ "voting.append(calculate_csi(df_busan_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
658
+ "mcc.append(multiclass_mcc(df_busan_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
659
+ "accuracy.append(accuracy_score(df_busan_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
660
+ "\n",
661
+ "print(\"CSI score of soft(test) :\", calculate_csi(df_busan_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n"
662
+ ]
663
+ },
664
+ {
665
+ "cell_type": "code",
666
+ "execution_count": 14,
667
+ "metadata": {},
668
+ "outputs": [
669
+ {
670
+ "data": {
671
+ "text/plain": [
672
+ "array([[ 11, 13, 0],\n",
673
+ " [ 11, 202, 68],\n",
674
+ " [ 2, 150, 8303]])"
675
+ ]
676
+ },
677
+ "execution_count": 14,
678
+ "metadata": {},
679
+ "output_type": "execute_result"
680
+ }
681
+ ],
682
+ "source": [
683
+ "confusion_matrix(df_busan_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1))"
684
+ ]
685
+ },
686
+ {
687
+ "cell_type": "markdown",
688
+ "metadata": {},
689
+ "source": [
690
+ "## **인천**"
691
+ ]
692
+ },
693
+ {
694
+ "cell_type": "code",
695
+ "execution_count": 15,
696
+ "metadata": {},
697
+ "outputs": [
698
+ {
699
+ "name": "stdout",
700
+ "output_type": "stream",
701
+ "text": [
702
+ "CSI score of hard(test) : 0.572269457161506\n"
703
+ ]
704
+ }
705
+ ],
706
+ "source": [
707
+ "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n",
708
+ "\n",
709
+ "\n",
710
+ "# 1 Fold\n",
711
+ "probas = []\n",
712
+ "test_preds = get_proba('incheon', 'deepgbm', 'pure', 1)\n",
713
+ "probas.append(test_preds)\n",
714
+ "test_preds = get_proba('incheon', 'resnet_like', 'smote', 1)\n",
715
+ "probas.append(test_preds)\n",
716
+ "test_preds = get_proba('incheon', 'ft_transformer', 'pure', 1)\n",
717
+ "probas.append(test_preds)\n",
718
+ "\n",
719
+ "probas.append(lgb_incheon_1.predict_proba(df_incheon_test.iloc[:,:-1]))\n",
720
+ "probas.append(xgb_incheon_1.predict_proba(df_incheon_test.iloc[:,:-1]))\n",
721
+ "\n",
722
+ "# 2 Fold\n",
723
+ "test_preds = get_proba('incheon', 'deepgbm', 'pure', 2)\n",
724
+ "probas.append(test_preds)\n",
725
+ "test_preds = get_proba('incheon', 'resnet_like', 'smote', 2)\n",
726
+ "probas.append(test_preds)\n",
727
+ "test_preds = get_proba('incheon', 'ft_transformer', 'pure', 2)\n",
728
+ "probas.append(test_preds)\n",
729
+ "\n",
730
+ "probas.append(lgb_incheon_2.predict_proba(df_incheon_test.iloc[:,:-1]))\n",
731
+ "probas.append(xgb_incheon_2.predict_proba(df_incheon_test.iloc[:,:-1]))\n",
732
+ "\n",
733
+ "# 3 Fold\n",
734
+ "test_preds = get_proba('incheon', 'deepgbm', 'pure', 3)\n",
735
+ "probas.append(test_preds)\n",
736
+ "test_preds = get_proba('incheon', 'resnet_like', 'smote', 3)\n",
737
+ "probas.append(test_preds)\n",
738
+ "test_preds = get_proba('incheon', 'ft_transformer', 'pure', 3)\n",
739
+ "probas.append(test_preds)\n",
740
+ "\n",
741
+ "probas.append(lgb_incheon_3.predict_proba(df_incheon_test.iloc[:,:-1]))\n",
742
+ "probas.append(xgb_incheon_3.predict_proba(df_incheon_test.iloc[:,:-1]))\n",
743
+ "\n",
744
+ "\n",
745
+ "\n",
746
+ "voting.append(calculate_csi(df_incheon_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
747
+ "mcc.append(multiclass_mcc(df_incheon_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
748
+ "accuracy.append(accuracy_score(df_incheon_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
749
+ "\n",
750
+ "\n",
751
+ "print(\"CSI score of hard(test) :\", calculate_csi(df_incheon_test.iloc[:,-1],mode(np.argmax(probas, axis=2), axis=0).mode[0]))\n"
752
+ ]
753
+ },
754
+ {
755
+ "cell_type": "code",
756
+ "execution_count": 16,
757
+ "metadata": {},
758
+ "outputs": [
759
+ {
760
+ "data": {
761
+ "text/plain": [
762
+ "array([[ 87, 74, 21],\n",
763
+ " [ 22, 788, 395],\n",
764
+ " [ 2, 140, 7231]])"
765
+ ]
766
+ },
767
+ "execution_count": 16,
768
+ "metadata": {},
769
+ "output_type": "execute_result"
770
+ }
771
+ ],
772
+ "source": [
773
+ "confusion_matrix(df_incheon_test.iloc[:,-1],mode(np.argmax(probas, axis=2), axis=0).mode[0])"
774
+ ]
775
+ },
776
+ {
777
+ "cell_type": "markdown",
778
+ "metadata": {},
779
+ "source": [
780
+ "## **대구**"
781
+ ]
782
+ },
783
+ {
784
+ "cell_type": "code",
785
+ "execution_count": 17,
786
+ "metadata": {},
787
+ "outputs": [
788
+ {
789
+ "name": "stdout",
790
+ "output_type": "stream",
791
+ "text": [
792
+ "CSI score of soft(test) : 0.2852112676055334\n"
793
+ ]
794
+ }
795
+ ],
796
+ "source": [
797
+ "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n",
798
+ "\n",
799
+ "probas= []\n",
800
+ "\n",
801
+ "# 1 Fold\n",
802
+ "test_preds = get_proba('daegu', 'deepgbm', 'smote', 1)\n",
803
+ "probas.append(test_preds)\n",
804
+ "test_preds = get_proba('daegu', 'ft_transformer', 'pure', 1)\n",
805
+ "probas.append(test_preds)\n",
806
+ "\n",
807
+ "# 2 Fold\n",
808
+ "test_preds = get_proba('daegu', 'deepgbm', 'smote', 2)\n",
809
+ "probas.append(test_preds)\n",
810
+ "test_preds = get_proba('daegu', 'ft_transformer', 'pure', 2)\n",
811
+ "probas.append(test_preds)\n",
812
+ "\n",
813
+ "# 3 Fold\n",
814
+ "test_preds = get_proba('daegu', 'deepgbm', 'smote', 3)\n",
815
+ "probas.append(test_preds)\n",
816
+ "test_preds = get_proba('daegu', 'ft_transformer', 'pure', 3)\n",
817
+ "probas.append(test_preds)\n",
818
+ "\n",
819
+ "voting.append(calculate_csi(df_daegu_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
820
+ "mcc.append(multiclass_mcc(df_daegu_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
821
+ "accuracy.append(accuracy_score(df_daegu_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
822
+ "\n",
823
+ "print(\"CSI score of soft(test) :\", calculate_csi(df_daegu_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n"
824
+ ]
825
+ },
826
+ {
827
+ "cell_type": "code",
828
+ "execution_count": 18,
829
+ "metadata": {},
830
+ "outputs": [
831
+ {
832
+ "data": {
833
+ "text/plain": [
834
+ "array([[ 1, 0, 0],\n",
835
+ " [ 1, 80, 47],\n",
836
+ " [ 2, 153, 8476]])"
837
+ ]
838
+ },
839
+ "execution_count": 18,
840
+ "metadata": {},
841
+ "output_type": "execute_result"
842
+ }
843
+ ],
844
+ "source": [
845
+ "confusion_matrix(df_daegu_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1))"
846
+ ]
847
+ },
848
+ {
849
+ "cell_type": "markdown",
850
+ "metadata": {},
851
+ "source": [
852
+ "## **대전**"
853
+ ]
854
+ },
855
+ {
856
+ "cell_type": "code",
857
+ "execution_count": 19,
858
+ "metadata": {},
859
+ "outputs": [
860
+ {
861
+ "name": "stdout",
862
+ "output_type": "stream",
863
+ "text": [
864
+ "CSI score of soft(test) : 0.31884057971011603\n"
865
+ ]
866
+ }
867
+ ],
868
+ "source": [
869
+ "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n",
870
+ "\n",
871
+ "probas = []\n",
872
+ "\n",
873
+ "# 1 Fold\n",
874
+ "test_preds = get_proba('daejeon', 'deepgbm', 'pure', 1)\n",
875
+ "probas.append(test_preds)\n",
876
+ "test_preds = get_proba('daejeon', 'ft_transformer', 'pure', 1)\n",
877
+ "probas.append(test_preds)\n",
878
+ "\n",
879
+ "\n",
880
+ "# 2 Fold\n",
881
+ "test_preds = get_proba('daejeon', 'deepgbm', 'pure', 2)\n",
882
+ "probas.append(test_preds)\n",
883
+ "test_preds = get_proba('daejeon', 'ft_transformer', 'pure', 2)\n",
884
+ "probas.append(test_preds)\n",
885
+ "\n",
886
+ "\n",
887
+ "# 3 Fold\n",
888
+ "test_preds = get_proba('daejeon', 'deepgbm', 'pure', 3)\n",
889
+ "probas.append(test_preds)\n",
890
+ "test_preds = get_proba('daejeon', 'ft_transformer', 'pure', 3)\n",
891
+ "probas.append(test_preds)\n",
892
+ "\n",
893
+ "voting.append(calculate_csi(df_daejeon_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
894
+ "mcc.append(multiclass_mcc(df_daejeon_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
895
+ "accuracy.append(accuracy_score(df_daejeon_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
896
+ "\n",
897
+ "\n",
898
+ "print(\"CSI score of soft(test) :\", calculate_csi(df_daejeon_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))"
899
+ ]
900
+ },
901
+ {
902
+ "cell_type": "code",
903
+ "execution_count": 20,
904
+ "metadata": {},
905
+ "outputs": [
906
+ {
907
+ "data": {
908
+ "text/plain": [
909
+ "array([[ 15, 23, 15],\n",
910
+ " [ 10, 337, 271],\n",
911
+ " [ 0, 433, 7656]])"
912
+ ]
913
+ },
914
+ "execution_count": 20,
915
+ "metadata": {},
916
+ "output_type": "execute_result"
917
+ }
918
+ ],
919
+ "source": [
920
+ "confusion_matrix(df_daejeon_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1))"
921
+ ]
922
+ },
923
+ {
924
+ "cell_type": "markdown",
925
+ "metadata": {},
926
+ "source": [
927
+ "## **광주**"
928
+ ]
929
+ },
930
+ {
931
+ "cell_type": "code",
932
+ "execution_count": 21,
933
+ "metadata": {},
934
+ "outputs": [
935
+ {
936
+ "name": "stdout",
937
+ "output_type": "stream",
938
+ "text": [
939
+ "CSI score of soft(test) : 0.4759725400457121\n"
940
+ ]
941
+ }
942
+ ],
943
+ "source": [
944
+ "from scipy.stats import mode # 최다 득표수 계산을 위한 mode 함수\n",
945
+ "\n",
946
+ "probas = []\n",
947
+ "\n",
948
+ "# 1 Fold\n",
949
+ "test_preds = get_proba('gwangju', 'deepgbm', 'pure', 1)\n",
950
+ "probas.append(test_preds)\n",
951
+ "test_preds = get_proba('gwangju', 'ft_transformer', 'pure', 1)\n",
952
+ "probas.append(test_preds)\n",
953
+ "\n",
954
+ "\n",
955
+ "# 2 Fold\n",
956
+ "test_preds = get_proba('gwangju', 'deepgbm', 'pure', 2)\n",
957
+ "probas.append(test_preds)\n",
958
+ "test_preds = get_proba('gwangju', 'ft_transformer', 'pure', 2)\n",
959
+ "probas.append(test_preds)\n",
960
+ "\n",
961
+ "\n",
962
+ "# 3 Fold\n",
963
+ "test_preds = get_proba('gwangju', 'deepgbm', 'pure', 3)\n",
964
+ "probas.append(test_preds)\n",
965
+ "test_preds = get_proba('gwangju', 'ft_transformer', 'pure', 3)\n",
966
+ "probas.append(test_preds)\n",
967
+ "\n",
968
+ "voting.append(calculate_csi(df_gwangju_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
969
+ "mcc.append(multiclass_mcc(df_gwangju_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
970
+ "accuracy.append(accuracy_score(df_gwangju_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))\n",
971
+ "\n",
972
+ "\n",
973
+ "print(\"CSI score of soft(test) :\", calculate_csi(df_gwangju_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1)))"
974
+ ]
975
+ },
976
+ {
977
+ "cell_type": "code",
978
+ "execution_count": 22,
979
+ "metadata": {},
980
+ "outputs": [
981
+ {
982
+ "data": {
983
+ "text/plain": [
984
+ "array([[ 10, 12, 8],\n",
985
+ " [ 2, 406, 235],\n",
986
+ " [ 0, 201, 7886]])"
987
+ ]
988
+ },
989
+ "execution_count": 22,
990
+ "metadata": {},
991
+ "output_type": "execute_result"
992
+ }
993
+ ],
994
+ "source": [
995
+ "confusion_matrix(df_gwangju_test.iloc[:,-1], np.argmax(np.mean(probas, axis=0), axis=1))"
996
+ ]
997
+ },
998
+ {
999
+ "cell_type": "code",
1000
+ "execution_count": 23,
1001
+ "metadata": {},
1002
+ "outputs": [
1003
+ {
1004
+ "data": {
1005
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAHDCAYAAAATEUquAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABDfUlEQVR4nO3deVwVdf///+cBZBfEDcgQUtw33EMvxQpDLcsrKysL5GvqpWJeF5VFlmiLWJbppaZZH9PcMs1WU0vUSrM0FZdCxd00cClFUKHg/fujH+fqCDhiynF53G+3c6t5z3tmXnPOeM55MjPvYzPGGAEAAAAASuXi7AIAAAAA4EpHcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAXFPWr1+vdu3aycfHRzabTWlpac4uSSNHjpTNZnN2GeVq1apVstlsWrVqVZmXnTFjhmw2m/bt23fJ6wKAi0VwAoDLaPfu3RowYIBq1aolT09P+fn5qX379powYYLOnDlj75efn68JEyaoefPm8vPzU6VKldSoUSP1799f27dvt/cr+kL5ww8/WG573759io+PV+3ateXp6amgoCB17NhRycnJl2VfrwS///677rvvPv366696/fXXNWvWLIWGhhbr99hjj8lms2nXrl2lrmv48OGy2WzasmXLBW379OnTGjly5EUFhcupT58+stls8vPzczjmimRkZMhms8lms+nVV191QoUAcHUgOAHAZbJ48WI1adJE77//vrp3766JEycqJSVFNWvW1JNPPqmhQ4fa+/bs2VOPP/64GjdurDFjxmjUqFHq2LGjlixZou+++67M2961a5eaN2+uZcuW6cEHH9SkSZM0ePBgValSRS+//PKl3M0ryu7du7V//3498cQT6t+/vx5++GEFBAQU69e7d29J0ty5c0td17x589SkSRM1bdr0grZ9+vRpjRo1qsTg9Oyzz5YYWsqLm5ubTp8+rU8//bTYvDlz5sjT09MJVQHA1cXN2QUAwLVo7969euCBBxQaGqoVK1YoODjYPm/w4MHatWuXFi9eLOnPS8s+++wzvfTSS3rmmWcc1jNp0iSdOHGizNt//fXXlZOTo7S0tGJnXI4cOVL2HfobcnNz5ePjUy7bKtq3SpUqnbdf27ZtFR4ernnz5mnEiBHF5q9du1Z79+7VmDFjLkldbm5ucnNz3keuh4eH2rdvr3nz5un+++93mDd37lzdcccd+uCDD5xUHQBcHTjjBACXwSuvvKKcnBz93//9n0NoKhIeHm4/47R7925JUvv27Yv1c3V1VZUqVcq8/d27d+vGG28s8TK16tWrF2tbsmSJoqKiVLFiRfn5+al169bFzsYsWLBALVu2lJeXl6pWraqHH35Yhw4dcujTp08f+fr6avfu3erWrZsqVqxoP7tTWFio8ePHq1GjRvL09FRgYKAGDBig33777YL2acWKFerQoYN8fHxUqVIl3X333UpPT3fYdlRUlCTpvvvuk81mU6dOnUpdX+/evbV9+3Zt3Lix2Ly5c+fKZrPpwQcflPRnIOvbt68CAwPl6empZs2aaebMmfb++/btU7Vq1SRJo0aNsl/6NnLkSEkl3+Nks9mUkJCgjz76SI0bN5aHh4caNWqkpUuXFqtn1apVatWqlTw9PVW7dm29+eabZb5v6qGHHtKSJUscgvj69euVkZGhhx56qMRl9uzZo/vuu0+VK1eWt7e3br75Znvg/6uff/5ZPXr0kI+Pj6pXr67//Oc/ysvLK3Gd33//vbp06SJ/f395e3srKipKa9asueD9AABnITgBwGXw6aefqlatWmrXrp1l36JwM2fOHP3xxx+XZPuhoaE6ePCgVqxYYdl3xowZuuOOO/Trr78qKSlJY8aMUUREhMMX+BkzZuj++++Xq6urUlJS1K9fPy1atEj/+Mc/ip0R++OPPxQTE6Pq1avr1VdfVc+ePSVJAwYM0JNPPmm/xys+Pl5z5sxRTEyMfv/99/PWuHz5csXExOjIkSMaOXKkEhMT9e2336p9+/b2AQQGDBhgP2P32GOPadasWRo+fHip6yztcr2CggK9//776tChg2rWrKkzZ86oU6dOmjVrlnr37q2xY8fK399fffr00YQJEyRJ1apV05QpUyRJ//znPzVr1izNmjVL99xzz3n3a/Xq1Ro0aJAeeOABvfLKKzp79qx69uyp48eP2/ts2rRJXbp00fHjxzVq1Cj17dtXzz//vD766KPzrvtc99xzj2w2mxYtWmRvmzt3rurXr68WLVoU65+VlaV27dpp2bJlGjRokF566SWdPXtWd911lz788EN7vzNnzui2227TsmXLlJCQoOHDh+ubb77RsGHDiq1zxYoV6tixo7Kzs5WcnKzRo0frxIkTuvXWW7Vu3boy7Q8AlDsDALikTp48aSSZu++++4L6FxYWmqioKCPJBAYGmgcffNBMnjzZ7N+/v1jfd955x0gy69evP+86t23bZry8vIwkExERYYYOHWo++ugjk5ub69DvxIkTpmLFiqZt27bmzJkzxeoyxpj8/HxTvXp107hxY4c+n332mZFkRowYYW+Li4szkszTTz/tsK5vvvnGSDJz5sxxaF+6dGmJ7eeKiIgw1atXN8ePH7e3bd682bi4uJjY2Fh728qVK40ks2DBgvOur0jr1q3NjTfeaAoKCorV9OabbxpjjBk/fryRZGbPnm3vk5+fbyIjI42vr6/Jzs42xhhz9OhRI8kkJycX205ycrI59yNXknF3dze7du1y2CdJZuLEifa27t27G29vb3Po0CF7W0ZGhnFzcyu2zpLExcUZHx8fY4wx9957r7ntttuMMcYUFBSYoKAgM2rUKLN3714jyYwdO9a+3L///W8jyXzzzTf2tlOnTpmbbrrJhIWF2Z+zoufn/ffft/fLzc014eHhRpJZuXKlMebP46lOnTomJibGfmwZY8zp06fNTTfdZDp37mxvKzrO9+7da7l/AFBeOOMEAJdYdna2JKlixYoX1N9ms2nZsmV68cUXFRAQoHnz5mnw4MEKDQ1Vr169Luoep0aNGiktLU0PP/yw9u3bpwkTJqhHjx4KDAzUW2+9Ze/35Zdf6tSpU3r66aeLDRBQdBnYDz/8oCNHjmjQoEEOfe644w7Vr1+/xEu3Bg4c6DC9YMEC+fv7q3Pnzjp27Jj90bJlS/n6+mrlypWl7ssvv/yitLQ09enTR5UrV7a3N23aVJ07d9bnn39etifnLx5++GH9/PPP+vrrr+1tc+fOlbu7u+677z5J0ueff66goCD7ZXuSVKFCBT322GPKycnRV199ddHbj46OVu3ate3TTZs2lZ+fn/bs2SPpz7Nfy5cvV48ePXTDDTfY+4WHh6tr165l3t5DDz2kVatWKTMzUytWrFBmZmapl+l9/vnnatOmjf7xj3/Y23x9fdW/f3/t27dPP/30k71fcHCw7r33Xns/b29v9e/f32F9aWlp9ssCjx8/bj8GcnNzddttt+nrr79WYWFhmfcJAMoLwQkALjE/Pz9J0qlTpy54GQ8PDw0fPlzp6ek6fPiw5s2bp5tvvlnvv/++EhISLqqOunXratasWTp27Ji2bNmi0aNHy83NTf3799fy5csl/e/+qsaNG5e6nv3790uS6tWrV2xe/fr17fOLuLm56cYbb3Roy8jI0MmTJ1W9enVVq1bN4ZGTk3PeASvOt/0GDRrYv3xfjAceeECurq72y/XOnj2rDz/8UF27drWPxrd//37VqVNHLi6OH5kNGjRwqO9i1KxZs1hbQECA/b6vI0eO6MyZMwoPDy/Wr6Q2K0X3nc2fP19z5sxR69atS13P/v37S33Oi+YX/Tc8PLzY/VbnLpuRkSFJiouLK3YMvP3228rLy9PJkyfLvE8AUF4YVQ8ALjE/Pz/dcMMN2rZt20UtHxwcrAceeEA9e/ZUo0aN9P7772vGjBkXPSqbq6urmjRpoiZNmigyMlK33HKL5syZo+jo6ItanxUPD49iIaOwsFDVq1fXnDlzSlymaGCF8la9enV17txZH3zwgSZPnqxPP/1Up06dst//dLm5urqW2G6MuSzb8/Dw0D333KOZM2dqz5499sErykPR2aSxY8cqIiKixD6+vr7lVg8AlBXBCQAugzvvvFPTpk3T2rVrFRkZeVHrqFChgpo2baqMjAwdO3ZMQUFBf7uuVq1aSfrz8jdJ9svEtm3bVuqZh6LBK3bs2KFbb73VYd6OHTtKHLnvXLVr19by5cvVvn17eXl5lanmv27/XNu3b1fVqlX/1nDnvXv31tKlS7VkyRLNnTtXfn5+6t69u8P2t2zZosLCQodAWPTDxEX1lWWEuwtVvXp1eXp6lvhDvef78d7zeeihhzR9+nS5uLjogQceKLVfaGhoqc950fyi/27btk3GGIfn4Nxli441Pz+/yxbaAeBy4lI9ALgMhg0bJh8fHz366KPKysoqNn/37t32EdkyMjJ04MCBYn1OnDihtWvXKiAgoMxnZL755psSR6oruh+o6DKq22+/XRUrVlRKSorOnj3r0LforEerVq1UvXp1TZ061WGI6SVLlig9PV133HGHZT3333+/CgoK9MILLxSb98cff5z3Pq7g4GBFRERo5syZDv22bdumL774Qt26dbPc/vn06NFD3t7eeuONN7RkyRLdc889DvdydevWTZmZmZo/f75DzRMnTpSvr699CHRvb29Juqh70krj6uqq6OhoffTRRzp8+LC9fdeuXVqyZMlFrfOWW27RCy+8oEmTJp03jHfr1k3r1q3T2rVr7W25ubmaNm2awsLC1LBhQ3u/w4cPa+HChfZ+p0+f1rRp0xzW17JlS9WuXVuvvvqqcnJyim3v6NGjF7U/AFBeOOMEAJdB7dq1NXfuXPXq1UsNGjRQbGysGjdurPz8fH377bdasGCB+vTpI0navHmzHnroIXXt2lUdOnRQ5cqVdejQIc2cOVOHDx/W+PHjS72kqzQvv/yyNmzYoHvuuUdNmzaVJG3cuFHvvvuuKleurH//+9+S/vzr/+uvv65HH31UrVu31kMPPaSAgABt3rxZp0+f1syZM1WhQgW9/PLLio+PV1RUlB588EFlZWVpwoQJCgsL03/+8x/LeqKiojRgwAClpKQoLS1Nt99+uypUqKCMjAwtWLBAEyZMcBhc4Fxjx45V165dFRkZqb59++rMmTOaOHGi/P39//blZr6+vurRo4f9PqdzL9Pr37+/3nzzTfXp00cbNmxQWFiYFi5cqDVr1mj8+PH2QUC8vLzUsGFDzZ8/X3Xr1lXlypXVuHHj894/diFGjhypL774Qu3bt9fAgQNVUFCgSZMmqXHjxkpLSyvz+lxcXPTss89a9nv66ac1b948de3aVY899pgqV66smTNnau/evfrggw/sZ9/69eunSZMmKTY2Vhs2bFBwcLBmzZplD5J/3e7bb7+trl27qlGjRoqPj1eNGjV06NAhrVy5Un5+fvr000/LvD8AUG6cPKofAFzTdu7cafr162fCwsKMu7u7qVixomnfvr2ZOHGiOXv2rDHGmKysLDNmzBgTFRVlgoODjZubmwkICDC33nqrWbhwocP6LnQ48jVr1pjBgwebxo0bG39/f1OhQgVTs2ZN06dPH7N79+5i/T/55BPTrl074+XlZfz8/EybNm3MvHnzHPrMnz/fNG/e3Hh4eJjKlSub3r17m59//tmhz1+Hvi7JtGnTTMuWLY2Xl5epWLGiadKkiRk2bJg5fPjweffHGGOWL19u2rdvb6+xe/fu5qeffnLoU9bhyIssXrzYSDLBwcEOQ5MXycrKMvHx8aZq1arG3d3dNGnSxLzzzjvF+n377bemZcuWxt3d3WFo8tKGIx88eHCxdYSGhpq4uDiHttTUVNO8eXPj7u5uateubd5++23z+OOPG09PT8t9s3pNjDElDkdujDG7d+829957r6lUqZLx9PQ0bdq0MZ999lmx5ffv32/uuusu4+3tbapWrWqGDh1qH9a9aDjyIps2bTL33HOPqVKlivHw8DChoaHm/vvvN6mpqfY+DEcO4EpkM+Yy3YEKAAAumx49eujHH3+0j1YHALi8uMcJAIAr3JkzZxymMzIy9Pnnn6tTp07OKQgArkOccQIA4AoXHBysPn36qFatWtq/f7+mTJmivLw8bdq0SXXq1HF2eQBwXWBwCAAArnBdunTRvHnzlJmZKQ8PD0VGRmr06NGEJgAoR5xxAgAAAAAL3OMEAAAAABYITgAAAABgwen3OE2ePFljx45VZmammjVrpokTJ6pNmzal9j9x4oSGDx+uRYsW6ddff1VoaKjGjx9/wb8cX1hYqMOHD6tixYqy2WyXajcAAAAAXGWMMTp16pRuuOEG+w97l8apwWn+/PlKTEzU1KlT1bZtW40fP14xMTHasWOHqlevXqx/fn6+OnfurOrVq2vhwoWqUaOG9u/fr0qVKl3wNg8fPqyQkJBLuBcAAAAArmYHDx7UjTfeeN4+Th0com3btmrdurUmTZok6c+zQSEhIRoyZIiefvrpYv2nTp2qsWPHavv27apQocJFbfPkyZOqVKmSDh48KD8/v79VPwAAAICrV3Z2tkJCQnTixAn5+/uft6/Tzjjl5+drw4YNSkpKsre5uLgoOjpaa9euLXGZTz75RJGRkRo8eLA+/vhjVatWTQ899JCeeuopubq6XtB2iy7P8/PzIzgBAAAAuKBbeJwWnI4dO6aCggIFBgY6tAcGBmr79u0lLrNnzx6tWLFCvXv31ueff65du3Zp0KBB+v3335WcnFziMnl5ecrLy7NPZ2dnX7qdAAAAAHBduKpG1SssLFT16tU1bdo0tWzZUr169dLw4cM1derUUpdJSUmRv7+//cH9TQAAAADKymnBqWrVqnJ1dVVWVpZDe1ZWloKCgkpcJjg4WHXr1nW4LK9BgwbKzMxUfn5+icskJSXp5MmT9sfBgwcv3U4AAAAAuC44LTi5u7urZcuWSk1NtbcVFhYqNTVVkZGRJS7Tvn177dq1S4WFhfa2nTt3Kjg4WO7u7iUu4+HhYb+fifuaAAAAAFwMp16ql5iYqLfeekszZ85Uenq6Bg4cqNzcXMXHx0uSYmNjHQaPGDhwoH799VcNHTpUO3fu1OLFizV69GgNHjzYWbsAAAAA4Drg1N9x6tWrl44ePaoRI0YoMzNTERERWrp0qX3AiAMHDjj8EFVISIiWLVum//znP2ratKlq1KihoUOH6qmnnnLWLgAAAAC4Djj1d5ycITs7W/7+/jp58iSX7QEAAADXsbJkg6tqVD0AAAAAcAaCEwAAAABYIDgBAAAAgAWCEwAAAABYIDgBAAAAgAWCEwAAAABYIDgBAAAAgAWCEwAAAABYIDgBAAAAgAU3ZxcAAFeT9hPbO7sElGLNkDXOLgEAcA3jjBMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWLgigtPkyZMVFhYmT09PtW3bVuvWrSu174wZM2Sz2Rwenp6e5VgtAAAAgOuN04PT/PnzlZiYqOTkZG3cuFHNmjVTTEyMjhw5Uuoyfn5++uWXX+yP/fv3l2PFAAAAAK43Tg9O48aNU79+/RQfH6+GDRtq6tSp8vb21vTp00tdxmazKSgoyP4IDAwsx4oBAAAAXG+cGpzy8/O1YcMGRUdH29tcXFwUHR2ttWvXlrpcTk6OQkNDFRISorvvvls//vhjqX3z8vKUnZ3t8AAAAACAsnBqcDp27JgKCgqKnTEKDAxUZmZmicvUq1dP06dP18cff6zZs2ersLBQ7dq1088//1xi/5SUFPn7+9sfISEhl3w/AAAAAFzbnH6pXllFRkYqNjZWERERioqK0qJFi1StWjW9+eabJfZPSkrSyZMn7Y+DBw+Wc8UAAAAArnZuztx41apV5erqqqysLIf2rKwsBQUFXdA6KlSooObNm2vXrl0lzvfw8JCHh8ffrhUAAADA9cupZ5zc3d3VsmVLpaam2tsKCwuVmpqqyMjIC1pHQUGBtm7dquDg4MtVJgAAAIDrnFPPOElSYmKi4uLi1KpVK7Vp00bjx49Xbm6u4uPjJUmxsbGqUaOGUlJSJEnPP/+8br75ZoWHh+vEiRMaO3as9u/fr0cffdSZuwEAAADgGub04NSrVy8dPXpUI0aMUGZmpiIiIrR06VL7gBEHDhyQi8v/Toz99ttv6tevnzIzMxUQEKCWLVvq22+/VcOGDZ21CwAAAACucTZjjHF2EeUpOztb/v7+OnnypPz8/JxdDoCrTPuJ7Z1dAkqxZsgaZ5cAALjKlCUbXHWj6gEAAABAeSM4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFN2cXAAAAAFxNXnr4XmeXgFIMn73wsq2bM04AAAAAYIHgBAAAAAAWCE4AAAAAYIHgBAAAAAAWCE4AAAAAYIHgBAAAAAAWGI4cuEAHnm/i7BJwHjVHbHV2CQAA4BrGGScAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALV0Rwmjx5ssLCwuTp6am2bdtq3bp1F7Tce++9J5vNph49elzeAgEAAABc15wenObPn6/ExEQlJydr48aNatasmWJiYnTkyJHzLrdv3z498cQT6tChQzlVCgAAAOB65fTgNG7cOPXr10/x8fFq2LChpk6dKm9vb02fPr3UZQoKCtS7d2+NGjVKtWrVKsdqAQAAAFyPnBqc8vPztWHDBkVHR9vbXFxcFB0drbVr15a63PPPP6/q1aurb9++5VEmAAAAgOucmzM3fuzYMRUUFCgwMNChPTAwUNu3by9xmdWrV+v//u//lJaWdkHbyMvLU15enn06Ozv7ousFAAAAcH1y+qV6ZXHq1Ck98sgjeuutt1S1atULWiYlJUX+/v72R0hIyGWuEgAAAMC1xqlnnKpWrSpXV1dlZWU5tGdlZSkoKKhY/927d2vfvn3q3r27va2wsFCS5Obmph07dqh27doOyyQlJSkxMdE+nZ2dTXgCAAAAUCZODU7u7u5q2bKlUlNT7UOKFxYWKjU1VQkJCcX6169fX1u3bnVoe/bZZ3Xq1ClNmDChxEDk4eEhDw+Py1I/AAAAgOuDU4OTJCUmJiouLk6tWrVSmzZtNH78eOXm5io+Pl6SFBsbqxo1aiglJUWenp5q3Lixw/KVKlWSpGLtAAAAAHCpOD049erVS0ePHtWIESOUmZmpiIgILV261D5gxIEDB+TiclXdigUAAADgGuP04CRJCQkJJV6aJ0mrVq0677IzZsy49AUBAAAAwF9wKgcAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMACwQkAAAAALBCcAAAAAMCCm7MLuFq0fPJdZ5eA89gwNtbZJQAAAOAaxhknAAAAALBAcAIAAAAAC1yqBwAAUAaTHv/U2SWgFAmvdXd2CbiGccYJAAAAACyU6YzTli1bLqhf06ZNL6oYAAAAALgSlSk4RUREyGazyRhTbF5Ru81mU0FBwSUrEAAAAACcrUzBae/evZerDgAAAAC4YpUpOIWGhl6uOgAAAADgilWmwSGOHTum/fv3O7T9+OOPio+P1/3336+5c+de0uIAAAAA4EpQpuA0ZMgQ/fe//7VPHzlyRB06dND69euVl5enPn36aNasWZe8SAAAAABwpjIFp++++0533XWXffrdd99V5cqVlZaWpo8//lijR4/W5MmTy1zE5MmTFRYWJk9PT7Vt21br1q0rte+iRYvUqlUrVapUST4+PoqIiCCsAQAAALisyhScMjMzFRYWZp9esWKF7rnnHrm5/Xmr1F133aWMjIwyFTB//nwlJiYqOTlZGzduVLNmzRQTE6MjR46U2L9y5coaPny41q5dqy1btig+Pl7x8fFatmxZmbYLAAAAABeqTMHJz89PJ06csE+vW7dObdu2tU/bbDbl5eWVqYBx48apX79+io+PV8OGDTV16lR5e3tr+vTpJfbv1KmT/vnPf6pBgwaqXbu2hg4dqqZNm2r16tVl2i4AAAAAXKgyBaebb75Z//3vf1VYWKiFCxfq1KlTuvXWW+3zd+7cqZCQkAteX35+vjZs2KDo6Oj/FeTioujoaK1du9ZyeWOMUlNTtWPHDnXs2LHEPnl5ecrOznZ4AAAAAEBZlCk4Pf/88/rkk0/k5eWlXr16adiwYQoICLDPf++99xQVFXXB6zt27JgKCgoUGBjo0B4YGKjMzMxSlzt58qR8fX3l7u6uO+64QxMnTlTnzp1L7JuSkiJ/f3/7oyzBDgAAAACkMv6OU7NmzZSenq41a9YoKCjI4TI9SXrwwQfVoEGDS1pgSSpWrKi0tDTl5OQoNTVViYmJqlWrljp16lSsb1JSkhITE+3T2dnZhCcAAAAAZVKm4LRixQolJCTou+++k5+fn8O8kydP6sknn9TUqVN10003XdD6qlatKldXV2VlZTm0Z2VlKSgoqNTlXFxcFB4eLkmKiIhQenq6UlJSSgxOHh4e8vDwuKB6AAAAAKAkZbpUb/z48erXr1+x0CRJ/v7+GjBggMaNG3fB63N3d1fLli2VmppqbyssLFRqaqoiIyMveD2FhYVlHpQCAAAAAC5UmYLT5s2b1aVLl1Ln33777dqwYUOZCkhMTNRbb72lmTNnKj09XQMHDlRubq7i4+MlSbGxsUpKSrL3T0lJ0Zdffqk9e/YoPT1dr732mmbNmqWHH364TNsFAAAAgAtVpkv1srKyVKFChdJX5uamo0ePlqmAXr166ejRoxoxYoQyMzMVERGhpUuX2geMOHDggFxc/pfvcnNzNWjQIP3888/y8vJS/fr1NXv2bPXq1atM2wUAAACAC1Wm4FSjRg1t27bNfn/RubZs2aLg4OAyF5GQkKCEhIQS561atcph+sUXX9SLL75Y5m0AAAAAwMUq06V63bp103PPPaezZ88Wm3fmzBklJyfrzjvvvGTFAQAAAMCVoExnnJ599lktWrRIdevWVUJCgurVqydJ2r59uyZPnqyCggINHz78shQKAAAAAM5SpuAUGBiob7/9VgMHDlRSUpKMMZIkm82mmJgYTZ48udiP2QIAAADA1a5MwUmSQkND9fnnn+u3337Trl27ZIxRnTp1FBAQcDnqAwDgivJVxyhnl4BSRH39lbNLAHANK3NwKhIQEKDWrVtfyloAAAAA4IpUpsEhAAAAAOB6RHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAsEJwAAAACwQHACAAAAAAtXRHCaPHmywsLC5OnpqbZt22rdunWl9n3rrbfUoUMHBQQEKCAgQNHR0eftDwAAAAB/l9OD0/z585WYmKjk5GRt3LhRzZo1U0xMjI4cOVJi/1WrVunBBx/UypUrtXbtWoWEhOj222/XoUOHyrlyAAAAANcLpwencePGqV+/foqPj1fDhg01depUeXt7a/r06SX2nzNnjgYNGqSIiAjVr19fb7/9tgoLC5WamlrOlQMAAAC4Xjg1OOXn52vDhg2Kjo62t7m4uCg6Olpr1669oHWcPn1av//+uypXrlzi/Ly8PGVnZzs8AAAAAKAsnBqcjh07poKCAgUGBjq0BwYGKjMz84LW8dRTT+mGG25wCF9/lZKSIn9/f/sjJCTkb9cNAAAA4Pri9Ev1/o4xY8bovffe04cffihPT88S+yQlJenkyZP2x8GDB8u5SgAAAABXOzdnbrxq1apydXVVVlaWQ3tWVpaCgoLOu+yrr76qMWPGaPny5WratGmp/Tw8POTh4XFJ6gUAAABwfXLqGSd3d3e1bNnSYWCHooEeIiMjS13ulVde0QsvvKClS5eqVatW5VEqAAAAgOuYU884SVJiYqLi4uLUqlUrtWnTRuPHj1dubq7i4+MlSbGxsapRo4ZSUlIkSS+//LJGjBihuXPnKiwszH4vlK+vr3x9fZ22HwAAAACuXU4PTr169dLRo0c1YsQIZWZmKiIiQkuXLrUPGHHgwAG5uPzvxNiUKVOUn5+ve++912E9ycnJGjlyZHmWDgAAAOA64fTgJEkJCQlKSEgocd6qVascpvft23f5CwIAAACAv7iqR9UDAAAAgPJAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAACwQnAAAAALBAcAIAAAAAC04PTpMnT1ZYWJg8PT3Vtm1brVu3rtS+P/74o3r27KmwsDDZbDaNHz++/AoFAAAAcN1yanCaP3++EhMTlZycrI0bN6pZs2aKiYnRkSNHSux/+vRp1apVS2PGjFFQUFA5VwsAAADgeuXU4DRu3Dj169dP8fHxatiwoaZOnSpvb29Nnz69xP6tW7fW2LFj9cADD8jDw6OcqwUAAABwvXJacMrPz9eGDRsUHR39v2JcXBQdHa21a9c6qywAAAAAKMbNWRs+duyYCgoKFBgY6NAeGBio7du3X7Lt5OXlKS8vzz6dnZ19ydYNAAAA4Prg9MEhLreUlBT5+/vbHyEhIc4uCQAAAMBVxmnBqWrVqnJ1dVVWVpZDe1ZW1iUd+CEpKUknT560Pw4ePHjJ1g0AAADg+uC04OTu7q6WLVsqNTXV3lZYWKjU1FRFRkZesu14eHjIz8/P4QEAAAAAZeG0e5wkKTExUXFxcWrVqpXatGmj8ePHKzc3V/Hx8ZKk2NhY1ahRQykpKZL+HFDip59+sv//oUOHlJaWJl9fX4WHhzttPwAAAABc25wanHr16qWjR49qxIgRyszMVEREhJYuXWofMOLAgQNycfnfSbHDhw+refPm9ulXX31Vr776qqKiorRq1aryLh8AAADAdcKpwUmSEhISlJCQUOK8c8NQWFiYjDHlUBUAAAAA/M81P6oeAAAAAPxdBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALBCcAAAAAsEBwAgAAAAALV0Rwmjx5ssLCwuTp6am2bdtq3bp15+2/YMEC1a9fX56enmrSpIk+//zzcqoUAAAAwPXI6cFp/vz5SkxMVHJysjZu3KhmzZopJiZGR44cKbH/t99+qwcffFB9+/bVpk2b1KNHD/Xo0UPbtm0r58oBAAAAXC+cHpzGjRunfv36KT4+Xg0bNtTUqVPl7e2t6dOnl9h/woQJ6tKli5588kk1aNBAL7zwglq0aKFJkyaVc+UAAAAArhduztx4fn6+NmzYoKSkJHubi4uLoqOjtXbt2hKXWbt2rRITEx3aYmJi9NFHH5XYPy8vT3l5efbpkydPSpKys7PLVGtB3pky9Uf5KuvreTFOnS247NvAxSuPY0CS/jjzR7lsB2VXXsdA7h8cA1eq8joGzuSdLpftoOzK6xg4+/vv5bIdlF1Zj4Gi/sYYy75ODU7Hjh1TQUGBAgMDHdoDAwO1ffv2EpfJzMwssX9mZmaJ/VNSUjRq1Khi7SEhIRdZNa5E/hP/5ewS4Gwp/s6uAE7m/xTHwHXPn2PgejdssrMrgLO9+P7FvQ+cOnVK/hbvIU4NTuUhKSnJ4QxVYWGhfv31V1WpUkU2m82JlTlPdna2QkJCdPDgQfn5+Tm7HDgBxwA4BsAxAInjABwDxhidOnVKN9xwg2VfpwanqlWrytXVVVlZWQ7tWVlZCgoKKnGZoKCgMvX38PCQh4eHQ1ulSpUuvuhriJ+f33X5DwT/wzEAjgFwDEDiOMD1fQxYnWkq4tTBIdzd3dWyZUulpqba2woLC5WamqrIyMgSl4mMjHToL0lffvllqf0BAAAA4O9y+qV6iYmJiouLU6tWrdSmTRuNHz9eubm5io+PlyTFxsaqRo0aSklJkSQNHTpUUVFReu2113THHXfovffe0w8//KBp06Y5czcAAAAAXMOcHpx69eqlo0ePasSIEcrMzFRERISWLl1qHwDiwIEDcnH534mxdu3aae7cuXr22Wf1zDPPqE6dOvroo4/UuHFjZ+3CVcfDw0PJycnFLmHE9YNjABwD4BiAxHEAjoGysJkLGXsPAAAAAK5jTv8BXAAAAAC40hGcAAAAAMACwQkAAAAALBCccFFWrVolm82mEydOOLsUAOcxY8YMfrsOQKk6deqkf//7384uA9eIa/37IcHpKnT06FENHDhQNWvWlIeHh4KCghQTE6M1a9Y4uzRcRn369JHNZrM/qlSpoi5dumjLli3OLg2XWJ8+fdSjRw9nl4ErzF/fAypUqKDAwEB17txZ06dPV2FhobPLwyVUnq/1okWL9MILL1zSdeLvy8zM1NChQxUeHi5PT08FBgaqffv2mjJlik6fPu3s8krVrl07/fLLLxf8g7JXG4LTVahnz57atGmTZs6cqZ07d+qTTz5Rp06ddPz4cWeXhsusS5cu+uWXX/TLL78oNTVVbm5uuvPOO51dFoByUvQesG/fPi1ZskS33HKLhg4dqjvvvFN//PGHs8vDJVRer3XlypVVsWLFS7Y+/H179uxR8+bN9cUXX2j06NHatGmT1q5dq2HDhumzzz7T8uXLnV1iqdzd3RUUFCSbzebsUi4LgtNV5sSJE/rmm2/08ssv65ZbblFoaKjatGmjpKQk3XXXXfY+jz76qKpVqyY/Pz/deuut2rx5s8N6pkyZotq1a8vd3V316tXTrFmz7PP27dsnm82mtLQ0h+3abDatWrWqPHYTpSg6wxgUFKSIiAg9/fTTOnjwoI4ePVri6fG0tDTZbDbt27dPkrR//351795dAQEB8vHxUaNGjfT5559LkgoKCtS3b1/ddNNN8vLyUr169TRhwgSH7RedCXn11VcVHBysKlWqaPDgwfr999/L6ym47nTq1EmPPfaYhg0bpsqVKysoKEgjR4506HPixAkNGDBAgYGB8vT0VOPGjfXZZ5859Fm2bJkaNGggX19f+xeyv3r77bfVoEEDeXp6qn79+nrjjTcc5m/dulW33nqrvLy8VKVKFfXv3185OTn2+Rwb5aPoPaBGjRpq0aKFnnnmGX388cdasmSJZsyYIUkaN26cmjRpIh8fH4WEhGjQoEEOr5UkrV69Wh06dJCXl5dCQkL02GOPKTc31z7/l19+0R133CEvLy/ddNNNmjt3rsLCwjR+/HhJfE6Uh/J6rc+9VC8vL09PPPGEatSoIR8fH7Vt27bYa/rBBx+oUaNG8vDwUFhYmF577TWH+WFhYRo9erT+3//7f6pYsaJq1qypadOmXdLn51o2aNAgubm56YcfftD999+vBg0aqFatWrr77ru1ePFide/eXU888YTDH07Hjx8vm82mpUuX2tvCw8P19ttvS5LWr1+vzp07q2rVqvL391dUVJQ2btzosF2bzaa3335b//znP+Xt7a06derok08+cejzySefqE6dOvL09NQtt9yimTNnOnz3OPe7yMiRIxUREeGwjvHjxyssLOzSPFnljOB0lfH19ZWvr68++ugj5eXlldjnvvvu05EjR7RkyRJt2LBBLVq00G233aZff/1VkvThhx9q6NChevzxx7Vt2zYNGDBA8fHxWrlyZXnuCv6mnJwczZ49W+Hh4apSpcoFLTN48GDl5eXp66+/1tatW/Xyyy/L19dXklRYWKgbb7xRCxYs0E8//aQRI0bomWee0fvvv++wjpUrV2r37t1auXKlZs6cqRkzZtg/xHF5zJw5Uz4+Pvr+++/1yiuv6Pnnn9eXX34p6c/XrWvXrlqzZo1mz56tn376SWPGjJGrq6t9+dOnT+vVV1/VrFmz9PXXX+vAgQN64okn7PPnzJmjESNG6KWXXlJ6erpGjx6t5557TjNnzpQk5ebmKiYmRgEBAVq/fr0WLFig5cuXKyEhwaFOjg3nuPXWW9WsWTMtWrRIkuTi4qL//ve/+vHHHzVz5kytWLFCw4YNs/ffvXu3unTpop49e2rLli2aP3++Vq9e7fB6xsbG6vDhw1q1apU++OADTZs2TUeOHCn3fYOjy/FanyshIUFr167Ve++9py1btui+++5Tly5dlJGRIUnasGGD7r//fj3wwAPaunWrRo4cqeeee67Yv/XXXntNrVq10qZNmzRo0CANHDhQO3bsuPRPyjXm+PHj+uKLLzR48GD5+PiU2MdmsykqKkqrV69WQUGBJOmrr75S1apV7SH30KFD2r17tzp16iRJOnXqlOLi4rR69Wp99913qlOnjrp166ZTp045rHvUqFG6//77tWXLFnXr1k29e/e2f3/cu3ev7r33XvXo0UObN2/WgAEDNHz48MvzRFypDK46CxcuNAEBAcbT09O0a9fOJCUlmc2bNxtjjPnmm2+Mn5+fOXv2rMMytWvXNm+++aYxxph27dqZfv36Ocy/7777TLdu3Ywxxuzdu9dIMps2bbLP/+2334wks3LlSmOMMStXrjSSzG+//XZ5dhLFxMXFGVdXV+Pj42N8fHyMJBMcHGw2bNhgjCn5Ndm0aZORZPbu3WuMMaZJkyZm5MiRF7zNwYMHm549ezrUEBoaav744w9723333Wd69er193YODuLi4szdd99tjDEmKirK/OMf/3CY37p1a/PUU08ZY4xZtmyZcXFxMTt27ChxXe+8846RZHbt2mVvmzx5sgkMDLRP165d28ydO9dhuRdeeMFERkYaY4yZNm2aCQgIMDk5Ofb5ixcvNi4uLiYzM9NeM8fG5fXX4+JcvXr1Mg0aNChx3oIFC0yVKlXs03379jX9+/d36PPNN98YFxcXc+bMGZOenm4kmfXr19vnZ2RkGEnm9ddfN8Zc2OcELl55vdbG/PkeM3ToUGOMMfv37zeurq7m0KFDDsvcdtttJikpyRhjzEMPPWQ6d+7sMP/JJ580DRs2tE+Hhoaahx9+2D5dWFhoqlevbqZMmXKevYYxxnz33XdGklm0aJFDe5UqVeyf/8OGDTO//fabcXFxMevXrzeFhYWmcuXKJiUlxbRt29YYY8zs2bNNjRo1St1OQUGBqVixovn000/tbZLMs88+a5/OyckxksySJUuMMcY89dRTpnHjxg7rGT58uMN3j3O/iyQnJ5tmzZo5LPP666+b0NDQsjwtVwzOOF2FevbsqcOHD+uTTz5Rly5dtGrVKrVo0UIzZszQ5s2blZOToypVqtjPTvn6+mrv3r3avXu3JCk9PV3t27d3WGf79u2Vnp7ujN1BGdxyyy1KS0tTWlqa1q1bp5iYGHXt2lX79++/oOUfe+wxvfjii2rfvr2Sk5OLDSwxefJktWzZUtWqVZOvr6+mTZumAwcOOPRp1KiRw9mM4OBg/hJ9mTVt2tRh+q/PeVpamm688UbVrVu31OW9vb1Vu3btEpfPzc3V7t271bdvX4f3jBdffNHhPaNZs2YOf/1s3769CgsLHf6CzLHhPMYY+z0Fy5cv12233aYaNWqoYsWKeuSRR3T8+HH7DeWbN2/WjBkzHF7vmJgYFRYWau/evdqxY4fc3NzUokUL+/rDw8MVEBDglH2Do0v5Wp9r69atKigoUN26dR2W+eqrryy/Q2RkZNjPfkiO71s2m01BQUG8H/wN69atU1pamho1aqS8vDxVqlRJzZo106pVq7R161a5u7urf//+2rRpk3JycvTVV18pKirKvnxWVpb69eunOnXqyN/fX35+fsrJySn2Gf/X183Hx0d+fn72123Hjh1q3bq1Q/82bdpcxr2+8rg5uwBcHE9PT3Xu3FmdO3fWc889p0cffVTJyckaNGiQgoODS7zG/EKHJHZx+TNPG2PsbdyncGXw8fFReHi4ffrtt9+Wv7+/3nrrLd1+++2Szv+6Pfroo4qJidHixYv1xRdfKCUlRa+99pqGDBmi9957T0888YRee+01RUZGqmLFiho7dqy+//57h3VUqFDBYdpmszGi12V2vufcy8vropYvOk6K7od466231LZtW4d+fw1Bf7dOXF7p6em66aabtG/fPt15550aOHCgXnrpJVWuXFmrV69W3759lZ+fL29vb+Xk5GjAgAF67LHHiq2nZs2a2rlzp+X2+Jxwnkv5Wp8rJydHrq6u2rBhQ7F//0WXdV8o3g8uTnh4uGw2W7HLGmvVqiXJ8T2/U6dOWrVqlTw8PBQVFaXKlSurQYMGWr16tb766is9/vjj9r5xcXE6fvy4JkyYoNDQUHl4eCgyMlL5+fkO27nUr5uLi4vD+4R0db9XEJyuEQ0bNtRHH32kFi1aKDMzU25ubqXeeNegQQOtWbNGcXFx9rY1a9aoYcOGkqRq1apJ+vPm4ObNm0uSww3AuHLYbDa5uLjozJkzDq9b0V+GS3rdQkJC9K9//Uv/+te/lJSUpLfeektDhgzRmjVr1K5dOw0aNMjet+gvjLhyNW3aVD///LN27tx53rNOpQkMDNQNN9ygPXv2qHfv3iX2adCggWbMmKHc3Fz7Wac1a9bIxcVF9erV+1v14+9bsWKFtm7dqv/85z/asGGDCgsL9dprr9nDzbn3KbZo0UI//fSTwx9h/qpevXr6448/tGnTJrVs2VKStGvXLv3222/2PnxOOMelfq3P1bx5cxUUFOjIkSPq0KFDiX2KvkP81Zo1a1S3bt0y/7EFxVWpUkWdO3fWpEmTNGTIkFLvc5KkqKgoTZ8+XW5uburSpYukP8PUvHnztHPnTvv9TdKfr9Ebb7yhbt26SZIOHjyoY8eOlam2evXq2QeUKrJ+/frzLlOtWjVlZmY6nCm9mt8ruFTvKnP8+HHdeuutmj17trZs2aK9e/dqwYIFeuWVV3T33XcrOjpakZGR6tGjh7744gvt27dP3377rYYPH64ffvhBkvTkk09qxowZmjJlijIyMjRu3DgtWrTIfrO4l5eXbr75Zo0ZM0bp6en66quv9Oyzzzpzt/H/y8vLU2ZmpjIzM5Wenq4hQ4YoJydH3bt3V3h4uEJCQjRy5EhlZGRo8eLFxUY6+ve//61ly5Zp79692rhxo1auXKkGDRpIkurUqaMffvhBy5Yt086dO/Xcc89ZviHC+aKiotSxY0f17NlTX375pfbu3aslS5Y4jKxkZdSoUUpJSdF///tf7dy5U1u3btU777yjcePGSZJ69+4tT09PxcXFadu2bVq5cqWGDBmiRx55RIGBgZdr11CCoveAQ4cOaePGjRo9erTuvvtu3XnnnYqNjVV4eLh+//13TZw4UXv27NGsWbM0depUh3U89dRT+vbbb5WQkKC0tDRlZGTo448/tg8YUL9+fUVHR6t///5at26dNm3apP79+8vLy8v+xYfPicuvPF7rc9WtW1e9e/dWbGysFi1apL1792rdunVKSUnR4sWLJUmPP/64UlNT9cILL2jnzp2aOXOmJk2a5DDgDP6eN954Q3/88YdatWql+fPnKz09XTt27NDs2bO1fft2e0Dt2LGjTp06pc8++8wekjp16qQ5c+YoODjY4Y9pderU0axZs5Senq7vv/9evXv3vqArFv5qwIAB2r59u5566int3LlT77//vn1QkNKGH+/UqZOOHj2qV155Rbt379bkyZO1ZMmSsj8pVwrn3V6Fi3H27Fnz9NNPmxYtWhh/f3/j7e1t6tWrZ5599llz+vRpY4wx2dnZZsiQIeaGG24wFSpUMCEhIaZ3797mwIED9vW88cYbplatWqZChQqmbt265t1333XYzk8//WQiIyONl5eXiYiIMF988QWDQzhZXFyckWR/VKxY0bRu3dosXLjQ3mf16tWmSZMmxtPT03To0MEsWLDAYXCIhIQEU7t2bePh4WGqVatmHnnkEXPs2DFjzJ/HVp8+fYy/v7+pVKmSGThwoHn66acdbuos6YbloUOHmqioqMu899eXcweHKLpxu8jdd99t4uLi7NPHjx838fHxpkqVKsbT09M0btzYfPbZZ8aYPweH8Pf3d1j+ww8/NOe+/c+ZM8dEREQYd3d3ExAQYDp27Ohwc/KWLVvMLbfcYjw9PU3lypVNv379zKlTp0qsuQjHxqX11/cANzc3U61aNRMdHW2mT59uCgoK7P3GjRtngoODjZeXl4mJiTHvvvtusffrdevWmc6dOxtfX1/j4+NjmjZtal566SX7/MOHD5uuXbsaDw8PExoaaubOnWuqV69upk6dau9j9TmBi1eer/W57zH5+flmxIgRJiwszFSoUMEEBwebf/7zn2bLli32PgsXLjQNGzY0FSpUMDVr1jRjx451qD80NNQ+kEiRZs2ameTk5Evy/FwPDh8+bBISEsxNN91kKlSoYHx9fU2bNm3M2LFjTW5urr1fs2bNTFBQkH36+PHjxmazmQceeMBhfRs3bjStWrUynp6epk6dOmbBggXFXidJ5sMPP3RYzt/f37zzzjv26Y8//tiEh4cbDw8P06lTJzNlyhQjyT7YSEnfD6dMmWJCQkKMj4+PiY2NNS+99NJVOziEzZhzLjwEAAD4i59//lkhISH2wQhw7YiMjNRtt92mF1980dml4Cr00ksvaerUqTp48KCkP38zsGvXrjp79qzc3d2dXN2lxz1OAADAwYoVK5STk6MmTZrol19+0bBhwxQWFqaOHTs6uzRcInl5edq6dat+/PHHEgePAEryxhtvqHXr1qpSpYrWrFmjsWPH2i/9zMrK0scff6w6depck6FJIjgBAIBz/P7773rmmWe0Z88eVaxYUe3atdOcOXOKjbiFq9eSJUsUGxuru+66S/fee6+zy8FVIiMjQy+++KJ+/fVX1axZU48//riSkpIkyf6Dum+88YaTq7x8uFQPAAAAACwwqh4AAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWCA4AQAAAIAFghMAAAAAWPj/AJ3jGjFFTxgpAAAAAElFTkSuQmCC",
1006
+ "text/plain": [
1007
+ "<Figure size 1000x500 with 1 Axes>"
1008
+ ]
1009
+ },
1010
+ "metadata": {},
1011
+ "output_type": "display_data"
1012
+ }
1013
+ ],
1014
+ "source": [
1015
+ "import matplotlib.pyplot as plt\n",
1016
+ "import seaborn as sns\n",
1017
+ "\n",
1018
+ "plt.figure(figsize=(10,5))\n",
1019
+ "sns.barplot(x=['Seoul', 'Busan', 'Incheon', 'Daegu', 'Daejeon', 'Gwangju'], y=voting)\n",
1020
+ "plt.title('CSI Score of Voting Model')\n",
1021
+ "plt.ylabel('CSI')\n",
1022
+ "\n",
1023
+ "plt.show()"
1024
+ ]
1025
+ },
1026
+ {
1027
+ "cell_type": "code",
1028
+ "execution_count": 24,
1029
+ "metadata": {},
1030
+ "outputs": [
1031
+ {
1032
+ "data": {
1033
+ "text/plain": [
1034
+ "0.4078715882283252"
1035
+ ]
1036
+ },
1037
+ "execution_count": 24,
1038
+ "metadata": {},
1039
+ "output_type": "execute_result"
1040
+ }
1041
+ ],
1042
+ "source": [
1043
+ "np.mean(voting)"
1044
+ ]
1045
+ },
1046
+ {
1047
+ "cell_type": "code",
1048
+ "execution_count": 25,
1049
+ "metadata": {},
1050
+ "outputs": [
1051
+ {
1052
+ "data": {
1053
+ "text/plain": [
1054
+ "[0.3248062015503624,\n",
1055
+ " 0.46608315098458075,\n",
1056
+ " 0.5763157894736463,\n",
1057
+ " 0.2852112676055334,\n",
1058
+ " 0.31884057971011603,\n",
1059
+ " 0.4759725400457121]"
1060
+ ]
1061
+ },
1062
+ "execution_count": 25,
1063
+ "metadata": {},
1064
+ "output_type": "execute_result"
1065
+ }
1066
+ ],
1067
+ "source": [
1068
+ "voting"
1069
+ ]
1070
+ },
1071
+ {
1072
+ "cell_type": "code",
1073
+ "execution_count": 26,
1074
+ "metadata": {},
1075
+ "outputs": [
1076
+ {
1077
+ "data": {
1078
+ "text/plain": [
1079
+ "[0.5106142349456536,\n",
1080
+ " 0.640202275543952,\n",
1081
+ " 0.709448778435959,\n",
1082
+ " 0.45579515959653394,\n",
1083
+ " 0.453960121993875,\n",
1084
+ " 0.6218724605270242]"
1085
+ ]
1086
+ },
1087
+ "execution_count": 26,
1088
+ "metadata": {},
1089
+ "output_type": "execute_result"
1090
+ }
1091
+ ],
1092
+ "source": [
1093
+ "mcc"
1094
+ ]
1095
+ },
1096
+ {
1097
+ "cell_type": "code",
1098
+ "execution_count": 27,
1099
+ "metadata": {},
1100
+ "outputs": [
1101
+ {
1102
+ "data": {
1103
+ "text/plain": [
1104
+ "[0.9005707762557078,\n",
1105
+ " 0.9721461187214612,\n",
1106
+ " 0.9264840182648402,\n",
1107
+ " 0.9768264840182649,\n",
1108
+ " 0.9141552511415525,\n",
1109
+ " 0.9477168949771689]"
1110
+ ]
1111
+ },
1112
+ "execution_count": 27,
1113
+ "metadata": {},
1114
+ "output_type": "execute_result"
1115
+ }
1116
+ ],
1117
+ "source": [
1118
+ "accuracy"
1119
+ ]
1120
+ }
1121
+ ],
1122
+ "metadata": {
1123
+ "kernelspec": {
1124
+ "display_name": "Python 3",
1125
+ "language": "python",
1126
+ "name": "python3"
1127
+ },
1128
+ "language_info": {
1129
+ "codemirror_mode": {
1130
+ "name": "ipython",
1131
+ "version": 3
1132
+ },
1133
+ "file_extension": ".py",
1134
+ "mimetype": "text/x-python",
1135
+ "name": "python",
1136
+ "nbconvert_exporter": "python",
1137
+ "pygments_lexer": "ipython3",
1138
+ "version": "3.8.10"
1139
+ }
1140
+ },
1141
+ "nbformat": 4,
1142
+ "nbformat_minor": 2
1143
+ }
Analysis_code/find_reason/ busan_trend.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/find_reason/ daegu_trend.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/find_reason/ gwangju_trend.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/find_reason/ incheon_trend.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/find_reason/ seoul_trend.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/find_reason/busan.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/find_reason/daegu.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/find_reason/daejeon.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/find_reason/daejeon_trend.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/find_reason/gwangju.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/find_reason/incheon.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/find_reason/seoul.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Analysis_code/ft_transformer.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ # FT-Transformer Implementation
5
+ class FTTransformer(nn.Module):
6
+ def __init__(self, num_features, cat_cardinalities, num_classes, d_token=192, n_blocks=6, attention_dropout=0.2, ffn_dropout=0.2):
7
+ super(FTTransformer, self).__init__()
8
+
9
+ self.num_classes = num_classes # 클래스 개수 저장
10
+
11
+ # Embedding layers for categorical features
12
+ self.cat_embeddings = nn.ModuleList([
13
+ nn.Embedding(num_categories, d_token) for num_categories in cat_cardinalities
14
+ ])
15
+
16
+ # Linear layer for numerical features
17
+ self.num_linear = nn.Linear(num_features, d_token)
18
+
19
+ # Transformer blocks
20
+ self.transformer_blocks = nn.ModuleList([
21
+ nn.TransformerEncoderLayer(
22
+ d_model=d_token,
23
+ nhead=8,
24
+ dim_feedforward=4 * d_token,
25
+ dropout=attention_dropout,
26
+ activation='gelu'
27
+ ) for _ in range(n_blocks)
28
+ ])
29
+
30
+ self.ffn_dropout = nn.Dropout(ffn_dropout)
31
+ if num_classes == 2:
32
+ self.output_layer = nn.Linear(d_token, 1) # Binary classification
33
+ elif num_classes > 2:
34
+ self.output_layer = nn.Linear(d_token, num_classes) # Multi classification
35
+
36
+ def forward(self, x_num, x_cat):
37
+ # Numerical feature embedding
38
+ x_num = self.num_linear(x_num)
39
+
40
+ # Categorical feature embedding
41
+ x_cat = [embed(x_cat[:, i]) for i, embed in enumerate(self.cat_embeddings)]
42
+ x_cat = torch.stack(x_cat, dim=1)
43
+
44
+ # Combine numerical and categorical embeddings
45
+ x = x_num.unsqueeze(1) + x_cat
46
+
47
+ # Pass through transformer blocks
48
+ for block in self.transformer_blocks:
49
+ x = block(x)
50
+
51
+ # Pooling and output
52
+ x = x.mean(dim=1)
53
+ x = self.ffn_dropout(x)
54
+ x = self.output_layer(x)
55
+
56
+ return x
Analysis_code/make_oversample_data/gan_sample_10000_1.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from imblearn.over_sampling import SMOTENC
5
+ import optuna
6
+ from ctgan import CTGAN
7
+ import torch
8
+ import warnings
9
+
10
+ # 지역별 데이터 파일 경로
11
+ regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
12
+ file_paths = [f'../../data/data_for_modeling/{region}_train.csv' for region in regions]
13
+ output_paths = [f'../../data/data_oversampled/ctgan10000/ctgan10000_1_{region}.csv' for region in regions]
14
+
15
+ # GPU 사용 설정
16
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
+ print(f"Using device: {device}")
18
+
19
+ # 경고 무시
20
+ warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
21
+
22
+ # 지역별 처리
23
+ for file_path, output_path in zip(file_paths, output_paths):
24
+ # 데이터 로드
25
+ data = pd.read_csv(file_path, index_col=0)
26
+ data= data.loc[data['year'].isin([2018,2019]),:]
27
+ data['cloudcover'] = data['cloudcover'].astype('int')
28
+ data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
29
+ X = data.drop(columns=['multi_class', 'binary_class'])
30
+ y = data['multi_class']
31
+
32
+ # 불필요한 열 제거
33
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
34
+
35
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
36
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
37
+
38
+ # sampling_strategy 설정
39
+ count_class_0 = (y == 0).sum()
40
+ count_class_1 = (y == 1).sum()
41
+ count_class_2 = (y == 2).sum()
42
+ sampling_strategy = {
43
+ 0: 500 if count_class_0 <= 500 else 1000,
44
+ 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림
45
+ 2: count_class_2
46
+ }
47
+
48
+ # SMOTENC 적용
49
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
50
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
51
+
52
+ # Resampled 데이터 생성
53
+ lerp_data = X_resampled.copy()
54
+ lerp_data['multi_class'] = y_resampled
55
+
56
+ # CTGAN에서 사용할 범주형 변수 열 이름 설정
57
+ categorical_features = [
58
+ col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'
59
+ ]
60
+
61
+ # Optuna 목적 함수 정의
62
+ def objective(trial):
63
+ # 하이퍼파라미터 탐색 범위 설정
64
+ embedding_dim = trial.suggest_int("embedding_dim", 64, 128)
65
+ generator_dim = trial.suggest_categorical("generator_dim", [(64, 64), (128, 128)])
66
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(64, 64), (128, 128)])
67
+ pac = trial.suggest_categorical("pac", [4, 8])
68
+ batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
69
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 3)
70
+
71
+ # CTGAN 모델 생성
72
+ ctgan = CTGAN(
73
+ embedding_dim=embedding_dim,
74
+ generator_dim=generator_dim,
75
+ discriminator_dim=discriminator_dim,
76
+ batch_size=batch_size,
77
+ discriminator_steps=discriminator_steps,
78
+ pac=pac
79
+ )
80
+
81
+ # 범주 0 데이터 필터링
82
+ data_0 = lerp_data[lerp_data['multi_class'] == 0]
83
+
84
+ # 모델 학습
85
+ ctgan.fit(data_0, discrete_columns=categorical_features)
86
+
87
+ # 샘플 생성
88
+ generated_data = ctgan.sample(len(data_0) * 2)
89
+
90
+ # 평가: 샘플의 연속형 변수 분포 비교
91
+ real_visi = data_0['visi']
92
+ generated_visi = generated_data['visi']
93
+
94
+ # 분포 간 차이(MSE) 계산
95
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
96
+ return -mse
97
+
98
+ # Optuna로 최적화 수행
99
+ study = optuna.create_study(direction="maximize")
100
+ study.optimize(objective, n_trials=50)
101
+
102
+ # 최적 하이퍼파라미터 출력
103
+ best_params = study.best_params
104
+
105
+ # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
106
+ ctgan = CTGAN(
107
+ embedding_dim=best_params["embedding_dim"],
108
+ generator_dim=best_params["generator_dim"],
109
+ discriminator_dim=best_params["discriminator_dim"],
110
+ batch_size=best_params["batch_size"],
111
+ discriminator_steps=best_params["discriminator_steps"],
112
+ pac=best_params["pac"]
113
+ )
114
+
115
+ # 범주 0 데이터로 최종 학습
116
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)
117
+ generated_0 = ctgan.sample(10000)
118
+
119
+ # 범주 1 데이터 최적화 및 생성
120
+ def objective_class1(trial):
121
+ embedding_dim = trial.suggest_int("embedding_dim", 128, 512)
122
+ generator_dim = trial.suggest_categorical("generator_dim", [(128, 128), (256, 256)])
123
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(128, 128), (256, 256)])
124
+ pac = trial.suggest_categorical("pac", [4, 8])
125
+ batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
126
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 5)
127
+
128
+ ctgan = CTGAN(
129
+ embedding_dim=embedding_dim,
130
+ generator_dim=generator_dim,
131
+ discriminator_dim=discriminator_dim,
132
+ batch_size=batch_size,
133
+ discriminator_steps=discriminator_steps,
134
+ pac=pac
135
+ )
136
+
137
+ data_1 = lerp_data[lerp_data['multi_class'] == 1]
138
+ ctgan.fit(data_1, discrete_columns=categorical_features)
139
+ generated_data = ctgan.sample(len(data_1) * 2)
140
+
141
+ real_visi = data_1['visi']
142
+ generated_visi = generated_data['visi']
143
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
144
+ return -mse
145
+
146
+ study_class1 = optuna.create_study(direction="maximize")
147
+ study_class1.optimize(objective_class1, n_trials=30)
148
+
149
+ best_params_class1 = study_class1.best_params
150
+ ctgan = CTGAN(
151
+ embedding_dim=best_params_class1["embedding_dim"],
152
+ generator_dim=best_params_class1["generator_dim"],
153
+ discriminator_dim=best_params_class1["discriminator_dim"],
154
+ batch_size=best_params_class1["batch_size"],
155
+ discriminator_steps=best_params_class1["discriminator_steps"],
156
+ pac=best_params_class1["pac"]
157
+ )
158
+
159
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)
160
+ generated_1 = ctgan.sample(10000 - int(np.ceil(count_class_1 / 100) * 100))
161
+
162
+ # 데이터 병합 및 저장
163
+ well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]
164
+ well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]
165
+ smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)
166
+ # 제거변수 복구
167
+ smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
168
+ smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)
169
+ smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)
170
+ smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)
171
+ smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)
172
+ smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']
173
+
174
+ filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
175
+ original_class2 = data[data['multi_class'] == 2]
176
+ final_data = pd.concat([filtered_data, original_class2], axis=0)
177
+ final_data.reset_index(drop=True, inplace=True)
178
+
179
+ # 결과 저장
180
+ final_data.to_csv(output_path, index = False)
181
+ print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))
Analysis_code/make_oversample_data/gan_sample_10000_2.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
5
+ from imblearn.over_sampling import SMOTENC
6
+ import optuna
7
+ from ctgan import CTGAN
8
+ import torch
9
+ import warnings
10
+
11
+ # 지역별 데이터 파일 경로
12
+ regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
13
+ file_paths = [f'../../data/data_for_modeling/{region}_train.csv' for region in regions]
14
+ output_paths = [f'../../data/data_oversampled/ctgan10000/ctgan10000_2_{region}.csv' for region in regions]
15
+
16
+ # GPU 사용 설정
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ print(f"Using device: {device}")
19
+
20
+ # 경고 무시
21
+ warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
22
+
23
+ # 지역별 처리
24
+ for file_path, output_path in zip(file_paths, output_paths):
25
+ # 데이터 로드
26
+ data = pd.read_csv(file_path, index_col=0)
27
+ data= data.loc[data['year'].isin([2018,2020]),:]
28
+ data['cloudcover'] = data['cloudcover'].astype('int')
29
+ data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
30
+ X = data.drop(columns=['multi_class', 'binary_class'])
31
+ y = data['multi_class']
32
+
33
+ # 불필요한 열 제거
34
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
35
+
36
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
37
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
38
+
39
+ # sampling_strategy 설정
40
+ count_class_0 = (y == 0).sum()
41
+ count_class_1 = (y == 1).sum()
42
+ count_class_2 = (y == 2).sum()
43
+ sampling_strategy = {
44
+ 0: 500 if count_class_0 <= 500 else 1000,
45
+ 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림
46
+ 2: count_class_2
47
+ }
48
+
49
+ # SMOTENC 적용
50
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
51
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
52
+
53
+ # Resampled 데이터 생성
54
+ lerp_data = X_resampled.copy()
55
+ lerp_data['multi_class'] = y_resampled
56
+
57
+ # CTGAN에서 사용할 범주형 변수 열 이름 설정
58
+ categorical_features = [
59
+ col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'
60
+ ]
61
+
62
+ # Optuna 목적 함수 정의
63
+ def objective(trial):
64
+ # 하이퍼파라미터 탐색 범위 설정
65
+ embedding_dim = trial.suggest_int("embedding_dim", 64, 128)
66
+ generator_dim = trial.suggest_categorical("generator_dim", [(64, 64), (128, 128)])
67
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(64, 64), (128, 128)])
68
+ pac = trial.suggest_categorical("pac", [4, 8])
69
+ batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
70
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 3)
71
+
72
+ # CTGAN 모델 생성
73
+ ctgan = CTGAN(
74
+ embedding_dim=embedding_dim,
75
+ generator_dim=generator_dim,
76
+ discriminator_dim=discriminator_dim,
77
+ batch_size=batch_size,
78
+ discriminator_steps=discriminator_steps,
79
+ pac=pac
80
+ )
81
+
82
+ # 범주 0 데이터 필터링
83
+ data_0 = lerp_data[lerp_data['multi_class'] == 0]
84
+
85
+ # 모델 학습
86
+ ctgan.fit(data_0, discrete_columns=categorical_features)
87
+
88
+ # 샘플 생성
89
+ generated_data = ctgan.sample(len(data_0) * 2)
90
+
91
+ # 평가: 샘플의 연속형 변수 분포 비교
92
+ real_visi = data_0['visi']
93
+ generated_visi = generated_data['visi']
94
+
95
+ # 분포 간 차이(MSE) 계산
96
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
97
+ return -mse
98
+
99
+ # Optuna로 최적화 수행
100
+ study = optuna.create_study(direction="maximize")
101
+ study.optimize(objective, n_trials=50)
102
+
103
+ # 최적 하이퍼파라미터 출력
104
+ best_params = study.best_params
105
+
106
+ # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
107
+ ctgan = CTGAN(
108
+ embedding_dim=best_params["embedding_dim"],
109
+ generator_dim=best_params["generator_dim"],
110
+ discriminator_dim=best_params["discriminator_dim"],
111
+ batch_size=best_params["batch_size"],
112
+ discriminator_steps=best_params["discriminator_steps"],
113
+ pac=best_params["pac"]
114
+ )
115
+
116
+ # 범주 0 데이터로 최종 학습
117
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)
118
+ generated_0 = ctgan.sample(10000)
119
+
120
+ # 범주 1 데이터 최적화 및 생성
121
+ def objective_class1(trial):
122
+ embedding_dim = trial.suggest_int("embedding_dim", 128, 512)
123
+ generator_dim = trial.suggest_categorical("generator_dim", [(128, 128), (256, 256)])
124
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(128, 128), (256, 256)])
125
+ pac = trial.suggest_categorical("pac", [4, 8])
126
+ batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
127
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 5)
128
+
129
+ ctgan = CTGAN(
130
+ embedding_dim=embedding_dim,
131
+ generator_dim=generator_dim,
132
+ discriminator_dim=discriminator_dim,
133
+ batch_size=batch_size,
134
+ discriminator_steps=discriminator_steps,
135
+ pac=pac
136
+ )
137
+
138
+ data_1 = lerp_data[lerp_data['multi_class'] == 1]
139
+ ctgan.fit(data_1, discrete_columns=categorical_features)
140
+ generated_data = ctgan.sample(len(data_1) * 2)
141
+
142
+ real_visi = data_1['visi']
143
+ generated_visi = generated_data['visi']
144
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
145
+ return -mse
146
+
147
+ study_class1 = optuna.create_study(direction="maximize")
148
+ study_class1.optimize(objective_class1, n_trials=30)
149
+
150
+ best_params_class1 = study_class1.best_params
151
+ ctgan = CTGAN(
152
+ embedding_dim=best_params_class1["embedding_dim"],
153
+ generator_dim=best_params_class1["generator_dim"],
154
+ discriminator_dim=best_params_class1["discriminator_dim"],
155
+ batch_size=best_params_class1["batch_size"],
156
+ discriminator_steps=best_params_class1["discriminator_steps"],
157
+ pac=best_params_class1["pac"]
158
+ )
159
+
160
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)
161
+ generated_1 = ctgan.sample(10000 - int(np.ceil(count_class_1 / 100) * 100))
162
+
163
+ # 데이터 병합 및 저장
164
+ well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]
165
+ well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]
166
+ smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)
167
+ # 제거변수 복구
168
+ smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
169
+ smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)
170
+ smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)
171
+ smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)
172
+ smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)
173
+ smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']
174
+
175
+ filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
176
+ original_class2 = data[data['multi_class'] == 2]
177
+ final_data = pd.concat([filtered_data, original_class2], axis=0)
178
+ final_data.reset_index(drop=True, inplace=True)
179
+
180
+ # 결과 저장
181
+ final_data.to_csv(output_path, index = False)
182
+ print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))
Analysis_code/make_oversample_data/gan_sample_10000_3.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
5
+ from imblearn.over_sampling import SMOTENC
6
+ import optuna
7
+ from ctgan import CTGAN
8
+ import torch
9
+ import warnings
10
+
11
+ # 지역별 데이터 파일 경로
12
+ regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
13
+ file_paths = [f'../../data/data_for_modeling/{region}_train.csv' for region in regions]
14
+ output_paths = [f'../../data/data_oversampled/ctgan10000/ctgan10000_3_{region}.csv' for region in regions]
15
+
16
+ # GPU 사용 설정
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ print(f"Using device: {device}")
19
+
20
+ # 경고 무시
21
+ warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
22
+
23
+ # 지역별 처리
24
+ for file_path, output_path in zip(file_paths, output_paths):
25
+ # 데이터 로드
26
+ data = pd.read_csv(file_path, index_col=0)
27
+ data= data.loc[data['year'].isin([2019,2020]),:]
28
+ data['cloudcover'] = data['cloudcover'].astype('int')
29
+ data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
30
+ X = data.drop(columns=['multi_class', 'binary_class'])
31
+ y = data['multi_class']
32
+
33
+ # 불필요한 열 제거
34
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
35
+
36
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
37
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
38
+
39
+ # sampling_strategy 설정
40
+ count_class_0 = (y == 0).sum()
41
+ count_class_1 = (y == 1).sum()
42
+ count_class_2 = (y == 2).sum()
43
+ sampling_strategy = {
44
+ 0: 500 if count_class_0 <= 500 else 1000,
45
+ 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림
46
+ 2: count_class_2
47
+ }
48
+
49
+ # SMOTENC 적용
50
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
51
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
52
+
53
+ # Resampled 데이터 생성
54
+ lerp_data = X_resampled.copy()
55
+ lerp_data['multi_class'] = y_resampled
56
+
57
+ # CTGAN에서 사용할 범주형 변수 열 이름 설정
58
+ categorical_features = [
59
+ col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'
60
+ ]
61
+
62
+ # Optuna 목적 함수 정의
63
+ def objective(trial):
64
+ # 하이퍼파라미터 탐색 범위 설정
65
+ embedding_dim = trial.suggest_int("embedding_dim", 64, 128)
66
+ generator_dim = trial.suggest_categorical("generator_dim", [(64, 64), (128, 128)])
67
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(64, 64), (128, 128)])
68
+ pac = trial.suggest_categorical("pac", [4, 8])
69
+ batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
70
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 3)
71
+
72
+ # CTGAN 모델 생성
73
+ ctgan = CTGAN(
74
+ embedding_dim=embedding_dim,
75
+ generator_dim=generator_dim,
76
+ discriminator_dim=discriminator_dim,
77
+ batch_size=batch_size,
78
+ discriminator_steps=discriminator_steps,
79
+ pac=pac
80
+ )
81
+
82
+ # 범주 0 데이터 필터링
83
+ data_0 = lerp_data[lerp_data['multi_class'] == 0]
84
+
85
+ # 모델 학습
86
+ ctgan.fit(data_0, discrete_columns=categorical_features)
87
+
88
+ # 샘플 생성
89
+ generated_data = ctgan.sample(len(data_0) * 2)
90
+
91
+ # 평가: 샘플의 연속형 변수 분포 비교
92
+ real_visi = data_0['visi']
93
+ generated_visi = generated_data['visi']
94
+
95
+ # 분포 간 차이(MSE) 계산
96
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
97
+ return -mse
98
+
99
+ # Optuna로 최적화 수행
100
+ study = optuna.create_study(direction="maximize")
101
+ study.optimize(objective, n_trials=50)
102
+
103
+ # 최적 하이퍼파라미터 출력
104
+ best_params = study.best_params
105
+
106
+ # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
107
+ ctgan = CTGAN(
108
+ embedding_dim=best_params["embedding_dim"],
109
+ generator_dim=best_params["generator_dim"],
110
+ discriminator_dim=best_params["discriminator_dim"],
111
+ batch_size=best_params["batch_size"],
112
+ discriminator_steps=best_params["discriminator_steps"],
113
+ pac=best_params["pac"]
114
+ )
115
+
116
+ # 범주 0 데이터로 최종 학습
117
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)
118
+ generated_0 = ctgan.sample(10000)
119
+
120
+ # 범주 1 데이터 최적화 및 생성
121
+ def objective_class1(trial):
122
+ embedding_dim = trial.suggest_int("embedding_dim", 128, 512)
123
+ generator_dim = trial.suggest_categorical("generator_dim", [(128, 128), (256, 256)])
124
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(128, 128), (256, 256)])
125
+ pac = trial.suggest_categorical("pac", [4, 8])
126
+ batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
127
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 5)
128
+
129
+ ctgan = CTGAN(
130
+ embedding_dim=embedding_dim,
131
+ generator_dim=generator_dim,
132
+ discriminator_dim=discriminator_dim,
133
+ batch_size=batch_size,
134
+ discriminator_steps=discriminator_steps,
135
+ pac=pac
136
+ )
137
+
138
+ data_1 = lerp_data[lerp_data['multi_class'] == 1]
139
+ ctgan.fit(data_1, discrete_columns=categorical_features)
140
+ generated_data = ctgan.sample(len(data_1) * 2)
141
+
142
+ real_visi = data_1['visi']
143
+ generated_visi = generated_data['visi']
144
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
145
+ return -mse
146
+
147
+ study_class1 = optuna.create_study(direction="maximize")
148
+ study_class1.optimize(objective_class1, n_trials=30)
149
+
150
+ best_params_class1 = study_class1.best_params
151
+ ctgan = CTGAN(
152
+ embedding_dim=best_params_class1["embedding_dim"],
153
+ generator_dim=best_params_class1["generator_dim"],
154
+ discriminator_dim=best_params_class1["discriminator_dim"],
155
+ batch_size=best_params_class1["batch_size"],
156
+ discriminator_steps=best_params_class1["discriminator_steps"],
157
+ pac=best_params_class1["pac"]
158
+ )
159
+
160
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)
161
+ generated_1 = ctgan.sample(10000 - int(np.ceil(count_class_1 / 100) * 100))
162
+
163
+ # 데이터 병합 및 저장
164
+ well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]
165
+ well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]
166
+ smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)
167
+ # 제거변수 복구
168
+ smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
169
+ smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)
170
+ smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)
171
+ smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)
172
+ smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)
173
+ smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']
174
+
175
+ filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
176
+ original_class2 = data[data['multi_class'] == 2]
177
+ final_data = pd.concat([filtered_data, original_class2], axis=0)
178
+ final_data.reset_index(drop=True, inplace=True)
179
+
180
+ # 결과 저장
181
+ final_data.to_csv(output_path, index = False)
182
+ print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))
Analysis_code/make_oversample_data/gan_sample_20000_1.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ # import os
4
+ # os.environ["CUDA_VISIBLE_DEVICES"] = "1"
5
+ from imblearn.over_sampling import SMOTENC
6
+ import optuna
7
+ from ctgan import CTGAN
8
+ import torch
9
+ import warnings
10
+
11
+ # 지역별 데이터 파일 경로
12
+ # regions = ['busan', 'daegu', 'daejeon', 'gwangju', 'incheon', 'seoul']
13
+ regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
14
+ file_paths = [f'../../data/data_for_modeling/{region}_train.csv' for region in regions]
15
+ output_paths = [f'../../data/data_oversampled/ctgan20000/ctgan20000_1_{region}.csv' for region in regions]
16
+
17
+ # GPU 사용 설정
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+ print(f"Using device: {device}")
20
+
21
+ # 경고 무시
22
+ warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
23
+
24
+ # 지역별 처리
25
+ for file_path, output_path in zip(file_paths, output_paths):
26
+ # 데이터 로드
27
+ data = pd.read_csv(file_path, index_col=0)
28
+ data= data.loc[data['year'].isin([2018,2019]),:]
29
+ data['cloudcover'] = data['cloudcover'].astype('int')
30
+ data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
31
+ X = data.drop(columns=['multi_class', 'binary_class'])
32
+ y = data['multi_class']
33
+
34
+ # 불필요한 열 제거
35
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
36
+
37
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
38
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
39
+
40
+ # sampling_strategy 설정
41
+ count_class_0 = (y == 0).sum()
42
+ count_class_1 = (y == 1).sum()
43
+ count_class_2 = (y == 2).sum()
44
+ sampling_strategy = {
45
+ 0: 500 if count_class_0 <= 500 else 1000,
46
+ 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림
47
+ 2: count_class_2
48
+ }
49
+
50
+ # SMOTENC 적용
51
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
52
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
53
+
54
+ # Resampled 데이터 생성
55
+ lerp_data = X_resampled.copy()
56
+ lerp_data['multi_class'] = y_resampled
57
+
58
+ # CTGAN에서 사용할 범주형 변수 열 이름 설정
59
+ categorical_features = [
60
+ col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'
61
+ ]
62
+
63
+ # Optuna 목적 함수 정의
64
+ def objective(trial):
65
+ # 하이퍼파라미터 탐색 범위 설정
66
+ embedding_dim = trial.suggest_int("embedding_dim", 64, 128)
67
+ generator_dim = trial.suggest_categorical("generator_dim", [(64, 64), (128, 128)])
68
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(64, 64), (128, 128)])
69
+ pac = trial.suggest_categorical("pac", [4, 8])
70
+ batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
71
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 3)
72
+
73
+ # CTGAN 모델 생성
74
+ ctgan = CTGAN(
75
+ embedding_dim=embedding_dim,
76
+ generator_dim=generator_dim,
77
+ discriminator_dim=discriminator_dim,
78
+ batch_size=batch_size,
79
+ discriminator_steps=discriminator_steps,
80
+ pac=pac
81
+ )
82
+
83
+ # 범주 0 데이터 필터링
84
+ data_0 = lerp_data[lerp_data['multi_class'] == 0]
85
+
86
+ # 모델 학습
87
+ ctgan.fit(data_0, discrete_columns=categorical_features)
88
+
89
+ # 샘플 생성
90
+ generated_data = ctgan.sample(len(data_0) * 2)
91
+
92
+ # 평가: 샘플의 연속형 변수 분포 비교
93
+ real_visi = data_0['visi']
94
+ generated_visi = generated_data['visi']
95
+
96
+ # 분포 간 차이(MSE) 계산
97
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
98
+ return -mse
99
+
100
+ # Optuna로 최적화 수행
101
+ study = optuna.create_study(direction="maximize")
102
+ study.optimize(objective, n_trials=50)
103
+
104
+ # 최적 하이퍼파라미터 출력
105
+ best_params = study.best_params
106
+
107
+ # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
108
+ ctgan = CTGAN(
109
+ embedding_dim=best_params["embedding_dim"],
110
+ generator_dim=best_params["generator_dim"],
111
+ discriminator_dim=best_params["discriminator_dim"],
112
+ batch_size=best_params["batch_size"],
113
+ discriminator_steps=best_params["discriminator_steps"],
114
+ pac=best_params["pac"]
115
+ )
116
+
117
+ # 범주 0 데이터로 최종 학습
118
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)
119
+ generated_0 = ctgan.sample(20000)
120
+
121
+ # 범주 1 데이터 최적화 및 생성
122
+ def objective_class1(trial):
123
+ embedding_dim = trial.suggest_int("embedding_dim", 128, 512)
124
+ generator_dim = trial.suggest_categorical("generator_dim", [(128, 128), (256, 256)])
125
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(128, 128), (256, 256)])
126
+ pac = trial.suggest_categorical("pac", [4, 8])
127
+ batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
128
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 5)
129
+
130
+ ctgan = CTGAN(
131
+ embedding_dim=embedding_dim,
132
+ generator_dim=generator_dim,
133
+ discriminator_dim=discriminator_dim,
134
+ batch_size=batch_size,
135
+ discriminator_steps=discriminator_steps,
136
+ pac=pac
137
+ )
138
+
139
+ data_1 = lerp_data[lerp_data['multi_class'] == 1]
140
+ ctgan.fit(data_1, discrete_columns=categorical_features)
141
+ generated_data = ctgan.sample(len(data_1) * 2)
142
+
143
+ real_visi = data_1['visi']
144
+ generated_visi = generated_data['visi']
145
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
146
+ return -mse
147
+
148
+ study_class1 = optuna.create_study(direction="maximize")
149
+ study_class1.optimize(objective_class1, n_trials=30)
150
+
151
+ best_params_class1 = study_class1.best_params
152
+ ctgan = CTGAN(
153
+ embedding_dim=best_params_class1["embedding_dim"],
154
+ generator_dim=best_params_class1["generator_dim"],
155
+ discriminator_dim=best_params_class1["discriminator_dim"],
156
+ batch_size=best_params_class1["batch_size"],
157
+ discriminator_steps=best_params_class1["discriminator_steps"],
158
+ pac=best_params_class1["pac"]
159
+ )
160
+
161
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)
162
+ generated_1 = ctgan.sample(20000 - int(np.ceil(count_class_1 / 100) * 100))
163
+
164
+ # 데이터 병합 및 저장
165
+ well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]
166
+ well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]
167
+ smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)
168
+ # 제거변수 복구
169
+ smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
170
+ smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)
171
+ smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)
172
+ smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)
173
+ smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)
174
+ smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']
175
+
176
+ filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
177
+ original_class2 = data[data['multi_class'] == 2]
178
+ final_data = pd.concat([filtered_data, original_class2], axis=0)
179
+ final_data.reset_index(drop=True, inplace=True)
180
+
181
+ # 결과 저장
182
+ final_data.to_csv(output_path, index = False)
183
+ print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))
Analysis_code/make_oversample_data/gan_sample_20000_2.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
5
+ from imblearn.over_sampling import SMOTENC
6
+ import optuna
7
+ from ctgan import CTGAN
8
+ import torch
9
+ import warnings
10
+
11
+ # 지역별 데이터 파일 경로
12
+ # regions = ['busan', 'daegu', 'daejeon', 'gwangju', 'incheon', 'seoul']
13
+ regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
14
+ file_paths = [f'../../data/data_for_modeling/{region}_train.csv' for region in regions]
15
+ output_paths = [f'../../data/data_oversampled/ctgan20000/ctgan20000_2_{region}.csv' for region in regions]
16
+
17
+ # GPU 사용 설정
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+ print(f"Using device: {device}")
20
+
21
+ # 경고 무시
22
+ warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
23
+
24
+ # 지역별 처리
25
+ for file_path, output_path in zip(file_paths, output_paths):
26
+ # 데이터 로드
27
+ data = pd.read_csv(file_path, index_col=0)
28
+ data= data.loc[data['year'].isin([2018,2020]),:]
29
+ data['cloudcover'] = data['cloudcover'].astype('int')
30
+ data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
31
+ X = data.drop(columns=['multi_class', 'binary_class'])
32
+ y = data['multi_class']
33
+
34
+ # 불필요한 열 제거
35
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
36
+
37
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
38
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
39
+
40
+ # sampling_strategy 설정
41
+ count_class_0 = (y == 0).sum()
42
+ count_class_1 = (y == 1).sum()
43
+ count_class_2 = (y == 2).sum()
44
+ sampling_strategy = {
45
+ 0: 500 if count_class_0 <= 500 else 1000,
46
+ 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림
47
+ 2: count_class_2
48
+ }
49
+
50
+ # SMOTENC 적용
51
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
52
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
53
+
54
+ # Resampled 데이터 생성
55
+ lerp_data = X_resampled.copy()
56
+ lerp_data['multi_class'] = y_resampled
57
+
58
+ # CTGAN에서 사용할 범주형 변수 열 이름 설정
59
+ categorical_features = [
60
+ col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'
61
+ ]
62
+
63
+ # Optuna 목적 함수 정의
64
+ def objective(trial):
65
+ # 하이퍼파라미터 탐색 범위 설정
66
+ embedding_dim = trial.suggest_int("embedding_dim", 64, 128)
67
+ generator_dim = trial.suggest_categorical("generator_dim", [(64, 64), (128, 128)])
68
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(64, 64), (128, 128)])
69
+ pac = trial.suggest_categorical("pac", [4, 8])
70
+ batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
71
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 3)
72
+
73
+ # CTGAN 모델 생성
74
+ ctgan = CTGAN(
75
+ embedding_dim=embedding_dim,
76
+ generator_dim=generator_dim,
77
+ discriminator_dim=discriminator_dim,
78
+ batch_size=batch_size,
79
+ discriminator_steps=discriminator_steps,
80
+ pac=pac
81
+ )
82
+
83
+ # 범주 0 데이터 필터링
84
+ data_0 = lerp_data[lerp_data['multi_class'] == 0]
85
+
86
+ # 모델 학습
87
+ ctgan.fit(data_0, discrete_columns=categorical_features)
88
+
89
+ # 샘플 생성
90
+ generated_data = ctgan.sample(len(data_0) * 2)
91
+
92
+ # 평가: 샘플의 연속형 변수 분포 비교
93
+ real_visi = data_0['visi']
94
+ generated_visi = generated_data['visi']
95
+
96
+ # 분포 간 차이(MSE) 계산
97
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
98
+ return -mse
99
+
100
+ # Optuna로 최적화 수행
101
+ study = optuna.create_study(direction="maximize")
102
+ study.optimize(objective, n_trials=50)
103
+
104
+ # 최적 하이퍼파라미터 출력
105
+ best_params = study.best_params
106
+
107
+ # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
108
+ ctgan = CTGAN(
109
+ embedding_dim=best_params["embedding_dim"],
110
+ generator_dim=best_params["generator_dim"],
111
+ discriminator_dim=best_params["discriminator_dim"],
112
+ batch_size=best_params["batch_size"],
113
+ discriminator_steps=best_params["discriminator_steps"],
114
+ pac=best_params["pac"]
115
+ )
116
+
117
+ # 범주 0 데이터로 최종 학습
118
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)
119
+ generated_0 = ctgan.sample(20000)
120
+
121
+ # 범주 1 데이터 최적화 및 생성
122
+ def objective_class1(trial):
123
+ embedding_dim = trial.suggest_int("embedding_dim", 128, 512)
124
+ generator_dim = trial.suggest_categorical("generator_dim", [(128, 128), (256, 256)])
125
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(128, 128), (256, 256)])
126
+ pac = trial.suggest_categorical("pac", [4, 8])
127
+ batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
128
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 5)
129
+
130
+ ctgan = CTGAN(
131
+ embedding_dim=embedding_dim,
132
+ generator_dim=generator_dim,
133
+ discriminator_dim=discriminator_dim,
134
+ batch_size=batch_size,
135
+ discriminator_steps=discriminator_steps,
136
+ pac=pac
137
+ )
138
+
139
+ data_1 = lerp_data[lerp_data['multi_class'] == 1]
140
+ ctgan.fit(data_1, discrete_columns=categorical_features)
141
+ generated_data = ctgan.sample(len(data_1) * 2)
142
+
143
+ real_visi = data_1['visi']
144
+ generated_visi = generated_data['visi']
145
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
146
+ return -mse
147
+
148
+ study_class1 = optuna.create_study(direction="maximize")
149
+ study_class1.optimize(objective_class1, n_trials=30)
150
+
151
+ best_params_class1 = study_class1.best_params
152
+ ctgan = CTGAN(
153
+ embedding_dim=best_params_class1["embedding_dim"],
154
+ generator_dim=best_params_class1["generator_dim"],
155
+ discriminator_dim=best_params_class1["discriminator_dim"],
156
+ batch_size=best_params_class1["batch_size"],
157
+ discriminator_steps=best_params_class1["discriminator_steps"],
158
+ pac=best_params_class1["pac"]
159
+ )
160
+
161
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)
162
+ generated_1 = ctgan.sample(20000 - int(np.ceil(count_class_1 / 100) * 100))
163
+
164
+ # 데이터 병합 및 저장
165
+ well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]
166
+ well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]
167
+ smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)
168
+ # 제거변수 복구
169
+ smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
170
+ smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)
171
+ smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)
172
+ smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)
173
+ smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)
174
+ smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']
175
+
176
+ filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
177
+ original_class2 = data[data['multi_class'] == 2]
178
+ final_data = pd.concat([filtered_data, original_class2], axis=0)
179
+ final_data.reset_index(drop=True, inplace=True)
180
+
181
+ # 결과 저장
182
+ final_data.to_csv(output_path, index = False)
183
+ print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))
Analysis_code/make_oversample_data/gan_sample_20000_3.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
5
+ from imblearn.over_sampling import SMOTENC
6
+ import optuna
7
+ from ctgan import CTGAN
8
+ import torch
9
+ import warnings
10
+
11
+ # 지역별 데이터 파일 경로
12
+ # regions = ['busan', 'daegu', 'daejeon', 'gwangju', 'incheon', 'seoul']
13
+ regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
14
+ file_paths = [f'../../data/data_for_modeling/{region}_train.csv' for region in regions]
15
+ output_paths = [f'../../data/data_oversampled/ctgan20000/ctgan20000_3_{region}.csv' for region in regions]
16
+
17
+ # GPU 사용 설정
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+ print(f"Using device: {device}")
20
+
21
+ # 경고 무시
22
+ warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
23
+
24
+ # 지역별 처리
25
+ for file_path, output_path in zip(file_paths, output_paths):
26
+ # 데이터 로드
27
+ data = pd.read_csv(file_path, index_col=0)
28
+ data= data.loc[data['year'].isin([2019,2020]),:]
29
+ data['cloudcover'] = data['cloudcover'].astype('int')
30
+ data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
31
+ X = data.drop(columns=['multi_class', 'binary_class'])
32
+ y = data['multi_class']
33
+
34
+ # 불필요한 열 제거
35
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
36
+
37
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
38
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
39
+
40
+ # sampling_strategy 설정
41
+ count_class_0 = (y == 0).sum()
42
+ count_class_1 = (y == 1).sum()
43
+ count_class_2 = (y == 2).sum()
44
+ sampling_strategy = {
45
+ 0: 500 if count_class_0 <= 500 else 1000,
46
+ 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림
47
+ 2: count_class_2
48
+ }
49
+
50
+ # SMOTENC 적용
51
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
52
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
53
+
54
+ # Resampled 데이터 생성
55
+ lerp_data = X_resampled.copy()
56
+ lerp_data['multi_class'] = y_resampled
57
+
58
+ # CTGAN에서 사용할 범주형 변수 열 이름 설정
59
+ categorical_features = [
60
+ col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'
61
+ ]
62
+
63
+ # Optuna 목적 함수 정의
64
+ def objective(trial):
65
+ # 하이퍼파라미터 탐색 범위 설정
66
+ embedding_dim = trial.suggest_int("embedding_dim", 64, 128)
67
+ generator_dim = trial.suggest_categorical("generator_dim", [(64, 64), (128, 128)])
68
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(64, 64), (128, 128)])
69
+ pac = trial.suggest_categorical("pac", [4, 8])
70
+ batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
71
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 3)
72
+
73
+ # CTGAN 모델 생성
74
+ ctgan = CTGAN(
75
+ embedding_dim=embedding_dim,
76
+ generator_dim=generator_dim,
77
+ discriminator_dim=discriminator_dim,
78
+ batch_size=batch_size,
79
+ discriminator_steps=discriminator_steps,
80
+ pac=pac
81
+ )
82
+
83
+ # 범주 0 데이터 필터링
84
+ data_0 = lerp_data[lerp_data['multi_class'] == 0]
85
+
86
+ # 모델 학습
87
+ ctgan.fit(data_0, discrete_columns=categorical_features)
88
+
89
+ # 샘플 생성
90
+ generated_data = ctgan.sample(len(data_0) * 2)
91
+
92
+ # 평가: 샘플의 연속형 변수 분포 비교
93
+ real_visi = data_0['visi']
94
+ generated_visi = generated_data['visi']
95
+
96
+ # 분포 간 차이(MSE) 계산
97
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
98
+ return -mse
99
+
100
+ # Optuna로 최적화 수행
101
+ study = optuna.create_study(direction="maximize")
102
+ study.optimize(objective, n_trials=50)
103
+
104
+ # 최적 하이퍼파라미터 출력
105
+ best_params = study.best_params
106
+
107
+ # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
108
+ ctgan = CTGAN(
109
+ embedding_dim=best_params["embedding_dim"],
110
+ generator_dim=best_params["generator_dim"],
111
+ discriminator_dim=best_params["discriminator_dim"],
112
+ batch_size=best_params["batch_size"],
113
+ discriminator_steps=best_params["discriminator_steps"],
114
+ pac=best_params["pac"]
115
+ )
116
+
117
+ # 범주 0 데이터로 최종 학습
118
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)
119
+ generated_0 = ctgan.sample(20000)
120
+
121
+ # 범주 1 데이터 최적화 및 생성
122
+ def objective_class1(trial):
123
+ embedding_dim = trial.suggest_int("embedding_dim", 128, 512)
124
+ generator_dim = trial.suggest_categorical("generator_dim", [(128, 128), (256, 256)])
125
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(128, 128), (256, 256)])
126
+ pac = trial.suggest_categorical("pac", [4, 8])
127
+ batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
128
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 5)
129
+
130
+ ctgan = CTGAN(
131
+ embedding_dim=embedding_dim,
132
+ generator_dim=generator_dim,
133
+ discriminator_dim=discriminator_dim,
134
+ batch_size=batch_size,
135
+ discriminator_steps=discriminator_steps,
136
+ pac=pac
137
+ )
138
+
139
+ data_1 = lerp_data[lerp_data['multi_class'] == 1]
140
+ ctgan.fit(data_1, discrete_columns=categorical_features)
141
+ generated_data = ctgan.sample(len(data_1) * 2)
142
+
143
+ real_visi = data_1['visi']
144
+ generated_visi = generated_data['visi']
145
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
146
+ return -mse
147
+
148
+ study_class1 = optuna.create_study(direction="maximize")
149
+ study_class1.optimize(objective_class1, n_trials=30)
150
+
151
+ best_params_class1 = study_class1.best_params
152
+ ctgan = CTGAN(
153
+ embedding_dim=best_params_class1["embedding_dim"],
154
+ generator_dim=best_params_class1["generator_dim"],
155
+ discriminator_dim=best_params_class1["discriminator_dim"],
156
+ batch_size=best_params_class1["batch_size"],
157
+ discriminator_steps=best_params_class1["discriminator_steps"],
158
+ pac=best_params_class1["pac"]
159
+ )
160
+
161
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)
162
+ generated_1 = ctgan.sample(20000 - int(np.ceil(count_class_1 / 100) * 100))
163
+
164
+ # 데이터 병합 및 저장
165
+ well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]
166
+ well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]
167
+ smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)
168
+ # 제거변수 복구
169
+ smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
170
+ smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)
171
+ smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)
172
+ smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)
173
+ smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)
174
+ smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']
175
+
176
+ filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
177
+ original_class2 = data[data['multi_class'] == 2]
178
+ final_data = pd.concat([filtered_data, original_class2], axis=0)
179
+ final_data.reset_index(drop=True, inplace=True)
180
+
181
+ # 결과 저장
182
+ final_data.to_csv(output_path, index = False)
183
+ print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))
Analysis_code/make_oversample_data/gan_sample_7000_1.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from imblearn.over_sampling import SMOTENC
4
+ import optuna
5
+ from ctgan import CTGAN
6
+ import torch
7
+ import warnings
8
+
9
+ # 지역별 데이터 파일 경로
10
+ regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
11
+ file_paths = [f'../../data/data_for_modeling/{region}_train.csv' for region in regions]
12
+ output_paths = [f'../../data/data_oversampled/ctgan7000/ctgan7000_1_{region}.csv' for region in regions]
13
+
14
+ # GPU 사용 설정
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+ print(f"Using device: {device}")
17
+
18
+ # 경고 무시
19
+ warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
20
+
21
+ # 지역별 처리
22
+ for file_path, output_path in zip(file_paths, output_paths):
23
+ # 데이터 로드
24
+ data = pd.read_csv(file_path, index_col=0)
25
+ data= data.loc[data['year'].isin([2018,2019]),:]
26
+ data['cloudcover'] = data['cloudcover'].astype('int')
27
+ data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
28
+ X = data.drop(columns=['multi_class', 'binary_class'])
29
+ y = data['multi_class']
30
+
31
+ # 불필요한 열 제거
32
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
33
+
34
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
35
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
36
+
37
+ # sampling_strategy 설정
38
+ count_class_0 = (y == 0).sum()
39
+ count_class_1 = (y == 1).sum()
40
+ count_class_2 = (y == 2).sum()
41
+ sampling_strategy = {
42
+ 0: 500 if count_class_0 <= 500 else 1000,
43
+ 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림
44
+ 2: count_class_2
45
+ }
46
+
47
+ # SMOTENC 적용
48
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
49
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
50
+
51
+ # Resampled 데이터 생성
52
+ lerp_data = X_resampled.copy()
53
+ lerp_data['multi_class'] = y_resampled
54
+
55
+ # CTGAN에서 사용할 범주형 변수 열 이름 설정
56
+ categorical_features = [
57
+ col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'
58
+ ]
59
+
60
+ # Optuna 목적 함수 정의
61
+ def objective(trial):
62
+ # 하이퍼파라미터 탐색 범위 설정
63
+ embedding_dim = trial.suggest_int("embedding_dim", 64, 128)
64
+ generator_dim = trial.suggest_categorical("generator_dim", [(64, 64), (128, 128)])
65
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(64, 64), (128, 128)])
66
+ pac = trial.suggest_categorical("pac", [4, 8])
67
+ batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
68
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 3)
69
+
70
+ # CTGAN 모델 생성
71
+ ctgan = CTGAN(
72
+ embedding_dim=embedding_dim,
73
+ generator_dim=generator_dim,
74
+ discriminator_dim=discriminator_dim,
75
+ batch_size=batch_size,
76
+ discriminator_steps=discriminator_steps,
77
+ pac=pac
78
+ )
79
+
80
+ # 범주 0 데이터 필터링
81
+ data_0 = lerp_data[lerp_data['multi_class'] == 0]
82
+
83
+ # 모델 학습
84
+ ctgan.fit(data_0, discrete_columns=categorical_features)
85
+
86
+ # 샘플 생성
87
+ generated_data = ctgan.sample(len(data_0) * 2)
88
+
89
+ # 평가: 샘플의 연속형 변수 분포 비교
90
+ real_visi = data_0['visi']
91
+ generated_visi = generated_data['visi']
92
+
93
+ # 분포 간 차이(MSE) 계산
94
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
95
+ return -mse
96
+
97
+ # Optuna로 최적화 수행
98
+ study = optuna.create_study(direction="maximize")
99
+ study.optimize(objective, n_trials=50)
100
+
101
+ # 최적 하이퍼파라미터 출력
102
+ best_params = study.best_params
103
+
104
+ # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
105
+ ctgan = CTGAN(
106
+ embedding_dim=best_params["embedding_dim"],
107
+ generator_dim=best_params["generator_dim"],
108
+ discriminator_dim=best_params["discriminator_dim"],
109
+ batch_size=best_params["batch_size"],
110
+ discriminator_steps=best_params["discriminator_steps"],
111
+ pac=best_params["pac"]
112
+ )
113
+
114
+ # 범주 0 데이터로 최종 학습
115
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)
116
+ generated_0 = ctgan.sample(7000)
117
+
118
+ # 범주 1 데이터 최적화 및 생성
119
+ def objective_class1(trial):
120
+ embedding_dim = trial.suggest_int("embedding_dim", 128, 512)
121
+ generator_dim = trial.suggest_categorical("generator_dim", [(128, 128), (256, 256)])
122
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(128, 128), (256, 256)])
123
+ pac = trial.suggest_categorical("pac", [4, 8])
124
+ batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
125
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 5)
126
+
127
+ ctgan = CTGAN(
128
+ embedding_dim=embedding_dim,
129
+ generator_dim=generator_dim,
130
+ discriminator_dim=discriminator_dim,
131
+ batch_size=batch_size,
132
+ discriminator_steps=discriminator_steps,
133
+ pac=pac
134
+ )
135
+
136
+ data_1 = lerp_data[lerp_data['multi_class'] == 1]
137
+ ctgan.fit(data_1, discrete_columns=categorical_features)
138
+ generated_data = ctgan.sample(len(data_1) * 2)
139
+
140
+ real_visi = data_1['visi']
141
+ generated_visi = generated_data['visi']
142
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
143
+ return -mse
144
+
145
+ study_class1 = optuna.create_study(direction="maximize")
146
+ study_class1.optimize(objective_class1, n_trials=30)
147
+
148
+ best_params_class1 = study_class1.best_params
149
+ ctgan = CTGAN(
150
+ embedding_dim=best_params_class1["embedding_dim"],
151
+ generator_dim=best_params_class1["generator_dim"],
152
+ discriminator_dim=best_params_class1["discriminator_dim"],
153
+ batch_size=best_params_class1["batch_size"],
154
+ discriminator_steps=best_params_class1["discriminator_steps"],
155
+ pac=best_params_class1["pac"]
156
+ )
157
+
158
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)
159
+ generated_1 = ctgan.sample(7000 - int(np.ceil(count_class_1 / 100) * 100))
160
+
161
+ # 데이터 병합 및 저장
162
+ well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]
163
+ well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]
164
+ smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)
165
+ # 제거변수 복구
166
+ smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
167
+ smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)
168
+ smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)
169
+ smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)
170
+ smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)
171
+ smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']
172
+
173
+ filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
174
+ original_class2 = data[data['multi_class'] == 2]
175
+ final_data = pd.concat([filtered_data, original_class2], axis=0)
176
+ final_data.reset_index(drop=True, inplace=True)
177
+
178
+ # 결과 저장
179
+ final_data.to_csv(output_path, index = False)
180
+ print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))
Analysis_code/make_oversample_data/gan_sample_7000_2.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
5
+ from imblearn.over_sampling import SMOTENC
6
+ import optuna
7
+ from ctgan import CTGAN
8
+ import torch
9
+ import warnings
10
+
11
+ # 지역별 데이터 파일 경로
12
+ regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
13
+ file_paths = [f'../../data/data_for_modeling/{region}_train.csv' for region in regions]
14
+ output_paths = [f'../../data/data_oversampled/ctgan7000/ctgan7000_2_{region}.csv' for region in regions]
15
+
16
+ # GPU 사용 설정
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ print(f"Using device: {device}")
19
+
20
+ # 경고 무시
21
+ warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
22
+
23
+ # 지역별 처리
24
+ for file_path, output_path in zip(file_paths, output_paths):
25
+ # 데이터 로드
26
+ data = pd.read_csv(file_path, index_col=0)
27
+ data= data.loc[data['year'].isin([2018,2020]),:]
28
+ data['cloudcover'] = data['cloudcover'].astype('int')
29
+ data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
30
+ X = data.drop(columns=['multi_class', 'binary_class'])
31
+ y = data['multi_class']
32
+
33
+ # 불필요한 열 제거
34
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
35
+
36
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
37
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
38
+
39
+ # sampling_strategy 설정
40
+ count_class_0 = (y == 0).sum()
41
+ count_class_1 = (y == 1).sum()
42
+ count_class_2 = (y == 2).sum()
43
+ sampling_strategy = {
44
+ 0: 500 if count_class_0 <= 500 else 1000,
45
+ 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림
46
+ 2: count_class_2
47
+ }
48
+
49
+ # SMOTENC 적용
50
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
51
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
52
+
53
+ # Resampled 데이터 생성
54
+ lerp_data = X_resampled.copy()
55
+ lerp_data['multi_class'] = y_resampled
56
+
57
+ # CTGAN에서 사용할 범주형 변수 열 이름 설정
58
+ categorical_features = [
59
+ col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'
60
+ ]
61
+
62
+ # Optuna 목적 함수 정의
63
+ def objective(trial):
64
+ # 하이퍼파라미터 탐색 범위 설정
65
+ embedding_dim = trial.suggest_int("embedding_dim", 64, 128)
66
+ generator_dim = trial.suggest_categorical("generator_dim", [(64, 64), (128, 128)])
67
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(64, 64), (128, 128)])
68
+ pac = trial.suggest_categorical("pac", [4, 8])
69
+ batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
70
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 3)
71
+
72
+ # CTGAN 모델 생성
73
+ ctgan = CTGAN(
74
+ embedding_dim=embedding_dim,
75
+ generator_dim=generator_dim,
76
+ discriminator_dim=discriminator_dim,
77
+ batch_size=batch_size,
78
+ discriminator_steps=discriminator_steps,
79
+ pac=pac
80
+ )
81
+
82
+ # 범주 0 데이터 필터링
83
+ data_0 = lerp_data[lerp_data['multi_class'] == 0]
84
+
85
+ # 모델 학습
86
+ ctgan.fit(data_0, discrete_columns=categorical_features)
87
+
88
+ # 샘플 생성
89
+ generated_data = ctgan.sample(len(data_0) * 2)
90
+
91
+ # 평가: 샘플의 연속형 변수 분포 비교
92
+ real_visi = data_0['visi']
93
+ generated_visi = generated_data['visi']
94
+
95
+ # 분포 간 차이(MSE) 계산
96
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
97
+ return -mse
98
+
99
+ # Optuna로 최적화 수행
100
+ study = optuna.create_study(direction="maximize")
101
+ study.optimize(objective, n_trials=50)
102
+
103
+ # 최적 하이퍼파라미터 출력
104
+ best_params = study.best_params
105
+
106
+ # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
107
+ ctgan = CTGAN(
108
+ embedding_dim=best_params["embedding_dim"],
109
+ generator_dim=best_params["generator_dim"],
110
+ discriminator_dim=best_params["discriminator_dim"],
111
+ batch_size=best_params["batch_size"],
112
+ discriminator_steps=best_params["discriminator_steps"],
113
+ pac=best_params["pac"]
114
+ )
115
+
116
+ # 범주 0 데이터로 최종 학습
117
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)
118
+ generated_0 = ctgan.sample(7000)
119
+
120
+ # 범주 1 데이터 최적화 및 생성
121
+ def objective_class1(trial):
122
+ embedding_dim = trial.suggest_int("embedding_dim", 128, 512)
123
+ generator_dim = trial.suggest_categorical("generator_dim", [(128, 128), (256, 256)])
124
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(128, 128), (256, 256)])
125
+ pac = trial.suggest_categorical("pac", [4, 8])
126
+ batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
127
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 5)
128
+
129
+ ctgan = CTGAN(
130
+ embedding_dim=embedding_dim,
131
+ generator_dim=generator_dim,
132
+ discriminator_dim=discriminator_dim,
133
+ batch_size=batch_size,
134
+ discriminator_steps=discriminator_steps,
135
+ pac=pac
136
+ )
137
+
138
+ data_1 = lerp_data[lerp_data['multi_class'] == 1]
139
+ ctgan.fit(data_1, discrete_columns=categorical_features)
140
+ generated_data = ctgan.sample(len(data_1) * 2)
141
+
142
+ real_visi = data_1['visi']
143
+ generated_visi = generated_data['visi']
144
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
145
+ return -mse
146
+
147
+ study_class1 = optuna.create_study(direction="maximize")
148
+ study_class1.optimize(objective_class1, n_trials=30)
149
+
150
+ best_params_class1 = study_class1.best_params
151
+ ctgan = CTGAN(
152
+ embedding_dim=best_params_class1["embedding_dim"],
153
+ generator_dim=best_params_class1["generator_dim"],
154
+ discriminator_dim=best_params_class1["discriminator_dim"],
155
+ batch_size=best_params_class1["batch_size"],
156
+ discriminator_steps=best_params_class1["discriminator_steps"],
157
+ pac=best_params_class1["pac"]
158
+ )
159
+
160
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)
161
+ generated_1 = ctgan.sample(7000 - int(np.ceil(count_class_1 / 100) * 100))
162
+
163
+ # 데이터 병합 및 저장
164
+ well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]
165
+ well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]
166
+ smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)
167
+ # 제거변수 복구
168
+ smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
169
+ smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)
170
+ smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)
171
+ smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)
172
+ smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)
173
+ smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']
174
+
175
+ filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
176
+ original_class2 = data[data['multi_class'] == 2]
177
+ final_data = pd.concat([filtered_data, original_class2], axis=0)
178
+ final_data.reset_index(drop=True, inplace=True)
179
+
180
+ # 결과 저장
181
+ final_data.to_csv(output_path, index = False)
182
+ print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))
Analysis_code/make_oversample_data/gan_sample_7000_3.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
5
+ from imblearn.over_sampling import SMOTENC
6
+ import optuna
7
+ from ctgan import CTGAN
8
+ import torch
9
+ import warnings
10
+
11
+ # 지역별 데이터 파일 경로
12
+ regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
13
+ file_paths = [f'../../data/data_for_modeling/{region}_train.csv' for region in regions]
14
+ output_paths = [f'../../data/data_oversampled/ctgan7000/ctgan7000_3_{region}.csv' for region in regions]
15
+
16
+ # GPU 사용 설정
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ print(f"Using device: {device}")
19
+
20
+ # 경고 무시
21
+ warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
22
+
23
+ # 지역별 처리
24
+ for file_path, output_path in zip(file_paths, output_paths):
25
+ # 데이터 로드
26
+ data = pd.read_csv(file_path, index_col=0)
27
+ data= data.loc[data['year'].isin([2019,2020]),:]
28
+ data['cloudcover'] = data['cloudcover'].astype('int')
29
+ data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
30
+ X = data.drop(columns=['multi_class', 'binary_class'])
31
+ y = data['multi_class']
32
+
33
+ # 불필요한 열 제거
34
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
35
+
36
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
37
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
38
+
39
+ # sampling_strategy 설정
40
+ count_class_0 = (y == 0).sum()
41
+ count_class_1 = (y == 1).sum()
42
+ count_class_2 = (y == 2).sum()
43
+ sampling_strategy = {
44
+ 0: 500 if count_class_0 <= 500 else 1000,
45
+ 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림
46
+ 2: count_class_2
47
+ }
48
+
49
+ # SMOTENC 적용
50
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
51
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
52
+
53
+ # Resampled 데이터 생성
54
+ lerp_data = X_resampled.copy()
55
+ lerp_data['multi_class'] = y_resampled
56
+
57
+ # CTGAN에서 사용할 범주형 변수 열 이름 설정
58
+ categorical_features = [
59
+ col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'
60
+ ]
61
+
62
+ # Optuna 목적 함수 정의
63
+ def objective(trial):
64
+ # 하이퍼파라미터 탐색 범위 설정
65
+ embedding_dim = trial.suggest_int("embedding_dim", 64, 128)
66
+ generator_dim = trial.suggest_categorical("generator_dim", [(64, 64), (128, 128)])
67
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(64, 64), (128, 128)])
68
+ pac = trial.suggest_categorical("pac", [4, 8])
69
+ batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
70
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 3)
71
+
72
+ # CTGAN 모델 생성
73
+ ctgan = CTGAN(
74
+ embedding_dim=embedding_dim,
75
+ generator_dim=generator_dim,
76
+ discriminator_dim=discriminator_dim,
77
+ batch_size=batch_size,
78
+ discriminator_steps=discriminator_steps,
79
+ pac=pac
80
+ )
81
+
82
+ # 범주 0 데이터 필터링
83
+ data_0 = lerp_data[lerp_data['multi_class'] == 0]
84
+
85
+ # 모델 학습
86
+ ctgan.fit(data_0, discrete_columns=categorical_features)
87
+
88
+ # 샘플 생성
89
+ generated_data = ctgan.sample(len(data_0) * 2)
90
+
91
+ # 평가: 샘플의 연속형 변수 분포 비교
92
+ real_visi = data_0['visi']
93
+ generated_visi = generated_data['visi']
94
+
95
+ # 분포 간 차이(MSE) 계산
96
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
97
+ return -mse
98
+
99
+ # Optuna로 최적화 수행
100
+ study = optuna.create_study(direction="maximize")
101
+ study.optimize(objective, n_trials=50)
102
+
103
+ # 최적 하이퍼파라미터 출력
104
+ best_params = study.best_params
105
+
106
+ # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
107
+ ctgan = CTGAN(
108
+ embedding_dim=best_params["embedding_dim"],
109
+ generator_dim=best_params["generator_dim"],
110
+ discriminator_dim=best_params["discriminator_dim"],
111
+ batch_size=best_params["batch_size"],
112
+ discriminator_steps=best_params["discriminator_steps"],
113
+ pac=best_params["pac"]
114
+ )
115
+
116
+ # 범주 0 데이터로 최종 학습
117
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)
118
+ generated_0 = ctgan.sample(7000)
119
+
120
+ # 범주 1 데이터 최적화 및 생성
121
+ def objective_class1(trial):
122
+ embedding_dim = trial.suggest_int("embedding_dim", 128, 512)
123
+ generator_dim = trial.suggest_categorical("generator_dim", [(128, 128), (256, 256)])
124
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(128, 128), (256, 256)])
125
+ pac = trial.suggest_categorical("pac", [4, 8])
126
+ batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
127
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 5)
128
+
129
+ ctgan = CTGAN(
130
+ embedding_dim=embedding_dim,
131
+ generator_dim=generator_dim,
132
+ discriminator_dim=discriminator_dim,
133
+ batch_size=batch_size,
134
+ discriminator_steps=discriminator_steps,
135
+ pac=pac
136
+ )
137
+
138
+ data_1 = lerp_data[lerp_data['multi_class'] == 1]
139
+ ctgan.fit(data_1, discrete_columns=categorical_features)
140
+ generated_data = ctgan.sample(len(data_1) * 2)
141
+
142
+ real_visi = data_1['visi']
143
+ generated_visi = generated_data['visi']
144
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
145
+ return -mse
146
+
147
+ study_class1 = optuna.create_study(direction="maximize")
148
+ study_class1.optimize(objective_class1, n_trials=30)
149
+
150
+ best_params_class1 = study_class1.best_params
151
+ ctgan = CTGAN(
152
+ embedding_dim=best_params_class1["embedding_dim"],
153
+ generator_dim=best_params_class1["generator_dim"],
154
+ discriminator_dim=best_params_class1["discriminator_dim"],
155
+ batch_size=best_params_class1["batch_size"],
156
+ discriminator_steps=best_params_class1["discriminator_steps"],
157
+ pac=best_params_class1["pac"]
158
+ )
159
+
160
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)
161
+ generated_1 = ctgan.sample(7000 - int(np.ceil(count_class_1 / 100) * 100))
162
+
163
+ # 데이터 병합 및 저장
164
+ well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]
165
+ well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]
166
+ smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)
167
+ # 제거변수 복구
168
+ smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
169
+ smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)
170
+ smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)
171
+ smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)
172
+ smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)
173
+ smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']
174
+
175
+ filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]
176
+ original_class2 = data[data['multi_class'] == 2]
177
+ final_data = pd.concat([filtered_data, original_class2], axis=0)
178
+ final_data.reset_index(drop=True, inplace=True)
179
+
180
+ # 결과 저장
181
+ final_data.to_csv(output_path, index = False)
182
+ print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))
Analysis_code/make_oversample_data/oversampling_code.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from imblearn.over_sampling import SMOTENC
4
+
5
+
6
+ # smote와 ctgan을 이용한 oversampling 진행
7
+
8
+ # 파일 경로와 지역 이름 리스트
9
+ regions = ['busan', 'daegu', 'daejeon', 'incheon', 'seoul','gwangju']
10
+ input_paths = [f'../data/data_for_modeling/{region}_train.csv' for region in regions]
11
+
12
+
13
+
14
+
15
+ # 반복적으로 각 지역 데이터 처리
16
+ for region, input_path in zip(regions, input_paths):
17
+ # 데이터 읽기
18
+ data = pd.read_csv(input_path, index_col=0)
19
+ data.drop(['Unnamed: 0'], axis=1, inplace=True)
20
+ print("\n######",region,"#######")
21
+ print(len(data[data['multi_class']==0]),'|',len(data[data['multi_class']==1]),'|',len(data[data['multi_class']==2]))
22
+ print(len(data.columns))
23
+
24
+
25
+
26
+
27
+
28
+ import pandas as pd
29
+ import numpy as np
30
+ from imblearn.over_sampling import SMOTENC
31
+
32
+ # 파일 경로와 지역 이름 리스트
33
+ regions = ['busan', 'daegu', 'daejeon', 'incheon', 'seoul','gwangju']
34
+ input_paths = [f'../data/data_for_modeling/{region}_train.csv' for region in regions]
35
+ output_paths = [f'../data/data_oversampled/smote_{region}.csv' for region in regions]
36
+
37
+ # 반복적으로 각 지역 데이터 처리
38
+ for region, input_path, output_path in zip(regions, input_paths, output_paths):
39
+ # 데이터 읽기
40
+ data = pd.read_csv(input_path, index_col=0)
41
+ data.drop(['Unnamed: 0'], axis=1, inplace=True)
42
+
43
+ # X와 y 분리
44
+ X = data.drop(columns=['multi_class', 'binary_class'])
45
+ y = data['multi_class']
46
+
47
+ # 불필요한 열 제거
48
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
49
+
50
+ # 범주형 변수 식별
51
+ categorical_features = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
52
+
53
+ # 각 지역의 multi_class 값이 2인 데이터 개수 계산
54
+ count_class_2 = (y == 2).sum()
55
+
56
+ # SMOTENC 적용
57
+ smotenc = SMOTENC(
58
+ categorical_features=categorical_features,
59
+ sampling_strategy={0: 10000, 1: 10000, 2: count_class_2},
60
+ random_state=42
61
+ )
62
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
63
+
64
+ # 추가 변수 생성
65
+ X_resampled['multi_class'] = y_resampled
66
+ X_resampled['binary_class'] = X_resampled['multi_class'].apply(lambda x: 0 if x == 2 else 1)
67
+ X_resampled['hour_sin'] = np.sin(2 * np.pi * X_resampled['hour'] / 24)
68
+ X_resampled['hour_cos'] = np.cos(2 * np.pi * X_resampled['hour'] / 24)
69
+ X_resampled['month_sin'] = np.sin(2 * np.pi * X_resampled['month'] / 12)
70
+ X_resampled['month_cos'] = np.cos(2 * np.pi * X_resampled['month'] / 12)
71
+ X_resampled['ground_temp - temp_C'] = X_resampled['groundtemp'] - X_resampled['temp_C']
72
+
73
+ # 결과 저장
74
+ X_resampled.to_csv(output_path)
75
+ print(f"Processed and saved: {region} -> {output_path}")
76
+
77
+
78
+ smote_seoul = pd.read_csv('../data/data_oversampled/smote_seoul.csv')
79
+ print(smote_seoul[smote_seoul['multi_class']==0]['visi'].describe())
80
+ print(smote_seoul[smote_seoul['multi_class']==1]['visi'].describe())
81
+
82
+
83
+ import pandas as pd
84
+ import numpy as np
85
+ from imblearn.over_sampling import SMOTENC
86
+
87
+ # 파일 경로와 지역 이름 리스트
88
+ regions = ['busan', 'daegu', 'daejeon', 'incheon', 'seoul','gwangju']
89
+ input_paths = [f'../data/data_oversampled/smote_{region}.csv' for region in regions]
90
+
91
+ # 반복적으로 각 지역 데이터 처리
92
+ for region, input_path in zip(regions, input_paths):
93
+ # 데이터 읽기
94
+ data = pd.read_csv(input_path, index_col=0)
95
+ data.drop(['Unnamed: 0'], axis=1, inplace=True)
96
+ print("\n######",region,"#######")
97
+ print(len(data[data['multi_class']==0]),'|',len(data[data['multi_class']==1]),'|',len(data[data['multi_class']==2]))
98
+ print(len(data.columns))
99
+
100
+
101
+
102
+ import pandas as pd
103
+ import numpy as np
104
+ from imblearn.over_sampling import SMOTENC
105
+
106
+ # 파일 경로와 지역 이름 리스트
107
+ regions = ['busan', 'daegu', 'daejeon', 'incheon', 'seoul','gwangju']
108
+ input_paths = [f'../data/data_for_modeling/{region}_train.csv' for region in regions]
109
+
110
+ # 반복적으로 각 지역 데이터 처리
111
+ for region, input_path in zip(regions, input_paths):
112
+ # 데이터 읽기
113
+ data = pd.read_csv(input_path, index_col=0)
114
+ data.drop(['Unnamed: 0'], axis=1, inplace=True)
115
+ print("\n######",region,"#######")
116
+ print(len(data[data['multi_class']==0]),'|',len(data[data['multi_class']==1]),'|',len(data[data['multi_class']==2]))
117
+ print(len(data.columns))
118
+
119
+
120
+ import pandas as pd
121
+ import numpy as np
122
+ from imblearn.over_sampling import SMOTENC
123
+ import optuna
124
+ from ctgan import CTGAN
125
+ import torch
126
+ import warnings
127
+
128
+ # 지역별 데이터 파일 경로
129
+ regions = ['busan', 'daegu', 'daejeon', 'incheon', 'seoul','gwangju']
130
+ file_paths = [f'../data/data_for_modeling/df_{region}.feather' for region in regions]
131
+ output_paths = [f'../data/data_oversampled/ctgan_{region}.csv' for region in regions]
132
+
133
+ # GPU 사용 설정
134
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
135
+ print(f"Using device: {device}")
136
+
137
+ # 경고 무시
138
+ warnings.filterwarnings("ignore", category=UserWarning, module="optuna.distributions")
139
+
140
+ # 지역별 처리
141
+ for file_path, output_path in zip(file_paths, output_paths):
142
+ # 데이터 로드
143
+ data = pd.read_feather(file_path)
144
+ data.drop(['Unnamed: 0'], axis=1, inplace=True)
145
+ X = data.drop(columns=['multi_class', 'binary_class'])
146
+ y = data['multi_class']
147
+
148
+ # 불필요한 열 제거
149
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
150
+
151
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
152
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
153
+
154
+ # sampling_strategy 설정
155
+ count_class_0 = (y == 0).sum()
156
+ count_class_1 = (y == 1).sum()
157
+ count_class_2 = (y == 2).sum()
158
+ sampling_strategy = {
159
+ 0: 500 if count_class_0 <= 500 else 1000,
160
+ 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림
161
+ 2: count_class_2
162
+ }
163
+
164
+ # SMOTENC 적용
165
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
166
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
167
+
168
+ # Resampled 데이터 생성
169
+ lerp_data = X_resampled.copy()
170
+ lerp_data['multi_class'] = y_resampled
171
+
172
+ # CTGAN에서 사용할 범주형 변수 열 이름 설정
173
+ categorical_features = [
174
+ col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'
175
+ ]
176
+
177
+ # Optuna 목적 함수 정의
178
+ def objective(trial):
179
+ # 하이퍼파라미터 탐색 범위 설정
180
+ embedding_dim = trial.suggest_int("embedding_dim", 64, 128)
181
+ generator_dim = trial.suggest_categorical("generator_dim", [(64, 64), (128, 128)])
182
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(64, 64), (128, 128)])
183
+ pac = trial.suggest_categorical("pac", [4, 8])
184
+ batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
185
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 3)
186
+
187
+ # CTGAN 모델 생성
188
+ ctgan = CTGAN(
189
+ embedding_dim=embedding_dim,
190
+ generator_dim=generator_dim,
191
+ discriminator_dim=discriminator_dim,
192
+ batch_size=batch_size,
193
+ discriminator_steps=discriminator_steps,
194
+ pac=pac
195
+ )
196
+
197
+ # 범주 0 데이터 필터링
198
+ data_0 = lerp_data[lerp_data['multi_class'] == 0]
199
+
200
+ # 모델 학습
201
+ ctgan.fit(data_0, discrete_columns=categorical_features)
202
+
203
+ # 샘플 생성
204
+ generated_data = ctgan.sample(len(data_0) * 2)
205
+
206
+ # 평가: 샘플의 연속형 변수 분포 비교
207
+ real_visi = data_0['visi']
208
+ generated_visi = generated_data['visi']
209
+
210
+ # 분포 간 차이(MSE) 계산
211
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
212
+ return -mse
213
+
214
+ # Optuna로 최적화 수행
215
+ study = optuna.create_study(direction="maximize")
216
+ study.optimize(objective, n_trials=50)
217
+
218
+ # 최적 하이퍼파라미터 출력
219
+ best_params = study.best_params
220
+
221
+ # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성
222
+ ctgan = CTGAN(
223
+ embedding_dim=best_params["embedding_dim"],
224
+ generator_dim=best_params["generator_dim"],
225
+ discriminator_dim=best_params["discriminator_dim"],
226
+ batch_size=best_params["batch_size"],
227
+ discriminator_steps=best_params["discriminator_steps"],
228
+ pac=best_params["pac"]
229
+ )
230
+
231
+ # 범주 0 데이터로 최종 학습
232
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)
233
+ generated_0 = ctgan.sample(19500 if count_class_0 <= 500 else 19000)
234
+
235
+ # 범주 1 데이터 최적화 및 생성
236
+ def objective_class1(trial):
237
+ embedding_dim = trial.suggest_int("embedding_dim", 128, 512)
238
+ generator_dim = trial.suggest_categorical("generator_dim", [(128, 128), (256, 256)])
239
+ discriminator_dim = trial.suggest_categorical("discriminator_dim", [(128, 128), (256, 256)])
240
+ pac = trial.suggest_categorical("pac", [4, 8])
241
+ batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
242
+ discriminator_steps = trial.suggest_int("discriminator_steps", 1, 5)
243
+
244
+ ctgan = CTGAN(
245
+ embedding_dim=embedding_dim,
246
+ generator_dim=generator_dim,
247
+ discriminator_dim=discriminator_dim,
248
+ batch_size=batch_size,
249
+ discriminator_steps=discriminator_steps,
250
+ pac=pac
251
+ )
252
+
253
+ data_1 = lerp_data[lerp_data['multi_class'] == 1]
254
+ ctgan.fit(data_1, discrete_columns=categorical_features)
255
+ generated_data = ctgan.sample(len(data_1) * 2)
256
+
257
+ real_visi = data_1['visi']
258
+ generated_visi = generated_data['visi']
259
+ mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)
260
+ return -mse
261
+
262
+ study_class1 = optuna.create_study(direction="maximize")
263
+ study_class1.optimize(objective_class1, n_trials=30)
264
+
265
+ best_params_class1 = study_class1.best_params
266
+ ctgan = CTGAN(
267
+ embedding_dim=best_params_class1["embedding_dim"],
268
+ generator_dim=best_params_class1["generator_dim"],
269
+ discriminator_dim=best_params_class1["discriminator_dim"],
270
+ batch_size=best_params_class1["batch_size"],
271
+ discriminator_steps=best_params_class1["discriminator_steps"],
272
+ pac=best_params_class1["pac"]
273
+ )
274
+
275
+ ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)
276
+ generated_1 = ctgan.sample(20000 - int(np.ceil(count_class_1 / 100) * 100))
277
+
278
+ # 데이터 병합 및 저장
279
+ well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]
280
+ well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]
281
+ smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)
282
+ # 제거변수 복구
283
+ smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
284
+ smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)
285
+ smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)
286
+ smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)
287
+ smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)
288
+ smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']
289
+
290
+ # 결과 저장
291
+ smote_gan_data.to_csv(output_path, index = False)
292
+ print(f"Processed and saved: {region} -> {output_path}")
293
+
294
+
295
+
296
+ import pandas as pd
297
+ import numpy as np
298
+ from imblearn.over_sampling import SMOTENC
299
+
300
+ # 파일 경로와 지역 이름 리스트
301
+ regions = ['busan', 'daegu', 'daejeon', 'incheon', 'seoul','gwangju']
302
+ input_paths = [f'../data/data_oversampled/ctgan_{region}.csv' for region in regions]
303
+
304
+ # 반복적으로 각 지역 데이터 처리
305
+ for region, input_path in zip(regions, input_paths):
306
+ # 데이터 읽기
307
+ data = pd.read_csv(input_path)
308
+ print("\n######",region,"#######")
309
+ print(len(data[data['multi_class']==0]),'|',len(data[data['multi_class']==1]),'|',len(data[data['multi_class']==2]))
310
+ print(len(data.columns))
311
+
312
+
313
+
314
+
315
+ busan_check = pd.read_csv('../data/data_oversampled/ctgan_busan.csv')
316
+ print(busan_check[busan_check['multi_class']==0]['visi'].describe())
317
+ print(busan_check[busan_check['multi_class']==1]['visi'].describe())
318
+ print(busan_check[busan_check['multi_class']==2]['visi'].describe())
319
+
320
+
321
+
322
+
323
+ import pandas as pd
324
+ import numpy as np
325
+ from imblearn.over_sampling import SMOTENC
326
+
327
+ # 파일 경로와 지역 이름 리스트
328
+ regions = ['busan', 'daegu', 'daejeon', 'incheon', 'seoul','gwangju']
329
+ origin_paths = [f'../data/data_for_modeling/{region}_train.csv' for region in regions]
330
+ augment_paths = [f'../data/data_oversampled/ctgan_{region}.csv' for region in regions]
331
+
332
+ # 반복적으로 각 지역 데이터 처리
333
+ for region, origin_path, augment_path in zip(regions, origin_paths, augment_paths):
334
+ # 데이터 읽기
335
+ origin = pd.read_csv(origin_path, index_col=0)
336
+ augment = pd.read_csv(augment_path)
337
+
338
+ # 증강된 데이터에서 범주 2 데이터 제거
339
+ filtered_data = augment[augment['multi_class'] != 2]
340
+
341
+ # 원본 데이터에서 범주 2 데이터 추출
342
+ original_class2 = origin[origin['multi_class'] == 2]
343
+
344
+ # 제거된 데이터에 원본 범주 2 데이터를 추가
345
+ final_data = pd.concat([filtered_data, original_class2], axis=0)
346
+
347
+ # 인덱스 재설정
348
+ final_data.reset_index(drop=True, inplace=True)
349
+
350
+ # 결과 저장
351
+ final_data.to_csv(augment_path, index = False)
352
+
353
+ print("\n######",region,"#######")
354
+ print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))
355
+ print(len(data.columns))
Analysis_code/make_oversample_data/smote_sample_1.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from imblearn.over_sampling import SMOTENC
5
+
6
+ # 지역별 데이터 파일 경로
7
+ regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
8
+ file_paths = [f'../../data/data_for_modeling/{region}_train.csv' for region in regions]
9
+ output_paths = [f'../../data/data_oversampled/smote/smote_1_{region}.csv' for region in regions]
10
+
11
+ # 지역별 처리
12
+ for file_path, output_path in zip(file_paths, output_paths):
13
+ # 데이터 로드
14
+ data = pd.read_csv(file_path, index_col=0)
15
+ data= data.loc[data['year'].isin([2018,2019]),:]
16
+ data['cloudcover'] = data['cloudcover'].astype('int')
17
+ data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
18
+ X = data.drop(columns=['multi_class', 'binary_class'])
19
+ y = data['multi_class']
20
+
21
+ # 불필요한 열 제거
22
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
23
+
24
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
25
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
26
+
27
+ # sampling_strategy 설정
28
+ count_class_2 = (y == 2).sum()
29
+ sampling_strategy = {
30
+ 0: int(np.ceil(count_class_2 / 1000) * 500),
31
+ 1: int(np.ceil(count_class_2 / 1000) * 500),
32
+ 2: count_class_2
33
+ }
34
+
35
+ # SMOTENC 적용
36
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
37
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
38
+
39
+ # Resampled 데이터 생성
40
+ lerp_data = X_resampled.copy()
41
+ lerp_data['multi_class'] = y_resampled
42
+
43
+ # 제거변수 복구
44
+ lerp_data['binary_class'] = lerp_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
45
+ lerp_data['hour_sin'] = np.sin(2 * np.pi * lerp_data['hour'] / 24)
46
+ lerp_data['hour_cos'] = np.cos(2 * np.pi * lerp_data['hour'] / 24)
47
+ lerp_data['month_sin'] = np.sin(2 * np.pi * lerp_data['month'] / 12)
48
+ lerp_data['month_cos'] = np.cos(2 * np.pi * lerp_data['month'] / 12)
49
+ lerp_data['ground_temp - temp_C'] = lerp_data['groundtemp'] - lerp_data['temp_C']
50
+
51
+ # 결과 저장
52
+ lerp_data.to_csv(output_path, index = False)
53
+ print(len(lerp_data[lerp_data['multi_class']==0]),'|',len(lerp_data[lerp_data['multi_class']==1]),'|',len(lerp_data[lerp_data['multi_class']==2]))
Analysis_code/make_oversample_data/smote_sample_2.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from imblearn.over_sampling import SMOTENC
5
+
6
+ # 지역별 데이터 파일 경로
7
+ regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
8
+ file_paths = [f'../../data/data_for_modeling/{region}_train.csv' for region in regions]
9
+ output_paths = [f'../../data/data_oversampled/smote/smote_2_{region}.csv' for region in regions]
10
+
11
+ # 지역별 처리
12
+ for file_path, output_path in zip(file_paths, output_paths):
13
+ # 데이터 로드
14
+ data = pd.read_csv(file_path, index_col=0)
15
+ data= data.loc[data['year'].isin([2018,2020]),:]
16
+ data['cloudcover'] = data['cloudcover'].astype('int')
17
+ data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
18
+ X = data.drop(columns=['multi_class', 'binary_class'])
19
+ y = data['multi_class']
20
+
21
+ # 불필요한 열 제거
22
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
23
+
24
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
25
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
26
+
27
+ # sampling_strategy 설정
28
+ count_class_2 = (y == 2).sum()
29
+ sampling_strategy = {
30
+ 0: int(np.ceil(count_class_2 / 1000) * 500),
31
+ 1: int(np.ceil(count_class_2 / 1000) * 500),
32
+ 2: count_class_2
33
+ }
34
+
35
+ # SMOTENC 적용
36
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
37
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
38
+
39
+ # Resampled 데이터 생성
40
+ lerp_data = X_resampled.copy()
41
+ lerp_data['multi_class'] = y_resampled
42
+
43
+ # 제거변수 복구
44
+ lerp_data['binary_class'] = lerp_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
45
+ lerp_data['hour_sin'] = np.sin(2 * np.pi * lerp_data['hour'] / 24)
46
+ lerp_data['hour_cos'] = np.cos(2 * np.pi * lerp_data['hour'] / 24)
47
+ lerp_data['month_sin'] = np.sin(2 * np.pi * lerp_data['month'] / 12)
48
+ lerp_data['month_cos'] = np.cos(2 * np.pi * lerp_data['month'] / 12)
49
+ lerp_data['ground_temp - temp_C'] = lerp_data['groundtemp'] - lerp_data['temp_C']
50
+
51
+ # 결과 저장
52
+ lerp_data.to_csv(output_path, index = False)
53
+ print(len(lerp_data[lerp_data['multi_class']==0]),'|',len(lerp_data[lerp_data['multi_class']==1]),'|',len(lerp_data[lerp_data['multi_class']==2]))
Analysis_code/make_oversample_data/smote_sample_3.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from imblearn.over_sampling import SMOTENC
5
+
6
+ # 지역별 데이터 파일 경로
7
+ regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']
8
+ file_paths = [f'../../data/data_for_modeling/{region}_train.csv' for region in regions]
9
+ output_paths = [f'../../data/data_oversampled/smote/smote_3_{region}.csv' for region in regions]
10
+
11
+ # 지역별 처리
12
+ for file_path, output_path in zip(file_paths, output_paths):
13
+ # 데이터 로드
14
+ data = pd.read_csv(file_path, index_col=0)
15
+ data= data.loc[data['year'].isin([2019,2020]),:]
16
+ data['cloudcover'] = data['cloudcover'].astype('int')
17
+ data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')
18
+ X = data.drop(columns=['multi_class', 'binary_class'])
19
+ y = data['multi_class']
20
+
21
+ # 불필요한 열 제거
22
+ X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)
23
+
24
+ # SMOTENC에서 사용할 범주형 변수 열 번호 설정
25
+ categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']
26
+
27
+ # sampling_strategy 설정
28
+ count_class_2 = (y == 2).sum()
29
+ sampling_strategy = {
30
+ 0: int(np.ceil(count_class_2 / 1000) * 500),
31
+ 1: int(np.ceil(count_class_2 / 1000) * 500),
32
+ 2: count_class_2
33
+ }
34
+
35
+ # SMOTENC 적용
36
+ smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)
37
+ X_resampled, y_resampled = smotenc.fit_resample(X, y)
38
+
39
+ # Resampled 데이터 생성
40
+ lerp_data = X_resampled.copy()
41
+ lerp_data['multi_class'] = y_resampled
42
+
43
+ # 제거변수 복구
44
+ lerp_data['binary_class'] = lerp_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)
45
+ lerp_data['hour_sin'] = np.sin(2 * np.pi * lerp_data['hour'] / 24)
46
+ lerp_data['hour_cos'] = np.cos(2 * np.pi * lerp_data['hour'] / 24)
47
+ lerp_data['month_sin'] = np.sin(2 * np.pi * lerp_data['month'] / 12)
48
+ lerp_data['month_cos'] = np.cos(2 * np.pi * lerp_data['month'] / 12)
49
+ lerp_data['ground_temp - temp_C'] = lerp_data['groundtemp'] - lerp_data['temp_C']
50
+
51
+ # 결과 저장
52
+ lerp_data.to_csv(output_path, index = False)
53
+ print(len(lerp_data[lerp_data['multi_class']==0]),'|',len(lerp_data[lerp_data['multi_class']==1]),'|',len(lerp_data[lerp_data['multi_class']==2]))
Analysis_code/make_train_test.ipynb ADDED
@@ -0,0 +1,1099 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import numpy as np\n",
11
+ "import matplotlib.pyplot as plt\n",
12
+ "import seaborn as sns\n",
13
+ "from sklearn.model_selection import train_test_split\n",
14
+ "from collections import Counter"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 2,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "df_seoul = pd.read_feather(\"../data/data_for_modeling/df_seoul.feather\")\n",
24
+ "df_busan = pd.read_feather(\"../data/data_for_modeling/df_busan.feather\")\n",
25
+ "df_incheon = pd.read_feather(\"../data/data_for_modeling/df_incheon.feather\")\n",
26
+ "df_daegu = pd.read_feather(\"../data/data_for_modeling/df_daegu.feather\")\n",
27
+ "df_daejeon = pd.read_feather(\"../data/data_for_modeling/df_daejeon.feather\")\n",
28
+ "df_gwangju = pd.read_feather(\"../data/data_for_modeling/df_gwangju.feather\")"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 3,
34
+ "metadata": {},
35
+ "outputs": [
36
+ {
37
+ "data": {
38
+ "text/plain": [
39
+ "Counter({2: 48534, 1: 3941, 0: 109})"
40
+ ]
41
+ },
42
+ "execution_count": 3,
43
+ "metadata": {},
44
+ "output_type": "execute_result"
45
+ }
46
+ ],
47
+ "source": [
48
+ "Counter(df_seoul['multi_class'])"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 4,
54
+ "metadata": {},
55
+ "outputs": [
56
+ {
57
+ "data": {
58
+ "text/plain": [
59
+ "Counter({2: 50069, 1: 2350, 0: 165})"
60
+ ]
61
+ },
62
+ "execution_count": 4,
63
+ "metadata": {},
64
+ "output_type": "execute_result"
65
+ }
66
+ ],
67
+ "source": [
68
+ "Counter(df_busan['multi_class'])"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 5,
74
+ "metadata": {},
75
+ "outputs": [
76
+ {
77
+ "data": {
78
+ "text/plain": [
79
+ "Counter({2: 44944, 1: 6658, 0: 982})"
80
+ ]
81
+ },
82
+ "execution_count": 5,
83
+ "metadata": {},
84
+ "output_type": "execute_result"
85
+ }
86
+ ],
87
+ "source": [
88
+ "Counter(df_incheon['multi_class'])"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": 6,
94
+ "metadata": {},
95
+ "outputs": [
96
+ {
97
+ "data": {
98
+ "text/plain": [
99
+ "Counter({2: 50919, 1: 1610, 0: 55})"
100
+ ]
101
+ },
102
+ "execution_count": 6,
103
+ "metadata": {},
104
+ "output_type": "execute_result"
105
+ }
106
+ ],
107
+ "source": [
108
+ "Counter(df_daegu['multi_class'])"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": 7,
114
+ "metadata": {},
115
+ "outputs": [
116
+ {
117
+ "data": {
118
+ "text/plain": [
119
+ "Counter({2: 48047, 1: 4227, 0: 310})"
120
+ ]
121
+ },
122
+ "execution_count": 7,
123
+ "metadata": {},
124
+ "output_type": "execute_result"
125
+ }
126
+ ],
127
+ "source": [
128
+ "Counter(df_daejeon['multi_class'])"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 8,
134
+ "metadata": {},
135
+ "outputs": [
136
+ {
137
+ "data": {
138
+ "text/plain": [
139
+ "Counter({2: 48405, 1: 4015, 0: 164})"
140
+ ]
141
+ },
142
+ "execution_count": 8,
143
+ "metadata": {},
144
+ "output_type": "execute_result"
145
+ }
146
+ ],
147
+ "source": [
148
+ "Counter(df_gwangju['multi_class'])"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": 9,
154
+ "metadata": {},
155
+ "outputs": [
156
+ {
157
+ "data": {
158
+ "text/plain": [
159
+ "(52584, 30)"
160
+ ]
161
+ },
162
+ "execution_count": 9,
163
+ "metadata": {},
164
+ "output_type": "execute_result"
165
+ }
166
+ ],
167
+ "source": [
168
+ "df_seoul.shape"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": 10,
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": [
177
+ "df_seoul = df_seoul.loc[df_seoul['year'].isin([2018, 2019, 2020, 2021]),:].copy()\n",
178
+ "df_busan = df_busan.loc[df_busan['year'].isin([2018, 2019, 2020, 2021]),:].copy()\n",
179
+ "df_incheon = df_incheon.loc[df_incheon['year'].isin([2018, 2019, 2020, 2021]),:].copy()\n",
180
+ "df_daegu = df_daegu.loc[df_daegu['year'].isin([2018, 2019, 2020, 2021]),:].copy()\n",
181
+ "df_daejeon = df_daejeon.loc[df_daejeon['year'].isin([2018, 2019, 2020, 2021]),:].copy()\n",
182
+ "df_gwangju = df_gwangju.loc[df_gwangju['year'].isin([2018, 2019, 2020, 2021]),:].copy()"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": 11,
188
+ "metadata": {},
189
+ "outputs": [],
190
+ "source": [
191
+ "cols = [col for col in df_seoul.columns if col != \"multi_class\"] + [\"multi_class\"]"
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "execution_count": 12,
197
+ "metadata": {},
198
+ "outputs": [],
199
+ "source": [
200
+ "df_seoul = df_seoul[cols]\n",
201
+ "df_busan = df_busan[cols]\n",
202
+ "df_incheon = df_incheon[cols]\n",
203
+ "df_daegu = df_daegu[cols]\n",
204
+ "df_daejeon = df_daejeon[cols]\n",
205
+ "df_gwangju = df_gwangju[cols]"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": 13,
211
+ "metadata": {},
212
+ "outputs": [],
213
+ "source": [
214
+ "df_seoul_train = df_seoul.loc[df_seoul['year'].isin([2018, 2019, 2020]),:].copy()\n",
215
+ "df_seoul_test = df_seoul.loc[df_seoul['year'].isin([2021]),:].copy()\n",
216
+ "\n",
217
+ "df_busan_train = df_busan.loc[df_busan['year'].isin([2018, 2019, 2020]),:].copy()\n",
218
+ "df_busan_test = df_busan.loc[df_busan['year'].isin([2021]),:].copy()\n",
219
+ "\n",
220
+ "df_incheon_train = df_incheon.loc[df_incheon['year'].isin([2018, 2019, 2020]),:].copy()\n",
221
+ "df_incheon_test = df_incheon.loc[df_incheon['year'].isin([2021]),:].copy()\n",
222
+ "\n",
223
+ "df_daegu_train = df_daegu.loc[df_daegu['year'].isin([2018, 2019, 2020]),:].copy()\n",
224
+ "df_daegu_test = df_daegu.loc[df_daegu['year'].isin([2021]),:].copy()\n",
225
+ "\n",
226
+ "df_daejeon_train = df_daejeon.loc[df_daejeon['year'].isin([2018, 2019, 2020]),:].copy()\n",
227
+ "df_daejeon_test = df_daejeon.loc[df_daejeon['year'].isin([2021]),:].copy()\n",
228
+ "\n",
229
+ "df_gwangju_train = df_gwangju.loc[df_gwangju['year'].isin([2018, 2019, 2020]),:].copy()\n",
230
+ "df_gwangju_test = df_gwangju.loc[df_gwangju['year'].isin([2021]),:].copy()"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": 14,
236
+ "metadata": {},
237
+ "outputs": [
238
+ {
239
+ "data": {
240
+ "text/html": [
241
+ "<div>\n",
242
+ "<style scoped>\n",
243
+ " .dataframe tbody tr th:only-of-type {\n",
244
+ " vertical-align: middle;\n",
245
+ " }\n",
246
+ "\n",
247
+ " .dataframe tbody tr th {\n",
248
+ " vertical-align: top;\n",
249
+ " }\n",
250
+ "\n",
251
+ " .dataframe thead th {\n",
252
+ " text-align: right;\n",
253
+ " }\n",
254
+ "</style>\n",
255
+ "<table border=\"1\" class=\"dataframe\">\n",
256
+ " <thead>\n",
257
+ " <tr style=\"text-align: right;\">\n",
258
+ " <th></th>\n",
259
+ " <th>temp_C</th>\n",
260
+ " <th>precip_mm</th>\n",
261
+ " <th>wind_speed</th>\n",
262
+ " <th>wind_dir</th>\n",
263
+ " <th>hm</th>\n",
264
+ " <th>vap_pressure</th>\n",
265
+ " <th>dewpoint_C</th>\n",
266
+ " <th>loc_pressure</th>\n",
267
+ " <th>sea_pressure</th>\n",
268
+ " <th>solarRad</th>\n",
269
+ " <th>...</th>\n",
270
+ " <th>year</th>\n",
271
+ " <th>month</th>\n",
272
+ " <th>hour</th>\n",
273
+ " <th>ground_temp - temp_C</th>\n",
274
+ " <th>hour_sin</th>\n",
275
+ " <th>hour_cos</th>\n",
276
+ " <th>month_sin</th>\n",
277
+ " <th>month_cos</th>\n",
278
+ " <th>visi</th>\n",
279
+ " <th>multi_class</th>\n",
280
+ " </tr>\n",
281
+ " </thead>\n",
282
+ " <tbody>\n",
283
+ " <tr>\n",
284
+ " <th>0</th>\n",
285
+ " <td>1.2</td>\n",
286
+ " <td>0.0</td>\n",
287
+ " <td>1.6</td>\n",
288
+ " <td>360</td>\n",
289
+ " <td>35.0</td>\n",
290
+ " <td>2.3</td>\n",
291
+ " <td>-12.6</td>\n",
292
+ " <td>1015.8</td>\n",
293
+ " <td>1024.6</td>\n",
294
+ " <td>0.00</td>\n",
295
+ " <td>...</td>\n",
296
+ " <td>2018</td>\n",
297
+ " <td>1</td>\n",
298
+ " <td>0</td>\n",
299
+ " <td>-5.4</td>\n",
300
+ " <td>0.000000</td>\n",
301
+ " <td>1.000000e+00</td>\n",
302
+ " <td>0.5</td>\n",
303
+ " <td>0.866025</td>\n",
304
+ " <td>2000.0</td>\n",
305
+ " <td>2</td>\n",
306
+ " </tr>\n",
307
+ " <tr>\n",
308
+ " <th>1</th>\n",
309
+ " <td>0.5</td>\n",
310
+ " <td>0.0</td>\n",
311
+ " <td>1.3</td>\n",
312
+ " <td>360</td>\n",
313
+ " <td>33.0</td>\n",
314
+ " <td>2.1</td>\n",
315
+ " <td>-13.9</td>\n",
316
+ " <td>1015.5</td>\n",
317
+ " <td>1024.3</td>\n",
318
+ " <td>0.00</td>\n",
319
+ " <td>...</td>\n",
320
+ " <td>2018</td>\n",
321
+ " <td>1</td>\n",
322
+ " <td>1</td>\n",
323
+ " <td>-5.4</td>\n",
324
+ " <td>0.258819</td>\n",
325
+ " <td>9.659258e-01</td>\n",
326
+ " <td>0.5</td>\n",
327
+ " <td>0.866025</td>\n",
328
+ " <td>2000.0</td>\n",
329
+ " <td>2</td>\n",
330
+ " </tr>\n",
331
+ " <tr>\n",
332
+ " <th>2</th>\n",
333
+ " <td>0.1</td>\n",
334
+ " <td>0.0</td>\n",
335
+ " <td>1.5</td>\n",
336
+ " <td>20</td>\n",
337
+ " <td>34.0</td>\n",
338
+ " <td>2.1</td>\n",
339
+ " <td>-13.9</td>\n",
340
+ " <td>1015.7</td>\n",
341
+ " <td>1024.5</td>\n",
342
+ " <td>0.00</td>\n",
343
+ " <td>...</td>\n",
344
+ " <td>2018</td>\n",
345
+ " <td>1</td>\n",
346
+ " <td>2</td>\n",
347
+ " <td>-5.4</td>\n",
348
+ " <td>0.500000</td>\n",
349
+ " <td>8.660254e-01</td>\n",
350
+ " <td>0.5</td>\n",
351
+ " <td>0.866025</td>\n",
352
+ " <td>2000.0</td>\n",
353
+ " <td>2</td>\n",
354
+ " </tr>\n",
355
+ " <tr>\n",
356
+ " <th>3</th>\n",
357
+ " <td>0.0</td>\n",
358
+ " <td>0.0</td>\n",
359
+ " <td>2.1</td>\n",
360
+ " <td>320</td>\n",
361
+ " <td>37.0</td>\n",
362
+ " <td>2.3</td>\n",
363
+ " <td>-12.9</td>\n",
364
+ " <td>1015.9</td>\n",
365
+ " <td>1024.7</td>\n",
366
+ " <td>0.00</td>\n",
367
+ " <td>...</td>\n",
368
+ " <td>2018</td>\n",
369
+ " <td>1</td>\n",
370
+ " <td>3</td>\n",
371
+ " <td>-5.0</td>\n",
372
+ " <td>0.707107</td>\n",
373
+ " <td>7.071068e-01</td>\n",
374
+ " <td>0.5</td>\n",
375
+ " <td>0.866025</td>\n",
376
+ " <td>2000.0</td>\n",
377
+ " <td>2</td>\n",
378
+ " </tr>\n",
379
+ " <tr>\n",
380
+ " <th>4</th>\n",
381
+ " <td>-0.1</td>\n",
382
+ " <td>0.0</td>\n",
383
+ " <td>2.3</td>\n",
384
+ " <td>340</td>\n",
385
+ " <td>42.0</td>\n",
386
+ " <td>2.5</td>\n",
387
+ " <td>-11.5</td>\n",
388
+ " <td>1016.0</td>\n",
389
+ " <td>1024.9</td>\n",
390
+ " <td>0.00</td>\n",
391
+ " <td>...</td>\n",
392
+ " <td>2018</td>\n",
393
+ " <td>1</td>\n",
394
+ " <td>4</td>\n",
395
+ " <td>-4.3</td>\n",
396
+ " <td>0.866025</td>\n",
397
+ " <td>5.000000e-01</td>\n",
398
+ " <td>0.5</td>\n",
399
+ " <td>0.866025</td>\n",
400
+ " <td>2000.0</td>\n",
401
+ " <td>2</td>\n",
402
+ " </tr>\n",
403
+ " <tr>\n",
404
+ " <th>5</th>\n",
405
+ " <td>-0.1</td>\n",
406
+ " <td>0.0</td>\n",
407
+ " <td>2.8</td>\n",
408
+ " <td>50</td>\n",
409
+ " <td>43.0</td>\n",
410
+ " <td>2.6</td>\n",
411
+ " <td>-11.2</td>\n",
412
+ " <td>1016.0</td>\n",
413
+ " <td>1024.9</td>\n",
414
+ " <td>0.00</td>\n",
415
+ " <td>...</td>\n",
416
+ " <td>2018</td>\n",
417
+ " <td>1</td>\n",
418
+ " <td>5</td>\n",
419
+ " <td>-4.0</td>\n",
420
+ " <td>0.965926</td>\n",
421
+ " <td>2.588190e-01</td>\n",
422
+ " <td>0.5</td>\n",
423
+ " <td>0.866025</td>\n",
424
+ " <td>2000.0</td>\n",
425
+ " <td>2</td>\n",
426
+ " </tr>\n",
427
+ " <tr>\n",
428
+ " <th>6</th>\n",
429
+ " <td>-0.5</td>\n",
430
+ " <td>0.0</td>\n",
431
+ " <td>2.1</td>\n",
432
+ " <td>20</td>\n",
433
+ " <td>45.0</td>\n",
434
+ " <td>2.6</td>\n",
435
+ " <td>-11.0</td>\n",
436
+ " <td>1016.5</td>\n",
437
+ " <td>1025.4</td>\n",
438
+ " <td>0.00</td>\n",
439
+ " <td>...</td>\n",
440
+ " <td>2018</td>\n",
441
+ " <td>1</td>\n",
442
+ " <td>6</td>\n",
443
+ " <td>-4.1</td>\n",
444
+ " <td>1.000000</td>\n",
445
+ " <td>6.123234e-17</td>\n",
446
+ " <td>0.5</td>\n",
447
+ " <td>0.866025</td>\n",
448
+ " <td>2000.0</td>\n",
449
+ " <td>2</td>\n",
450
+ " </tr>\n",
451
+ " <tr>\n",
452
+ " <th>7</th>\n",
453
+ " <td>-0.8</td>\n",
454
+ " <td>0.0</td>\n",
455
+ " <td>2.5</td>\n",
456
+ " <td>340</td>\n",
457
+ " <td>45.0</td>\n",
458
+ " <td>2.6</td>\n",
459
+ " <td>-11.2</td>\n",
460
+ " <td>1017.1</td>\n",
461
+ " <td>1026.0</td>\n",
462
+ " <td>0.00</td>\n",
463
+ " <td>...</td>\n",
464
+ " <td>2018</td>\n",
465
+ " <td>1</td>\n",
466
+ " <td>7</td>\n",
467
+ " <td>-4.5</td>\n",
468
+ " <td>0.965926</td>\n",
469
+ " <td>-2.588190e-01</td>\n",
470
+ " <td>0.5</td>\n",
471
+ " <td>0.866025</td>\n",
472
+ " <td>2000.0</td>\n",
473
+ " <td>2</td>\n",
474
+ " </tr>\n",
475
+ " <tr>\n",
476
+ " <th>8</th>\n",
477
+ " <td>-0.5</td>\n",
478
+ " <td>0.0</td>\n",
479
+ " <td>1.2</td>\n",
480
+ " <td>360</td>\n",
481
+ " <td>43.0</td>\n",
482
+ " <td>2.5</td>\n",
483
+ " <td>-11.5</td>\n",
484
+ " <td>1017.4</td>\n",
485
+ " <td>1026.3</td>\n",
486
+ " <td>0.03</td>\n",
487
+ " <td>...</td>\n",
488
+ " <td>2018</td>\n",
489
+ " <td>1</td>\n",
490
+ " <td>8</td>\n",
491
+ " <td>-4.0</td>\n",
492
+ " <td>0.866025</td>\n",
493
+ " <td>-5.000000e-01</td>\n",
494
+ " <td>0.5</td>\n",
495
+ " <td>0.866025</td>\n",
496
+ " <td>2000.0</td>\n",
497
+ " <td>2</td>\n",
498
+ " </tr>\n",
499
+ " <tr>\n",
500
+ " <th>9</th>\n",
501
+ " <td>1.7</td>\n",
502
+ " <td>0.0</td>\n",
503
+ " <td>2.1</td>\n",
504
+ " <td>20</td>\n",
505
+ " <td>39.0</td>\n",
506
+ " <td>2.7</td>\n",
507
+ " <td>-10.8</td>\n",
508
+ " <td>1018.1</td>\n",
509
+ " <td>1026.9</td>\n",
510
+ " <td>0.46</td>\n",
511
+ " <td>...</td>\n",
512
+ " <td>2018</td>\n",
513
+ " <td>1</td>\n",
514
+ " <td>9</td>\n",
515
+ " <td>2.8</td>\n",
516
+ " <td>0.707107</td>\n",
517
+ " <td>-7.071068e-01</td>\n",
518
+ " <td>0.5</td>\n",
519
+ " <td>0.866025</td>\n",
520
+ " <td>1953.0</td>\n",
521
+ " <td>2</td>\n",
522
+ " </tr>\n",
523
+ " </tbody>\n",
524
+ "</table>\n",
525
+ "<p>10 rows × 30 columns</p>\n",
526
+ "</div>"
527
+ ],
528
+ "text/plain": [
529
+ " temp_C precip_mm wind_speed wind_dir hm vap_pressure dewpoint_C \\\n",
530
+ "0 1.2 0.0 1.6 360 35.0 2.3 -12.6 \n",
531
+ "1 0.5 0.0 1.3 360 33.0 2.1 -13.9 \n",
532
+ "2 0.1 0.0 1.5 20 34.0 2.1 -13.9 \n",
533
+ "3 0.0 0.0 2.1 320 37.0 2.3 -12.9 \n",
534
+ "4 -0.1 0.0 2.3 340 42.0 2.5 -11.5 \n",
535
+ "5 -0.1 0.0 2.8 50 43.0 2.6 -11.2 \n",
536
+ "6 -0.5 0.0 2.1 20 45.0 2.6 -11.0 \n",
537
+ "7 -0.8 0.0 2.5 340 45.0 2.6 -11.2 \n",
538
+ "8 -0.5 0.0 1.2 360 43.0 2.5 -11.5 \n",
539
+ "9 1.7 0.0 2.1 20 39.0 2.7 -10.8 \n",
540
+ "\n",
541
+ " loc_pressure sea_pressure solarRad ... year month hour \\\n",
542
+ "0 1015.8 1024.6 0.00 ... 2018 1 0 \n",
543
+ "1 1015.5 1024.3 0.00 ... 2018 1 1 \n",
544
+ "2 1015.7 1024.5 0.00 ... 2018 1 2 \n",
545
+ "3 1015.9 1024.7 0.00 ... 2018 1 3 \n",
546
+ "4 1016.0 1024.9 0.00 ... 2018 1 4 \n",
547
+ "5 1016.0 1024.9 0.00 ... 2018 1 5 \n",
548
+ "6 1016.5 1025.4 0.00 ... 2018 1 6 \n",
549
+ "7 1017.1 1026.0 0.00 ... 2018 1 7 \n",
550
+ "8 1017.4 1026.3 0.03 ... 2018 1 8 \n",
551
+ "9 1018.1 1026.9 0.46 ... 2018 1 9 \n",
552
+ "\n",
553
+ " ground_temp - temp_C hour_sin hour_cos month_sin month_cos visi \\\n",
554
+ "0 -5.4 0.000000 1.000000e+00 0.5 0.866025 2000.0 \n",
555
+ "1 -5.4 0.258819 9.659258e-01 0.5 0.866025 2000.0 \n",
556
+ "2 -5.4 0.500000 8.660254e-01 0.5 0.866025 2000.0 \n",
557
+ "3 -5.0 0.707107 7.071068e-01 0.5 0.866025 2000.0 \n",
558
+ "4 -4.3 0.866025 5.000000e-01 0.5 0.866025 2000.0 \n",
559
+ "5 -4.0 0.965926 2.588190e-01 0.5 0.866025 2000.0 \n",
560
+ "6 -4.1 1.000000 6.123234e-17 0.5 0.866025 2000.0 \n",
561
+ "7 -4.5 0.965926 -2.588190e-01 0.5 0.866025 2000.0 \n",
562
+ "8 -4.0 0.866025 -5.000000e-01 0.5 0.866025 2000.0 \n",
563
+ "9 2.8 0.707107 -7.071068e-01 0.5 0.866025 1953.0 \n",
564
+ "\n",
565
+ " multi_class \n",
566
+ "0 2 \n",
567
+ "1 2 \n",
568
+ "2 2 \n",
569
+ "3 2 \n",
570
+ "4 2 \n",
571
+ "5 2 \n",
572
+ "6 2 \n",
573
+ "7 2 \n",
574
+ "8 2 \n",
575
+ "9 2 \n",
576
+ "\n",
577
+ "[10 rows x 30 columns]"
578
+ ]
579
+ },
580
+ "execution_count": 14,
581
+ "metadata": {},
582
+ "output_type": "execute_result"
583
+ }
584
+ ],
585
+ "source": [
586
+ "df_busan_train.head(10)"
587
+ ]
588
+ },
589
+ {
590
+ "cell_type": "code",
591
+ "execution_count": 15,
592
+ "metadata": {},
593
+ "outputs": [
594
+ {
595
+ "data": {
596
+ "text/html": [
597
+ "<div>\n",
598
+ "<style scoped>\n",
599
+ " .dataframe tbody tr th:only-of-type {\n",
600
+ " vertical-align: middle;\n",
601
+ " }\n",
602
+ "\n",
603
+ " .dataframe tbody tr th {\n",
604
+ " vertical-align: top;\n",
605
+ " }\n",
606
+ "\n",
607
+ " .dataframe thead th {\n",
608
+ " text-align: right;\n",
609
+ " }\n",
610
+ "</style>\n",
611
+ "<table border=\"1\" class=\"dataframe\">\n",
612
+ " <thead>\n",
613
+ " <tr style=\"text-align: right;\">\n",
614
+ " <th></th>\n",
615
+ " <th>temp_C</th>\n",
616
+ " <th>precip_mm</th>\n",
617
+ " <th>wind_speed</th>\n",
618
+ " <th>wind_dir</th>\n",
619
+ " <th>hm</th>\n",
620
+ " <th>vap_pressure</th>\n",
621
+ " <th>dewpoint_C</th>\n",
622
+ " <th>loc_pressure</th>\n",
623
+ " <th>sea_pressure</th>\n",
624
+ " <th>solarRad</th>\n",
625
+ " <th>...</th>\n",
626
+ " <th>year</th>\n",
627
+ " <th>month</th>\n",
628
+ " <th>hour</th>\n",
629
+ " <th>ground_temp - temp_C</th>\n",
630
+ " <th>hour_sin</th>\n",
631
+ " <th>hour_cos</th>\n",
632
+ " <th>month_sin</th>\n",
633
+ " <th>month_cos</th>\n",
634
+ " <th>visi</th>\n",
635
+ " <th>multi_class</th>\n",
636
+ " </tr>\n",
637
+ " </thead>\n",
638
+ " <tbody>\n",
639
+ " <tr>\n",
640
+ " <th>26294</th>\n",
641
+ " <td>0.1</td>\n",
642
+ " <td>0.0</td>\n",
643
+ " <td>6.3</td>\n",
644
+ " <td>270</td>\n",
645
+ " <td>37.0</td>\n",
646
+ " <td>2.3</td>\n",
647
+ " <td>-12.9</td>\n",
648
+ " <td>1013.3</td>\n",
649
+ " <td>1022.1</td>\n",
650
+ " <td>2.07</td>\n",
651
+ " <td>...</td>\n",
652
+ " <td>2020</td>\n",
653
+ " <td>12</td>\n",
654
+ " <td>14</td>\n",
655
+ " <td>5.8</td>\n",
656
+ " <td>-0.500000</td>\n",
657
+ " <td>-8.660254e-01</td>\n",
658
+ " <td>-2.449294e-16</td>\n",
659
+ " <td>1.0</td>\n",
660
+ " <td>5000.0</td>\n",
661
+ " <td>2</td>\n",
662
+ " </tr>\n",
663
+ " <tr>\n",
664
+ " <th>26295</th>\n",
665
+ " <td>1.2</td>\n",
666
+ " <td>0.0</td>\n",
667
+ " <td>5.9</td>\n",
668
+ " <td>270</td>\n",
669
+ " <td>35.0</td>\n",
670
+ " <td>2.3</td>\n",
671
+ " <td>-12.6</td>\n",
672
+ " <td>1013.2</td>\n",
673
+ " <td>1022.0</td>\n",
674
+ " <td>1.71</td>\n",
675
+ " <td>...</td>\n",
676
+ " <td>2020</td>\n",
677
+ " <td>12</td>\n",
678
+ " <td>15</td>\n",
679
+ " <td>5.6</td>\n",
680
+ " <td>-0.707107</td>\n",
681
+ " <td>-7.071068e-01</td>\n",
682
+ " <td>-2.449294e-16</td>\n",
683
+ " <td>1.0</td>\n",
684
+ " <td>5000.0</td>\n",
685
+ " <td>2</td>\n",
686
+ " </tr>\n",
687
+ " <tr>\n",
688
+ " <th>26296</th>\n",
689
+ " <td>1.6</td>\n",
690
+ " <td>0.0</td>\n",
691
+ " <td>3.6</td>\n",
692
+ " <td>290</td>\n",
693
+ " <td>34.0</td>\n",
694
+ " <td>2.3</td>\n",
695
+ " <td>-12.6</td>\n",
696
+ " <td>1012.8</td>\n",
697
+ " <td>1021.6</td>\n",
698
+ " <td>1.14</td>\n",
699
+ " <td>...</td>\n",
700
+ " <td>2020</td>\n",
701
+ " <td>12</td>\n",
702
+ " <td>16</td>\n",
703
+ " <td>1.4</td>\n",
704
+ " <td>-0.866025</td>\n",
705
+ " <td>-5.000000e-01</td>\n",
706
+ " <td>-2.449294e-16</td>\n",
707
+ " <td>1.0</td>\n",
708
+ " <td>5000.0</td>\n",
709
+ " <td>2</td>\n",
710
+ " </tr>\n",
711
+ " <tr>\n",
712
+ " <th>26297</th>\n",
713
+ " <td>1.2</td>\n",
714
+ " <td>0.0</td>\n",
715
+ " <td>3.8</td>\n",
716
+ " <td>250</td>\n",
717
+ " <td>38.0</td>\n",
718
+ " <td>2.5</td>\n",
719
+ " <td>-11.5</td>\n",
720
+ " <td>1012.8</td>\n",
721
+ " <td>1021.6</td>\n",
722
+ " <td>0.48</td>\n",
723
+ " <td>...</td>\n",
724
+ " <td>2020</td>\n",
725
+ " <td>12</td>\n",
726
+ " <td>17</td>\n",
727
+ " <td>-0.4</td>\n",
728
+ " <td>-0.965926</td>\n",
729
+ " <td>-2.588190e-01</td>\n",
730
+ " <td>-2.449294e-16</td>\n",
731
+ " <td>1.0</td>\n",
732
+ " <td>5000.0</td>\n",
733
+ " <td>2</td>\n",
734
+ " </tr>\n",
735
+ " <tr>\n",
736
+ " <th>26298</th>\n",
737
+ " <td>0.9</td>\n",
738
+ " <td>0.0</td>\n",
739
+ " <td>3.8</td>\n",
740
+ " <td>270</td>\n",
741
+ " <td>40.0</td>\n",
742
+ " <td>2.6</td>\n",
743
+ " <td>-11.2</td>\n",
744
+ " <td>1013.1</td>\n",
745
+ " <td>1021.9</td>\n",
746
+ " <td>0.02</td>\n",
747
+ " <td>...</td>\n",
748
+ " <td>2020</td>\n",
749
+ " <td>12</td>\n",
750
+ " <td>18</td>\n",
751
+ " <td>-0.8</td>\n",
752
+ " <td>-1.000000</td>\n",
753
+ " <td>-1.836970e-16</td>\n",
754
+ " <td>-2.449294e-16</td>\n",
755
+ " <td>1.0</td>\n",
756
+ " <td>5000.0</td>\n",
757
+ " <td>2</td>\n",
758
+ " </tr>\n",
759
+ " <tr>\n",
760
+ " <th>26299</th>\n",
761
+ " <td>0.6</td>\n",
762
+ " <td>0.0</td>\n",
763
+ " <td>6.2</td>\n",
764
+ " <td>270</td>\n",
765
+ " <td>41.0</td>\n",
766
+ " <td>2.6</td>\n",
767
+ " <td>-11.1</td>\n",
768
+ " <td>1014.0</td>\n",
769
+ " <td>1022.8</td>\n",
770
+ " <td>0.00</td>\n",
771
+ " <td>...</td>\n",
772
+ " <td>2020</td>\n",
773
+ " <td>12</td>\n",
774
+ " <td>19</td>\n",
775
+ " <td>-1.1</td>\n",
776
+ " <td>-0.965926</td>\n",
777
+ " <td>2.588190e-01</td>\n",
778
+ " <td>-2.449294e-16</td>\n",
779
+ " <td>1.0</td>\n",
780
+ " <td>5000.0</td>\n",
781
+ " <td>2</td>\n",
782
+ " </tr>\n",
783
+ " <tr>\n",
784
+ " <th>26300</th>\n",
785
+ " <td>0.1</td>\n",
786
+ " <td>0.0</td>\n",
787
+ " <td>6.0</td>\n",
788
+ " <td>270</td>\n",
789
+ " <td>44.0</td>\n",
790
+ " <td>2.7</td>\n",
791
+ " <td>-10.7</td>\n",
792
+ " <td>1014.8</td>\n",
793
+ " <td>1023.6</td>\n",
794
+ " <td>0.00</td>\n",
795
+ " <td>...</td>\n",
796
+ " <td>2020</td>\n",
797
+ " <td>12</td>\n",
798
+ " <td>20</td>\n",
799
+ " <td>-0.9</td>\n",
800
+ " <td>-0.866025</td>\n",
801
+ " <td>5.000000e-01</td>\n",
802
+ " <td>-2.449294e-16</td>\n",
803
+ " <td>1.0</td>\n",
804
+ " <td>5000.0</td>\n",
805
+ " <td>2</td>\n",
806
+ " </tr>\n",
807
+ " <tr>\n",
808
+ " <th>26301</th>\n",
809
+ " <td>-0.2</td>\n",
810
+ " <td>0.0</td>\n",
811
+ " <td>5.0</td>\n",
812
+ " <td>290</td>\n",
813
+ " <td>48.0</td>\n",
814
+ " <td>2.9</td>\n",
815
+ " <td>-9.9</td>\n",
816
+ " <td>1014.6</td>\n",
817
+ " <td>1023.4</td>\n",
818
+ " <td>0.00</td>\n",
819
+ " <td>...</td>\n",
820
+ " <td>2020</td>\n",
821
+ " <td>12</td>\n",
822
+ " <td>21</td>\n",
823
+ " <td>-0.8</td>\n",
824
+ " <td>-0.707107</td>\n",
825
+ " <td>7.071068e-01</td>\n",
826
+ " <td>-2.449294e-16</td>\n",
827
+ " <td>1.0</td>\n",
828
+ " <td>5000.0</td>\n",
829
+ " <td>2</td>\n",
830
+ " </tr>\n",
831
+ " <tr>\n",
832
+ " <th>26302</th>\n",
833
+ " <td>-0.7</td>\n",
834
+ " <td>0.0</td>\n",
835
+ " <td>2.7</td>\n",
836
+ " <td>270</td>\n",
837
+ " <td>51.0</td>\n",
838
+ " <td>3.0</td>\n",
839
+ " <td>-9.6</td>\n",
840
+ " <td>1014.8</td>\n",
841
+ " <td>1023.6</td>\n",
842
+ " <td>0.00</td>\n",
843
+ " <td>...</td>\n",
844
+ " <td>2020</td>\n",
845
+ " <td>12</td>\n",
846
+ " <td>22</td>\n",
847
+ " <td>-0.6</td>\n",
848
+ " <td>-0.500000</td>\n",
849
+ " <td>8.660254e-01</td>\n",
850
+ " <td>-2.449294e-16</td>\n",
851
+ " <td>1.0</td>\n",
852
+ " <td>5000.0</td>\n",
853
+ " <td>2</td>\n",
854
+ " </tr>\n",
855
+ " <tr>\n",
856
+ " <th>26303</th>\n",
857
+ " <td>-0.7</td>\n",
858
+ " <td>0.0</td>\n",
859
+ " <td>3.8</td>\n",
860
+ " <td>250</td>\n",
861
+ " <td>55.0</td>\n",
862
+ " <td>3.2</td>\n",
863
+ " <td>-8.6</td>\n",
864
+ " <td>1015.1</td>\n",
865
+ " <td>1024.0</td>\n",
866
+ " <td>0.00</td>\n",
867
+ " <td>...</td>\n",
868
+ " <td>2020</td>\n",
869
+ " <td>12</td>\n",
870
+ " <td>23</td>\n",
871
+ " <td>-0.6</td>\n",
872
+ " <td>-0.258819</td>\n",
873
+ " <td>9.659258e-01</td>\n",
874
+ " <td>-2.449294e-16</td>\n",
875
+ " <td>1.0</td>\n",
876
+ " <td>5000.0</td>\n",
877
+ " <td>2</td>\n",
878
+ " </tr>\n",
879
+ " </tbody>\n",
880
+ "</table>\n",
881
+ "<p>10 rows × 30 columns</p>\n",
882
+ "</div>"
883
+ ],
884
+ "text/plain": [
885
+ " temp_C precip_mm wind_speed wind_dir hm vap_pressure dewpoint_C \\\n",
886
+ "26294 0.1 0.0 6.3 270 37.0 2.3 -12.9 \n",
887
+ "26295 1.2 0.0 5.9 270 35.0 2.3 -12.6 \n",
888
+ "26296 1.6 0.0 3.6 290 34.0 2.3 -12.6 \n",
889
+ "26297 1.2 0.0 3.8 250 38.0 2.5 -11.5 \n",
890
+ "26298 0.9 0.0 3.8 270 40.0 2.6 -11.2 \n",
891
+ "26299 0.6 0.0 6.2 270 41.0 2.6 -11.1 \n",
892
+ "26300 0.1 0.0 6.0 270 44.0 2.7 -10.7 \n",
893
+ "26301 -0.2 0.0 5.0 290 48.0 2.9 -9.9 \n",
894
+ "26302 -0.7 0.0 2.7 270 51.0 3.0 -9.6 \n",
895
+ "26303 -0.7 0.0 3.8 250 55.0 3.2 -8.6 \n",
896
+ "\n",
897
+ " loc_pressure sea_pressure solarRad ... year month hour \\\n",
898
+ "26294 1013.3 1022.1 2.07 ... 2020 12 14 \n",
899
+ "26295 1013.2 1022.0 1.71 ... 2020 12 15 \n",
900
+ "26296 1012.8 1021.6 1.14 ... 2020 12 16 \n",
901
+ "26297 1012.8 1021.6 0.48 ... 2020 12 17 \n",
902
+ "26298 1013.1 1021.9 0.02 ... 2020 12 18 \n",
903
+ "26299 1014.0 1022.8 0.00 ... 2020 12 19 \n",
904
+ "26300 1014.8 1023.6 0.00 ... 2020 12 20 \n",
905
+ "26301 1014.6 1023.4 0.00 ... 2020 12 21 \n",
906
+ "26302 1014.8 1023.6 0.00 ... 2020 12 22 \n",
907
+ "26303 1015.1 1024.0 0.00 ... 2020 12 23 \n",
908
+ "\n",
909
+ " ground_temp - temp_C hour_sin hour_cos month_sin month_cos \\\n",
910
+ "26294 5.8 -0.500000 -8.660254e-01 -2.449294e-16 1.0 \n",
911
+ "26295 5.6 -0.707107 -7.071068e-01 -2.449294e-16 1.0 \n",
912
+ "26296 1.4 -0.866025 -5.000000e-01 -2.449294e-16 1.0 \n",
913
+ "26297 -0.4 -0.965926 -2.588190e-01 -2.449294e-16 1.0 \n",
914
+ "26298 -0.8 -1.000000 -1.836970e-16 -2.449294e-16 1.0 \n",
915
+ "26299 -1.1 -0.965926 2.588190e-01 -2.449294e-16 1.0 \n",
916
+ "26300 -0.9 -0.866025 5.000000e-01 -2.449294e-16 1.0 \n",
917
+ "26301 -0.8 -0.707107 7.071068e-01 -2.449294e-16 1.0 \n",
918
+ "26302 -0.6 -0.500000 8.660254e-01 -2.449294e-16 1.0 \n",
919
+ "26303 -0.6 -0.258819 9.659258e-01 -2.449294e-16 1.0 \n",
920
+ "\n",
921
+ " visi multi_class \n",
922
+ "26294 5000.0 2 \n",
923
+ "26295 5000.0 2 \n",
924
+ "26296 5000.0 2 \n",
925
+ "26297 5000.0 2 \n",
926
+ "26298 5000.0 2 \n",
927
+ "26299 5000.0 2 \n",
928
+ "26300 5000.0 2 \n",
929
+ "26301 5000.0 2 \n",
930
+ "26302 5000.0 2 \n",
931
+ "26303 5000.0 2 \n",
932
+ "\n",
933
+ "[10 rows x 30 columns]"
934
+ ]
935
+ },
936
+ "execution_count": 15,
937
+ "metadata": {},
938
+ "output_type": "execute_result"
939
+ }
940
+ ],
941
+ "source": [
942
+ "df_busan_train.tail(10)"
943
+ ]
944
+ },
945
+ {
946
+ "cell_type": "code",
947
+ "execution_count": 16,
948
+ "metadata": {},
949
+ "outputs": [
950
+ {
951
+ "name": "stdout",
952
+ "output_type": "stream",
953
+ "text": [
954
+ "<class 'pandas.core.frame.DataFrame'>\n",
955
+ "Index: 26304 entries, 0 to 26303\n",
956
+ "Data columns (total 30 columns):\n",
957
+ " # Column Non-Null Count Dtype \n",
958
+ "--- ------ -------------- ----- \n",
959
+ " 0 temp_C 26304 non-null float64 \n",
960
+ " 1 precip_mm 26304 non-null float64 \n",
961
+ " 2 wind_speed 26304 non-null float64 \n",
962
+ " 3 wind_dir 26304 non-null category\n",
963
+ " 4 hm 26304 non-null float64 \n",
964
+ " 5 vap_pressure 26304 non-null float64 \n",
965
+ " 6 dewpoint_C 26304 non-null float64 \n",
966
+ " 7 loc_pressure 26304 non-null float64 \n",
967
+ " 8 sea_pressure 26304 non-null float64 \n",
968
+ " 9 solarRad 26304 non-null float64 \n",
969
+ " 10 snow_cm 26304 non-null float64 \n",
970
+ " 11 cloudcover 26304 non-null category\n",
971
+ " 12 lm_cloudcover 26304 non-null category\n",
972
+ " 13 low_cloudbase 26304 non-null float64 \n",
973
+ " 14 groundtemp 26304 non-null float64 \n",
974
+ " 15 O3 26304 non-null float64 \n",
975
+ " 16 NO2 26304 non-null float64 \n",
976
+ " 17 PM10 26304 non-null float64 \n",
977
+ " 18 PM25 26304 non-null float64 \n",
978
+ " 19 binary_class 26304 non-null int64 \n",
979
+ " 20 year 26304 non-null int64 \n",
980
+ " 21 month 26304 non-null int64 \n",
981
+ " 22 hour 26304 non-null int64 \n",
982
+ " 23 ground_temp - temp_C 26304 non-null float64 \n",
983
+ " 24 hour_sin 26304 non-null float64 \n",
984
+ " 25 hour_cos 26304 non-null float64 \n",
985
+ " 26 month_sin 26304 non-null float64 \n",
986
+ " 27 month_cos 26304 non-null float64 \n",
987
+ " 28 visi 26304 non-null float64 \n",
988
+ " 29 multi_class 26304 non-null int64 \n",
989
+ "dtypes: category(3), float64(22), int64(5)\n",
990
+ "memory usage: 5.7 MB\n"
991
+ ]
992
+ }
993
+ ],
994
+ "source": [
995
+ "df_busan_train.info()"
996
+ ]
997
+ },
998
+ {
999
+ "cell_type": "code",
1000
+ "execution_count": 17,
1001
+ "metadata": {},
1002
+ "outputs": [],
1003
+ "source": [
1004
+ "df_seoul_train.to_csv(\"../data/data_for_modeling/seoul_train.csv\")\n",
1005
+ "df_seoul_test.to_csv(\"../data/data_for_modeling/seoul_test.csv\")\n",
1006
+ "\n",
1007
+ "df_busan_train.to_csv(\"../data/data_for_modeling/busan_train.csv\")\n",
1008
+ "df_busan_test.to_csv(\"../data/data_for_modeling/busan_test.csv\")\n",
1009
+ "\n",
1010
+ "df_incheon_train.to_csv(\"../data/data_for_modeling/incheon_train.csv\")\n",
1011
+ "df_incheon_test.to_csv(\"../data/data_for_modeling/incheon_test.csv\")\n",
1012
+ "\n",
1013
+ "df_daegu_train.to_csv(\"../data/data_for_modeling/daegu_train.csv\")\n",
1014
+ "df_daegu_test.to_csv(\"../data/data_for_modeling/daegu_test.csv\")\n",
1015
+ "\n",
1016
+ "df_daejeon_train.to_csv(\"../data/data_for_modeling/daejeon_train.csv\")\n",
1017
+ "df_daejeon_test.to_csv(\"../data/data_for_modeling/daejeon_test.csv\")\n",
1018
+ "\n",
1019
+ "df_gwangju_train.to_csv(\"../data/data_for_modeling/gwangju_train.csv\")\n",
1020
+ "df_gwangju_test.to_csv(\"../data/data_for_modeling/gwangju_test.csv\")\n",
1021
+ "\n",
1022
+ "df_seoul_train = pd.read_csv(\"../data/data_for_modeling/seoul_train.csv\")\n",
1023
+ "df_seoul_test = pd.read_csv(\"../data/data_for_modeling/seoul_test.csv\")\n"
1024
+ ]
1025
+ },
1026
+ {
1027
+ "cell_type": "code",
1028
+ "execution_count": 18,
1029
+ "metadata": {},
1030
+ "outputs": [
1031
+ {
1032
+ "name": "stdout",
1033
+ "output_type": "stream",
1034
+ "text": [
1035
+ "Counter({2: 8266, 1: 481, 0: 13})\n",
1036
+ "Counter({2: 23686, 1: 2579, 0: 39})\n",
1037
+ "Counter({2: 8455, 1: 281, 0: 24})\n",
1038
+ "Counter({2: 24694, 1: 1516, 0: 94})\n",
1039
+ "Counter({2: 7373, 1: 1205, 0: 182})\n",
1040
+ "Counter({2: 21893, 1: 3892, 0: 519})\n",
1041
+ "Counter({2: 8631, 1: 128, 0: 1})\n",
1042
+ "Counter({2: 25149, 1: 1107, 0: 48})\n",
1043
+ "Counter({2: 8089, 1: 618, 0: 53})\n",
1044
+ "Counter({2: 23471, 1: 2660, 0: 173})\n",
1045
+ "Counter({2: 8087, 1: 643, 0: 30})\n",
1046
+ "Counter({2: 23798, 1: 2411, 0: 95})\n"
1047
+ ]
1048
+ }
1049
+ ],
1050
+ "source": [
1051
+ "print(Counter(df_seoul_test['multi_class']))\n",
1052
+ "print(Counter(df_seoul_train['multi_class']))\n",
1053
+ "\n",
1054
+ "print(Counter(df_busan_test['multi_class']))\n",
1055
+ "print(Counter(df_busan_train['multi_class']))\n",
1056
+ "\n",
1057
+ "print(Counter(df_incheon_test['multi_class']))\n",
1058
+ "print(Counter(df_incheon_train['multi_class']))\n",
1059
+ "\n",
1060
+ "print(Counter(df_daegu_test['multi_class']))\n",
1061
+ "print(Counter(df_daegu_train['multi_class']))\n",
1062
+ "\n",
1063
+ "print(Counter(df_daejeon_test['multi_class']))\n",
1064
+ "print(Counter(df_daejeon_train['multi_class']))\n",
1065
+ "\n",
1066
+ "print(Counter(df_gwangju_test['multi_class']))\n",
1067
+ "print(Counter(df_gwangju_train['multi_class']))"
1068
+ ]
1069
+ },
1070
+ {
1071
+ "cell_type": "code",
1072
+ "execution_count": null,
1073
+ "metadata": {},
1074
+ "outputs": [],
1075
+ "source": []
1076
+ }
1077
+ ],
1078
+ "metadata": {
1079
+ "kernelspec": {
1080
+ "display_name": "Python 3",
1081
+ "language": "python",
1082
+ "name": "python3"
1083
+ },
1084
+ "language_info": {
1085
+ "codemirror_mode": {
1086
+ "name": "ipython",
1087
+ "version": 3
1088
+ },
1089
+ "file_extension": ".py",
1090
+ "mimetype": "text/x-python",
1091
+ "name": "python",
1092
+ "nbconvert_exporter": "python",
1093
+ "pygments_lexer": "ipython3",
1094
+ "version": "3.8.10"
1095
+ }
1096
+ },
1097
+ "nbformat": 4,
1098
+ "nbformat_minor": 2
1099
+ }
Analysis_code/model_result/best_sample/ensemble_best_sample.csv ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,CSI,MCC,Accuracy,region,data_sample
2
+ deepgbm+ft_transformer,0.6992424754332419,0.8049778134563997,0.9617050719032529,seoul,best
3
+ deepgbm+resnet_like,0.7090721222546449,0.8134996041947694,0.9639575067994112,seoul,best
4
+ deepgbm+XGBoost,0.6692268948642789,0.7828377071996622,0.9579950262411524,seoul,best
5
+ deepgbm+LightGBM,0.6935149238495999,0.801572479004747,0.9620935948299524,seoul,best
6
+ ft_transformer+resnet_like,0.6252731559795573,0.7509360530096033,0.9488043848924154,seoul,best
7
+ ft_transformer+XGBoost,0.6142839547044981,0.7403116223696319,0.9470938235563208,seoul,best
8
+ ft_transformer+LightGBM,0.6085686211212852,0.7359331482423171,0.9466367866856302,seoul,best
9
+ resnet_like+XGBoost,0.6194270544060083,0.7446567256109252,0.9478222130731675,seoul,best
10
+ resnet_like+LightGBM,0.6136847718241535,0.741095701007121,0.9457727208457053,seoul,best
11
+ XGBoost+LightGBM,0.5861106806341969,0.7149062718037028,0.9421973118413721,seoul,best
12
+ deepgbm+ft_transformer+resnet_like,0.6821685259093041,0.7946632359980877,0.9596256373148356,seoul,best
13
+ deepgbm+ft_transformer+XGBoost,0.6730333196442752,0.7867573605932577,0.9584481123836616,seoul,best
14
+ deepgbm+ft_transformer+LightGBM,0.6764764169794865,0.7902017631994056,0.9590548610591277,seoul,best
15
+ deepgbm+resnet_like+XGBoost,0.6784375221650579,0.7915054245115618,0.9591382422170655,seoul,best
16
+ deepgbm+resnet_like+LightGBM,0.6775625748710835,0.7913206958685782,0.9590235671332685,seoul,best
17
+ deepgbm+XGBoost+LightGBM,0.6325378315490732,0.7547274504003308,0.9511999817018905,seoul,best
18
+ ft_transformer+resnet_like+XGBoost,0.6332314687362993,0.7561343704178952,0.9500612362868145,seoul,best
19
+ ft_transformer+resnet_like+LightGBM,0.6306548804840734,0.7545886604473321,0.9496810306826026,seoul,best
20
+ ft_transformer+XGBoost+LightGBM,0.6052879958628536,0.7319675532436248,0.9455772637672482,seoul,best
21
+ resnet_like+XGBoost+LightGBM,0.6103490502864374,0.7369455626993492,0.9460374196338716,seoul,best
22
+ deepgbm+ft_transformer+resnet_like+XGBoost,0.6729967545904317,0.7875845379790564,0.9581108449567916,seoul,best
23
+ deepgbm+ft_transformer+resnet_like+LightGBM,0.6719561859986568,0.7873723840805852,0.9580337017907196,seoul,best
24
+ deepgbm+ft_transformer+XGBoost+LightGBM,0.6531335492225346,0.7716405213564462,0.9549966314843926,seoul,best
25
+ deepgbm+resnet_like+XGBoost+LightGBM,0.6495283344540792,0.769203226760809,0.9539740166845488,seoul,best
26
+ ft_transformer+resnet_like+XGBoost+LightGBM,0.6233313312481723,0.7475113353931637,0.9482768587136429,seoul,best
27
+ deepgbm+ft_transformer+resnet_like+XGBoost+LightGBM,0.6569118107203898,0.7752226196801869,0.955377564854322,seoul,best
28
+ deepgbm+ft_transformer,0.6231286091598959,0.762319461740938,0.9692817368232819,busan,best
29
+ deepgbm+resnet_like,0.693965736363066,0.8126142282014271,0.9774987316083207,busan,best
30
+ deepgbm+XGBoost,0.5949465703841524,0.7372757652859034,0.9695935324500337,busan,best
31
+ deepgbm+LightGBM,0.6134672160989406,0.7540785415035627,0.969397659505452,busan,best
32
+ ft_transformer+resnet_like,0.5993289393140034,0.7415869283802814,0.9687921584283586,busan,best
33
+ ft_transformer+XGBoost,0.5275703861594696,0.6836131837976293,0.9627103242924039,busan,best
34
+ ft_transformer+LightGBM,0.5340117437566735,0.690570910687501,0.9611092397135513,busan,best
35
+ resnet_like+XGBoost,0.5476348496496576,0.6975679320922632,0.9653371426670326,busan,best
36
+ resnet_like+LightGBM,0.5541563454462936,0.7052881357398278,0.9637749415708096,busan,best
37
+ XGBoost+LightGBM,0.4789290487253062,0.6395338303931094,0.9572717310843294,busan,best
38
+ deepgbm+ft_transformer+resnet_like,0.6446887106206897,0.777619884223958,0.9728593291247681,busan,best
39
+ deepgbm+ft_transformer+XGBoost,0.6102029002552042,0.7511249566662123,0.9699326712744633,busan,best
40
+ deepgbm+ft_transformer+LightGBM,0.6162985577163776,0.7570233197218496,0.9692066729878318,busan,best
41
+ deepgbm+resnet_like+XGBoost,0.6261344637257955,0.7612451856135231,0.9726364248821019,busan,best
42
+ deepgbm+resnet_like+LightGBM,0.6357139958459742,0.770523881133966,0.9720613859986192,busan,best
43
+ deepgbm+XGBoost+LightGBM,0.550954042564896,0.7024853673988001,0.9645722608977718,busan,best
44
+ ft_transformer+resnet_like+XGBoost,0.5708235070191732,0.7183051671189992,0.9673502466086118,busan,best
45
+ ft_transformer+resnet_like+LightGBM,0.5747801644059322,0.723481589964096,0.9662444585838926,busan,best
46
+ ft_transformer+XGBoost+LightGBM,0.5188547215134367,0.6758426217880089,0.9612265139606259,busan,best
47
+ resnet_like+XGBoost+LightGBM,0.5283032797443571,0.6828500944597963,0.9625962730077933,busan,best
48
+ deepgbm+ft_transformer+resnet_like+XGBoost,0.62252465197324,0.7595480882291313,0.9716073641573305,busan,best
49
+ deepgbm+ft_transformer+resnet_like+LightGBM,0.6273921955959187,0.7650978772099609,0.9711861957398673,busan,best
50
+ deepgbm+ft_transformer+XGBoost+LightGBM,0.5803171388664561,0.7273427892946641,0.9671579085260872,busan,best
51
+ deepgbm+resnet_like+XGBoost+LightGBM,0.5883015315947713,0.7322310098800981,0.9685279794728481,busan,best
52
+ ft_transformer+resnet_like+XGBoost+LightGBM,0.5527408442450511,0.7040063830266932,0.9648394548826841,busan,best
53
+ deepgbm+ft_transformer+resnet_like+XGBoost+LightGBM,0.5949050119452819,0.7385090163673939,0.9689076652444045,busan,best
54
+ deepgbm+ft_transformer,0.5873884001633557,0.7086005084355499,0.9163964576523526,incheon,best
55
+ deepgbm+resnet_like,0.5938343436639008,0.7129218344958757,0.9139223661119011,incheon,best
56
+ deepgbm+XGBoost,0.5919031180535835,0.7111840783628853,0.9141871688665986,incheon,best
57
+ deepgbm+LightGBM,0.5936054700869063,0.71280494431202,0.9151763064434298,incheon,best
58
+ ft_transformer+resnet_like,0.5967079690105518,0.7167701525416347,0.9161293676339713,incheon,best
59
+ ft_transformer+XGBoost,0.5958609493419124,0.7170391565071776,0.9169653625105005,incheon,best
60
+ ft_transformer+LightGBM,0.5970916463252486,0.7186247751024871,0.9177277490830152,incheon,best
61
+ resnet_like+XGBoost,0.6048691059122082,0.7230658352305586,0.9150990593108267,incheon,best
62
+ resnet_like+LightGBM,0.6006642978391444,0.7204725929547785,0.913995974415916,incheon,best
63
+ XGBoost+LightGBM,0.5923037998052801,0.7130252479770401,0.9127417221847942,incheon,best
64
+ deepgbm+ft_transformer+resnet_like,0.5991788818331828,0.7177110572369007,0.9171559331619964,incheon,best
65
+ deepgbm+ft_transformer+XGBoost,0.5957796816002722,0.7161817507572641,0.9171555172958721,incheon,best
66
+ deepgbm+ft_transformer+LightGBM,0.5956266359998414,0.7160886468480842,0.9173455681147126,incheon,best
67
+ deepgbm+resnet_like+XGBoost,0.606730149793507,0.7239728449518917,0.9167729204614451,incheon,best
68
+ deepgbm+resnet_like+LightGBM,0.6063493089884,0.7235778913430906,0.9167717768296031,incheon,best
69
+ deepgbm+XGBoost+LightGBM,0.6018765631426876,0.7202419676609231,0.9160501451372774,incheon,best
70
+ ft_transformer+resnet_like+XGBoost,0.6044191487414033,0.7234603140558615,0.9175339554690555,incheon,best
71
+ ft_transformer+resnet_like+LightGBM,0.6061367487233379,0.7248883683993704,0.9179906804401528,incheon,best
72
+ ft_transformer+XGBoost+LightGBM,0.6013258447351273,0.721556226775587,0.9174587876670742,incheon,best
73
+ resnet_like+XGBoost+LightGBM,0.6028836532105156,0.7216380012607432,0.915023475642721,incheon,best
74
+ deepgbm+ft_transformer+resnet_like+XGBoost,0.6058688939038325,0.7235044499077539,0.9180666799743826,incheon,best
75
+ deepgbm+ft_transformer+resnet_like+LightGBM,0.6053266964324343,0.7232391120300768,0.9180279004582844,incheon,best
76
+ deepgbm+ft_transformer+XGBoost+LightGBM,0.6030917448851759,0.7220222485555309,0.9181051475908876,incheon,best
77
+ deepgbm+resnet_like+XGBoost+LightGBM,0.6054312862809185,0.7231150429381622,0.9164303507414893,incheon,best
78
+ ft_transformer+resnet_like+XGBoost+LightGBM,0.6046920079336132,0.7233826726564859,0.9171152822483387,incheon,best
79
+ deepgbm+ft_transformer+resnet_like+XGBoost+LightGBM,0.6109420059115421,0.72786127635734,0.9189396869359815,incheon,best
80
+ deepgbm+ft_transformer,0.6446005403603131,0.7684243711794304,0.9801833553742378,daegu,best
81
+ deepgbm+resnet_like,0.6137498386585009,0.7496378336201855,0.9772962048057489,daegu,best
82
+ deepgbm+XGBoost,0.5974990513708366,0.7398559419115336,0.9765007568763463,daegu,best
83
+ deepgbm+LightGBM,0.6115123902876293,0.7497436182930696,0.9776771381756785,daegu,best
84
+ ft_transformer+resnet_like,0.5469112144188149,0.6944690161411611,0.9743305595062838,daegu,best
85
+ ft_transformer+XGBoost,0.5161941376039343,0.6722802482596641,0.9719755096439354,daegu,best
86
+ ft_transformer+LightGBM,0.5073005220098658,0.6635248688295593,0.9720125217290049,daegu,best
87
+ resnet_like+XGBoost,0.5029516970497304,0.6645749569024574,0.9683687027472115,daegu,best
88
+ resnet_like+LightGBM,0.4951131290074195,0.6589965524803283,0.968139976378804,daegu,best
89
+ XGBoost+LightGBM,0.4549973810817563,0.6238297325596118,0.9647562816578087,daegu,best
90
+ deepgbm+ft_transformer+resnet_like,0.6183060943020874,0.7512947555661068,0.978549833237684,daegu,best
91
+ deepgbm+ft_transformer+XGBoost,0.6017176745611856,0.7392748101693137,0.9775999950096065,daegu,best
92
+ deepgbm+ft_transformer+LightGBM,0.6004996462619208,0.7388281525299276,0.9778661493292079,daegu,best
93
+ deepgbm+resnet_like+XGBoost,0.5739664154870325,0.7226992953385828,0.9747144039390839,daegu,best
94
+ deepgbm+resnet_like+LightGBM,0.5760590538958189,0.7240729842389874,0.975132349394083,daegu,best
95
+ deepgbm+XGBoost+LightGBM,0.5273980786589871,0.6866104535247874,0.9714085801498781,daegu,best
96
+ ft_transformer+resnet_like+XGBoost,0.5287822236272242,0.6819314770838893,0.9723570668130517,daegu,best
97
+ ft_transformer+resnet_like+LightGBM,0.520853018041326,0.675931210971238,0.972014705026158,daegu,best
98
+ ft_transformer+XGBoost+LightGBM,0.48437190164347993,0.6467498445011408,0.9690512430238457,daegu,best
99
+ resnet_like+XGBoost+LightGBM,0.4803327637605557,0.6466009408894583,0.9668476723973018,daegu,best
100
+ deepgbm+ft_transformer+resnet_like+XGBoost,0.5787636584539054,0.7225361676622583,0.9760432001730003,daegu,best
101
+ deepgbm+ft_transformer+resnet_like+LightGBM,0.5827514275296815,0.7252407488954487,0.9763842103949898,daegu,best
102
+ deepgbm+ft_transformer+XGBoost+LightGBM,0.5563231124331217,0.7066219754268251,0.9745609493391888,daegu,best
103
+ deepgbm+resnet_like+XGBoost+LightGBM,0.5497033929377105,0.7036672133164642,0.9729662067187331,daegu,best
104
+ ft_transformer+resnet_like+XGBoost+LightGBM,0.5057939271151667,0.6644447331953721,0.9703813908226664,daegu,best
105
+ deepgbm+ft_transformer+resnet_like+XGBoost+LightGBM,0.5538086053234044,0.7036101925279703,0.9742201470502616,daegu,best
106
+ deepgbm+ft_transformer,0.6798477338109503,0.7890627213525137,0.9581849730934616,daejeon,best
107
+ deepgbm+resnet_like,0.665680366173557,0.7799054500657331,0.9548432808510284,daejeon,best
108
+ deepgbm+XGBoost,0.6574065703012586,0.7732790182043394,0.9541580374445858,daejeon,best
109
+ deepgbm+LightGBM,0.6615432493953055,0.7763462848232718,0.9551088113714433,daejeon,best
110
+ ft_transformer+resnet_like,0.5874777029243061,0.7124933721407519,0.9448812078415716,daejeon,best
111
+ ft_transformer+XGBoost,0.5819133878684225,0.7079795420327318,0.9433591378263508,daejeon,best
112
+ ft_transformer+LightGBM,0.5805334443959108,0.7070987268766625,0.9437394473970939,daejeon,best
113
+ resnet_like+XGBoost,0.5621088257073251,0.6925993749297947,0.9389891126248638,daejeon,best
114
+ resnet_like+LightGBM,0.5552708987061287,0.6862528661139199,0.9386090109871829,daejeon,best
115
+ XGBoost+LightGBM,0.5288440675809284,0.6634738950206458,0.9327136928080112,daejeon,best
116
+ deepgbm+ft_transformer+resnet_like,0.6605998838783642,0.7743701608802702,0.9553335870116691,daejeon,best
117
+ deepgbm+ft_transformer+XGBoost,0.6495239084733958,0.7651993867637442,0.9536631567565769,daejeon,best
118
+ deepgbm+ft_transformer+LightGBM,0.6556854096497969,0.7699591127202252,0.9550316682053713,daejeon,best
119
+ deepgbm+resnet_like+XGBoost,0.646377396487214,0.7644362437422076,0.9526370070946761,daejeon,best
120
+ deepgbm+resnet_like+LightGBM,0.6441717942778256,0.7623630196443353,0.9527517861450042,daejeon,best
121
+ deepgbm+XGBoost+LightGBM,0.6104936580055047,0.7355161749128437,0.9470099225657277,daejeon,best
122
+ ft_transformer+resnet_like+XGBoost,0.5926724219238438,0.7169506517948077,0.9453380367792,daejeon,best
123
+ ft_transformer+resnet_like+LightGBM,0.5868504053718506,0.7120674745305696,0.9448050003742795,daejeon,best
124
+ ft_transformer+XGBoost+LightGBM,0.5697749480416849,0.6978540261423789,0.9408133093794446,daejeon,best
125
+ resnet_like+XGBoost+LightGBM,0.5568421888456356,0.6879639430352676,0.9383421289018639,daejeon,best
126
+ deepgbm+ft_transformer+resnet_like+XGBoost,0.6464484806055241,0.7628617034260374,0.953282847185834,daejeon,best
127
+ deepgbm+ft_transformer+resnet_like+LightGBM,0.6432761028976647,0.7603027833589299,0.9530166928662326,daejeon,best
128
+ deepgbm+ft_transformer+XGBoost+LightGBM,0.6268590959555621,0.7470252829879999,0.9501275669336527,daejeon,best
129
+ deepgbm+resnet_like+XGBoost+LightGBM,0.617990326312608,0.7412687534166494,0.9484927971987257,daejeon,best
130
+ ft_transformer+resnet_like+XGBoost+LightGBM,0.5778772266115083,0.7046854387067157,0.9426776372150277,daejeon,best
131
+ deepgbm+ft_transformer+resnet_like+XGBoost+LightGBM,0.6251709503819354,0.7454103687111592,0.9500901389824588,daejeon,best
132
+ deepgbm+ft_transformer,0.6798477338109503,0.7672365450318365,0.9541375560379602,gwangju,best
133
+ deepgbm+resnet_like,0.665680366173557,0.7714595133764687,0.9543727976423164,gwangju,best
134
+ deepgbm+XGBoost,0.6574065703012586,0.7719143895834364,0.9543191075928837,gwangju,best
135
+ deepgbm+LightGBM,0.6615432493953055,0.7728007686314036,0.9544770483485955,gwangju,best
136
+ ft_transformer+resnet_like,0.5874777029243061,0.7627495358829616,0.9528777415974249,gwangju,best
137
+ ft_transformer+XGBoost,0.5819133878684225,0.7549252510472145,0.9515179410587,gwangju,best
138
+ ft_transformer+LightGBM,0.5805334443959108,0.7489469355258954,0.9505456293509993,gwangju,best
139
+ resnet_like+XGBoost,0.5621088257073251,0.742686095459662,0.9492615719369842,gwangju,best
140
+ resnet_like+LightGBM,0.5552708987061287,0.7370427725250878,0.9481963158420041,gwangju,best
141
+ XGBoost+LightGBM,0.5288440675809284,0.7303546927519567,0.9467888046570956,gwangju,best
142
+ deepgbm+ft_transformer+resnet_like,0.6605998838783642,0.7743701608802702,0.9553335870116691,gwangju,best
143
+ deepgbm+ft_transformer+XGBoost,0.6495239084733958,0.7651993867637442,0.9536631567565769,gwangju,best
144
+ deepgbm+ft_transformer+LightGBM,0.6556854096497969,0.7699591127202252,0.9550316682053713,gwangju,best
145
+ deepgbm+resnet_like+XGBoost,0.646377396487214,0.7644362437422076,0.9526370070946761,gwangju,best
146
+ deepgbm+resnet_like+LightGBM,0.6441717942778256,0.7623630196443353,0.9527517861450042,gwangju,best
147
+ deepgbm+XGBoost+LightGBM,0.6104936580055047,0.7355161749128437,0.9470099225657277,gwangju,best
148
+ ft_transformer+resnet_like+XGBoost,0.5926724219238438,0.7169506517948077,0.9453380367792,gwangju,best
149
+ ft_transformer+resnet_like+LightGBM,0.5868504053718506,0.7120674745305696,0.9448050003742795,gwangju,best
150
+ ft_transformer+XGBoost+LightGBM,0.5697749480416849,0.6978540261423789,0.9408133093794446,gwangju,best
151
+ resnet_like+XGBoost+LightGBM,0.5568421888456356,0.6879639430352676,0.9383421289018639,gwangju,best
152
+ deepgbm+ft_transformer+resnet_like+XGBoost,0.6464484806055241,0.7628617034260374,0.953282847185834,gwangju,best
153
+ deepgbm+ft_transformer+resnet_like+LightGBM,0.6432761028976647,0.7603027833589299,0.9530166928662326,gwangju,best
154
+ deepgbm+ft_transformer+XGBoost+LightGBM,0.6268590959555621,0.7470252829879999,0.9501275669336527,gwangju,best
155
+ deepgbm+resnet_like+XGBoost+LightGBM,0.617990326312608,0.7412687534166494,0.9484927971987257,gwangju,best
156
+ ft_transformer+resnet_like+XGBoost+LightGBM,0.5778772266115083,0.7046854387067157,0.9426776372150277,gwangju,best
157
+ deepgbm+ft_transformer+resnet_like+XGBoost+LightGBM,0.6251709503819354,0.7454103687111592,0.9500901389824588,gwangju,best
Analysis_code/model_result/deepgbm_sampled_data_test.csv ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ region,model,data_sample,CSI,MCC,Accuracy
2
+ seoul,deepgbm,pure,0.6442509203188513,0.7660970126461858,0.9575257213197927
3
+ busan,deepgbm,pure,0.6818642050717673,0.8049988566475562,0.9749078856534504
4
+ incheon,deepgbm,pure,0.5671286321477068,0.6887072986816571,0.9090567324566875
5
+ daegu,deepgbm,pure,0.5768403304029271,0.7158561147888857,0.9752356921259908
6
+ daejeon,deepgbm,pure,0.6879666308365847,0.797628286801526,0.9573174763580109
7
+ gwangju,deepgbm,pure,0.6169039999207967,0.7404995782864335,0.9513605060259002
8
+ seoul,deepgbm,smote,0.6178386591753761,0.7437507767516012,0.9473890885046287
9
+ busan,deepgbm,smote,0.5771081499427251,0.7328290895553372,0.9590890660478578
10
+ incheon,deepgbm,smote,0.559840116480077,0.6829120896700739,0.9007683126647871
11
+ daegu,deepgbm,smote,0.6425770288404345,0.7686582033950807,0.9787012085069575
12
+ daejeon,deepgbm,smote,0.5569398315371836,0.6906810572279379,0.9299289492726501
13
+ gwangju,deepgbm,smote,0.5526168770566259,0.6910361569617187,0.9341257662333341
14
+ seoul,deepgbm,ctgan20000,0.7095252362606882,0.8058537207489471,0.9619258968152972
15
+ busan,deepgbm,ctgan20000,0.5799669524449778,0.7334430424969706,0.9640024203408438
16
+ incheon,deepgbm,ctgan20000,0.5289538894298621,0.6580659191964314,0.8935004283421081
17
+ daegu,deepgbm,ctgan20000,0.44609227542135127,0.616365605274888,0.9682904159492977
18
+ daejeon,deepgbm,ctgan20000,0.5716353901847903,0.7095774395951602,0.9385713751029269
19
+ gwangju,deepgbm,ctgan20000,0.4571493095973662,0.620850435638757,0.9200951709625637
20
+ seoul,deepgbm,ctgan10000,0.5616336216829804,0.700657840332931,0.9383062604486363
21
+ busan,deepgbm,ctgan10000,0.5879352056252677,0.7351657312694786,0.9672767422711281
22
+ incheon,deepgbm,ctgan10000,0.5290269463089048,0.6582877751557777,0.8928201753291581
23
+ daegu,deepgbm,ctgan10000,0.5557972102197738,0.7032214449405951,0.9747075421480318
24
+ daejeon,deepgbm,ctgan10000,0.5413249801262358,0.6761149432485588,0.9286541116683718
25
+ gwangju,deepgbm,ctgan10000,0.41816087724732665,0.5735307327535618,0.9039835816054095
26
+ seoul,deepgbm,ctgan7000,0.5813133937275518,0.7156872321691327,0.9403897497317663
27
+ busan,deepgbm,ctgan7000,0.597435747844303,0.7454746315945583,0.9670328367891807
28
+ incheon,deepgbm,ctgan7000,0.5123361043389418,0.646967905975133,0.8907633014779882
29
+ daegu,deepgbm,ctgan7000,0.6146700981464019,0.7399778687018386,0.9766398640949504
30
+ daejeon,deepgbm,ctgan7000,0.5581763081057168,0.6896808823936648,0.9308861691244354
31
+ gwangju,deepgbm,ctgan7000,0.5798647223742114,0.7012366964380711,0.9421613394216134