asteroidddd commited on
Commit
6b960c3
ยท
1 Parent(s): 167050b

Add KDE ensemble models (overall, major, minor) + training script

Browse files
models/major_dict.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6e7bb0661f907adf9e0455875811ec5629e51b6fd7361796260d612eb5745a8
3
+ size 2503542
models/minor_dict.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f78559db04aeaeb876a4e909b903b48819d5ce64eeb1bfc7b31c69f597cf979
3
+ size 6767078
models/overall_dict.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:657b13bff9e0ffd51d21dee18f885354ea5537fc7d6cb4bd30b33a9cb9da5736
3
+ size 1819026
onbid-map-prob-train.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # onbid-map-prob-train.py
2
+
3
+ import os
4
+ import shutil
5
+ import stat
6
+ import numpy as np
7
+ import pandas as pd
8
+ from sklearn.neighbors import KernelDensity
9
+ import joblib
10
+ from huggingface_hub import HfApi, Repository
11
+
12
+ # ํ™˜๊ฒฝ ๋ณ€์ˆ˜์—์„œ Hugging Face ํ† ํฐ ์ฝ๊ธฐ
13
+ HF_REPO_NAME = "asteroidddd/onbid-map-prob"
14
+ HF_TOKEN = os.getenv("HF_TOKEN")
15
+ if HF_TOKEN is None:
16
+ raise ValueError("ํ™˜๊ฒฝ ๋ณ€์ˆ˜ HF_TOKEN์ด ์„ค์ •๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
17
+
18
+ # KDE ํ•™์Šต์šฉ ํ•จ์ˆ˜ ์ •์˜
19
+ def train_kde_models(df,
20
+ car_df,
21
+ value_col='๋‚™์ฐฐ๊ฐ€์œจ_์ตœ์ดˆ์ตœ์ €๊ฐ€๊ธฐ์ค€',
22
+ major_col='๋Œ€๋ถ„๋ฅ˜',
23
+ minor_col='์ค‘๋ถ„๋ฅ˜',
24
+ bandwidth=2.0,
25
+ num_grid=1000,
26
+ margin=10):
27
+
28
+ # ์ „์ฒด ๋ฐ์ดํ„ฐ KDE ํ•™์Šต
29
+ values_all = df[value_col].dropna().values.reshape(-1, 1)
30
+ x_all_min = values_all.min() - margin
31
+ x_all_max = values_all.max() + margin
32
+ x_all = np.linspace(x_all_min, x_all_max, num_grid).reshape(-1, 1)
33
+
34
+ kde_all = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
35
+ kde_all.fit(values_all)
36
+
37
+ log_pdf_all = kde_all.score_samples(x_all)
38
+ pdf_all = np.exp(log_pdf_all)
39
+ dx_all = (x_all[1, 0] - x_all[0, 0])
40
+ cdf_all = np.cumsum(pdf_all) * dx_all
41
+
42
+ overall_dict = {
43
+ 'kde': kde_all,
44
+ 'x_range': x_all.flatten(),
45
+ 'cdf': cdf_all,
46
+ 'x_min': x_all_min,
47
+ 'x_max': x_all_max,
48
+ }
49
+
50
+ # ๋Œ€๋ถ„๋ฅ˜๋ณ„ KDE ํ•™์Šต: ์ž๋™์ฐจ car_df ์‚ฌ์šฉ
51
+ major_dict = {}
52
+ for major_cat, group in df.groupby(major_col):
53
+
54
+ # ๋งŒ์•ฝ ๋Œ€๋ถ„๋ฅ˜๊ฐ€ '์ž๋™์ฐจ'๋ผ๋ฉด car_df ์‚ฌ์šฉ
55
+ if str(major_cat) == '์ž๋™์ฐจ':
56
+ vals = car_df[value_col].dropna().values.reshape(-1, 1)
57
+ else:
58
+ vals = group[value_col].dropna().values.reshape(-1, 1)
59
+
60
+ if len(vals) < 2:
61
+ major_dict[major_cat] = overall_dict
62
+ continue
63
+
64
+ x_min = vals.min() - margin
65
+ x_max = vals.max() + margin
66
+ x_range = np.linspace(x_min, x_max, num_grid).reshape(-1, 1)
67
+
68
+ kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
69
+ kde.fit(vals)
70
+
71
+ log_pdf = kde.score_samples(x_range)
72
+ pdf = np.exp(log_pdf)
73
+ dx = x_range[1, 0] - x_range[0, 0]
74
+ cdf = np.cumsum(pdf) * dx
75
+
76
+ major_dict[major_cat] = {
77
+ 'kde': kde,
78
+ 'x_range': x_range.flatten(),
79
+ 'cdf': cdf,
80
+ 'x_min': x_min,
81
+ 'x_max': x_max,
82
+ }
83
+
84
+ # ์ค‘๋ถ„๋ฅ˜๋ณ„ KDE ํ•™์Šต: ์ž๋™์ฐจ car_df ์‚ฌ์šฉ
85
+ minor_dict = {}
86
+ for minor_cat, group in df.groupby(minor_col):
87
+ vals = group[value_col].dropna().values.reshape(-1, 1)
88
+ if len(vals) < 2:
89
+ minor_dict[minor_cat] = overall_dict
90
+ continue
91
+
92
+ x_min = vals.min() - margin
93
+ x_max = vals.max() + margin
94
+ x_range = np.linspace(x_min, x_max, num_grid).reshape(-1, 1)
95
+
96
+ kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
97
+ kde.fit(vals)
98
+
99
+ log_pdf = kde.score_samples(x_range)
100
+ pdf = np.exp(log_pdf)
101
+ dx = x_range[1, 0] - x_range[0, 0]
102
+ cdf = np.cumsum(pdf) * dx
103
+
104
+ minor_dict[minor_cat] = {
105
+ 'kde': kde,
106
+ 'x_range': x_range.flatten(),
107
+ 'cdf': cdf,
108
+ 'x_min': x_min,
109
+ 'x_max': x_max,
110
+ }
111
+
112
+ return overall_dict, major_dict, minor_dict
113
+
114
+ def rm_readonly(func, path, exc_info):
115
+ os.chmod(path, stat.S_IWRITE)
116
+ func(path)
117
+
118
+ # ๋ฉ”์ธ
119
+ def main():
120
+ # ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
121
+ df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl')
122
+ car_df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\car_data.pkl')
123
+
124
+ # KDE ๋ชจ๋ธ ํ•™์Šต
125
+ overall_model, major_models, minor_models = train_kde_models(
126
+ df=df,
127
+ car_df=car_df,
128
+ value_col='๋‚™์ฐฐ๊ฐ€์œจ_์ตœ์ดˆ์ตœ์ €๊ฐ€๊ธฐ์ค€',
129
+ major_col='๋Œ€๋ถ„๋ฅ˜',
130
+ minor_col='์ค‘๋ถ„๋ฅ˜',
131
+ bandwidth=2.0,
132
+ num_grid=1000,
133
+ margin=10
134
+ )
135
+
136
+ # KDE ๋ชจ๋ธ ์ €์žฅ
137
+ os.makedirs("output/kde_models", exist_ok=True)
138
+ joblib.dump(overall_model, "output/kde_models/overall_dict.pkl")
139
+ joblib.dump(major_models, "output/kde_models/major_dict.pkl")
140
+ joblib.dump(minor_models, "output/kde_models/minor_dict.pkl")
141
+ print("KDE ๋ชจ๋ธ ํŒŒ์ผ ์ €์žฅ ์™„๋ฃŒ: output/kde_models/overall_dict.pkl, major_dict.pkl, minor_dict.pkl")
142
+
143
+ # requirements.txt ์ž‘์„ฑ
144
+ deps = ["numpy", "pandas", "scikit-learn", "joblib", "huggingface_hub"]
145
+ with open("requirements.txt", "w", encoding="utf-8") as f:
146
+ f.write("\n".join(deps))
147
+
148
+ # Hugging Face ๋ ˆํฌ ์ƒ์„ฑ ๋ฐ ํด๋ก 
149
+ api = HfApi()
150
+ try:
151
+ api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
152
+ except Exception:
153
+ pass # ์ด๋ฏธ ๋ ˆํฌ๊ฐ€ ์กด์žฌํ•˜๋ฉด ๋ฌด์‹œ
154
+
155
+ local_dir = "hf_repo"
156
+ if os.path.isdir(local_dir):
157
+ shutil.rmtree(local_dir, onerror=rm_readonly)
158
+ repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
159
+
160
+ # ๋ชจ๋ธ ํŒŒ์ผ ๋ฐ ์Šคํฌ๋ฆฝํŠธ ๋ณต์‚ฌ
161
+ dst_models_dir = os.path.join(local_dir, "models")
162
+ os.makedirs(dst_models_dir, exist_ok=True)
163
+
164
+ for fname in ["overall_dict.pkl", "major_dict.pkl", "minor_dict.pkl"]:
165
+ src = os.path.join("output/kde_models", fname)
166
+ if os.path.isfile(src):
167
+ shutil.copy(src, os.path.join(dst_models_dir, fname))
168
+
169
+ # ์Šคํฌ๋ฆฝํŠธ ํŒŒ์ผ ๋ฐ requirements.txt ๋ณต์‚ฌ
170
+ script_name = os.path.basename(__file__)
171
+ shutil.copy(__file__, os.path.join(local_dir, script_name))
172
+ shutil.copy("requirements.txt", os.path.join(local_dir, "requirements.txt"))
173
+
174
+ # ์ปค๋ฐ‹ ๋ฐ ํ‘ธ์‹œ
175
+ repo.git_add(auto_lfs_track=True)
176
+ repo.git_commit("Add KDE ensemble models (overall, major, minor) + training script")
177
+ repo.git_push()
178
+ print("Hugging Face Hub์— KDE ๋ชจ๋ธ ์—…๋กœ๋“œ ์™„๋ฃŒ")
179
+
180
+ if __name__ == "__main__":
181
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ scikit-learn
4
+ joblib
5
+ huggingface_hub