onbid-map-prob / onbid-map-prob-train.py
asteroidddd's picture
Add KDE ensemble models (overall, major, minor) + training script
6b960c3
# onbid-map-prob-train.py
import os
import shutil
import stat
import numpy as np
import pandas as pd
from sklearn.neighbors import KernelDensity
import joblib
from huggingface_hub import HfApi, Repository
# ํ™˜๊ฒฝ ๋ณ€์ˆ˜์—์„œ Hugging Face ํ† ํฐ ์ฝ๊ธฐ
HF_REPO_NAME = "asteroidddd/onbid-map-prob"
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN is None:
raise ValueError("ํ™˜๊ฒฝ ๋ณ€์ˆ˜ HF_TOKEN์ด ์„ค์ •๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
# KDE ํ•™์Šต์šฉ ํ•จ์ˆ˜ ์ •์˜
def train_kde_models(df,
car_df,
value_col='๋‚™์ฐฐ๊ฐ€์œจ_์ตœ์ดˆ์ตœ์ €๊ฐ€๊ธฐ์ค€',
major_col='๋Œ€๋ถ„๋ฅ˜',
minor_col='์ค‘๋ถ„๋ฅ˜',
bandwidth=2.0,
num_grid=1000,
margin=10):
# ์ „์ฒด ๋ฐ์ดํ„ฐ KDE ํ•™์Šต
values_all = df[value_col].dropna().values.reshape(-1, 1)
x_all_min = values_all.min() - margin
x_all_max = values_all.max() + margin
x_all = np.linspace(x_all_min, x_all_max, num_grid).reshape(-1, 1)
kde_all = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
kde_all.fit(values_all)
log_pdf_all = kde_all.score_samples(x_all)
pdf_all = np.exp(log_pdf_all)
dx_all = (x_all[1, 0] - x_all[0, 0])
cdf_all = np.cumsum(pdf_all) * dx_all
overall_dict = {
'kde': kde_all,
'x_range': x_all.flatten(),
'cdf': cdf_all,
'x_min': x_all_min,
'x_max': x_all_max,
}
# ๋Œ€๋ถ„๋ฅ˜๋ณ„ KDE ํ•™์Šต: ์ž๋™์ฐจ car_df ์‚ฌ์šฉ
major_dict = {}
for major_cat, group in df.groupby(major_col):
# ๋งŒ์•ฝ ๋Œ€๋ถ„๋ฅ˜๊ฐ€ '์ž๋™์ฐจ'๋ผ๋ฉด car_df ์‚ฌ์šฉ
if str(major_cat) == '์ž๋™์ฐจ':
vals = car_df[value_col].dropna().values.reshape(-1, 1)
else:
vals = group[value_col].dropna().values.reshape(-1, 1)
if len(vals) < 2:
major_dict[major_cat] = overall_dict
continue
x_min = vals.min() - margin
x_max = vals.max() + margin
x_range = np.linspace(x_min, x_max, num_grid).reshape(-1, 1)
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
kde.fit(vals)
log_pdf = kde.score_samples(x_range)
pdf = np.exp(log_pdf)
dx = x_range[1, 0] - x_range[0, 0]
cdf = np.cumsum(pdf) * dx
major_dict[major_cat] = {
'kde': kde,
'x_range': x_range.flatten(),
'cdf': cdf,
'x_min': x_min,
'x_max': x_max,
}
# ์ค‘๋ถ„๋ฅ˜๋ณ„ KDE ํ•™์Šต: ์ž๋™์ฐจ car_df ์‚ฌ์šฉ
minor_dict = {}
for minor_cat, group in df.groupby(minor_col):
vals = group[value_col].dropna().values.reshape(-1, 1)
if len(vals) < 2:
minor_dict[minor_cat] = overall_dict
continue
x_min = vals.min() - margin
x_max = vals.max() + margin
x_range = np.linspace(x_min, x_max, num_grid).reshape(-1, 1)
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
kde.fit(vals)
log_pdf = kde.score_samples(x_range)
pdf = np.exp(log_pdf)
dx = x_range[1, 0] - x_range[0, 0]
cdf = np.cumsum(pdf) * dx
minor_dict[minor_cat] = {
'kde': kde,
'x_range': x_range.flatten(),
'cdf': cdf,
'x_min': x_min,
'x_max': x_max,
}
return overall_dict, major_dict, minor_dict
def rm_readonly(func, path, exc_info):
os.chmod(path, stat.S_IWRITE)
func(path)
# ๋ฉ”์ธ
def main():
# ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl')
car_df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\car_data.pkl')
# KDE ๋ชจ๋ธ ํ•™์Šต
overall_model, major_models, minor_models = train_kde_models(
df=df,
car_df=car_df,
value_col='๋‚™์ฐฐ๊ฐ€์œจ_์ตœ์ดˆ์ตœ์ €๊ฐ€๊ธฐ์ค€',
major_col='๋Œ€๋ถ„๋ฅ˜',
minor_col='์ค‘๋ถ„๋ฅ˜',
bandwidth=2.0,
num_grid=1000,
margin=10
)
# KDE ๋ชจ๋ธ ์ €์žฅ
os.makedirs("output/kde_models", exist_ok=True)
joblib.dump(overall_model, "output/kde_models/overall_dict.pkl")
joblib.dump(major_models, "output/kde_models/major_dict.pkl")
joblib.dump(minor_models, "output/kde_models/minor_dict.pkl")
print("KDE ๋ชจ๋ธ ํŒŒ์ผ ์ €์žฅ ์™„๋ฃŒ: output/kde_models/overall_dict.pkl, major_dict.pkl, minor_dict.pkl")
# requirements.txt ์ž‘์„ฑ
deps = ["numpy", "pandas", "scikit-learn", "joblib", "huggingface_hub"]
with open("requirements.txt", "w", encoding="utf-8") as f:
f.write("\n".join(deps))
# Hugging Face ๋ ˆํฌ ์ƒ์„ฑ ๋ฐ ํด๋ก 
api = HfApi()
try:
api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
except Exception:
pass # ์ด๋ฏธ ๋ ˆํฌ๊ฐ€ ์กด์žฌํ•˜๋ฉด ๋ฌด์‹œ
local_dir = "hf_repo"
if os.path.isdir(local_dir):
shutil.rmtree(local_dir, onerror=rm_readonly)
repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
# ๋ชจ๋ธ ํŒŒ์ผ ๋ฐ ์Šคํฌ๋ฆฝํŠธ ๋ณต์‚ฌ
dst_models_dir = os.path.join(local_dir, "models")
os.makedirs(dst_models_dir, exist_ok=True)
for fname in ["overall_dict.pkl", "major_dict.pkl", "minor_dict.pkl"]:
src = os.path.join("output/kde_models", fname)
if os.path.isfile(src):
shutil.copy(src, os.path.join(dst_models_dir, fname))
# ์Šคํฌ๋ฆฝํŠธ ํŒŒ์ผ ๋ฐ requirements.txt ๋ณต์‚ฌ
script_name = os.path.basename(__file__)
shutil.copy(__file__, os.path.join(local_dir, script_name))
shutil.copy("requirements.txt", os.path.join(local_dir, "requirements.txt"))
# ์ปค๋ฐ‹ ๋ฐ ํ‘ธ์‹œ
repo.git_add(auto_lfs_track=True)
repo.git_commit("Add KDE ensemble models (overall, major, minor) + training script")
repo.git_push()
print("Hugging Face Hub์— KDE ๋ชจ๋ธ ์—…๋กœ๋“œ ์™„๋ฃŒ")
if __name__ == "__main__":
main()