|
|
|
|
|
|
|
|
import os |
|
|
import shutil |
|
|
import stat |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from sklearn.neighbors import KernelDensity |
|
|
import joblib |
|
|
from huggingface_hub import HfApi, Repository |
|
|
|
|
|
|
|
|
HF_REPO_NAME = "asteroidddd/onbid-map-prob" |
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
if HF_TOKEN is None: |
|
|
raise ValueError("ํ๊ฒฝ ๋ณ์ HF_TOKEN์ด ์ค์ ๋์ด ์์ง ์์ต๋๋ค.") |
|
|
|
|
|
|
|
|
def train_kde_models(df, |
|
|
car_df, |
|
|
value_col='๋์ฐฐ๊ฐ์จ_์ต์ด์ต์ ๊ฐ๊ธฐ์ค', |
|
|
major_col='๋๋ถ๋ฅ', |
|
|
minor_col='์ค๋ถ๋ฅ', |
|
|
bandwidth=2.0, |
|
|
num_grid=1000, |
|
|
margin=10): |
|
|
|
|
|
|
|
|
values_all = df[value_col].dropna().values.reshape(-1, 1) |
|
|
x_all_min = values_all.min() - margin |
|
|
x_all_max = values_all.max() + margin |
|
|
x_all = np.linspace(x_all_min, x_all_max, num_grid).reshape(-1, 1) |
|
|
|
|
|
kde_all = KernelDensity(kernel='gaussian', bandwidth=bandwidth) |
|
|
kde_all.fit(values_all) |
|
|
|
|
|
log_pdf_all = kde_all.score_samples(x_all) |
|
|
pdf_all = np.exp(log_pdf_all) |
|
|
dx_all = (x_all[1, 0] - x_all[0, 0]) |
|
|
cdf_all = np.cumsum(pdf_all) * dx_all |
|
|
|
|
|
overall_dict = { |
|
|
'kde': kde_all, |
|
|
'x_range': x_all.flatten(), |
|
|
'cdf': cdf_all, |
|
|
'x_min': x_all_min, |
|
|
'x_max': x_all_max, |
|
|
} |
|
|
|
|
|
|
|
|
major_dict = {} |
|
|
for major_cat, group in df.groupby(major_col): |
|
|
|
|
|
|
|
|
if str(major_cat) == '์๋์ฐจ': |
|
|
vals = car_df[value_col].dropna().values.reshape(-1, 1) |
|
|
else: |
|
|
vals = group[value_col].dropna().values.reshape(-1, 1) |
|
|
|
|
|
if len(vals) < 2: |
|
|
major_dict[major_cat] = overall_dict |
|
|
continue |
|
|
|
|
|
x_min = vals.min() - margin |
|
|
x_max = vals.max() + margin |
|
|
x_range = np.linspace(x_min, x_max, num_grid).reshape(-1, 1) |
|
|
|
|
|
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth) |
|
|
kde.fit(vals) |
|
|
|
|
|
log_pdf = kde.score_samples(x_range) |
|
|
pdf = np.exp(log_pdf) |
|
|
dx = x_range[1, 0] - x_range[0, 0] |
|
|
cdf = np.cumsum(pdf) * dx |
|
|
|
|
|
major_dict[major_cat] = { |
|
|
'kde': kde, |
|
|
'x_range': x_range.flatten(), |
|
|
'cdf': cdf, |
|
|
'x_min': x_min, |
|
|
'x_max': x_max, |
|
|
} |
|
|
|
|
|
|
|
|
minor_dict = {} |
|
|
for minor_cat, group in df.groupby(minor_col): |
|
|
vals = group[value_col].dropna().values.reshape(-1, 1) |
|
|
if len(vals) < 2: |
|
|
minor_dict[minor_cat] = overall_dict |
|
|
continue |
|
|
|
|
|
x_min = vals.min() - margin |
|
|
x_max = vals.max() + margin |
|
|
x_range = np.linspace(x_min, x_max, num_grid).reshape(-1, 1) |
|
|
|
|
|
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth) |
|
|
kde.fit(vals) |
|
|
|
|
|
log_pdf = kde.score_samples(x_range) |
|
|
pdf = np.exp(log_pdf) |
|
|
dx = x_range[1, 0] - x_range[0, 0] |
|
|
cdf = np.cumsum(pdf) * dx |
|
|
|
|
|
minor_dict[minor_cat] = { |
|
|
'kde': kde, |
|
|
'x_range': x_range.flatten(), |
|
|
'cdf': cdf, |
|
|
'x_min': x_min, |
|
|
'x_max': x_max, |
|
|
} |
|
|
|
|
|
return overall_dict, major_dict, minor_dict |
|
|
|
|
|
def rm_readonly(func, path, exc_info): |
|
|
os.chmod(path, stat.S_IWRITE) |
|
|
func(path) |
|
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl') |
|
|
car_df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\car_data.pkl') |
|
|
|
|
|
|
|
|
overall_model, major_models, minor_models = train_kde_models( |
|
|
df=df, |
|
|
car_df=car_df, |
|
|
value_col='๋์ฐฐ๊ฐ์จ_์ต์ด์ต์ ๊ฐ๊ธฐ์ค', |
|
|
major_col='๋๋ถ๋ฅ', |
|
|
minor_col='์ค๋ถ๋ฅ', |
|
|
bandwidth=2.0, |
|
|
num_grid=1000, |
|
|
margin=10 |
|
|
) |
|
|
|
|
|
|
|
|
os.makedirs("output/kde_models", exist_ok=True) |
|
|
joblib.dump(overall_model, "output/kde_models/overall_dict.pkl") |
|
|
joblib.dump(major_models, "output/kde_models/major_dict.pkl") |
|
|
joblib.dump(minor_models, "output/kde_models/minor_dict.pkl") |
|
|
print("KDE ๋ชจ๋ธ ํ์ผ ์ ์ฅ ์๋ฃ: output/kde_models/overall_dict.pkl, major_dict.pkl, minor_dict.pkl") |
|
|
|
|
|
|
|
|
deps = ["numpy", "pandas", "scikit-learn", "joblib", "huggingface_hub"] |
|
|
with open("requirements.txt", "w", encoding="utf-8") as f: |
|
|
f.write("\n".join(deps)) |
|
|
|
|
|
|
|
|
api = HfApi() |
|
|
try: |
|
|
api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
local_dir = "hf_repo" |
|
|
if os.path.isdir(local_dir): |
|
|
shutil.rmtree(local_dir, onerror=rm_readonly) |
|
|
repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN) |
|
|
|
|
|
|
|
|
dst_models_dir = os.path.join(local_dir, "models") |
|
|
os.makedirs(dst_models_dir, exist_ok=True) |
|
|
|
|
|
for fname in ["overall_dict.pkl", "major_dict.pkl", "minor_dict.pkl"]: |
|
|
src = os.path.join("output/kde_models", fname) |
|
|
if os.path.isfile(src): |
|
|
shutil.copy(src, os.path.join(dst_models_dir, fname)) |
|
|
|
|
|
|
|
|
script_name = os.path.basename(__file__) |
|
|
shutil.copy(__file__, os.path.join(local_dir, script_name)) |
|
|
shutil.copy("requirements.txt", os.path.join(local_dir, "requirements.txt")) |
|
|
|
|
|
|
|
|
repo.git_add(auto_lfs_track=True) |
|
|
repo.git_commit("Add KDE ensemble models (overall, major, minor) + training script") |
|
|
repo.git_push() |
|
|
print("Hugging Face Hub์ KDE ๋ชจ๋ธ ์
๋ก๋ ์๋ฃ") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|