onbid-map-carp / onbid-map-carp-train.py
asteroidddd's picture
Add trained pipelines for orders 1~5 (๊ฐœ๋ณ„ ํŒŒ๋ผ๋ฏธํ„ฐ) + training script
2647f27
# onbid-map-carp-train.py
import os
import shutil
import stat
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from huggingface_hub import HfApi, Repository
# ์ฐจ์ˆ˜๋ณ„ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ์„ค์ •
ORDER_PARAMS = {
1: {
"objective": "reg:squarederror",
"max_depth": 4,
"learning_rate": 0.10,
"n_estimators": 100,
"subsample": 0.8,
"colsample_bytree": 0.8,
"random_state": 42,
},
2: {
"objective": "reg:squarederror",
"max_depth": 5,
"learning_rate": 0.05,
"n_estimators": 120,
"subsample": 0.85,
"colsample_bytree": 0.9,
"random_state": 42,
},
3: {
"objective": "reg:squarederror",
"max_depth": 6,
"learning_rate": 0.03,
"n_estimators": 150,
"subsample": 0.9,
"colsample_bytree": 0.9,
"random_state": 42,
},
4: {
"objective": "reg:squarederror",
"max_depth": 7,
"learning_rate": 0.02,
"n_estimators": 180,
"subsample": 0.9,
"colsample_bytree": 0.95,
"random_state": 42,
},
5: {
"objective": "reg:squarederror",
"max_depth": 8,
"learning_rate": 0.01,
"n_estimators": 200,
"subsample": 0.95,
"colsample_bytree": 0.95,
"random_state": 42,
},
}
# ํ™˜๊ฒฝ ๋ณ€์ˆ˜์—์„œ Hugging Face ํ† ํฐ ์ฝ๊ธฐ
HF_REPO_NAME = "asteroidddd/onbid-map-carp"
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN is None:
raise ValueError("ํ™˜๊ฒฝ ๋ณ€์ˆ˜ HF_TOKEN์ด ์„ค์ •๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
# ์ด ์Šคํฌ๋ฆฝํŠธ์˜ ๊ฒฝ๋กœ
SCRIPT_PATH = os.path.abspath(__file__)
def rm_readonly(func, path, exc_info):
os.chmod(path, stat.S_IWRITE)
func(path)
def main():
# ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\car_data.pkl')
# ์ž๋™์ฐจ๋งŒ ํ•„ํ„ฐ๋ง (์ด๋ฏธ ๋˜์–ด ์žˆ์Œ)
if "๋Œ€๋ถ„๋ฅ˜" in df.columns:
df = df[(df["๋Œ€๋ถ„๋ฅ˜"] == "์ž๋™์ฐจ")].reset_index(drop=True)
# '๋‚™์ฐฐ์ฐจ์ˆ˜' ์ปฌ๋Ÿผ์„ ์ •์ˆ˜ํ˜•์œผ๋กœ ๋ณ€ํ™˜ํ•˜๊ณ , 5 ์ด์ƒ์€ 5๋กœ ํ†ต์ผ
df["๋‚™์ฐฐ์ฐจ์ˆ˜"] = df["๋‚™์ฐฐ์ฐจ์ˆ˜"].astype(int).apply(lambda x: x if x < 5 else 5)
# ์ฐจ์ˆ˜๋ณ„ ๋ชจ๋ธ ํ•™์Šต & ์ €์žฅ
for order in [1, 2, 3, 4, 5]:
# ํ•ด๋‹น ์ฐจ์ˆ˜ ๋ฐ์ดํ„ฐ๋งŒ ํ•„ํ„ฐ๋ง
subset = df[df["๋‚™์ฐฐ์ฐจ์ˆ˜"] == order].copy().reset_index(drop=True)
if subset.empty:
print(f"์ฐจ์ˆ˜ {order} ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค.")
continue
# ๋‚ ์งœ ์ปฌ๋Ÿผ(datetime) ํŒŒ์ƒ๋ณ€์ˆ˜ ์ƒ์„ฑ
if "์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ" in subset.columns:
subset["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"] = pd.to_datetime(subset["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"])
subset["์ตœ์ดˆ์ž…์ฐฐ_์—ฐ๋„"] = subset["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"].dt.year
subset["์ตœ์ดˆ์ž…์ฐฐ_์›”"] = subset["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"].dt.month
subset["์ตœ์ดˆ์ž…์ฐฐ_์ผ"] = subset["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"].dt.day
subset["์ตœ์ดˆ์ž…์ฐฐ_์š”์ผ"] = subset["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"].dt.weekday
subset = subset.drop(columns=["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"])
else:
raise KeyError("์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ ์ปฌ๋Ÿผ์ด ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์— ์—†์Šต๋‹ˆ๋‹ค.")
# ์‚ฌ์šฉํ•  ํ”ผ์ฒ˜ ์ปฌ๋Ÿผ ๊ฒฐ์ •
base_cols = ["๋Œ€๋ถ„๋ฅ˜", "์ค‘๋ถ„๋ฅ˜", "์†Œ๋ถ„๋ฅ˜", "์ œ์กฐ์‚ฌ", "์ฐจ์ข…", "๊ธฐ๊ด€"]
date_cols = ["์ตœ์ดˆ์ž…์ฐฐ_์—ฐ๋„", "์ตœ์ดˆ์ž…์ฐฐ_์›”", "์ตœ์ดˆ์ž…์ฐฐ_์ผ", "์ตœ์ดˆ์ž…์ฐฐ_์š”์ผ"]
if order == 1:
bid_cols = ["1์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€"]
elif order == 2:
bid_cols = ["1์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€", "2์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€"]
elif order == 3:
bid_cols = ["1์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€", "2์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€", "3์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€"]
elif order == 4:
bid_cols = ["1์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€", "2์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€", "3์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€", "4์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€"]
else: # order == 5
bid_cols = ["1์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€", "2์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€", "3์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€", "4์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€", "5์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€"]
feature_cols = base_cols + date_cols + bid_cols
X = subset[feature_cols].copy()
y = subset["๋‚™์ฐฐ๊ฐ€์œจ_์ตœ์ดˆ์ตœ์ €๊ฐ€๊ธฐ์ค€"].copy()
# ์ „์ฒ˜๋ฆฌ + ๋ชจ๋ธ ํŒŒ์ดํ”„๋ผ์ธ ์ •์˜
preprocessor = ColumnTransformer(
transformers=[("ohe", OneHotEncoder(handle_unknown="ignore"), base_cols)],
remainder="passthrough"
)
# ์ฐจ์ˆ˜๋ณ„ ํŒŒ๋ผ๋ฏธํ„ฐ๋ฅผ ๊บผ๋‚ด์„œ XGBRegressor ์ƒ์„ฑ
params = ORDER_PARAMS.get(order)
model = XGBRegressor(**params)
pipeline = Pipeline([
("preprocessor", preprocessor),
("regressor", model)
])
# ์ „์ฒด ๋ฐ์ดํ„ฐ๋กœ ํ•™์Šต
pipeline.fit(X, y)
print(f"์ฐจ์ˆ˜ {order} ๋ชจ๋ธ ํ•™์Šต ์™„๋ฃŒ (params: {params})")
# ๋ชจ๋ธ ์ €์žฅ
output_dir = f"output/order{order}"
os.makedirs(output_dir, exist_ok=True)
joblib.dump(pipeline, os.path.join(output_dir, "pipeline.pkl"))
print(f" โ†’ pipeline.pkl ์ €์žฅ: {output_dir}/pipeline.pkl")
# requirements.txt ์ž‘์„ฑ
deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"]
with open("requirements.txt", "w", encoding="utf-8") as f:
f.write("\n".join(deps))
# ๋ ˆํฌ ์ƒ์„ฑ ์‹œ๋„
api = HfApi()
try:
api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
except Exception:
pass
# ๋กœ์ปฌ์— ๋ ˆํฌ ํด๋ก  (๊ธฐ์กด ๋””๋ ‰ํ† ๋ฆฌ ์‚ญ์ œ ํฌํ•จ)
local_dir = "hf_repo"
if os.path.isdir(local_dir):
shutil.rmtree(local_dir, onerror=rm_readonly)
repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
# output/order{์ฐจ์ˆ˜} ๋‚ด ํŒŒ์ผ์„ hf_repo/models_by_order/order{์ฐจ์ˆ˜} ํด๋”๋กœ ๋ณต์‚ฌ
for order in [1, 2, 3, 4, 5]:
src_dir = f"output/order{order}"
if not os.path.isdir(src_dir):
continue
dst_dir = os.path.join(local_dir, "models_by_order", f"order{order}")
os.makedirs(dst_dir, exist_ok=True)
src_file = os.path.join(src_dir, "pipeline.pkl")
if os.path.isfile(src_file):
shutil.copy(src_file, os.path.join(dst_dir, "pipeline.pkl"))
# ์Šคํฌ๋ฆฝํŠธ ํŒŒ์ผ ๋ฐ requirements.txt๋„ ํ•จ๊ป˜ ๋ณต์‚ฌ
for src in [SCRIPT_PATH, "requirements.txt"]:
dst = os.path.join(local_dir, os.path.basename(src))
shutil.copy(src, dst)
# ์ปค๋ฐ‹ ๋ฐ ํ‘ธ์‹œ
repo.git_add(auto_lfs_track=True)
repo.git_commit("Add trained pipelines for orders 1~5 (๊ฐœ๋ณ„ ํŒŒ๋ผ๋ฏธํ„ฐ) + training script")
repo.git_push()
print("Hugging Face Hub์— ๋ชจ๋ธ ์—…๋กœ๋“œ ์™„๋ฃŒ")
if __name__ == "__main__":
main()