| |
|
|
| import os |
| import shutil |
| import stat |
| import pandas as pd |
| import joblib |
| from sklearn.preprocessing import OneHotEncoder |
| from sklearn.compose import ColumnTransformer |
| from sklearn.pipeline import Pipeline |
| from xgboost import XGBRegressor |
| from huggingface_hub import HfApi, Repository |
|
|
| |
| ORDER_PARAMS = { |
| 1: { |
| "objective": "reg:squarederror", |
| "max_depth": 4, |
| "learning_rate": 0.10, |
| "n_estimators": 100, |
| "subsample": 0.8, |
| "colsample_bytree": 0.8, |
| "random_state": 42, |
| }, |
| 2: { |
| "objective": "reg:squarederror", |
| "max_depth": 5, |
| "learning_rate": 0.05, |
| "n_estimators": 120, |
| "subsample": 0.85, |
| "colsample_bytree": 0.9, |
| "random_state": 42, |
| }, |
| 3: { |
| "objective": "reg:squarederror", |
| "max_depth": 6, |
| "learning_rate": 0.03, |
| "n_estimators": 150, |
| "subsample": 0.9, |
| "colsample_bytree": 0.9, |
| "random_state": 42, |
| }, |
| 4: { |
| "objective": "reg:squarederror", |
| "max_depth": 7, |
| "learning_rate": 0.02, |
| "n_estimators": 180, |
| "subsample": 0.9, |
| "colsample_bytree": 0.95, |
| "random_state": 42, |
| }, |
| 5: { |
| "objective": "reg:squarederror", |
| "max_depth": 8, |
| "learning_rate": 0.01, |
| "n_estimators": 200, |
| "subsample": 0.95, |
| "colsample_bytree": 0.95, |
| "random_state": 42, |
| }, |
| } |
|
|
| |
| HF_REPO_NAME = "asteroidddd/onbid-map-carp" |
| HF_TOKEN = os.getenv("HF_TOKEN") |
| if HF_TOKEN is None: |
| raise ValueError("ํ๊ฒฝ ๋ณ์ HF_TOKEN์ด ์ค์ ๋์ด ์์ง ์์ต๋๋ค.") |
|
|
| |
| SCRIPT_PATH = os.path.abspath(__file__) |
|
|
| def rm_readonly(func, path, exc_info): |
| os.chmod(path, stat.S_IWRITE) |
| func(path) |
|
|
| def main(): |
|
|
| |
| df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\car_data.pkl') |
|
|
| |
| if "๋๋ถ๋ฅ" in df.columns: |
| df = df[(df["๋๋ถ๋ฅ"] == "์๋์ฐจ")].reset_index(drop=True) |
|
|
| |
| df["๋์ฐฐ์ฐจ์"] = df["๋์ฐฐ์ฐจ์"].astype(int).apply(lambda x: x if x < 5 else 5) |
|
|
| |
| for order in [1, 2, 3, 4, 5]: |
|
|
| |
| subset = df[df["๋์ฐฐ์ฐจ์"] == order].copy().reset_index(drop=True) |
| if subset.empty: |
| print(f"์ฐจ์ {order} ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค. ๊ฑด๋๋๋๋ค.") |
| continue |
|
|
| |
| if "์ต์ด์
์ฐฐ์๊ธฐ" in subset.columns: |
| subset["์ต์ด์
์ฐฐ์๊ธฐ"] = pd.to_datetime(subset["์ต์ด์
์ฐฐ์๊ธฐ"]) |
| subset["์ต์ด์
์ฐฐ_์ฐ๋"] = subset["์ต์ด์
์ฐฐ์๊ธฐ"].dt.year |
| subset["์ต์ด์
์ฐฐ_์"] = subset["์ต์ด์
์ฐฐ์๊ธฐ"].dt.month |
| subset["์ต์ด์
์ฐฐ_์ผ"] = subset["์ต์ด์
์ฐฐ์๊ธฐ"].dt.day |
| subset["์ต์ด์
์ฐฐ_์์ผ"] = subset["์ต์ด์
์ฐฐ์๊ธฐ"].dt.weekday |
| subset = subset.drop(columns=["์ต์ด์
์ฐฐ์๊ธฐ"]) |
| else: |
| raise KeyError("์ต์ด์
์ฐฐ์๊ธฐ ์ปฌ๋ผ์ด ๋ฐ์ดํฐํ๋ ์์ ์์ต๋๋ค.") |
|
|
| |
| base_cols = ["๋๋ถ๋ฅ", "์ค๋ถ๋ฅ", "์๋ถ๋ฅ", "์ ์กฐ์ฌ", "์ฐจ์ข
", "๊ธฐ๊ด"] |
| date_cols = ["์ต์ด์
์ฐฐ_์ฐ๋", "์ต์ด์
์ฐฐ_์", "์ต์ด์
์ฐฐ_์ผ", "์ต์ด์
์ฐฐ_์์ผ"] |
| if order == 1: |
| bid_cols = ["1์ฐจ์ต์ ์
์ฐฐ๊ฐ"] |
| elif order == 2: |
| bid_cols = ["1์ฐจ์ต์ ์
์ฐฐ๊ฐ", "2์ฐจ์ต์ ์
์ฐฐ๊ฐ"] |
| elif order == 3: |
| bid_cols = ["1์ฐจ์ต์ ์
์ฐฐ๊ฐ", "2์ฐจ์ต์ ์
์ฐฐ๊ฐ", "3์ฐจ์ต์ ์
์ฐฐ๊ฐ"] |
| elif order == 4: |
| bid_cols = ["1์ฐจ์ต์ ์
์ฐฐ๊ฐ", "2์ฐจ์ต์ ์
์ฐฐ๊ฐ", "3์ฐจ์ต์ ์
์ฐฐ๊ฐ", "4์ฐจ์ต์ ์
์ฐฐ๊ฐ"] |
| else: |
| bid_cols = ["1์ฐจ์ต์ ์
์ฐฐ๊ฐ", "2์ฐจ์ต์ ์
์ฐฐ๊ฐ", "3์ฐจ์ต์ ์
์ฐฐ๊ฐ", "4์ฐจ์ต์ ์
์ฐฐ๊ฐ", "5์ฐจ์ต์ ์
์ฐฐ๊ฐ"] |
|
|
| feature_cols = base_cols + date_cols + bid_cols |
| X = subset[feature_cols].copy() |
| y = subset["๋์ฐฐ๊ฐ์จ_์ต์ด์ต์ ๊ฐ๊ธฐ์ค"].copy() |
|
|
| |
| preprocessor = ColumnTransformer( |
| transformers=[("ohe", OneHotEncoder(handle_unknown="ignore"), base_cols)], |
| remainder="passthrough" |
| ) |
|
|
| |
| params = ORDER_PARAMS.get(order) |
| model = XGBRegressor(**params) |
| pipeline = Pipeline([ |
| ("preprocessor", preprocessor), |
| ("regressor", model) |
| ]) |
|
|
| |
| pipeline.fit(X, y) |
| print(f"์ฐจ์ {order} ๋ชจ๋ธ ํ์ต ์๋ฃ (params: {params})") |
|
|
| |
| output_dir = f"output/order{order}" |
| os.makedirs(output_dir, exist_ok=True) |
| joblib.dump(pipeline, os.path.join(output_dir, "pipeline.pkl")) |
| print(f" โ pipeline.pkl ์ ์ฅ: {output_dir}/pipeline.pkl") |
|
|
| |
| deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"] |
| with open("requirements.txt", "w", encoding="utf-8") as f: |
| f.write("\n".join(deps)) |
|
|
| |
| api = HfApi() |
| try: |
| api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN) |
| except Exception: |
| pass |
|
|
| |
| local_dir = "hf_repo" |
| if os.path.isdir(local_dir): |
| shutil.rmtree(local_dir, onerror=rm_readonly) |
| repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN) |
|
|
| |
| for order in [1, 2, 3, 4, 5]: |
| src_dir = f"output/order{order}" |
| if not os.path.isdir(src_dir): |
| continue |
|
|
| dst_dir = os.path.join(local_dir, "models_by_order", f"order{order}") |
| os.makedirs(dst_dir, exist_ok=True) |
|
|
| src_file = os.path.join(src_dir, "pipeline.pkl") |
| if os.path.isfile(src_file): |
| shutil.copy(src_file, os.path.join(dst_dir, "pipeline.pkl")) |
|
|
| |
| for src in [SCRIPT_PATH, "requirements.txt"]: |
| dst = os.path.join(local_dir, os.path.basename(src)) |
| shutil.copy(src, dst) |
|
|
| |
| repo.git_add(auto_lfs_track=True) |
| repo.git_commit("Add trained pipelines for orders 1~5 (๊ฐ๋ณ ํ๋ผ๋ฏธํฐ) + training script") |
| repo.git_push() |
| print("Hugging Face Hub์ ๋ชจ๋ธ ์
๋ก๋ ์๋ฃ") |
|
|
| if __name__ == "__main__": |
| main() |
|
|