File size: 3,473 Bytes
4b4d652 ba69daf 4b4d652 ba69daf 4b4d652 ba69daf 4b4d652 5a2a0c6 4b4d652 5a2a0c6 4b4d652 ba69daf 4b4d652 ba69daf 4b4d652 ba69daf 4b4d652 ba69daf 4b4d652 ba69daf 4b4d652 5a2a0c6 4b4d652 ba69daf 4b4d652 b6795f3 4b4d652 ba69daf 4b4d652 ba69daf 4b4d652 5a2a0c6 4b4d652 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | # onbid_map_round_train.py
import os
import shutil
import stat
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from huggingface_hub import HfApi, Repository
# ํ๊ฒฝ ๋ณ์์์ Hugging Face ํ ํฐ ์ฝ๊ธฐ
HF_REPO_NAME = "asteroidddd/onbid-map-round"
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN is None:
raise ValueError("ํ๊ฒฝ ๋ณ์ HF_TOKEN์ด ์ค์ ๋์ด ์์ง ์์ต๋๋ค.")
# ์ด ์คํฌ๋ฆฝํธ์ ๊ฒฝ๋ก
SCRIPT_PATH = os.path.abspath(__file__)
def rm_readonly(func, path, exc_info):
os.chmod(path, stat.S_IWRITE)
func(path)
def main():
# ๋ฐ์ดํฐ ๋ก๋
df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl')
# ๋ผ๋ฒจ ์ธ์ฝ๋ฉ & ๋น๋ โค 10์ธ ํด๋์ค ์ ๊ฑฐ
le_label = LabelEncoder()
df["๋์ฐฐ์ฐจ์_LE"] = le_label.fit_transform(df["๋์ฐฐ์ฐจ์"])
counts = df["๋์ฐฐ์ฐจ์_LE"].value_counts()
rare = counts[counts <= 10].index.tolist()
df = df[~df["๋์ฐฐ์ฐจ์_LE"].isin(rare)].reset_index(drop=True)
# ๋ ์ง ํ์ ๋ณ์ ์์ฑ
df["์ต์ด์
์ฐฐ_์ฐ๋"] = df["์ต์ด์
์ฐฐ์๊ธฐ"].dt.year
df["์ต์ด์
์ฐฐ_์"] = df["์ต์ด์
์ฐฐ์๊ธฐ"].dt.month
df["์ต์ด์
์ฐฐ_์ผ"] = df["์ต์ด์
์ฐฐ์๊ธฐ"].dt.day
df["์ต์ด์
์ฐฐ_์์ผ"] = df["์ต์ด์
์ฐฐ์๊ธฐ"].dt.weekday
df = df.drop(columns=["์ต์ด์
์ฐฐ์๊ธฐ"])
# ํผ์ฒ/ํ๊น ๋ถ๋ฆฌ
X = df[["๋๋ถ๋ฅ", "์ค๋ถ๋ฅ", "๊ธฐ๊ด",
"์ต์ด์
์ฐฐ_์ฐ๋", "์ต์ด์
์ฐฐ_์", "์ต์ด์
์ฐฐ_์ผ", "์ต์ด์
์ฐฐ_์์ผ",
"1์ฐจ์ต์ ์
์ฐฐ๊ฐ"]]
y = df["๋์ฐฐ์ฐจ์_LE"]
# ์ ์ฒ๋ฆฌ + ๋ชจ๋ธ ํ์ดํ๋ผ์ธ
cat_cols = ["๋๋ถ๋ฅ", "์ค๋ถ๋ฅ", "๊ธฐ๊ด"]
preprocessor = ColumnTransformer(
transformers=[
("ohe", OneHotEncoder(handle_unknown="ignore"), cat_cols)
],
remainder="passthrough"
)
pipeline = Pipeline([
("preprocessor", preprocessor),
("classifier", XGBClassifier(eval_metric="mlogloss", random_state=42))
])
# ํ์ต
pipeline.fit(X, y)
# ํ์ดํ๋ผ์ธ & ๋ผ๋ฒจ ์ธ์ฝ๋ ์ ์ฅ
os.makedirs("output", exist_ok=True)
pipeline_path = "output/auction_pipeline.pkl"
label_path = "output/label_encoder.pkl"
joblib.dump(pipeline, pipeline_path)
joblib.dump(le_label, label_path)
# requirements.txt ์์ฑ
deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"]
with open("requirements.txt", "w", encoding="utf-8") as f:
f.write("\n".join(deps))
# Hugging Face ๋ ํฌ ์์ฑ ์๋
api = HfApi()
try:
api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
except:
pass
# ๋ก์ปฌ์ ๋ ํฌ ํด๋ก (๊ธฐ์กด ์ญ์ ์ read-only ์ฒ๋ฆฌ)
local_dir = "hf_repo"
if os.path.isdir(local_dir):
shutil.rmtree(local_dir, onerror=rm_readonly)
repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
# ํ์ผ ๋ณต์ฌ
for src in [SCRIPT_PATH, "requirements.txt", pipeline_path, label_path]:
dst = os.path.join(local_dir, os.path.basename(src))
shutil.copy(src, dst)
# ์ปค๋ฐ ๋ฐ ํธ์
repo.git_add(auto_lfs_track=True)
repo.git_commit("Add trained pipeline + preprocessing code")
repo.git_push()
if __name__ == "__main__":
main()
|