onbid-map-round / onbid-map-round-train.py
asteroidddd's picture
Add trained pipeline + preprocessing code
ba69daf
# onbid_map_round_train.py
import os
import shutil
import stat
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from huggingface_hub import HfApi, Repository
# ํ™˜๊ฒฝ ๋ณ€์ˆ˜์—์„œ Hugging Face ํ† ํฐ ์ฝ๊ธฐ
HF_REPO_NAME = "asteroidddd/onbid-map-round"
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN is None:
raise ValueError("ํ™˜๊ฒฝ ๋ณ€์ˆ˜ HF_TOKEN์ด ์„ค์ •๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
# ์ด ์Šคํฌ๋ฆฝํŠธ์˜ ๊ฒฝ๋กœ
SCRIPT_PATH = os.path.abspath(__file__)
def rm_readonly(func, path, exc_info):
os.chmod(path, stat.S_IWRITE)
func(path)
def main():
# ๋ฐ์ดํ„ฐ ๋กœ๋“œ
df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl')
# ๋ผ๋ฒจ ์ธ์ฝ”๋”ฉ & ๋นˆ๋„ โ‰ค 10์ธ ํด๋ž˜์Šค ์ œ๊ฑฐ
le_label = LabelEncoder()
df["๋‚™์ฐฐ์ฐจ์ˆ˜_LE"] = le_label.fit_transform(df["๋‚™์ฐฐ์ฐจ์ˆ˜"])
counts = df["๋‚™์ฐฐ์ฐจ์ˆ˜_LE"].value_counts()
rare = counts[counts <= 10].index.tolist()
df = df[~df["๋‚™์ฐฐ์ฐจ์ˆ˜_LE"].isin(rare)].reset_index(drop=True)
# ๋‚ ์งœ ํŒŒ์ƒ ๋ณ€์ˆ˜ ์ƒ์„ฑ
df["์ตœ์ดˆ์ž…์ฐฐ_์—ฐ๋„"] = df["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"].dt.year
df["์ตœ์ดˆ์ž…์ฐฐ_์›”"] = df["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"].dt.month
df["์ตœ์ดˆ์ž…์ฐฐ_์ผ"] = df["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"].dt.day
df["์ตœ์ดˆ์ž…์ฐฐ_์š”์ผ"] = df["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"].dt.weekday
df = df.drop(columns=["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"])
# ํ”ผ์ฒ˜/ํƒ€๊นƒ ๋ถ„๋ฆฌ
X = df[["๋Œ€๋ถ„๋ฅ˜", "์ค‘๋ถ„๋ฅ˜", "๊ธฐ๊ด€",
"์ตœ์ดˆ์ž…์ฐฐ_์—ฐ๋„", "์ตœ์ดˆ์ž…์ฐฐ_์›”", "์ตœ์ดˆ์ž…์ฐฐ_์ผ", "์ตœ์ดˆ์ž…์ฐฐ_์š”์ผ",
"1์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€"]]
y = df["๋‚™์ฐฐ์ฐจ์ˆ˜_LE"]
# ์ „์ฒ˜๋ฆฌ + ๋ชจ๋ธ ํŒŒ์ดํ”„๋ผ์ธ
cat_cols = ["๋Œ€๋ถ„๋ฅ˜", "์ค‘๋ถ„๋ฅ˜", "๊ธฐ๊ด€"]
preprocessor = ColumnTransformer(
transformers=[
("ohe", OneHotEncoder(handle_unknown="ignore"), cat_cols)
],
remainder="passthrough"
)
pipeline = Pipeline([
("preprocessor", preprocessor),
("classifier", XGBClassifier(eval_metric="mlogloss", random_state=42))
])
# ํ•™์Šต
pipeline.fit(X, y)
# ํŒŒ์ดํ”„๋ผ์ธ & ๋ผ๋ฒจ ์ธ์ฝ”๋” ์ €์žฅ
os.makedirs("output", exist_ok=True)
pipeline_path = "output/auction_pipeline.pkl"
label_path = "output/label_encoder.pkl"
joblib.dump(pipeline, pipeline_path)
joblib.dump(le_label, label_path)
# requirements.txt ์ž‘์„ฑ
deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"]
with open("requirements.txt", "w", encoding="utf-8") as f:
f.write("\n".join(deps))
# Hugging Face ๋ ˆํฌ ์ƒ์„ฑ ์‹œ๋„
api = HfApi()
try:
api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
except:
pass
# ๋กœ์ปฌ์— ๋ ˆํฌ ํด๋ก  (๊ธฐ์กด ์‚ญ์ œ ์‹œ read-only ์ฒ˜๋ฆฌ)
local_dir = "hf_repo"
if os.path.isdir(local_dir):
shutil.rmtree(local_dir, onerror=rm_readonly)
repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
# ํŒŒ์ผ ๋ณต์‚ฌ
for src in [SCRIPT_PATH, "requirements.txt", pipeline_path, label_path]:
dst = os.path.join(local_dir, os.path.basename(src))
shutil.copy(src, dst)
# ์ปค๋ฐ‹ ๋ฐ ํ‘ธ์‹œ
repo.git_add(auto_lfs_track=True)
repo.git_commit("Add trained pipeline + preprocessing code")
repo.git_push()
if __name__ == "__main__":
main()