|
|
|
|
|
|
|
|
import os |
|
|
import shutil |
|
|
import stat |
|
|
import pandas as pd |
|
|
import joblib |
|
|
from sklearn.preprocessing import OneHotEncoder, LabelEncoder |
|
|
from sklearn.compose import ColumnTransformer |
|
|
from sklearn.pipeline import Pipeline |
|
|
from xgboost import XGBClassifier |
|
|
from huggingface_hub import HfApi, Repository |
|
|
|
|
|
|
|
|
HF_REPO_NAME = "asteroidddd/onbid-map-round" |
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
if HF_TOKEN is None: |
|
|
raise ValueError("ํ๊ฒฝ ๋ณ์ HF_TOKEN์ด ์ค์ ๋์ด ์์ง ์์ต๋๋ค.") |
|
|
|
|
|
|
|
|
SCRIPT_PATH = os.path.abspath(__file__) |
|
|
|
|
|
def rm_readonly(func, path, exc_info): |
|
|
os.chmod(path, stat.S_IWRITE) |
|
|
func(path) |
|
|
|
|
|
def main(): |
|
|
|
|
|
df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl') |
|
|
|
|
|
|
|
|
le_label = LabelEncoder() |
|
|
df["๋์ฐฐ์ฐจ์_LE"] = le_label.fit_transform(df["๋์ฐฐ์ฐจ์"]) |
|
|
counts = df["๋์ฐฐ์ฐจ์_LE"].value_counts() |
|
|
rare = counts[counts <= 10].index.tolist() |
|
|
df = df[~df["๋์ฐฐ์ฐจ์_LE"].isin(rare)].reset_index(drop=True) |
|
|
|
|
|
|
|
|
df["์ต์ด์
์ฐฐ_์ฐ๋"] = df["์ต์ด์
์ฐฐ์๊ธฐ"].dt.year |
|
|
df["์ต์ด์
์ฐฐ_์"] = df["์ต์ด์
์ฐฐ์๊ธฐ"].dt.month |
|
|
df["์ต์ด์
์ฐฐ_์ผ"] = df["์ต์ด์
์ฐฐ์๊ธฐ"].dt.day |
|
|
df["์ต์ด์
์ฐฐ_์์ผ"] = df["์ต์ด์
์ฐฐ์๊ธฐ"].dt.weekday |
|
|
df = df.drop(columns=["์ต์ด์
์ฐฐ์๊ธฐ"]) |
|
|
|
|
|
|
|
|
X = df[["๋๋ถ๋ฅ", "์ค๋ถ๋ฅ", "๊ธฐ๊ด", |
|
|
"์ต์ด์
์ฐฐ_์ฐ๋", "์ต์ด์
์ฐฐ_์", "์ต์ด์
์ฐฐ_์ผ", "์ต์ด์
์ฐฐ_์์ผ", |
|
|
"1์ฐจ์ต์ ์
์ฐฐ๊ฐ"]] |
|
|
y = df["๋์ฐฐ์ฐจ์_LE"] |
|
|
|
|
|
|
|
|
cat_cols = ["๋๋ถ๋ฅ", "์ค๋ถ๋ฅ", "๊ธฐ๊ด"] |
|
|
preprocessor = ColumnTransformer( |
|
|
transformers=[ |
|
|
("ohe", OneHotEncoder(handle_unknown="ignore"), cat_cols) |
|
|
], |
|
|
remainder="passthrough" |
|
|
) |
|
|
pipeline = Pipeline([ |
|
|
("preprocessor", preprocessor), |
|
|
("classifier", XGBClassifier(eval_metric="mlogloss", random_state=42)) |
|
|
]) |
|
|
|
|
|
|
|
|
pipeline.fit(X, y) |
|
|
|
|
|
|
|
|
os.makedirs("output", exist_ok=True) |
|
|
pipeline_path = "output/auction_pipeline.pkl" |
|
|
label_path = "output/label_encoder.pkl" |
|
|
joblib.dump(pipeline, pipeline_path) |
|
|
joblib.dump(le_label, label_path) |
|
|
|
|
|
|
|
|
deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"] |
|
|
with open("requirements.txt", "w", encoding="utf-8") as f: |
|
|
f.write("\n".join(deps)) |
|
|
|
|
|
|
|
|
api = HfApi() |
|
|
try: |
|
|
api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
local_dir = "hf_repo" |
|
|
if os.path.isdir(local_dir): |
|
|
shutil.rmtree(local_dir, onerror=rm_readonly) |
|
|
repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN) |
|
|
|
|
|
|
|
|
for src in [SCRIPT_PATH, "requirements.txt", pipeline_path, label_path]: |
|
|
dst = os.path.join(local_dir, os.path.basename(src)) |
|
|
shutil.copy(src, dst) |
|
|
|
|
|
|
|
|
repo.git_add(auto_lfs_track=True) |
|
|
repo.git_commit("Add trained pipeline + preprocessing code") |
|
|
repo.git_push() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|