Commit ยท
ba69daf
1
Parent(s): b6795f3
Add trained pipeline + preprocessing code
Browse files- auction_pipeline.pkl +2 -2
- onbid-map-round-train.py +21 -26
auction_pipeline.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c1acd90a22aaf1e5520ebfc531247896e5d33df2f3fb66e0e9e74020f59ed71
|
| 3 |
+
size 3575369
|
onbid-map-round-train.py
CHANGED
|
@@ -5,34 +5,22 @@ import shutil
|
|
| 5 |
import stat
|
| 6 |
import pandas as pd
|
| 7 |
import joblib
|
| 8 |
-
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
|
| 9 |
from sklearn.compose import ColumnTransformer
|
| 10 |
from sklearn.pipeline import Pipeline
|
| 11 |
from xgboost import XGBClassifier
|
| 12 |
from huggingface_hub import HfApi, Repository
|
| 13 |
|
| 14 |
-
# ํ๊ฒฝ ๋ณ์์์ ํ ํฐ ์ฝ
|
| 15 |
HF_REPO_NAME = "asteroidddd/onbid-map-round"
|
| 16 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 17 |
if HF_TOKEN is None:
|
| 18 |
raise ValueError("ํ๊ฒฝ ๋ณ์ HF_TOKEN์ด ์ค์ ๋์ด ์์ง ์์ต๋๋ค.")
|
| 19 |
|
| 20 |
-
# ์ด ์คํฌ๋ฆฝํธ์ ๊ฒฝ๋ก
|
| 21 |
SCRIPT_PATH = os.path.abspath(__file__)
|
| 22 |
-
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
|
| 23 |
-
|
| 24 |
-
def extract_date_features(df):
|
| 25 |
-
"""์ต์ด์
์ฐฐ์๊ธฐ์์ ์ฐ๋/์/์ผ/์์ผ์ ์ถ์ถํ๊ณ ์๋ณธ ์ปฌ๋ผ ์ ๊ฑฐ."""
|
| 26 |
-
X = df.copy()
|
| 27 |
-
dt = pd.to_datetime(X["์ต์ด์
์ฐฐ์๊ธฐ"])
|
| 28 |
-
X["์ต์ด์
์ฐฐ_์ฐ๋"] = dt.dt.year
|
| 29 |
-
X["์ต์ด์
์ฐฐ_์"] = dt.dt.month
|
| 30 |
-
X["์ต์ด์
์ฐฐ_์ผ"] = dt.dt.day
|
| 31 |
-
X["์ต์ด์
์ฐฐ_์์ผ"] = dt.dt.weekday
|
| 32 |
-
return X.drop(columns=["์ต์ด์
์ฐฐ์๊ธฐ"])
|
| 33 |
|
| 34 |
def rm_readonly(func, path, exc_info):
|
| 35 |
-
"""์ฝ๊ธฐ ์ ์ฉ ํ์ผ ์ญ์ ์ ๊ถํ ๋ณ๊ฒฝ ํ ์ฌ์๋."""
|
| 36 |
os.chmod(path, stat.S_IWRITE)
|
| 37 |
func(path)
|
| 38 |
|
|
@@ -40,22 +28,30 @@ def main():
|
|
| 40 |
# ๋ฐ์ดํฐ ๋ก๋
|
| 41 |
df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl')
|
| 42 |
|
| 43 |
-
#
|
| 44 |
le_label = LabelEncoder()
|
| 45 |
df["๋์ฐฐ์ฐจ์_LE"] = le_label.fit_transform(df["๋์ฐฐ์ฐจ์"])
|
| 46 |
counts = df["๋์ฐฐ์ฐจ์_LE"].value_counts()
|
| 47 |
rare = counts[counts <= 10].index.tolist()
|
| 48 |
df = df[~df["๋์ฐฐ์ฐจ์_LE"].isin(rare)].reset_index(drop=True)
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
y = df["๋์ฐฐ์ฐจ์_LE"]
|
| 53 |
|
| 54 |
-
# ์ ์ฒ๋ฆฌ
|
| 55 |
cat_cols = ["๋๋ถ๋ฅ", "์ค๋ถ๋ฅ", "๊ธฐ๊ด"]
|
| 56 |
preprocessor = ColumnTransformer(
|
| 57 |
transformers=[
|
| 58 |
-
("datefeat", FunctionTransformer(extract_date_features, validate=False), ["์ต์ด์
์ฐฐ์๊ธฐ"]),
|
| 59 |
("ohe", OneHotEncoder(handle_unknown="ignore"), cat_cols)
|
| 60 |
],
|
| 61 |
remainder="passthrough"
|
|
@@ -65,10 +61,10 @@ def main():
|
|
| 65 |
("classifier", XGBClassifier(eval_metric="mlogloss", random_state=42))
|
| 66 |
])
|
| 67 |
|
| 68 |
-
#
|
| 69 |
pipeline.fit(X, y)
|
| 70 |
|
| 71 |
-
#
|
| 72 |
os.makedirs("output", exist_ok=True)
|
| 73 |
pipeline_path = "output/auction_pipeline.pkl"
|
| 74 |
label_path = "output/label_encoder.pkl"
|
|
@@ -80,20 +76,20 @@ def main():
|
|
| 80 |
with open("requirements.txt", "w", encoding="utf-8") as f:
|
| 81 |
f.write("\n".join(deps))
|
| 82 |
|
| 83 |
-
# Hugging Face ๋ ํฌ
|
| 84 |
api = HfApi()
|
| 85 |
try:
|
| 86 |
api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
|
| 87 |
except:
|
| 88 |
pass
|
| 89 |
|
| 90 |
-
# ๋ก์ปฌ์ ๋ ํฌ ํด๋ก (๊ธฐ์กด ์ญ์ ์ read-only
|
| 91 |
local_dir = "hf_repo"
|
| 92 |
if os.path.isdir(local_dir):
|
| 93 |
shutil.rmtree(local_dir, onerror=rm_readonly)
|
| 94 |
repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
|
| 95 |
|
| 96 |
-
#
|
| 97 |
for src in [SCRIPT_PATH, "requirements.txt", pipeline_path, label_path]:
|
| 98 |
dst = os.path.join(local_dir, os.path.basename(src))
|
| 99 |
shutil.copy(src, dst)
|
|
@@ -103,6 +99,5 @@ def main():
|
|
| 103 |
repo.git_commit("Add trained pipeline + preprocessing code")
|
| 104 |
repo.git_push()
|
| 105 |
|
| 106 |
-
|
| 107 |
if __name__ == "__main__":
|
| 108 |
main()
|
|
|
|
| 5 |
import stat
|
| 6 |
import pandas as pd
|
| 7 |
import joblib
|
| 8 |
+
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
|
| 9 |
from sklearn.compose import ColumnTransformer
|
| 10 |
from sklearn.pipeline import Pipeline
|
| 11 |
from xgboost import XGBClassifier
|
| 12 |
from huggingface_hub import HfApi, Repository
|
| 13 |
|
| 14 |
+
# ํ๊ฒฝ ๋ณ์์์ Hugging Face ํ ํฐ ์ฝ๊ธฐ
|
| 15 |
HF_REPO_NAME = "asteroidddd/onbid-map-round"
|
| 16 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 17 |
if HF_TOKEN is None:
|
| 18 |
raise ValueError("ํ๊ฒฝ ๋ณ์ HF_TOKEN์ด ์ค์ ๋์ด ์์ง ์์ต๋๋ค.")
|
| 19 |
|
| 20 |
+
# ์ด ์คํฌ๋ฆฝํธ์ ๊ฒฝ๋ก
|
| 21 |
SCRIPT_PATH = os.path.abspath(__file__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def rm_readonly(func, path, exc_info):
|
|
|
|
| 24 |
os.chmod(path, stat.S_IWRITE)
|
| 25 |
func(path)
|
| 26 |
|
|
|
|
| 28 |
# ๋ฐ์ดํฐ ๋ก๋
|
| 29 |
df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl')
|
| 30 |
|
| 31 |
+
# ๋ผ๋ฒจ ์ธ์ฝ๋ฉ & ๋น๋ โค 10์ธ ํด๋์ค ์ ๊ฑฐ
|
| 32 |
le_label = LabelEncoder()
|
| 33 |
df["๋์ฐฐ์ฐจ์_LE"] = le_label.fit_transform(df["๋์ฐฐ์ฐจ์"])
|
| 34 |
counts = df["๋์ฐฐ์ฐจ์_LE"].value_counts()
|
| 35 |
rare = counts[counts <= 10].index.tolist()
|
| 36 |
df = df[~df["๋์ฐฐ์ฐจ์_LE"].isin(rare)].reset_index(drop=True)
|
| 37 |
|
| 38 |
+
# ๋ ์ง ํ์ ๋ณ์ ์์ฑ
|
| 39 |
+
df["์ต์ด์
์ฐฐ_์ฐ๋"] = df["์ต์ด์
์ฐฐ์๊ธฐ"].dt.year
|
| 40 |
+
df["์ต์ด์
์ฐฐ_์"] = df["์ต์ด์
์ฐฐ์๊ธฐ"].dt.month
|
| 41 |
+
df["์ต์ด์
์ฐฐ_์ผ"] = df["์ต์ด์
์ฐฐ์๊ธฐ"].dt.day
|
| 42 |
+
df["์ต์ด์
์ฐฐ_์์ผ"] = df["์ต์ด์
์ฐฐ์๊ธฐ"].dt.weekday
|
| 43 |
+
df = df.drop(columns=["์ต์ด์
์ฐฐ์๊ธฐ"])
|
| 44 |
+
|
| 45 |
+
# ํผ์ฒ/ํ๊น ๋ถ๋ฆฌ
|
| 46 |
+
X = df[["๋๋ถ๋ฅ", "์ค๋ถ๋ฅ", "๊ธฐ๊ด",
|
| 47 |
+
"์ต์ด์
์ฐฐ_์ฐ๋", "์ต์ด์
์ฐฐ_์", "์ต์ด์
์ฐฐ_์ผ", "์ต์ด์
์ฐฐ_์์ผ",
|
| 48 |
+
"1์ฐจ์ต์ ์
์ฐฐ๊ฐ"]]
|
| 49 |
y = df["๋์ฐฐ์ฐจ์_LE"]
|
| 50 |
|
| 51 |
+
# ์ ์ฒ๋ฆฌ + ๋ชจ๋ธ ํ์ดํ๋ผ์ธ
|
| 52 |
cat_cols = ["๋๋ถ๋ฅ", "์ค๋ถ๋ฅ", "๊ธฐ๊ด"]
|
| 53 |
preprocessor = ColumnTransformer(
|
| 54 |
transformers=[
|
|
|
|
| 55 |
("ohe", OneHotEncoder(handle_unknown="ignore"), cat_cols)
|
| 56 |
],
|
| 57 |
remainder="passthrough"
|
|
|
|
| 61 |
("classifier", XGBClassifier(eval_metric="mlogloss", random_state=42))
|
| 62 |
])
|
| 63 |
|
| 64 |
+
# ํ์ต
|
| 65 |
pipeline.fit(X, y)
|
| 66 |
|
| 67 |
+
# ํ์ดํ๋ผ์ธ & ๋ผ๋ฒจ ์ธ์ฝ๋ ์ ์ฅ
|
| 68 |
os.makedirs("output", exist_ok=True)
|
| 69 |
pipeline_path = "output/auction_pipeline.pkl"
|
| 70 |
label_path = "output/label_encoder.pkl"
|
|
|
|
| 76 |
with open("requirements.txt", "w", encoding="utf-8") as f:
|
| 77 |
f.write("\n".join(deps))
|
| 78 |
|
| 79 |
+
# Hugging Face ๋ ํฌ ์์ฑ ์๋
|
| 80 |
api = HfApi()
|
| 81 |
try:
|
| 82 |
api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
|
| 83 |
except:
|
| 84 |
pass
|
| 85 |
|
| 86 |
+
# ๋ก์ปฌ์ ๋ ํฌ ํด๋ก (๊ธฐ์กด ์ญ์ ์ read-only ์ฒ๋ฆฌ)
|
| 87 |
local_dir = "hf_repo"
|
| 88 |
if os.path.isdir(local_dir):
|
| 89 |
shutil.rmtree(local_dir, onerror=rm_readonly)
|
| 90 |
repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
|
| 91 |
|
| 92 |
+
# ํ์ผ ๋ณต์ฌ
|
| 93 |
for src in [SCRIPT_PATH, "requirements.txt", pipeline_path, label_path]:
|
| 94 |
dst = os.path.join(local_dir, os.path.basename(src))
|
| 95 |
shutil.copy(src, dst)
|
|
|
|
| 99 |
repo.git_commit("Add trained pipeline + preprocessing code")
|
| 100 |
repo.git_push()
|
| 101 |
|
|
|
|
| 102 |
if __name__ == "__main__":
|
| 103 |
main()
|