Commit ยท
5a2a0c6
1
Parent(s): 4b4d652
Add trained pipeline + preprocessing code
Browse files- onbid-map-round-train.py +18 -15
onbid-map-round-train.py
CHANGED
|
@@ -12,18 +12,19 @@ from sklearn.pipeline import Pipeline
|
|
| 12 |
from xgboost import XGBClassifier
|
| 13 |
from huggingface_hub import HfApi, Repository
|
| 14 |
|
| 15 |
-
# ํ๊ฒฝ ๋ณ์์์ ํ ํฐ ์ฝ์ด์ค๊ธฐ
|
| 16 |
HF_REPO_NAME = "asteroidddd/onbid-map-round"
|
| 17 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 18 |
if HF_TOKEN is None:
|
| 19 |
raise ValueError("ํ๊ฒฝ ๋ณ์ HF_TOKEN์ด ์ค์ ๋์ด ์์ง ์์ต๋๋ค.")
|
| 20 |
|
| 21 |
-
# ์คํฌ๋ฆฝํธ ๊ฒฝ๋ก
|
| 22 |
SCRIPT_PATH = os.path.abspath(__file__)
|
| 23 |
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
|
| 24 |
|
| 25 |
-
|
| 26 |
class DateFeatures(BaseEstimator, TransformerMixin):
|
|
|
|
| 27 |
def __init__(self, date_column="์ต์ด์
์ฐฐ์๊ธฐ"):
|
| 28 |
self.date_column = date_column
|
| 29 |
|
|
@@ -40,27 +41,29 @@ class DateFeatures(BaseEstimator, TransformerMixin):
|
|
| 40 |
return X.drop(columns=[self.date_column])
|
| 41 |
|
| 42 |
|
|
|
|
| 43 |
def rm_readonly(func, path, exc_info):
|
| 44 |
os.chmod(path, stat.S_IWRITE)
|
| 45 |
func(path)
|
| 46 |
|
| 47 |
|
| 48 |
def main():
|
| 49 |
-
|
|
|
|
| 50 |
df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl')
|
| 51 |
|
| 52 |
-
#
|
| 53 |
le_label = LabelEncoder()
|
| 54 |
df["๋์ฐฐ์ฐจ์_LE"] = le_label.fit_transform(df["๋์ฐฐ์ฐจ์"])
|
| 55 |
counts = df["๋์ฐฐ์ฐจ์_LE"].value_counts()
|
| 56 |
rare = counts[counts <= 10].index.tolist()
|
| 57 |
df = df[~df["๋์ฐฐ์ฐจ์_LE"].isin(rare)].reset_index(drop=True)
|
| 58 |
|
| 59 |
-
#
|
| 60 |
X = df[["๋๋ถ๋ฅ", "์ค๋ถ๋ฅ", "๊ธฐ๊ด", "์ต์ด์
์ฐฐ์๊ธฐ", "1์ฐจ์ต์ ์
์ฐฐ๊ฐ"]]
|
| 61 |
y = df["๋์ฐฐ์ฐจ์_LE"]
|
| 62 |
|
| 63 |
-
#
|
| 64 |
cat_cols = ["๋๋ถ๋ฅ", "์ค๋ถ๋ฅ", "๊ธฐ๊ด"]
|
| 65 |
preprocessor = ColumnTransformer(
|
| 66 |
transformers=[
|
|
@@ -74,40 +77,40 @@ def main():
|
|
| 74 |
("classifier", XGBClassifier(eval_metric="mlogloss", random_state=42))
|
| 75 |
])
|
| 76 |
|
| 77 |
-
#
|
| 78 |
pipeline.fit(X, y)
|
| 79 |
|
| 80 |
-
#
|
| 81 |
os.makedirs("output", exist_ok=True)
|
| 82 |
pipeline_path = "output/auction_pipeline.pkl"
|
| 83 |
label_path = "output/label_encoder.pkl"
|
| 84 |
joblib.dump(pipeline, pipeline_path)
|
| 85 |
joblib.dump(le_label, label_path)
|
| 86 |
|
| 87 |
-
#
|
| 88 |
deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"]
|
| 89 |
with open("requirements.txt", "w", encoding="utf-8") as f:
|
| 90 |
f.write("\n".join(deps))
|
| 91 |
|
| 92 |
-
#
|
| 93 |
api = HfApi()
|
| 94 |
try:
|
| 95 |
api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
|
| 96 |
except:
|
| 97 |
-
pass
|
| 98 |
|
| 99 |
-
#
|
| 100 |
local_dir = "hf_repo"
|
| 101 |
if os.path.isdir(local_dir):
|
| 102 |
shutil.rmtree(local_dir, onerror=rm_readonly)
|
| 103 |
repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
|
| 104 |
|
| 105 |
-
#
|
| 106 |
for src in [SCRIPT_PATH, "requirements.txt", pipeline_path, label_path]:
|
| 107 |
dst = os.path.join(local_dir, os.path.basename(src))
|
| 108 |
shutil.copy(src, dst)
|
| 109 |
|
| 110 |
-
#
|
| 111 |
repo.git_add(auto_lfs_track=True)
|
| 112 |
repo.git_commit("Add trained pipeline + preprocessing code")
|
| 113 |
repo.git_push()
|
|
|
|
| 12 |
from xgboost import XGBClassifier
|
| 13 |
from huggingface_hub import HfApi, Repository
|
| 14 |
|
| 15 |
+
# ํ๊ฒฝ ๋ณ์์์ Hugging Face ํ ํฐ ์ฝ์ด์ค๊ธฐ
|
| 16 |
HF_REPO_NAME = "asteroidddd/onbid-map-round"
|
| 17 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 18 |
if HF_TOKEN is None:
|
| 19 |
raise ValueError("ํ๊ฒฝ ๋ณ์ HF_TOKEN์ด ์ค์ ๋์ด ์์ง ์์ต๋๋ค.")
|
| 20 |
|
| 21 |
+
# ์ด ์คํฌ๋ฆฝํธ์ ๊ฒฝ๋ก์ ํ์ผ๋ช
|
| 22 |
SCRIPT_PATH = os.path.abspath(__file__)
|
| 23 |
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
|
| 24 |
|
| 25 |
+
# ์ต์ด์
์ฐฐ์๊ธฐ ์ฐ๋/์/์ผ/์์ผ ์ถ์ถ ๋ณํ๊ธฐ
|
| 26 |
class DateFeatures(BaseEstimator, TransformerMixin):
|
| 27 |
+
|
| 28 |
def __init__(self, date_column="์ต์ด์
์ฐฐ์๊ธฐ"):
|
| 29 |
self.date_column = date_column
|
| 30 |
|
|
|
|
| 41 |
return X.drop(columns=[self.date_column])
|
| 42 |
|
| 43 |
|
| 44 |
+
# ์ฝ๊ธฐ ์ ์ฉ ํ์ผ ์ญ์ ์ ๊ถํ ๋ณ๊ฒฝ ํ ์ฌ์๋
|
| 45 |
def rm_readonly(func, path, exc_info):
|
| 46 |
os.chmod(path, stat.S_IWRITE)
|
| 47 |
func(path)
|
| 48 |
|
| 49 |
|
| 50 |
def main():
|
| 51 |
+
|
| 52 |
+
# ๋ฐ์ดํฐ ๋ก๋
|
| 53 |
df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl')
|
| 54 |
|
| 55 |
+
# ๋์ฐฐ์ฐจ์ ๋ ์ด๋ธ ์ธ์ฝ๋ฉ ํ, ๋น๋ โค 10์ธ ํด๋์ค ์ ๊ฑฐ
|
| 56 |
le_label = LabelEncoder()
|
| 57 |
df["๋์ฐฐ์ฐจ์_LE"] = le_label.fit_transform(df["๋์ฐฐ์ฐจ์"])
|
| 58 |
counts = df["๋์ฐฐ์ฐจ์_LE"].value_counts()
|
| 59 |
rare = counts[counts <= 10].index.tolist()
|
| 60 |
df = df[~df["๋์ฐฐ์ฐจ์_LE"].isin(rare)].reset_index(drop=True)
|
| 61 |
|
| 62 |
+
# ์
๋ ฅ(X)๊ณผ ํ๊น(y) ๋ถ๋ฆฌ
|
| 63 |
X = df[["๋๋ถ๋ฅ", "์ค๋ถ๋ฅ", "๊ธฐ๊ด", "์ต์ด์
์ฐฐ์๊ธฐ", "1์ฐจ์ต์ ์
์ฐฐ๊ฐ"]]
|
| 64 |
y = df["๋์ฐฐ์ฐจ์_LE"]
|
| 65 |
|
| 66 |
+
# ์ ์ฒ๋ฆฌ ๋ฐ ๋ชจ๋ธ ํ์ดํ๋ผ์ธ ์ ์
|
| 67 |
cat_cols = ["๋๋ถ๋ฅ", "์ค๋ถ๋ฅ", "๊ธฐ๊ด"]
|
| 68 |
preprocessor = ColumnTransformer(
|
| 69 |
transformers=[
|
|
|
|
| 77 |
("classifier", XGBClassifier(eval_metric="mlogloss", random_state=42))
|
| 78 |
])
|
| 79 |
|
| 80 |
+
# ํ์ดํ๋ผ์ธ ํ์ต
|
| 81 |
pipeline.fit(X, y)
|
| 82 |
|
| 83 |
+
# ํ์ต๋ ํ์ดํ๋ผ์ธ๊ณผ ๋ ์ด๋ธ ์ธ์ฝ๋ ์ ์ฅ
|
| 84 |
os.makedirs("output", exist_ok=True)
|
| 85 |
pipeline_path = "output/auction_pipeline.pkl"
|
| 86 |
label_path = "output/label_encoder.pkl"
|
| 87 |
joblib.dump(pipeline, pipeline_path)
|
| 88 |
joblib.dump(le_label, label_path)
|
| 89 |
|
| 90 |
+
# requirements.txt ์์ฑ
|
| 91 |
deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"]
|
| 92 |
with open("requirements.txt", "w", encoding="utf-8") as f:
|
| 93 |
f.write("\n".join(deps))
|
| 94 |
|
| 95 |
+
# Hugging Face ๋ ํฌ์งํ ๋ฆฌ ์์ฑ ์๋
|
| 96 |
api = HfApi()
|
| 97 |
try:
|
| 98 |
api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
|
| 99 |
except:
|
| 100 |
+
pass # ์ด๋ฏธ ์กด์ฌํ๋ฉด ๋ฌด์
|
| 101 |
|
| 102 |
+
# ๋ก์ปฌ์ ๋ ํฌ ํด๋ก (๊ธฐ์กด ์ญ์ ์ read-only ์ค๋ฅ ์ฒ๋ฆฌ)
|
| 103 |
local_dir = "hf_repo"
|
| 104 |
if os.path.isdir(local_dir):
|
| 105 |
shutil.rmtree(local_dir, onerror=rm_readonly)
|
| 106 |
repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
|
| 107 |
|
| 108 |
+
# ํ์ํ ํ์ผ ๋ณต์ฌ
|
| 109 |
for src in [SCRIPT_PATH, "requirements.txt", pipeline_path, label_path]:
|
| 110 |
dst = os.path.join(local_dir, os.path.basename(src))
|
| 111 |
shutil.copy(src, dst)
|
| 112 |
|
| 113 |
+
# ์ปค๋ฐ ๋ฐ ํธ์
|
| 114 |
repo.git_add(auto_lfs_track=True)
|
| 115 |
repo.git_commit("Add trained pipeline + preprocessing code")
|
| 116 |
repo.git_push()
|