Commit ·
7eb54e6
1
Parent(s): fa9b18c
Add trained pipelines for orders 1~5 (개별 파라미터) + training script
Browse files
models_by_order/order1/pipeline.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:041e9b5169234d467c3498975c5ba3448ade260c439ce76d32ac516c78b4a677
|
| 3 |
+
size 180342
|
models_by_order/order2/pipeline.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e0c69aac43a89ac4f3fae7ef4dcde3828073fdca21ff27483d937f23b23cf6d6
|
| 3 |
+
size 274597
|
models_by_order/order3/pipeline.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85bc231dd68e1371c39185517de6eb65d13e42fc9e72636b9b541f0ee9d8b8be
|
| 3 |
+
size 382023
|
models_by_order/order4/pipeline.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:df27cff94b39d0a360c32b1ba14ced6fc12ac833f7655db3195341b9d8659ead
|
| 3 |
+
size 609739
|
models_by_order/order5/pipeline.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c97e664735d7c448609cd943837432879bae5a1afcb9ee977416b023e9a77c88
|
| 3 |
+
size 1316200
|
onbid-map-etcp-train.py
CHANGED
|
@@ -5,18 +5,13 @@ import shutil
|
|
| 5 |
import stat
|
| 6 |
import pandas as pd
|
| 7 |
import joblib
|
| 8 |
-
|
| 9 |
from sklearn.preprocessing import OneHotEncoder
|
| 10 |
from sklearn.compose import ColumnTransformer
|
| 11 |
from sklearn.pipeline import Pipeline
|
| 12 |
from xgboost import XGBRegressor
|
| 13 |
-
|
| 14 |
from huggingface_hub import HfApi, Repository
|
| 15 |
|
| 16 |
-
# -----------------------------
|
| 17 |
# 차수별 하이퍼파라미터 설정
|
| 18 |
-
# -----------------------------
|
| 19 |
-
# ORDER_PARAMS[차수] = 해당 차수 XGBRegressor에 넘길 파라미터 딕셔너리
|
| 20 |
ORDER_PARAMS = {
|
| 21 |
1: {
|
| 22 |
"objective": "reg:squarederror",
|
|
@@ -65,9 +60,7 @@ ORDER_PARAMS = {
|
|
| 65 |
},
|
| 66 |
}
|
| 67 |
|
| 68 |
-
#
|
| 69 |
-
# Hugging Face 환경 변수
|
| 70 |
-
# -----------------------------
|
| 71 |
HF_REPO_NAME = "asteroidddd/onbid-map-etcp"
|
| 72 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 73 |
if HF_TOKEN is None:
|
|
@@ -81,13 +74,9 @@ def rm_readonly(func, path, exc_info):
|
|
| 81 |
func(path)
|
| 82 |
|
| 83 |
def main():
|
| 84 |
-
|
| 85 |
-
# 데이터
|
| 86 |
-
|
| 87 |
-
# TODO: 실제 데이터 경로로 변경하고, '최초입찰시기' 컬럼을 datetime 타입으로 로드해주세요.
|
| 88 |
-
# 예시:
|
| 89 |
-
# df = pd.read_csv("data/onbid_data.csv", parse_dates=["최초입찰시기"])
|
| 90 |
-
df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl') # 실제 DataFrame으로 교체하세요.
|
| 91 |
|
| 92 |
# '자동차' 대분류 행 제거
|
| 93 |
if "대분류" in df.columns:
|
|
@@ -96,19 +85,29 @@ def main():
|
|
| 96 |
# '낙찰차수' 컬럼을 정수형으로 변환하고, 5 이상은 5로 통일
|
| 97 |
df["낙찰차수"] = df["낙찰차수"].astype(int).apply(lambda x: x if x < 5 else 5)
|
| 98 |
|
| 99 |
-
# -----------------------------
|
| 100 |
# 차수별 모델 학습 & 저장
|
| 101 |
-
# -----------------------------
|
| 102 |
for order in [1, 2, 3, 4, 5]:
|
| 103 |
-
|
|
|
|
| 104 |
subset = df[df["낙찰차수"] == order].copy().reset_index(drop=True)
|
| 105 |
if subset.empty:
|
| 106 |
print(f"차수 {order} 데이터가 없습니다. 건너뜁니다.")
|
| 107 |
continue
|
| 108 |
|
| 109 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
base_cols = ["대분류", "중분류", "기관"]
|
| 111 |
-
|
| 112 |
if order == 1:
|
| 113 |
bid_cols = ["1차최저입찰가"]
|
| 114 |
elif order == 2:
|
|
@@ -120,20 +119,17 @@ def main():
|
|
| 120 |
else: # order == 5
|
| 121 |
bid_cols = ["1차최저입찰가", "2차최저입찰가", "3차최저입찰가", "4차최저입찰가", "5차최저입찰가"]
|
| 122 |
|
| 123 |
-
|
|
|
|
| 124 |
y = subset["낙찰가율_최초최저가기준"].copy()
|
| 125 |
|
| 126 |
-
#
|
| 127 |
-
if "최초입찰시기" in X.columns:
|
| 128 |
-
X["최초입찰시기"] = X["최초입찰시기"].astype("int64")
|
| 129 |
-
|
| 130 |
-
# 4) 전처리 + 모델 파이프라인 정의
|
| 131 |
preprocessor = ColumnTransformer(
|
| 132 |
transformers=[("ohe", OneHotEncoder(handle_unknown="ignore"), base_cols)],
|
| 133 |
remainder="passthrough"
|
| 134 |
)
|
| 135 |
|
| 136 |
-
#
|
| 137 |
params = ORDER_PARAMS.get(order)
|
| 138 |
model = XGBRegressor(**params)
|
| 139 |
pipeline = Pipeline([
|
|
@@ -141,40 +137,35 @@ def main():
|
|
| 141 |
("regressor", model)
|
| 142 |
])
|
| 143 |
|
| 144 |
-
#
|
| 145 |
pipeline.fit(X, y)
|
| 146 |
print(f"차수 {order} 모델 학습 완료 (params: {params})")
|
| 147 |
|
| 148 |
-
#
|
| 149 |
output_dir = f"output/order{order}"
|
| 150 |
os.makedirs(output_dir, exist_ok=True)
|
| 151 |
joblib.dump(pipeline, os.path.join(output_dir, "pipeline.pkl"))
|
| 152 |
print(f" → pipeline.pkl 저장: {output_dir}/pipeline.pkl")
|
| 153 |
|
| 154 |
-
# -----------------------------
|
| 155 |
# requirements.txt 작성
|
| 156 |
-
# -----------------------------
|
| 157 |
deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"]
|
| 158 |
with open("requirements.txt", "w", encoding="utf-8") as f:
|
| 159 |
f.write("\n".join(deps))
|
| 160 |
|
| 161 |
-
#
|
| 162 |
-
# Hugging Face 업로드 파트
|
| 163 |
-
# -----------------------------
|
| 164 |
-
# 1) 레포 생성 시도
|
| 165 |
api = HfApi()
|
| 166 |
try:
|
| 167 |
api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
|
| 168 |
except Exception:
|
| 169 |
pass
|
| 170 |
|
| 171 |
-
#
|
| 172 |
local_dir = "hf_repo"
|
| 173 |
if os.path.isdir(local_dir):
|
| 174 |
shutil.rmtree(local_dir, onerror=rm_readonly)
|
| 175 |
repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
|
| 176 |
|
| 177 |
-
#
|
| 178 |
for order in [1, 2, 3, 4, 5]:
|
| 179 |
src_dir = f"output/order{order}"
|
| 180 |
if not os.path.isdir(src_dir):
|
|
@@ -187,12 +178,12 @@ def main():
|
|
| 187 |
if os.path.isfile(src_file):
|
| 188 |
shutil.copy(src_file, os.path.join(dst_dir, "pipeline.pkl"))
|
| 189 |
|
| 190 |
-
#
|
| 191 |
for src in [SCRIPT_PATH, "requirements.txt"]:
|
| 192 |
dst = os.path.join(local_dir, os.path.basename(src))
|
| 193 |
shutil.copy(src, dst)
|
| 194 |
|
| 195 |
-
#
|
| 196 |
repo.git_add(auto_lfs_track=True)
|
| 197 |
repo.git_commit("Add trained pipelines for orders 1~5 (개별 파라미터) + training script")
|
| 198 |
repo.git_push()
|
|
|
|
| 5 |
import stat
|
| 6 |
import pandas as pd
|
| 7 |
import joblib
|
|
|
|
| 8 |
from sklearn.preprocessing import OneHotEncoder
|
| 9 |
from sklearn.compose import ColumnTransformer
|
| 10 |
from sklearn.pipeline import Pipeline
|
| 11 |
from xgboost import XGBRegressor
|
|
|
|
| 12 |
from huggingface_hub import HfApi, Repository
|
| 13 |
|
|
|
|
| 14 |
# 차수별 하이퍼파라미터 설정
|
|
|
|
|
|
|
| 15 |
ORDER_PARAMS = {
|
| 16 |
1: {
|
| 17 |
"objective": "reg:squarederror",
|
|
|
|
| 60 |
},
|
| 61 |
}
|
| 62 |
|
| 63 |
+
# 환경 변수에서 Hugging Face 토큰 읽기
|
|
|
|
|
|
|
| 64 |
HF_REPO_NAME = "asteroidddd/onbid-map-etcp"
|
| 65 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 66 |
if HF_TOKEN is None:
|
|
|
|
| 74 |
func(path)
|
| 75 |
|
| 76 |
def main():
|
| 77 |
+
|
| 78 |
+
# 데이터 불러오기
|
| 79 |
+
df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
# '자동차' 대분류 행 제거
|
| 82 |
if "대분류" in df.columns:
|
|
|
|
| 85 |
# '낙찰차수' 컬럼을 정수형으로 변환하고, 5 이상은 5로 통일
|
| 86 |
df["낙찰차수"] = df["낙찰차수"].astype(int).apply(lambda x: x if x < 5 else 5)
|
| 87 |
|
|
|
|
| 88 |
# 차수별 모델 학습 & 저장
|
|
|
|
| 89 |
for order in [1, 2, 3, 4, 5]:
|
| 90 |
+
|
| 91 |
+
# 해당 차수 데이터만 필터링
|
| 92 |
subset = df[df["낙찰차수"] == order].copy().reset_index(drop=True)
|
| 93 |
if subset.empty:
|
| 94 |
print(f"차수 {order} 데이터가 없습니다. 건너뜁니다.")
|
| 95 |
continue
|
| 96 |
|
| 97 |
+
# 날짜 컬럼(datetime) 파생변수 생성
|
| 98 |
+
if "최초입찰시기" in subset.columns:
|
| 99 |
+
subset["최초입찰시기"] = pd.to_datetime(subset["최초입찰시기"])
|
| 100 |
+
subset["최초입찰_연도"] = subset["최초입찰시기"].dt.year
|
| 101 |
+
subset["최초입찰_월"] = subset["최초입찰시기"].dt.month
|
| 102 |
+
subset["최초입찰_일"] = subset["최초입찰시기"].dt.day
|
| 103 |
+
subset["최초입찰_요일"] = subset["최초입찰시기"].dt.weekday
|
| 104 |
+
subset = subset.drop(columns=["최초입찰시기"])
|
| 105 |
+
else:
|
| 106 |
+
raise KeyError("최초입찰시기 컬럼이 데이터프레임에 없습니다.")
|
| 107 |
+
|
| 108 |
+
# 사용할 피처 컬럼 결정
|
| 109 |
base_cols = ["대분류", "중분류", "기관"]
|
| 110 |
+
date_cols = ["최초입찰_연도", "최초입찰_월", "최초입찰_일", "최초입찰_요일"]
|
| 111 |
if order == 1:
|
| 112 |
bid_cols = ["1차최저입찰가"]
|
| 113 |
elif order == 2:
|
|
|
|
| 119 |
else: # order == 5
|
| 120 |
bid_cols = ["1차최저입찰가", "2차최저입찰가", "3차최저입찰가", "4차최저입찰가", "5차최저입찰가"]
|
| 121 |
|
| 122 |
+
feature_cols = base_cols + date_cols + bid_cols
|
| 123 |
+
X = subset[feature_cols].copy()
|
| 124 |
y = subset["낙찰가율_최초최저가기준"].copy()
|
| 125 |
|
| 126 |
+
# 전처리 + 모델 파이프라인 정의
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
preprocessor = ColumnTransformer(
|
| 128 |
transformers=[("ohe", OneHotEncoder(handle_unknown="ignore"), base_cols)],
|
| 129 |
remainder="passthrough"
|
| 130 |
)
|
| 131 |
|
| 132 |
+
# 차수별 파라미터를 꺼내서 XGBRegressor 생성
|
| 133 |
params = ORDER_PARAMS.get(order)
|
| 134 |
model = XGBRegressor(**params)
|
| 135 |
pipeline = Pipeline([
|
|
|
|
| 137 |
("regressor", model)
|
| 138 |
])
|
| 139 |
|
| 140 |
+
# 전체 데이터로 학습
|
| 141 |
pipeline.fit(X, y)
|
| 142 |
print(f"차수 {order} 모델 학습 완료 (params: {params})")
|
| 143 |
|
| 144 |
+
# 모델 저장
|
| 145 |
output_dir = f"output/order{order}"
|
| 146 |
os.makedirs(output_dir, exist_ok=True)
|
| 147 |
joblib.dump(pipeline, os.path.join(output_dir, "pipeline.pkl"))
|
| 148 |
print(f" → pipeline.pkl 저장: {output_dir}/pipeline.pkl")
|
| 149 |
|
|
|
|
| 150 |
# requirements.txt 작성
|
|
|
|
| 151 |
deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"]
|
| 152 |
with open("requirements.txt", "w", encoding="utf-8") as f:
|
| 153 |
f.write("\n".join(deps))
|
| 154 |
|
| 155 |
+
# 레포 생성 시도
|
|
|
|
|
|
|
|
|
|
| 156 |
api = HfApi()
|
| 157 |
try:
|
| 158 |
api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
|
| 159 |
except Exception:
|
| 160 |
pass
|
| 161 |
|
| 162 |
+
# 로컬에 레포 클론 (기존 디렉토리 삭제 포함)
|
| 163 |
local_dir = "hf_repo"
|
| 164 |
if os.path.isdir(local_dir):
|
| 165 |
shutil.rmtree(local_dir, onerror=rm_readonly)
|
| 166 |
repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
|
| 167 |
|
| 168 |
+
# output/order{차수} 내 파일을 hf_repo/models_by_order/order{차수} 폴더로 복사
|
| 169 |
for order in [1, 2, 3, 4, 5]:
|
| 170 |
src_dir = f"output/order{order}"
|
| 171 |
if not os.path.isdir(src_dir):
|
|
|
|
| 178 |
if os.path.isfile(src_file):
|
| 179 |
shutil.copy(src_file, os.path.join(dst_dir, "pipeline.pkl"))
|
| 180 |
|
| 181 |
+
# 스크립트 파일 및 requirements.txt도 함께 복사
|
| 182 |
for src in [SCRIPT_PATH, "requirements.txt"]:
|
| 183 |
dst = os.path.join(local_dir, os.path.basename(src))
|
| 184 |
shutil.copy(src, dst)
|
| 185 |
|
| 186 |
+
# 커밋 및 푸시
|
| 187 |
repo.git_add(auto_lfs_track=True)
|
| 188 |
repo.git_commit("Add trained pipelines for orders 1~5 (개별 파라미터) + training script")
|
| 189 |
repo.git_push()
|