asteroidddd commited on
Commit
fa9b18c
·
1 Parent(s): f42f2c0

Add trained pipelines for orders 1~5 (개별 파라미터) + training script

Browse files
models_by_order/order1/pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fab13277dd54bccfc5d65419da333aa8999d70275fa8a2963e9dc892226aafa1
3
+ size 178082
models_by_order/order2/pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9245c5f2c2452e9568f551c42b6da759ec0dc59a8ca1f948ab1425624d08ae67
3
+ size 271793
models_by_order/order3/pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04c66492081b1032b210c18849065f221af61495706f18cdcc3313aaad24fef8
3
+ size 371671
models_by_order/order4/pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b113a083518042de617cf76452f4e46368a64b91da2cdd230d4708e61458bbb
3
+ size 580403
models_by_order/order5/pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f590a9679bc247b716bc0e2dfb57799c6253c10e00fa7cebdc627ee9967d212
3
+ size 1267904
onbid-map-etcp-train.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # onbid_map_round_train.py
2
+
3
+ import os
4
+ import shutil
5
+ import stat
6
+ import pandas as pd
7
+ import joblib
8
+
9
+ from sklearn.preprocessing import OneHotEncoder
10
+ from sklearn.compose import ColumnTransformer
11
+ from sklearn.pipeline import Pipeline
12
+ from xgboost import XGBRegressor
13
+
14
+ from huggingface_hub import HfApi, Repository
15
+
16
+ # -----------------------------
17
+ # 차수별 하이퍼파라미터 설정
18
+ # -----------------------------
19
+ # ORDER_PARAMS[차수] = 해당 차수 XGBRegressor에 넘길 파라미터 딕셔너리
20
+ ORDER_PARAMS = {
21
+ 1: {
22
+ "objective": "reg:squarederror",
23
+ "max_depth": 4,
24
+ "learning_rate": 0.10,
25
+ "n_estimators": 100,
26
+ "subsample": 0.8,
27
+ "colsample_bytree": 0.8,
28
+ "random_state": 42,
29
+ },
30
+ 2: {
31
+ "objective": "reg:squarederror",
32
+ "max_depth": 5,
33
+ "learning_rate": 0.05,
34
+ "n_estimators": 120,
35
+ "subsample": 0.85,
36
+ "colsample_bytree": 0.9,
37
+ "random_state": 42,
38
+ },
39
+ 3: {
40
+ "objective": "reg:squarederror",
41
+ "max_depth": 6,
42
+ "learning_rate": 0.03,
43
+ "n_estimators": 150,
44
+ "subsample": 0.9,
45
+ "colsample_bytree": 0.9,
46
+ "random_state": 42,
47
+ },
48
+ 4: {
49
+ "objective": "reg:squarederror",
50
+ "max_depth": 7,
51
+ "learning_rate": 0.02,
52
+ "n_estimators": 180,
53
+ "subsample": 0.9,
54
+ "colsample_bytree": 0.95,
55
+ "random_state": 42,
56
+ },
57
+ 5: {
58
+ "objective": "reg:squarederror",
59
+ "max_depth": 8,
60
+ "learning_rate": 0.01,
61
+ "n_estimators": 200,
62
+ "subsample": 0.95,
63
+ "colsample_bytree": 0.95,
64
+ "random_state": 42,
65
+ },
66
+ }
67
+
68
+ # -----------------------------
69
+ # Hugging Face 환경 변수
70
+ # -----------------------------
71
+ HF_REPO_NAME = "asteroidddd/onbid-map-etcp"
72
+ HF_TOKEN = os.getenv("HF_TOKEN")
73
+ if HF_TOKEN is None:
74
+ raise ValueError("환경 변수 HF_TOKEN이 설정되어 있지 않습니다.")
75
+
76
+ # 이 스크립트의 경로
77
+ SCRIPT_PATH = os.path.abspath(__file__)
78
+
79
+ def rm_readonly(func, path, exc_info):
80
+ os.chmod(path, stat.S_IWRITE)
81
+ func(path)
82
+
83
+ def main():
84
+ # -----------------------------
85
+ # 데이터 로드 파트
86
+ # -----------------------------
87
+ # TODO: 실제 데이터 경로로 변경하고, '최초입찰시기' 컬럼을 datetime 타입으로 로드해주세요.
88
+ # 예시:
89
+ # df = pd.read_csv("data/onbid_data.csv", parse_dates=["최초입찰시기"])
90
+ df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl') # 실제 DataFrame으로 교체하세요.
91
+
92
+ # '자동차' 대분류 행 제거
93
+ if "대분류" in df.columns:
94
+ df = df[~(df["대분류"] == "자동차")].reset_index(drop=True)
95
+
96
+ # '낙찰차수' 컬럼을 정수형으로 변환하고, 5 이상은 5로 통일
97
+ df["낙찰차수"] = df["낙찰차수"].astype(int).apply(lambda x: x if x < 5 else 5)
98
+
99
+ # -----------------------------
100
+ # 차수별 모델 학습 & 저장
101
+ # -----------------------------
102
+ for order in [1, 2, 3, 4, 5]:
103
+ # 1) 해당 차수 데이터만 필터링
104
+ subset = df[df["낙찰차수"] == order].copy().reset_index(drop=True)
105
+ if subset.empty:
106
+ print(f"차수 {order} 데이터가 없습니다. 건너뜁니다.")
107
+ continue
108
+
109
+ # 2) 사용할 피처 컬럼 결정
110
+ base_cols = ["대분류", "중분류", "기관"]
111
+ date_col = ["최초입찰시기"]
112
+ if order == 1:
113
+ bid_cols = ["1차최저입찰가"]
114
+ elif order == 2:
115
+ bid_cols = ["1차최저입찰가", "2차최저입찰가"]
116
+ elif order == 3:
117
+ bid_cols = ["1차최저입찰가", "2차최저입찰가", "3차최저입찰가"]
118
+ elif order == 4:
119
+ bid_cols = ["1차최저입찰가", "2차최저입찰가", "3차최저입찰가", "4차최저입찰가"]
120
+ else: # order == 5
121
+ bid_cols = ["1차최저입찰가", "2차최저입찰가", "3차최저입찰가", "4차최저입찰가", "5차최저입찰가"]
122
+
123
+ X = subset[base_cols + date_col + bid_cols].copy()
124
+ y = subset["낙찰가율_최초최저가기준"].copy()
125
+
126
+ # 3) 날짜형 컬럼을 정수형(UNIX 타임스탬프)으로 변환
127
+ if "최초입찰시기" in X.columns:
128
+ X["최초입찰시기"] = X["최초입찰시기"].astype("int64")
129
+
130
+ # 4) 전처리 + 모델 파이프라인 정의
131
+ preprocessor = ColumnTransformer(
132
+ transformers=[("ohe", OneHotEncoder(handle_unknown="ignore"), base_cols)],
133
+ remainder="passthrough"
134
+ )
135
+
136
+ # 5) 차수별 파라미터를 꺼내서 XGBRegressor 생성
137
+ params = ORDER_PARAMS.get(order)
138
+ model = XGBRegressor(**params)
139
+ pipeline = Pipeline([
140
+ ("preprocessor", preprocessor),
141
+ ("regressor", model)
142
+ ])
143
+
144
+ # 6) 전체 데이터로 학습
145
+ pipeline.fit(X, y)
146
+ print(f"차수 {order} 모델 학습 완료 (params: {params})")
147
+
148
+ # 7) 모델 저장
149
+ output_dir = f"output/order{order}"
150
+ os.makedirs(output_dir, exist_ok=True)
151
+ joblib.dump(pipeline, os.path.join(output_dir, "pipeline.pkl"))
152
+ print(f" → pipeline.pkl 저장: {output_dir}/pipeline.pkl")
153
+
154
+ # -----------------------------
155
+ # requirements.txt 작성
156
+ # -----------------------------
157
+ deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"]
158
+ with open("requirements.txt", "w", encoding="utf-8") as f:
159
+ f.write("\n".join(deps))
160
+
161
+ # -----------------------------
162
+ # Hugging Face 업로드 파트
163
+ # -----------------------------
164
+ # 1) 레포 생성 시도
165
+ api = HfApi()
166
+ try:
167
+ api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
168
+ except Exception:
169
+ pass
170
+
171
+ # 2) 로컬에 레포 클론 (기존 디렉토리 삭제 포함)
172
+ local_dir = "hf_repo"
173
+ if os.path.isdir(local_dir):
174
+ shutil.rmtree(local_dir, onerror=rm_readonly)
175
+ repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
176
+
177
+ # 3) output/order{차수} 내 파일을 hf_repo/models_by_order/order{차수} 폴더로 복사
178
+ for order in [1, 2, 3, 4, 5]:
179
+ src_dir = f"output/order{order}"
180
+ if not os.path.isdir(src_dir):
181
+ continue
182
+
183
+ dst_dir = os.path.join(local_dir, "models_by_order", f"order{order}")
184
+ os.makedirs(dst_dir, exist_ok=True)
185
+
186
+ src_file = os.path.join(src_dir, "pipeline.pkl")
187
+ if os.path.isfile(src_file):
188
+ shutil.copy(src_file, os.path.join(dst_dir, "pipeline.pkl"))
189
+
190
+ # 4) 스크립트 파일 및 requirements.txt도 함께 복사
191
+ for src in [SCRIPT_PATH, "requirements.txt"]:
192
+ dst = os.path.join(local_dir, os.path.basename(src))
193
+ shutil.copy(src, dst)
194
+
195
+ # 5) 커밋 및 푸시
196
+ repo.git_add(auto_lfs_track=True)
197
+ repo.git_commit("Add trained pipelines for orders 1~5 (개별 파라미터) + training script")
198
+ repo.git_push()
199
+ print("Hugging Face Hub에 모델 업로드 완료")
200
+
201
+ if __name__ == "__main__":
202
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pandas
2
+ scikit-learn
3
+ xgboost
4
+ joblib
5
+ huggingface_hub