asteroidddd commited on
Commit
2647f27
·
1 Parent(s): e1f9b76

Add trained pipelines for orders 1~5 (개별 파라미터) + training script

Browse files
models_by_order/order1/pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34f774ecce18b5ee2983c3ae76d42096a726ea447ea9f12883960925db19d933
3
+ size 178070
models_by_order/order2/pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:017e7eb292066baca4286b2a17cd6e02a799ea7bc5c68ec0165e3ba2232e99ee
3
+ size 266967
models_by_order/order3/pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a9717526ffcf56a25a03a7a8e16bd022e55d3bf50d552b755c23ad61ab65129
3
+ size 353015
models_by_order/order4/pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed8400a0d7dce5c7ae690f59b87e92c424afa7e8f8fd49b2b0714d182838508
3
+ size 490677
models_by_order/order5/pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf1a8a1fc6ba59d599ddab9615ae343f9a9ed224451257633b4c1303123b3c66
3
+ size 921883
onbid-map-carp-train.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # onbid-map-carp-train.py
2
+
3
+ import os
4
+ import shutil
5
+ import stat
6
+ import pandas as pd
7
+ import joblib
8
+ from sklearn.preprocessing import OneHotEncoder
9
+ from sklearn.compose import ColumnTransformer
10
+ from sklearn.pipeline import Pipeline
11
+ from xgboost import XGBRegressor
12
+ from huggingface_hub import HfApi, Repository
13
+
14
+ # 차수별 하이퍼파라미터 설정
15
+ ORDER_PARAMS = {
16
+ 1: {
17
+ "objective": "reg:squarederror",
18
+ "max_depth": 4,
19
+ "learning_rate": 0.10,
20
+ "n_estimators": 100,
21
+ "subsample": 0.8,
22
+ "colsample_bytree": 0.8,
23
+ "random_state": 42,
24
+ },
25
+ 2: {
26
+ "objective": "reg:squarederror",
27
+ "max_depth": 5,
28
+ "learning_rate": 0.05,
29
+ "n_estimators": 120,
30
+ "subsample": 0.85,
31
+ "colsample_bytree": 0.9,
32
+ "random_state": 42,
33
+ },
34
+ 3: {
35
+ "objective": "reg:squarederror",
36
+ "max_depth": 6,
37
+ "learning_rate": 0.03,
38
+ "n_estimators": 150,
39
+ "subsample": 0.9,
40
+ "colsample_bytree": 0.9,
41
+ "random_state": 42,
42
+ },
43
+ 4: {
44
+ "objective": "reg:squarederror",
45
+ "max_depth": 7,
46
+ "learning_rate": 0.02,
47
+ "n_estimators": 180,
48
+ "subsample": 0.9,
49
+ "colsample_bytree": 0.95,
50
+ "random_state": 42,
51
+ },
52
+ 5: {
53
+ "objective": "reg:squarederror",
54
+ "max_depth": 8,
55
+ "learning_rate": 0.01,
56
+ "n_estimators": 200,
57
+ "subsample": 0.95,
58
+ "colsample_bytree": 0.95,
59
+ "random_state": 42,
60
+ },
61
+ }
62
+
63
+ # 환경 변수에서 Hugging Face 토큰 읽기
64
+ HF_REPO_NAME = "asteroidddd/onbid-map-carp"
65
+ HF_TOKEN = os.getenv("HF_TOKEN")
66
+ if HF_TOKEN is None:
67
+ raise ValueError("환경 변수 HF_TOKEN이 설정되어 있지 않습니다.")
68
+
69
+ # 이 스크립트의 경로
70
+ SCRIPT_PATH = os.path.abspath(__file__)
71
+
72
+ def rm_readonly(func, path, exc_info):
73
+ os.chmod(path, stat.S_IWRITE)
74
+ func(path)
75
+
76
+ def main():
77
+
78
+ # 데이터 불러오기
79
+ df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\car_data.pkl')
80
+
81
+ # 자동차만 필터링 (이미 되어 있음)
82
+ if "대분류" in df.columns:
83
+ df = df[(df["대분류"] == "자동차")].reset_index(drop=True)
84
+
85
+ # '낙찰차수' 컬럼을 정수형으로 변환하고, 5 이상은 5로 통일
86
+ df["낙찰차수"] = df["낙찰차수"].astype(int).apply(lambda x: x if x < 5 else 5)
87
+
88
+ # 차수별 모델 학습 & 저장
89
+ for order in [1, 2, 3, 4, 5]:
90
+
91
+ # 해당 차수 데이터만 필터링
92
+ subset = df[df["낙찰차수"] == order].copy().reset_index(drop=True)
93
+ if subset.empty:
94
+ print(f"차수 {order} 데이터가 없습니다. 건너뜁니다.")
95
+ continue
96
+
97
+ # 날짜 컬럼(datetime) 파생변수 생성
98
+ if "최초입찰시기" in subset.columns:
99
+ subset["최초입찰시기"] = pd.to_datetime(subset["최초입찰시기"])
100
+ subset["최초입찰_연도"] = subset["최초입찰시기"].dt.year
101
+ subset["최초입찰_월"] = subset["최초입찰시기"].dt.month
102
+ subset["최초입찰_일"] = subset["최초입찰시기"].dt.day
103
+ subset["최초입찰_요일"] = subset["최초입찰시기"].dt.weekday
104
+ subset = subset.drop(columns=["최초입찰시기"])
105
+ else:
106
+ raise KeyError("최초입찰시기 컬럼이 데이터프레임에 없습니다.")
107
+
108
+ # 사용할 피처 컬럼 결정
109
+ base_cols = ["대분류", "중분류", "소분류", "제조사", "차종", "기관"]
110
+ date_cols = ["최초입찰_연도", "최초입찰_월", "최초입찰_일", "최초입찰_요일"]
111
+ if order == 1:
112
+ bid_cols = ["1차최저입찰가"]
113
+ elif order == 2:
114
+ bid_cols = ["1차최저입찰가", "2차최저입찰가"]
115
+ elif order == 3:
116
+ bid_cols = ["1차최저입찰가", "2차최저입찰가", "3차최저입찰가"]
117
+ elif order == 4:
118
+ bid_cols = ["1차최저입찰가", "2차최저입찰가", "3차최저입찰가", "4차최저입찰가"]
119
+ else: # order == 5
120
+ bid_cols = ["1차최저입찰가", "2차최저입찰가", "3차최저입찰가", "4차최저입찰가", "5차최저입찰가"]
121
+
122
+ feature_cols = base_cols + date_cols + bid_cols
123
+ X = subset[feature_cols].copy()
124
+ y = subset["낙찰가율_최초최저가기준"].copy()
125
+
126
+ # 전처리 + 모델 파이프라인 정의
127
+ preprocessor = ColumnTransformer(
128
+ transformers=[("ohe", OneHotEncoder(handle_unknown="ignore"), base_cols)],
129
+ remainder="passthrough"
130
+ )
131
+
132
+ # 차수별 파라미터를 꺼내서 XGBRegressor 생성
133
+ params = ORDER_PARAMS.get(order)
134
+ model = XGBRegressor(**params)
135
+ pipeline = Pipeline([
136
+ ("preprocessor", preprocessor),
137
+ ("regressor", model)
138
+ ])
139
+
140
+ # 전체 데이터로 학습
141
+ pipeline.fit(X, y)
142
+ print(f"차수 {order} 모델 학습 완료 (params: {params})")
143
+
144
+ # 모델 저장
145
+ output_dir = f"output/order{order}"
146
+ os.makedirs(output_dir, exist_ok=True)
147
+ joblib.dump(pipeline, os.path.join(output_dir, "pipeline.pkl"))
148
+ print(f" → pipeline.pkl 저장: {output_dir}/pipeline.pkl")
149
+
150
+ # requirements.txt 작성
151
+ deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"]
152
+ with open("requirements.txt", "w", encoding="utf-8") as f:
153
+ f.write("\n".join(deps))
154
+
155
+ # 레포 생성 시도
156
+ api = HfApi()
157
+ try:
158
+ api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
159
+ except Exception:
160
+ pass
161
+
162
+ # 로컬에 레포 클론 (기존 디렉토리 삭제 포함)
163
+ local_dir = "hf_repo"
164
+ if os.path.isdir(local_dir):
165
+ shutil.rmtree(local_dir, onerror=rm_readonly)
166
+ repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
167
+
168
+ # output/order{차수} 내 파일을 hf_repo/models_by_order/order{차수} 폴더로 복사
169
+ for order in [1, 2, 3, 4, 5]:
170
+ src_dir = f"output/order{order}"
171
+ if not os.path.isdir(src_dir):
172
+ continue
173
+
174
+ dst_dir = os.path.join(local_dir, "models_by_order", f"order{order}")
175
+ os.makedirs(dst_dir, exist_ok=True)
176
+
177
+ src_file = os.path.join(src_dir, "pipeline.pkl")
178
+ if os.path.isfile(src_file):
179
+ shutil.copy(src_file, os.path.join(dst_dir, "pipeline.pkl"))
180
+
181
+ # 스크립트 파일 및 requirements.txt도 함께 복사
182
+ for src in [SCRIPT_PATH, "requirements.txt"]:
183
+ dst = os.path.join(local_dir, os.path.basename(src))
184
+ shutil.copy(src, dst)
185
+
186
+ # 커밋 및 푸시
187
+ repo.git_add(auto_lfs_track=True)
188
+ repo.git_commit("Add trained pipelines for orders 1~5 (개별 파라미터) + training script")
189
+ repo.git_push()
190
+ print("Hugging Face Hub에 모델 업로드 완료")
191
+
192
+ if __name__ == "__main__":
193
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pandas
2
+ scikit-learn
3
+ xgboost
4
+ joblib
5
+ huggingface_hub