asteroidddd
/

onbid-map-round

Model card Files Files and versions

xet

Community

asteroidddd commited on Jun 1, 2025

Commit

5a2a0c6

1 Parent(s): 4b4d652

Add trained pipeline + preprocessing code

Browse files

Files changed (1) hide show

onbid-map-round-train.py +18 -15

onbid-map-round-train.py CHANGED Viewed

@@ -12,18 +12,19 @@ from sklearn.pipeline import Pipeline
 from xgboost import XGBClassifier
 from huggingface_hub import HfApi, Repository
-# 환경 변수에서 토큰 읽어오기
 HF_REPO_NAME = "asteroidddd/onbid-map-round"
 HF_TOKEN     = os.getenv("HF_TOKEN")
 if HF_TOKEN is None:
     raise ValueError("환경 변수 HF_TOKEN이 설정되어 있지 않습니다.")
-# 스크립트 경로/이름
 SCRIPT_PATH = os.path.abspath(__file__)
 SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
 class DateFeatures(BaseEstimator, TransformerMixin):
     def __init__(self, date_column="최초입찰시기"):
         self.date_column = date_column
@@ -40,27 +41,29 @@ class DateFeatures(BaseEstimator, TransformerMixin):
         return X.drop(columns=[self.date_column])
 def rm_readonly(func, path, exc_info):
     os.chmod(path, stat.S_IWRITE)
     func(path)
 def main():
-    # 1) 데이터 로드
     df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl')
-    # 2) 라벨 인코딩 및 희귀 레이블 제거
     le_label = LabelEncoder()
     df["낙찰차수_LE"] = le_label.fit_transform(df["낙찰차수"])
     counts = df["낙찰차수_LE"].value_counts()
     rare = counts[counts <= 10].index.tolist()
     df = df[~df["낙찰차수_LE"].isin(rare)].reset_index(drop=True)
-    # 3) 피처/타깃 분리
     X = df[["대분류", "중분류", "기관", "최초입찰시기", "1차최저입찰가"]]
     y = df["낙찰차수_LE"]
-    # 4) 전처리 + 모델 파이프라인
     cat_cols = ["대분류", "중분류", "기관"]
     preprocessor = ColumnTransformer(
         transformers=[
@@ -74,40 +77,40 @@ def main():
         ("classifier", XGBClassifier(eval_metric="mlogloss", random_state=42))
     ])
-    # 5) 학습
     pipeline.fit(X, y)
-    # 6) 파이프라인 및 라벨 인코더 저장
     os.makedirs("output", exist_ok=True)
     pipeline_path = "output/auction_pipeline.pkl"
     label_path    = "output/label_encoder.pkl"
     joblib.dump(pipeline, pipeline_path)
     joblib.dump(le_label, label_path)
-    # 7) requirements.txt 작성
     deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"]
     with open("requirements.txt", "w", encoding="utf-8") as f:
         f.write("\n".join(deps))
-    # 8) Hugging Face 리포 생성 시도
     api = HfApi()
     try:
         api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
     except:
-        pass
-    # 9) 로컬에 레포 클론 (기존 삭제 시 read-only 문제 해결)
     local_dir = "hf_repo"
     if os.path.isdir(local_dir):
         shutil.rmtree(local_dir, onerror=rm_readonly)
     repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
-    # 10) 파일 복사
     for src in [SCRIPT_PATH, "requirements.txt", pipeline_path, label_path]:
         dst = os.path.join(local_dir, os.path.basename(src))
         shutil.copy(src, dst)
-    # 11) 커밋 및 푸시
     repo.git_add(auto_lfs_track=True)
     repo.git_commit("Add trained pipeline + preprocessing code")
     repo.git_push()

 from xgboost import XGBClassifier
 from huggingface_hub import HfApi, Repository
+# 환경 변수에서 Hugging Face 토큰 읽어오기
 HF_REPO_NAME = "asteroidddd/onbid-map-round"
 HF_TOKEN     = os.getenv("HF_TOKEN")
 if HF_TOKEN is None:
     raise ValueError("환경 변수 HF_TOKEN이 설정되어 있지 않습니다.")
+# 이 스크립트의 경로와 파일명
 SCRIPT_PATH = os.path.abspath(__file__)
 SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
+# 최초입찰시기 연도/월/일/요일 추출 변환기
 class DateFeatures(BaseEstimator, TransformerMixin):
     def __init__(self, date_column="최초입찰시기"):
         self.date_column = date_column
         return X.drop(columns=[self.date_column])
+# 읽기 전용 파일 삭제 시 권한 변경 후 재시도
 def rm_readonly(func, path, exc_info):
     os.chmod(path, stat.S_IWRITE)
     func(path)
 def main():
+    # 데이터 로드
     df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl')
+    # 낙찰차수 레이블 인코딩 후, 빈도 ≤ 10인 클래스 제거
     le_label = LabelEncoder()
     df["낙찰차수_LE"] = le_label.fit_transform(df["낙찰차수"])
     counts = df["낙찰차수_LE"].value_counts()
     rare = counts[counts <= 10].index.tolist()
     df = df[~df["낙찰차수_LE"].isin(rare)].reset_index(drop=True)
+    # 입력(X)과 타깃(y) 분리
     X = df[["대분류", "중분류", "기관", "최초입찰시기", "1차최저입찰가"]]
     y = df["낙찰차수_LE"]
+    # 전처리 및 모델 파이프라인 정의
     cat_cols = ["대분류", "중분류", "기관"]
     preprocessor = ColumnTransformer(
         transformers=[
         ("classifier", XGBClassifier(eval_metric="mlogloss", random_state=42))
     ])
+    # 파이프라인 학습
     pipeline.fit(X, y)
+    # 학습된 파이프라인과 레이블 인코더 저장
     os.makedirs("output", exist_ok=True)
     pipeline_path = "output/auction_pipeline.pkl"
     label_path    = "output/label_encoder.pkl"
     joblib.dump(pipeline, pipeline_path)
     joblib.dump(le_label, label_path)
+    # requirements.txt 작성
     deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"]
     with open("requirements.txt", "w", encoding="utf-8") as f:
         f.write("\n".join(deps))
+    # Hugging Face 레포지토리 생성 시도
     api = HfApi()
     try:
         api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
     except:
+        pass  # 이미 존재하면 무시
+    # 로컬에 레포 클론 (기존 삭제 시 read-only 오류 처리)
     local_dir = "hf_repo"
     if os.path.isdir(local_dir):
         shutil.rmtree(local_dir, onerror=rm_readonly)
     repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)
+    # 필요한 파일 복사
     for src in [SCRIPT_PATH, "requirements.txt", pipeline_path, label_path]:
         dst = os.path.join(local_dir, os.path.basename(src))
         shutil.copy(src, dst)
+    # 커밋 및 푸시
     repo.git_add(auto_lfs_track=True)
     repo.git_commit("Add trained pipeline + preprocessing code")
     repo.git_push()