File size: 3,473 Bytes
4b4d652
 
 
 
 
 
 
ba69daf
4b4d652
 
 
 
 
ba69daf
4b4d652
 
 
 
 
ba69daf
4b4d652
5a2a0c6
4b4d652
 
 
 
 
5a2a0c6
4b4d652
 
ba69daf
4b4d652
 
 
 
 
 
ba69daf
 
 
 
 
 
 
 
 
 
 
4b4d652
 
ba69daf
4b4d652
 
 
 
 
 
 
 
 
 
 
 
ba69daf
4b4d652
 
ba69daf
4b4d652
 
 
 
 
 
5a2a0c6
4b4d652
 
 
 
ba69daf
4b4d652
 
 
 
b6795f3
4b4d652
ba69daf
4b4d652
 
 
 
 
ba69daf
4b4d652
 
 
 
5a2a0c6
4b4d652
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# onbid_map_round_train.py

import os
import shutil
import stat
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from huggingface_hub import HfApi, Repository

# ํ™˜๊ฒฝ ๋ณ€์ˆ˜์—์„œ Hugging Face ํ† ํฐ ์ฝ๊ธฐ
HF_REPO_NAME = "asteroidddd/onbid-map-round"
HF_TOKEN     = os.getenv("HF_TOKEN")
if HF_TOKEN is None:
    raise ValueError("ํ™˜๊ฒฝ ๋ณ€์ˆ˜ HF_TOKEN์ด ์„ค์ •๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")

# ์ด ์Šคํฌ๋ฆฝํŠธ์˜ ๊ฒฝ๋กœ
SCRIPT_PATH = os.path.abspath(__file__)

def rm_readonly(func, path, exc_info):
    os.chmod(path, stat.S_IWRITE)
    func(path)

def main():
    # ๋ฐ์ดํ„ฐ ๋กœ๋“œ
    df = pd.read_pickle(r'C:\Users\hwang\Desktop\OSSP\data.pkl')

    # ๋ผ๋ฒจ ์ธ์ฝ”๋”ฉ & ๋นˆ๋„ โ‰ค 10์ธ ํด๋ž˜์Šค ์ œ๊ฑฐ
    le_label = LabelEncoder()
    df["๋‚™์ฐฐ์ฐจ์ˆ˜_LE"] = le_label.fit_transform(df["๋‚™์ฐฐ์ฐจ์ˆ˜"])
    counts = df["๋‚™์ฐฐ์ฐจ์ˆ˜_LE"].value_counts()
    rare = counts[counts <= 10].index.tolist()
    df = df[~df["๋‚™์ฐฐ์ฐจ์ˆ˜_LE"].isin(rare)].reset_index(drop=True)

    # ๋‚ ์งœ ํŒŒ์ƒ ๋ณ€์ˆ˜ ์ƒ์„ฑ
    df["์ตœ์ดˆ์ž…์ฐฐ_์—ฐ๋„"] = df["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"].dt.year
    df["์ตœ์ดˆ์ž…์ฐฐ_์›”"]   = df["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"].dt.month
    df["์ตœ์ดˆ์ž…์ฐฐ_์ผ"]   = df["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"].dt.day
    df["์ตœ์ดˆ์ž…์ฐฐ_์š”์ผ"] = df["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"].dt.weekday
    df = df.drop(columns=["์ตœ์ดˆ์ž…์ฐฐ์‹œ๊ธฐ"])

    # ํ”ผ์ฒ˜/ํƒ€๊นƒ ๋ถ„๋ฆฌ
    X = df[["๋Œ€๋ถ„๋ฅ˜", "์ค‘๋ถ„๋ฅ˜", "๊ธฐ๊ด€",
            "์ตœ์ดˆ์ž…์ฐฐ_์—ฐ๋„", "์ตœ์ดˆ์ž…์ฐฐ_์›”", "์ตœ์ดˆ์ž…์ฐฐ_์ผ", "์ตœ์ดˆ์ž…์ฐฐ_์š”์ผ",
            "1์ฐจ์ตœ์ €์ž…์ฐฐ๊ฐ€"]]
    y = df["๋‚™์ฐฐ์ฐจ์ˆ˜_LE"]

    # ์ „์ฒ˜๋ฆฌ + ๋ชจ๋ธ ํŒŒ์ดํ”„๋ผ์ธ
    cat_cols = ["๋Œ€๋ถ„๋ฅ˜", "์ค‘๋ถ„๋ฅ˜", "๊ธฐ๊ด€"]
    preprocessor = ColumnTransformer(
        transformers=[
            ("ohe", OneHotEncoder(handle_unknown="ignore"), cat_cols)
        ],
        remainder="passthrough"
    )
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", XGBClassifier(eval_metric="mlogloss", random_state=42))
    ])

    # ํ•™์Šต
    pipeline.fit(X, y)

    # ํŒŒ์ดํ”„๋ผ์ธ & ๋ผ๋ฒจ ์ธ์ฝ”๋” ์ €์žฅ
    os.makedirs("output", exist_ok=True)
    pipeline_path = "output/auction_pipeline.pkl"
    label_path    = "output/label_encoder.pkl"
    joblib.dump(pipeline, pipeline_path)
    joblib.dump(le_label, label_path)

    # requirements.txt ์ž‘์„ฑ
    deps = ["pandas", "scikit-learn", "xgboost", "joblib", "huggingface_hub"]
    with open("requirements.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(deps))

    # Hugging Face ๋ ˆํฌ ์ƒ์„ฑ ์‹œ๋„
    api = HfApi()
    try:
        api.create_repo(repo_id=HF_REPO_NAME, token=HF_TOKEN)
    except:
        pass

    # ๋กœ์ปฌ์— ๋ ˆํฌ ํด๋ก  (๊ธฐ์กด ์‚ญ์ œ ์‹œ read-only ์ฒ˜๋ฆฌ)
    local_dir = "hf_repo"
    if os.path.isdir(local_dir):
        shutil.rmtree(local_dir, onerror=rm_readonly)
    repo = Repository(local_dir=local_dir, clone_from=HF_REPO_NAME, use_auth_token=HF_TOKEN)

    # ํŒŒ์ผ ๋ณต์‚ฌ
    for src in [SCRIPT_PATH, "requirements.txt", pipeline_path, label_path]:
        dst = os.path.join(local_dir, os.path.basename(src))
        shutil.copy(src, dst)

    # ์ปค๋ฐ‹ ๋ฐ ํ‘ธ์‹œ
    repo.git_add(auto_lfs_track=True)
    repo.git_commit("Add trained pipeline + preprocessing code")
    repo.git_push()

if __name__ == "__main__":
    main()