fabagent / data /secom /preprocess.py
hee_!J
feat: Tier 1 ์ด์ƒ ํƒ์ง€ SECOM ๊ธฐ๋ฐ˜ ๊ตฌํ˜„
77d44e1
Raw
History Blame Contribute Delete
1.56 kB
"""SECOM ์ „์ฒ˜๋ฆฌ
์ด์ƒ ํƒ์ง€ ๋ชจ๋ธ์— ๋„ฃ๊ธฐ ์ „ ๊ณตํ†ต ์ „์ฒ˜๋ฆฌ
- ์ „๋ถ€ ๊ฒฐ์ธก์ด๊ฑฐ๋‚˜ ๋ถ„์‚ฐ์ด 0(์ƒ์ˆ˜)์ธ ์ปฌ๋Ÿผ ์ œ๊ฑฐ
- ๋‚จ์€ ๊ฒฐ์ธก์น˜๋Š” ์ค‘์•™๊ฐ’์œผ๋กœ ์ž„ํ“จํ…Œ์ด์…˜
- StandardScaler๋กœ ์Šค์ผ€์ผ๋ง
agents/detection.py์™€ experiments/ ์–‘์ชฝ์—์„œ ๊ณต์šฉ์œผ๋กœ ์‚ฌ์šฉ
fit์€ train ๋ฐ์ดํ„ฐ์—๋งŒ, transform์€ train/test ๊ณตํ†ต์œผ๋กœ ์ ์šฉ
"""
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
class SecomPreprocessor:
"""SECOM ์„ผ์„œ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ, sklearn ์Šคํƒ€์ผ fit/transform"""
def __init__(self, var_threshold: float = 0.0):
self.var_threshold = var_threshold
self.keep_cols: list[str] = []
self.imputer = SimpleImputer(strategy="median")
self.scaler = StandardScaler()
def fit(self, X: pd.DataFrame) -> "SecomPreprocessor":
# ์ „๋ถ€ ๊ฒฐ์ธก์ธ ์ปฌ๋Ÿผ ์ œ๊ฑฐ ํ›„, ๋ถ„์‚ฐ์ด ์ž„๊ณ„ ์ดํ•˜(๊ฑฐ์˜ ์ƒ์ˆ˜)์ธ ์ปฌ๋Ÿผ๋„ ์ œ๊ฑฐ
non_empty = X.columns[X.notna().any()]
variances = X[non_empty].var()
self.keep_cols = list(variances[variances > self.var_threshold].index)
kept = X[self.keep_cols]
self.imputer.fit(kept)
self.scaler.fit(self.imputer.transform(kept))
return self
def transform(self, X: pd.DataFrame) -> np.ndarray:
kept = X[self.keep_cols]
return self.scaler.transform(self.imputer.transform(kept))
def fit_transform(self, X: pd.DataFrame) -> np.ndarray:
return self.fit(X).transform(X)