fabagent / data /phm2016 /loader.py
hee_!J
feat: ์ž๊ฐ€ํ•™์ŠตยทFAISS RAGยท์ •๋Ÿ‰์‹คํ—˜ + ๋ฐฐํฌ ์ค€๋น„
159b5df
Raw
History Blame Contribute Delete
3.43 kB
"""PHM 2016 Data Challenge CMP ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋”
CMP(Chemical Mechanical Planarization) ๊ณต์ • ์„ผ์„œ ๋ฐ์ดํ„ฐ, 25๊ฐœ ์‹ค ์„ผ์„œ ์ด๋ฆ„ ๊ณต๊ฐœ
- 1981 wafer ร— 2 stage(A/B)
- target: ํ‰๊ท  ์žฌ๋ฃŒ ์ œ๊ฑฐ์œจ(AVG_REMOVAL_RATE)
์ถœ์ฒ˜: https://phmsociety.org PHM Data Challenge 2016
raw/CMP-data/training/CMP-training-NNN.csv: trajectory ์‹œ๊ณ„์—ด (wafer ๋‹ค์ˆ˜)
raw/CMP-training-removalrate.csv: (WAFER_ID, STAGE, AVG_REMOVAL_RATE) ๋ผ๋ฒจ
per-wafer feature vector๋ฅผ ๋งŒ๋“ค๊ธฐ ์œ„ํ•ด trajectory๋ฅผ ํ‰๊ท ์œผ๋กœ ์ง‘๊ณ„
A3 ์•Œ๋žŒ(CMP step ์ด์ƒ)์ด ์ด ๋ฐ์ดํ„ฐ๋กœ Tier 1 ์ด์ƒ ํƒ์ง€ ์ˆ˜ํ–‰
"""
from functools import lru_cache
from pathlib import Path
import pandas as pd
RAW_DIR = Path(__file__).parent / "raw"
TRAIN_TRAJ_DIR = RAW_DIR / "CMP-data" / "training"
TRAIN_LABEL = RAW_DIR / "CMP-training-removalrate.csv"
# ์‚ฌ์ „ ์ง‘๊ณ„ ์บ์‹œ (๋ฐฐํฌ์šฉ, ~400KB) - raw ์—†์ด๋„ ๋™์ž‘
CACHED_FEATURES = Path(__file__).parent / "phm_cmp_features.csv"
# ์ง‘๊ณ„ ๋Œ€์ƒ ์„ผ์„œ ์ปฌ๋Ÿผ, ์ง„์งœ ์˜๋ฏธ ์žˆ๋Š” ์ด๋ฆ„๋“ค
SENSOR_COLS = [
"USAGE_OF_BACKING_FILM",
"USAGE_OF_DRESSER",
"USAGE_OF_POLISHING_TABLE",
"USAGE_OF_DRESSER_TABLE",
"PRESSURIZED_CHAMBER_PRESSURE",
"MAIN_OUTER_AIR_BAG_PRESSURE",
"CENTER_AIR_BAG_PRESSURE",
"RETAINER_RING_PRESSURE",
"RIPPLE_AIR_BAG_PRESSURE",
"USAGE_OF_MEMBRANE",
"USAGE_OF_PRESSURIZED_SHEET",
"SLURRY_FLOW_LINE_A",
"SLURRY_FLOW_LINE_B",
"SLURRY_FLOW_LINE_C",
"WAFER_ROTATION",
"STAGE_ROTATION",
"HEAD_ROTATION",
"DRESSING_WATER_STATUS",
"EDGE_AIR_BAG_PRESSURE",
]
@lru_cache(maxsize=1)
def load_phm_cmp() -> tuple[pd.DataFrame, pd.Series]:
"""์บ์‹œ CSV๊ฐ€ ์žˆ์œผ๋ฉด ๊ทธ๊ฑธ ์‚ฌ์šฉ, ์—†์œผ๋ฉด raw trajectory ์ง‘๊ณ„ ํ›„ ์บ์‹œ ์ €์žฅ
features: (N, 19) - ์„ผ์„œ ํ‰๊ท ๊ฐ’
labels: (N,) - AVG_REMOVAL_RATE
index: MultiIndex (WAFER_ID, STAGE)
"""
if CACHED_FEATURES.exists():
return _load_cached()
return _build_and_cache()
def _load_cached() -> tuple[pd.DataFrame, pd.Series]:
df = pd.read_csv(CACHED_FEATURES, index_col=["WAFER_ID", "STAGE"])
labels = df["AVG_REMOVAL_RATE"]
features = df.drop(columns=["AVG_REMOVAL_RATE"])
return features, labels
def _build_and_cache() -> tuple[pd.DataFrame, pd.Series]:
"""raw trajectory์—์„œ wafer-stage ๋‹จ์œ„๋กœ ํ‰๊ท  ์ง‘๊ณ„, ๊ฒฐ๊ณผ๋ฅผ ์บ์‹œ CSV๋กœ ์ €์žฅ"""
if not TRAIN_TRAJ_DIR.exists() or not TRAIN_LABEL.exists():
raise FileNotFoundError(
f"PHM 2016 CMP ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Œ, {RAW_DIR}์— ๋ฐ์ดํ„ฐ์…‹์„ ๋‘๊ฑฐ๋‚˜ "
f"{CACHED_FEATURES} ์บ์‹œ ํŒŒ์ผ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค (data/phm2016/README.md ์ฐธ๊ณ )"
)
frames = []
for path in sorted(TRAIN_TRAJ_DIR.glob("CMP-training-*.csv")):
df = pd.read_csv(path, usecols=["WAFER_ID", "STAGE"] + SENSOR_COLS)
frames.append(df)
all_traj = pd.concat(frames, ignore_index=True)
features = all_traj.groupby(["WAFER_ID", "STAGE"])[SENSOR_COLS].mean()
labels_df = pd.read_csv(TRAIN_LABEL)
labels_df = labels_df.set_index(["WAFER_ID", "STAGE"])["AVG_REMOVAL_RATE"]
common = features.index.intersection(labels_df.index)
features = features.loc[common]
labels = labels_df.loc[common]
# ์บ์‹œ ์ €์žฅ (๋ฐฐํฌ ์‹œ raw ์—†์ด๋„ ๋™์ž‘ํ•˜๋„๋ก)
combined = features.copy()
combined["AVG_REMOVAL_RATE"] = labels
combined.to_csv(CACHED_FEATURES)
return features, labels