| """PHM 2016 Data Challenge CMP ๋ฐ์ดํฐ์
๋ก๋ |
| |
| CMP(Chemical Mechanical Planarization) ๊ณต์ ์ผ์ ๋ฐ์ดํฐ, 25๊ฐ ์ค ์ผ์ ์ด๋ฆ ๊ณต๊ฐ |
| - 1981 wafer ร 2 stage(A/B) |
| - target: ํ๊ท ์ฌ๋ฃ ์ ๊ฑฐ์จ(AVG_REMOVAL_RATE) |
| ์ถ์ฒ: https://phmsociety.org PHM Data Challenge 2016 |
| |
| raw/CMP-data/training/CMP-training-NNN.csv: trajectory ์๊ณ์ด (wafer ๋ค์) |
| raw/CMP-training-removalrate.csv: (WAFER_ID, STAGE, AVG_REMOVAL_RATE) ๋ผ๋ฒจ |
| |
| per-wafer feature vector๋ฅผ ๋ง๋ค๊ธฐ ์ํด trajectory๋ฅผ ํ๊ท ์ผ๋ก ์ง๊ณ |
| A3 ์๋(CMP step ์ด์)์ด ์ด ๋ฐ์ดํฐ๋ก Tier 1 ์ด์ ํ์ง ์ํ |
| """ |
| from functools import lru_cache |
| from pathlib import Path |
|
|
| import pandas as pd |
|
|
| RAW_DIR = Path(__file__).parent / "raw" |
| TRAIN_TRAJ_DIR = RAW_DIR / "CMP-data" / "training" |
| TRAIN_LABEL = RAW_DIR / "CMP-training-removalrate.csv" |
| |
| CACHED_FEATURES = Path(__file__).parent / "phm_cmp_features.csv" |
|
|
| |
| SENSOR_COLS = [ |
| "USAGE_OF_BACKING_FILM", |
| "USAGE_OF_DRESSER", |
| "USAGE_OF_POLISHING_TABLE", |
| "USAGE_OF_DRESSER_TABLE", |
| "PRESSURIZED_CHAMBER_PRESSURE", |
| "MAIN_OUTER_AIR_BAG_PRESSURE", |
| "CENTER_AIR_BAG_PRESSURE", |
| "RETAINER_RING_PRESSURE", |
| "RIPPLE_AIR_BAG_PRESSURE", |
| "USAGE_OF_MEMBRANE", |
| "USAGE_OF_PRESSURIZED_SHEET", |
| "SLURRY_FLOW_LINE_A", |
| "SLURRY_FLOW_LINE_B", |
| "SLURRY_FLOW_LINE_C", |
| "WAFER_ROTATION", |
| "STAGE_ROTATION", |
| "HEAD_ROTATION", |
| "DRESSING_WATER_STATUS", |
| "EDGE_AIR_BAG_PRESSURE", |
| ] |
|
|
|
|
| @lru_cache(maxsize=1) |
| def load_phm_cmp() -> tuple[pd.DataFrame, pd.Series]: |
| """์บ์ CSV๊ฐ ์์ผ๋ฉด ๊ทธ๊ฑธ ์ฌ์ฉ, ์์ผ๋ฉด raw trajectory ์ง๊ณ ํ ์บ์ ์ ์ฅ |
| |
| features: (N, 19) - ์ผ์ ํ๊ท ๊ฐ |
| labels: (N,) - AVG_REMOVAL_RATE |
| index: MultiIndex (WAFER_ID, STAGE) |
| """ |
| if CACHED_FEATURES.exists(): |
| return _load_cached() |
| return _build_and_cache() |
|
|
|
|
| def _load_cached() -> tuple[pd.DataFrame, pd.Series]: |
| df = pd.read_csv(CACHED_FEATURES, index_col=["WAFER_ID", "STAGE"]) |
| labels = df["AVG_REMOVAL_RATE"] |
| features = df.drop(columns=["AVG_REMOVAL_RATE"]) |
| return features, labels |
|
|
|
|
| def _build_and_cache() -> tuple[pd.DataFrame, pd.Series]: |
| """raw trajectory์์ wafer-stage ๋จ์๋ก ํ๊ท ์ง๊ณ, ๊ฒฐ๊ณผ๋ฅผ ์บ์ CSV๋ก ์ ์ฅ""" |
| if not TRAIN_TRAJ_DIR.exists() or not TRAIN_LABEL.exists(): |
| raise FileNotFoundError( |
| f"PHM 2016 CMP ๋ฐ์ดํฐ๊ฐ ์์, {RAW_DIR}์ ๋ฐ์ดํฐ์
์ ๋๊ฑฐ๋ " |
| f"{CACHED_FEATURES} ์บ์ ํ์ผ์ด ํ์ํฉ๋๋ค (data/phm2016/README.md ์ฐธ๊ณ )" |
| ) |
|
|
| frames = [] |
| for path in sorted(TRAIN_TRAJ_DIR.glob("CMP-training-*.csv")): |
| df = pd.read_csv(path, usecols=["WAFER_ID", "STAGE"] + SENSOR_COLS) |
| frames.append(df) |
| all_traj = pd.concat(frames, ignore_index=True) |
| features = all_traj.groupby(["WAFER_ID", "STAGE"])[SENSOR_COLS].mean() |
|
|
| labels_df = pd.read_csv(TRAIN_LABEL) |
| labels_df = labels_df.set_index(["WAFER_ID", "STAGE"])["AVG_REMOVAL_RATE"] |
|
|
| common = features.index.intersection(labels_df.index) |
| features = features.loc[common] |
| labels = labels_df.loc[common] |
|
|
| |
| combined = features.copy() |
| combined["AVG_REMOVAL_RATE"] = labels |
| combined.to_csv(CACHED_FEATURES) |
|
|
| return features, labels |
|
|