File size: 3,432 Bytes
f606871 159b5df f606871 159b5df f606871 159b5df f606871 159b5df f606871 159b5df | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | """PHM 2016 Data Challenge CMP ๋ฐ์ดํฐ์
๋ก๋
CMP(Chemical Mechanical Planarization) ๊ณต์ ์ผ์ ๋ฐ์ดํฐ, 25๊ฐ ์ค ์ผ์ ์ด๋ฆ ๊ณต๊ฐ
- 1981 wafer ร 2 stage(A/B)
- target: ํ๊ท ์ฌ๋ฃ ์ ๊ฑฐ์จ(AVG_REMOVAL_RATE)
์ถ์ฒ: https://phmsociety.org PHM Data Challenge 2016
raw/CMP-data/training/CMP-training-NNN.csv: trajectory ์๊ณ์ด (wafer ๋ค์)
raw/CMP-training-removalrate.csv: (WAFER_ID, STAGE, AVG_REMOVAL_RATE) ๋ผ๋ฒจ
per-wafer feature vector๋ฅผ ๋ง๋ค๊ธฐ ์ํด trajectory๋ฅผ ํ๊ท ์ผ๋ก ์ง๊ณ
A3 ์๋(CMP step ์ด์)์ด ์ด ๋ฐ์ดํฐ๋ก Tier 1 ์ด์ ํ์ง ์ํ
"""
from functools import lru_cache
from pathlib import Path
import pandas as pd
RAW_DIR = Path(__file__).parent / "raw"
TRAIN_TRAJ_DIR = RAW_DIR / "CMP-data" / "training"
TRAIN_LABEL = RAW_DIR / "CMP-training-removalrate.csv"
# ์ฌ์ ์ง๊ณ ์บ์ (๋ฐฐํฌ์ฉ, ~400KB) - raw ์์ด๋ ๋์
CACHED_FEATURES = Path(__file__).parent / "phm_cmp_features.csv"
# ์ง๊ณ ๋์ ์ผ์ ์ปฌ๋ผ, ์ง์ง ์๋ฏธ ์๋ ์ด๋ฆ๋ค
SENSOR_COLS = [
"USAGE_OF_BACKING_FILM",
"USAGE_OF_DRESSER",
"USAGE_OF_POLISHING_TABLE",
"USAGE_OF_DRESSER_TABLE",
"PRESSURIZED_CHAMBER_PRESSURE",
"MAIN_OUTER_AIR_BAG_PRESSURE",
"CENTER_AIR_BAG_PRESSURE",
"RETAINER_RING_PRESSURE",
"RIPPLE_AIR_BAG_PRESSURE",
"USAGE_OF_MEMBRANE",
"USAGE_OF_PRESSURIZED_SHEET",
"SLURRY_FLOW_LINE_A",
"SLURRY_FLOW_LINE_B",
"SLURRY_FLOW_LINE_C",
"WAFER_ROTATION",
"STAGE_ROTATION",
"HEAD_ROTATION",
"DRESSING_WATER_STATUS",
"EDGE_AIR_BAG_PRESSURE",
]
@lru_cache(maxsize=1)
def load_phm_cmp() -> tuple[pd.DataFrame, pd.Series]:
"""์บ์ CSV๊ฐ ์์ผ๋ฉด ๊ทธ๊ฑธ ์ฌ์ฉ, ์์ผ๋ฉด raw trajectory ์ง๊ณ ํ ์บ์ ์ ์ฅ
features: (N, 19) - ์ผ์ ํ๊ท ๊ฐ
labels: (N,) - AVG_REMOVAL_RATE
index: MultiIndex (WAFER_ID, STAGE)
"""
if CACHED_FEATURES.exists():
return _load_cached()
return _build_and_cache()
def _load_cached() -> tuple[pd.DataFrame, pd.Series]:
df = pd.read_csv(CACHED_FEATURES, index_col=["WAFER_ID", "STAGE"])
labels = df["AVG_REMOVAL_RATE"]
features = df.drop(columns=["AVG_REMOVAL_RATE"])
return features, labels
def _build_and_cache() -> tuple[pd.DataFrame, pd.Series]:
"""raw trajectory์์ wafer-stage ๋จ์๋ก ํ๊ท ์ง๊ณ, ๊ฒฐ๊ณผ๋ฅผ ์บ์ CSV๋ก ์ ์ฅ"""
if not TRAIN_TRAJ_DIR.exists() or not TRAIN_LABEL.exists():
raise FileNotFoundError(
f"PHM 2016 CMP ๋ฐ์ดํฐ๊ฐ ์์, {RAW_DIR}์ ๋ฐ์ดํฐ์
์ ๋๊ฑฐ๋ "
f"{CACHED_FEATURES} ์บ์ ํ์ผ์ด ํ์ํฉ๋๋ค (data/phm2016/README.md ์ฐธ๊ณ )"
)
frames = []
for path in sorted(TRAIN_TRAJ_DIR.glob("CMP-training-*.csv")):
df = pd.read_csv(path, usecols=["WAFER_ID", "STAGE"] + SENSOR_COLS)
frames.append(df)
all_traj = pd.concat(frames, ignore_index=True)
features = all_traj.groupby(["WAFER_ID", "STAGE"])[SENSOR_COLS].mean()
labels_df = pd.read_csv(TRAIN_LABEL)
labels_df = labels_df.set_index(["WAFER_ID", "STAGE"])["AVG_REMOVAL_RATE"]
common = features.index.intersection(labels_df.index)
features = features.loc[common]
labels = labels_df.loc[common]
# ์บ์ ์ ์ฅ (๋ฐฐํฌ ์ raw ์์ด๋ ๋์ํ๋๋ก)
combined = features.copy()
combined["AVG_REMOVAL_RATE"] = labels
combined.to_csv(CACHED_FEATURES)
return features, labels
|