File size: 3,432 Bytes
f606871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159b5df
 
f606871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159b5df
f606871
 
 
 
 
159b5df
 
 
 
 
 
 
 
 
 
 
 
 
 
f606871
 
159b5df
 
f606871
 
 
 
 
 
 
 
 
 
 
 
 
159b5df
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""PHM 2016 Data Challenge CMP ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋”

CMP(Chemical Mechanical Planarization) ๊ณต์ • ์„ผ์„œ ๋ฐ์ดํ„ฐ, 25๊ฐœ ์‹ค ์„ผ์„œ ์ด๋ฆ„ ๊ณต๊ฐœ
- 1981 wafer ร— 2 stage(A/B)
- target: ํ‰๊ท  ์žฌ๋ฃŒ ์ œ๊ฑฐ์œจ(AVG_REMOVAL_RATE)
์ถœ์ฒ˜: https://phmsociety.org PHM Data Challenge 2016

raw/CMP-data/training/CMP-training-NNN.csv: trajectory ์‹œ๊ณ„์—ด (wafer ๋‹ค์ˆ˜)
raw/CMP-training-removalrate.csv: (WAFER_ID, STAGE, AVG_REMOVAL_RATE) ๋ผ๋ฒจ

per-wafer feature vector๋ฅผ ๋งŒ๋“ค๊ธฐ ์œ„ํ•ด trajectory๋ฅผ ํ‰๊ท ์œผ๋กœ ์ง‘๊ณ„
A3 ์•Œ๋žŒ(CMP step ์ด์ƒ)์ด ์ด ๋ฐ์ดํ„ฐ๋กœ Tier 1 ์ด์ƒ ํƒ์ง€ ์ˆ˜ํ–‰
"""
from functools import lru_cache
from pathlib import Path

import pandas as pd

RAW_DIR = Path(__file__).parent / "raw"
TRAIN_TRAJ_DIR = RAW_DIR / "CMP-data" / "training"
TRAIN_LABEL = RAW_DIR / "CMP-training-removalrate.csv"
# ์‚ฌ์ „ ์ง‘๊ณ„ ์บ์‹œ (๋ฐฐํฌ์šฉ, ~400KB) - raw ์—†์ด๋„ ๋™์ž‘
CACHED_FEATURES = Path(__file__).parent / "phm_cmp_features.csv"

# ์ง‘๊ณ„ ๋Œ€์ƒ ์„ผ์„œ ์ปฌ๋Ÿผ, ์ง„์งœ ์˜๋ฏธ ์žˆ๋Š” ์ด๋ฆ„๋“ค
SENSOR_COLS = [
    "USAGE_OF_BACKING_FILM",
    "USAGE_OF_DRESSER",
    "USAGE_OF_POLISHING_TABLE",
    "USAGE_OF_DRESSER_TABLE",
    "PRESSURIZED_CHAMBER_PRESSURE",
    "MAIN_OUTER_AIR_BAG_PRESSURE",
    "CENTER_AIR_BAG_PRESSURE",
    "RETAINER_RING_PRESSURE",
    "RIPPLE_AIR_BAG_PRESSURE",
    "USAGE_OF_MEMBRANE",
    "USAGE_OF_PRESSURIZED_SHEET",
    "SLURRY_FLOW_LINE_A",
    "SLURRY_FLOW_LINE_B",
    "SLURRY_FLOW_LINE_C",
    "WAFER_ROTATION",
    "STAGE_ROTATION",
    "HEAD_ROTATION",
    "DRESSING_WATER_STATUS",
    "EDGE_AIR_BAG_PRESSURE",
]


@lru_cache(maxsize=1)
def load_phm_cmp() -> tuple[pd.DataFrame, pd.Series]:
    """์บ์‹œ CSV๊ฐ€ ์žˆ์œผ๋ฉด ๊ทธ๊ฑธ ์‚ฌ์šฉ, ์—†์œผ๋ฉด raw trajectory ์ง‘๊ณ„ ํ›„ ์บ์‹œ ์ €์žฅ

    features: (N, 19) - ์„ผ์„œ ํ‰๊ท ๊ฐ’
    labels: (N,) - AVG_REMOVAL_RATE
    index: MultiIndex (WAFER_ID, STAGE)
    """
    if CACHED_FEATURES.exists():
        return _load_cached()
    return _build_and_cache()


def _load_cached() -> tuple[pd.DataFrame, pd.Series]:
    df = pd.read_csv(CACHED_FEATURES, index_col=["WAFER_ID", "STAGE"])
    labels = df["AVG_REMOVAL_RATE"]
    features = df.drop(columns=["AVG_REMOVAL_RATE"])
    return features, labels


def _build_and_cache() -> tuple[pd.DataFrame, pd.Series]:
    """raw trajectory์—์„œ wafer-stage ๋‹จ์œ„๋กœ ํ‰๊ท  ์ง‘๊ณ„, ๊ฒฐ๊ณผ๋ฅผ ์บ์‹œ CSV๋กœ ์ €์žฅ"""
    if not TRAIN_TRAJ_DIR.exists() or not TRAIN_LABEL.exists():
        raise FileNotFoundError(
            f"PHM 2016 CMP ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Œ, {RAW_DIR}์— ๋ฐ์ดํ„ฐ์…‹์„ ๋‘๊ฑฐ๋‚˜ "
            f"{CACHED_FEATURES} ์บ์‹œ ํŒŒ์ผ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค (data/phm2016/README.md ์ฐธ๊ณ )"
        )

    frames = []
    for path in sorted(TRAIN_TRAJ_DIR.glob("CMP-training-*.csv")):
        df = pd.read_csv(path, usecols=["WAFER_ID", "STAGE"] + SENSOR_COLS)
        frames.append(df)
    all_traj = pd.concat(frames, ignore_index=True)
    features = all_traj.groupby(["WAFER_ID", "STAGE"])[SENSOR_COLS].mean()

    labels_df = pd.read_csv(TRAIN_LABEL)
    labels_df = labels_df.set_index(["WAFER_ID", "STAGE"])["AVG_REMOVAL_RATE"]

    common = features.index.intersection(labels_df.index)
    features = features.loc[common]
    labels = labels_df.loc[common]

    # ์บ์‹œ ์ €์žฅ (๋ฐฐํฌ ์‹œ raw ์—†์ด๋„ ๋™์ž‘ํ•˜๋„๋ก)
    combined = features.copy()
    combined["AVG_REMOVAL_RATE"] = labels
    combined.to_csv(CACHED_FEATURES)

    return features, labels