|
|
""" |
|
|
Data preparation script for the predictive maintenance project. |
|
|
|
|
|
Responsibilities: |
|
|
- Load the raw engine dataset from the Hugging Face dataset repo (preferred) |
|
|
or from the local data folder as a fallback. |
|
|
- Clean and preprocess the data (rename columns, handle missing values, |
|
|
drop duplicates, basic sanity checks). |
|
|
- Split the cleaned data into train and test sets. |
|
|
- Save train and test CSVs locally. |
|
|
- Upload the resulting train and test CSVs back to the Hugging Face dataset repo. |
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
from pathlib import Path |
|
|
from typing import Tuple |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
import config |
|
|
from hf_data_utils import download_dataset_file, upload_dataset_file |
|
|
|
|
|
|
|
|
def _load_raw_data_from_hf_or_local() -> pd.DataFrame: |
|
|
""" |
|
|
Try to load the raw dataset from the Hugging Face dataset repo. |
|
|
If that fails (e.g., no token or repo yet), fall back to the local CSV. |
|
|
""" |
|
|
|
|
|
if config.HF_TOKEN and config.HF_DATASET_REPO: |
|
|
try: |
|
|
remote_path = download_dataset_file( |
|
|
filename="data/engine_data.csv", |
|
|
repo_id=config.HF_DATASET_REPO, |
|
|
token=config.HF_TOKEN, |
|
|
local_dir=config.DATA_DIR, |
|
|
) |
|
|
return pd.read_csv(remote_path) |
|
|
except Exception: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
if not config.RAW_DATA_FILE.exists(): |
|
|
raise FileNotFoundError( |
|
|
f"Raw data file not found at {config.RAW_DATA_FILE}. " |
|
|
"Ensure engine_data.csv exists or upload it to the HF dataset repo." |
|
|
) |
|
|
|
|
|
return pd.read_csv(config.RAW_DATA_FILE) |
|
|
|
|
|
|
|
|
def _clean_data(df: pd.DataFrame) -> pd.DataFrame: |
|
|
""" |
|
|
Perform basic cleaning and feature engineering. |
|
|
""" |
|
|
|
|
|
df = df.rename(columns=config.RAW_COLUMN_RENAME_MAP) |
|
|
|
|
|
|
|
|
expected_cols = set(config.FEATURE_COLUMNS + [config.TARGET_COLUMN]) |
|
|
df = df[[col for col in df.columns if col in expected_cols]] |
|
|
|
|
|
|
|
|
df = df.drop_duplicates().reset_index(drop=True) |
|
|
|
|
|
|
|
|
if df.isna().any().any(): |
|
|
df = df.fillna(df.median(numeric_only=True)) |
|
|
|
|
|
|
|
|
df[config.TARGET_COLUMN] = df[config.TARGET_COLUMN].astype(int) |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
def _train_test_split( |
|
|
df: pd.DataFrame, |
|
|
) -> Tuple[pd.DataFrame, pd.DataFrame]: |
|
|
""" |
|
|
Split the cleaned dataframe into train and test sets. |
|
|
""" |
|
|
X = df[config.FEATURE_COLUMNS] |
|
|
y = df[config.TARGET_COLUMN] |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, |
|
|
y, |
|
|
test_size=config.TEST_SIZE, |
|
|
random_state=config.RANDOM_STATE, |
|
|
stratify=y, |
|
|
) |
|
|
|
|
|
train_df = X_train.copy() |
|
|
train_df[config.TARGET_COLUMN] = y_train |
|
|
|
|
|
test_df = X_test.copy() |
|
|
test_df[config.TARGET_COLUMN] = y_test |
|
|
|
|
|
return train_df, test_df |
|
|
|
|
|
|
|
|
def main() -> None: |
|
|
""" |
|
|
Execute the full data preparation pipeline. |
|
|
""" |
|
|
print("Loading raw data...") |
|
|
raw_df = _load_raw_data_from_hf_or_local() |
|
|
print(f"Raw data shape: {raw_df.shape}") |
|
|
|
|
|
print("Cleaning data...") |
|
|
clean_df = _clean_data(raw_df) |
|
|
print(f"Clean data shape: {clean_df.shape}") |
|
|
|
|
|
print("Performing train/test split...") |
|
|
train_df, test_df = _train_test_split(clean_df) |
|
|
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}") |
|
|
|
|
|
|
|
|
config.PROCESSED_DIR.mkdir(parents=True, exist_ok=True) |
|
|
train_df.to_csv(config.TRAIN_FILE, index=False) |
|
|
test_df.to_csv(config.TEST_FILE, index=False) |
|
|
print(f"Saved train to {config.TRAIN_FILE}") |
|
|
print(f"Saved test to {config.TEST_FILE}") |
|
|
|
|
|
|
|
|
if config.HF_TOKEN and config.HF_DATASET_REPO: |
|
|
try: |
|
|
print("Uploading train and test splits to Hugging Face dataset repo...") |
|
|
upload_dataset_file( |
|
|
local_path=config.TRAIN_FILE, |
|
|
repo_id=config.HF_DATASET_REPO, |
|
|
repo_path="data/train.csv", |
|
|
token=config.HF_TOKEN, |
|
|
) |
|
|
upload_dataset_file( |
|
|
local_path=config.TEST_FILE, |
|
|
repo_id=config.HF_DATASET_REPO, |
|
|
repo_path="data/test.csv", |
|
|
token=config.HF_TOKEN, |
|
|
) |
|
|
print("Upload to Hugging Face completed.") |
|
|
except Exception as e: |
|
|
print( |
|
|
f"Warning: Failed to upload train/test to Hugging Face dataset repo: {e}" |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|