Spaces:

ananttripathiak
/

engine-maintenance-space

Running

File size: 4,817 Bytes

1aa7fae

"""
Data preparation script for the predictive maintenance project.

Responsibilities:
- Load the raw engine dataset from the Hugging Face dataset repo (preferred)
  or from the local data folder as a fallback.
- Clean and preprocess the data (rename columns, handle missing values,
  drop duplicates, basic sanity checks).
- Split the cleaned data into train and test sets.
- Save train and test CSVs locally.
- Upload the resulting train and test CSVs back to the Hugging Face dataset repo.
"""

from __future__ import annotations

from pathlib import Path
from typing import Tuple

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import config
from hf_data_utils import download_dataset_file, upload_dataset_file


def _load_raw_data_from_hf_or_local() -> pd.DataFrame:
    """
    Try to load the raw dataset from the Hugging Face dataset repo.
    If that fails (e.g., no token or repo yet), fall back to the local CSV.
    """
    # Preferred: load from HF dataset repo if token and repo are configured
    if config.HF_TOKEN and config.HF_DATASET_REPO:
        try:
            remote_path = download_dataset_file(
                filename="data/engine_data.csv",
                repo_id=config.HF_DATASET_REPO,
                token=config.HF_TOKEN,
                local_dir=config.DATA_DIR,
            )
            return pd.read_csv(remote_path)
        except Exception:
            # Fall back to local file
            pass

    # Local fallback
    if not config.RAW_DATA_FILE.exists():
        raise FileNotFoundError(
            f"Raw data file not found at {config.RAW_DATA_FILE}. "
            "Ensure engine_data.csv exists or upload it to the HF dataset repo."
        )

    return pd.read_csv(config.RAW_DATA_FILE)


def _clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Perform basic cleaning and feature engineering.
    """
    # Standardize column names
    df = df.rename(columns=config.RAW_COLUMN_RENAME_MAP)

    # Keep only the expected columns (drop any extras, if present)
    expected_cols = set(config.FEATURE_COLUMNS + [config.TARGET_COLUMN])
    df = df[[col for col in df.columns if col in expected_cols]]

    # Drop duplicate rows
    df = df.drop_duplicates().reset_index(drop=True)

    # Handle missing values: for this numeric dataset, fill with median
    if df.isna().any().any():
        df = df.fillna(df.median(numeric_only=True))

    # Ensure target is integer/binary
    df[config.TARGET_COLUMN] = df[config.TARGET_COLUMN].astype(int)

    return df


def _train_test_split(
    df: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split the cleaned dataframe into train and test sets.
    """
    X = df[config.FEATURE_COLUMNS]
    y = df[config.TARGET_COLUMN]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=config.TEST_SIZE,
        random_state=config.RANDOM_STATE,
        stratify=y,
    )

    train_df = X_train.copy()
    train_df[config.TARGET_COLUMN] = y_train

    test_df = X_test.copy()
    test_df[config.TARGET_COLUMN] = y_test

    return train_df, test_df


def main() -> None:
    """
    Execute the full data preparation pipeline.
    """
    print("Loading raw data...")
    raw_df = _load_raw_data_from_hf_or_local()
    print(f"Raw data shape: {raw_df.shape}")

    print("Cleaning data...")
    clean_df = _clean_data(raw_df)
    print(f"Clean data shape: {clean_df.shape}")

    print("Performing train/test split...")
    train_df, test_df = _train_test_split(clean_df)
    print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")

    # Save locally
    config.PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    train_df.to_csv(config.TRAIN_FILE, index=False)
    test_df.to_csv(config.TEST_FILE, index=False)
    print(f"Saved train to {config.TRAIN_FILE}")
    print(f"Saved test to {config.TEST_FILE}")

    # Upload to HF dataset repo, if configured
    if config.HF_TOKEN and config.HF_DATASET_REPO:
        try:
            print("Uploading train and test splits to Hugging Face dataset repo...")
            upload_dataset_file(
                local_path=config.TRAIN_FILE,
                repo_id=config.HF_DATASET_REPO,
                repo_path="data/train.csv",
                token=config.HF_TOKEN,
            )
            upload_dataset_file(
                local_path=config.TEST_FILE,
                repo_id=config.HF_DATASET_REPO,
                repo_path="data/test.csv",
                token=config.HF_TOKEN,
            )
            print("Upload to Hugging Face completed.")
        except Exception as e:
            print(
                f"Warning: Failed to upload train/test to Hugging Face dataset repo: {e}"
            )


if __name__ == "__main__":
    main()