ananttripathiak's picture
Upload folder using huggingface_hub
1aa7fae verified
"""
Data preparation script for the predictive maintenance project.
Responsibilities:
- Load the raw engine dataset from the Hugging Face dataset repo (preferred)
or from the local data folder as a fallback.
- Clean and preprocess the data (rename columns, handle missing values,
drop duplicates, basic sanity checks).
- Split the cleaned data into train and test sets.
- Save train and test CSVs locally.
- Upload the resulting train and test CSVs back to the Hugging Face dataset repo.
"""
from __future__ import annotations
from pathlib import Path
from typing import Tuple
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import config
from hf_data_utils import download_dataset_file, upload_dataset_file
def _load_raw_data_from_hf_or_local() -> pd.DataFrame:
"""
Try to load the raw dataset from the Hugging Face dataset repo.
If that fails (e.g., no token or repo yet), fall back to the local CSV.
"""
# Preferred: load from HF dataset repo if token and repo are configured
if config.HF_TOKEN and config.HF_DATASET_REPO:
try:
remote_path = download_dataset_file(
filename="data/engine_data.csv",
repo_id=config.HF_DATASET_REPO,
token=config.HF_TOKEN,
local_dir=config.DATA_DIR,
)
return pd.read_csv(remote_path)
except Exception:
# Fall back to local file
pass
# Local fallback
if not config.RAW_DATA_FILE.exists():
raise FileNotFoundError(
f"Raw data file not found at {config.RAW_DATA_FILE}. "
"Ensure engine_data.csv exists or upload it to the HF dataset repo."
)
return pd.read_csv(config.RAW_DATA_FILE)
def _clean_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Perform basic cleaning and feature engineering.
"""
# Standardize column names
df = df.rename(columns=config.RAW_COLUMN_RENAME_MAP)
# Keep only the expected columns (drop any extras, if present)
expected_cols = set(config.FEATURE_COLUMNS + [config.TARGET_COLUMN])
df = df[[col for col in df.columns if col in expected_cols]]
# Drop duplicate rows
df = df.drop_duplicates().reset_index(drop=True)
# Handle missing values: for this numeric dataset, fill with median
if df.isna().any().any():
df = df.fillna(df.median(numeric_only=True))
# Ensure target is integer/binary
df[config.TARGET_COLUMN] = df[config.TARGET_COLUMN].astype(int)
return df
def _train_test_split(
df: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Split the cleaned dataframe into train and test sets.
"""
X = df[config.FEATURE_COLUMNS]
y = df[config.TARGET_COLUMN]
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=config.TEST_SIZE,
random_state=config.RANDOM_STATE,
stratify=y,
)
train_df = X_train.copy()
train_df[config.TARGET_COLUMN] = y_train
test_df = X_test.copy()
test_df[config.TARGET_COLUMN] = y_test
return train_df, test_df
def main() -> None:
"""
Execute the full data preparation pipeline.
"""
print("Loading raw data...")
raw_df = _load_raw_data_from_hf_or_local()
print(f"Raw data shape: {raw_df.shape}")
print("Cleaning data...")
clean_df = _clean_data(raw_df)
print(f"Clean data shape: {clean_df.shape}")
print("Performing train/test split...")
train_df, test_df = _train_test_split(clean_df)
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
# Save locally
config.PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
train_df.to_csv(config.TRAIN_FILE, index=False)
test_df.to_csv(config.TEST_FILE, index=False)
print(f"Saved train to {config.TRAIN_FILE}")
print(f"Saved test to {config.TEST_FILE}")
# Upload to HF dataset repo, if configured
if config.HF_TOKEN and config.HF_DATASET_REPO:
try:
print("Uploading train and test splits to Hugging Face dataset repo...")
upload_dataset_file(
local_path=config.TRAIN_FILE,
repo_id=config.HF_DATASET_REPO,
repo_path="data/train.csv",
token=config.HF_TOKEN,
)
upload_dataset_file(
local_path=config.TEST_FILE,
repo_id=config.HF_DATASET_REPO,
repo_path="data/test.csv",
token=config.HF_TOKEN,
)
print("Upload to Hugging Face completed.")
except Exception as e:
print(
f"Warning: Failed to upload train/test to Hugging Face dataset repo: {e}"
)
if __name__ == "__main__":
main()