#!/usr/bin/env python3 """Clean and prepare data""" import os import logging import pandas as pd from huggingface_hub import hf_hub_download logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) HF_TOKEN = os.getenv("HF_TOKEN") HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK") DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance") repo_id = f"{HF_USERNAME}/{DATASET_NAME}" logger.info("Cleaning data...") # Download data file_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="engine_data.csv", token=HF_TOKEN) df = pd.read_csv(file_path) logger.info(f"Original shape: {df.shape}") logger.info(f"Original columns: {df.columns.tolist()}") # Standardize column names to lowercase with underscores # This ensures consistency regardless of how they're named in the source df.columns = df.columns.str.lower().str.replace(' ', '_') logger.info(f"Standardized columns: {df.columns.tolist()}") # Verify the target column exists if 'engine_condition' not in df.columns: logger.error(f"Target column 'engine_condition' not found after standardization!") logger.error(f"Available columns: {df.columns.tolist()}") raise KeyError("Missing expected target column") # Remove duplicates initial_rows = df.shape[0] df = df.drop_duplicates() logger.info(f"After removing duplicates: {df.shape} (removed {initial_rows - df.shape[0]} rows)") # Handle missing values (if any) initial_rows = df.shape[0] df = df.dropna() logger.info(f"After dropping NA: {df.shape} (removed {initial_rows - df.shape[0]} rows)") # Log target distribution logger.info(f"Target distribution: {df['engine_condition'].value_counts()}") # Save cleaned data os.makedirs("data", exist_ok=True) df.to_csv("data/cleaned_data.csv", index=False) logger.info("✓ Data cleaning completed!") logger.info(f"✓ Final columns: {df.columns.tolist()}")