Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Clean and prepare data""" | |
| import os | |
| import logging | |
| import pandas as pd | |
| from huggingface_hub import hf_hub_download | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK") | |
| DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance") | |
| repo_id = f"{HF_USERNAME}/{DATASET_NAME}" | |
| logger.info("Cleaning data...") | |
| # Download data | |
| file_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", | |
| filename="engine_data.csv", token=HF_TOKEN) | |
| df = pd.read_csv(file_path) | |
| logger.info(f"Original shape: {df.shape}") | |
| logger.info(f"Original columns: {df.columns.tolist()}") | |
| # Standardize column names to lowercase with underscores | |
| # This ensures consistency regardless of how they're named in the source | |
| df.columns = df.columns.str.lower().str.replace(' ', '_') | |
| logger.info(f"Standardized columns: {df.columns.tolist()}") | |
| # Verify the target column exists | |
| if 'engine_condition' not in df.columns: | |
| logger.error(f"Target column 'engine_condition' not found after standardization!") | |
| logger.error(f"Available columns: {df.columns.tolist()}") | |
| raise KeyError("Missing expected target column") | |
| # Remove duplicates | |
| initial_rows = df.shape[0] | |
| df = df.drop_duplicates() | |
| logger.info(f"After removing duplicates: {df.shape} (removed {initial_rows - df.shape[0]} rows)") | |
| # Handle missing values (if any) | |
| initial_rows = df.shape[0] | |
| df = df.dropna() | |
| logger.info(f"After dropping NA: {df.shape} (removed {initial_rows - df.shape[0]} rows)") | |
| # Log target distribution | |
| logger.info(f"Target distribution: | |
| {df['engine_condition'].value_counts()}") | |
| # Save cleaned data | |
| os.makedirs("data", exist_ok=True) | |
| df.to_csv("data/cleaned_data.csv", index=False) | |
| logger.info("✓ Data cleaning completed!") | |
| logger.info(f"✓ Final columns: {df.columns.tolist()}") | |