Spaces:

SharleyK
/

PredictiveMaintenance

Sleeping

App Files Files Community

PredictiveMaintenance / predictive-maintenance-pipeline /scripts /07_clean_data.py

SharleyK

Upload folder using huggingface_hub

9d8621a verified about 1 month ago

raw

history blame contribute delete

1.91 kB

	#!/usr/bin/env python3
	"""Clean and prepare data"""
	import os
	import logging
	import pandas as pd
	from huggingface_hub import hf_hub_download

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	HF_TOKEN = os.getenv("HF_TOKEN")
	HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
	DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
	repo_id = f"{HF_USERNAME}/{DATASET_NAME}"

	logger.info("Cleaning data...")

	# Download data
	file_path = hf_hub_download(repo_id=repo_id, repo_type="dataset",
	filename="engine_data.csv", token=HF_TOKEN)

	df = pd.read_csv(file_path)

	logger.info(f"Original shape: {df.shape}")
	logger.info(f"Original columns: {df.columns.tolist()}")

	# Standardize column names to lowercase with underscores
	# This ensures consistency regardless of how they're named in the source
	df.columns = df.columns.str.lower().str.replace(' ', '_')
	logger.info(f"Standardized columns: {df.columns.tolist()}")

	# Verify the target column exists
	if 'engine_condition' not in df.columns:
	logger.error(f"Target column 'engine_condition' not found after standardization!")
	logger.error(f"Available columns: {df.columns.tolist()}")
	raise KeyError("Missing expected target column")

	# Remove duplicates
	initial_rows = df.shape[0]
	df = df.drop_duplicates()
	logger.info(f"After removing duplicates: {df.shape} (removed {initial_rows - df.shape[0]} rows)")

	# Handle missing values (if any)
	initial_rows = df.shape[0]
	df = df.dropna()
	logger.info(f"After dropping NA: {df.shape} (removed {initial_rows - df.shape[0]} rows)")

	# Log target distribution
	logger.info(f"Target distribution:
	{df['engine_condition'].value_counts()}")

	# Save cleaned data
	os.makedirs("data", exist_ok=True)
	df.to_csv("data/cleaned_data.csv", index=False)

	logger.info("✓ Data cleaning completed!")
	logger.info(f"✓ Final columns: {df.columns.tolist()}")