Spaces:

harikrishna1985
/

predictive-maintenance-space

Sleeping

App Files Files Community

predictive-maintenance-space / src /01_data_prep.py

harikrishna1985

Upload src/01_data_prep.py with huggingface_hub

66cae1e verified 9 days ago

raw

history blame contribute delete

4.91 kB

	import os
	import json
	from pathlib import Path

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from huggingface_hub import hf_hub_download, HfApi


	# =========================
	# CONFIG
	# =========================
	DATASET_REPO_ID = "harikrishna1985/Engine_data"
	RAW_FILENAME = "data/engine_data.csv"

	TARGET_COLUMN = "engine_condition"

	# columns to drop if unnecessary
	DROP_COLUMNS = [
	# "unnamed: 0",
	# "id",
	]

	TEST_SIZE = 0.2
	RANDOM_STATE = 42

	LOCAL_DATA_DIR = Path("data")
	LOCAL_DATA_DIR.mkdir(parents=True, exist_ok=True)

	TRAIN_FILE = LOCAL_DATA_DIR / "train.csv"
	TEST_FILE = LOCAL_DATA_DIR / "test.csv"
	CLEAN_FILE = LOCAL_DATA_DIR / "cleaned_data.csv"
	METADATA_FILE = LOCAL_DATA_DIR / "prep_metadata.json"


	# =========================
	# HELPERS
	# =========================
	def get_hf_api() -> HfApi:
	token = os.getenv("HF_TOKEN")
	return HfApi(token=token)


	def load_raw_data_from_hf() -> pd.DataFrame:
	print(f"Downloading raw dataset from HF dataset repo: {DATASET_REPO_ID}")
	local_path = hf_hub_download(
	repo_id=DATASET_REPO_ID,
	filename=RAW_FILENAME,
	repo_type="dataset",
	)
	df = pd.read_csv(local_path)
	print(f"Raw data shape: {df.shape}")
	return df


	def clean_data(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()

	# standardize column names
	df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

	# align target/drop names with cleaned columns
	drop_cols_clean = [c.strip().lower().replace(" ", "_") for c in DROP_COLUMNS]
	target_col_clean = TARGET_COLUMN.strip().lower().replace(" ", "_")

	# drop unwanted columns if present
	cols_to_drop = [c for c in drop_cols_clean if c in df.columns]
	if cols_to_drop:
	df = df.drop(columns=cols_to_drop)

	# remove duplicates
	df = df.drop_duplicates()

	# remove rows with missing target
	if target_col_clean not in df.columns:
	raise ValueError(f"Target column '{target_col_clean}' not found in dataset columns: {list(df.columns)}")

	df = df.dropna(subset=[target_col_clean])

	# fill numeric missing values with median
	numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
	if numeric_cols:
	df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

	# fill non-numeric missing values with mode if possible
	non_numeric_cols = [c for c in df.columns if c not in numeric_cols]
	for col in non_numeric_cols:
	if df[col].isna().sum() > 0:
	mode_vals = df[col].mode()
	fill_value = mode_vals.iloc[0] if not mode_vals.empty else "unknown"
	df[col] = df[col].fillna(fill_value)

	print(f"Cleaned data shape: {df.shape}")
	return df


	def split_and_save(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
	target_col_clean = TARGET_COLUMN.strip().lower().replace(" ", "_")

	# stratify if target is classification-friendly
	stratify_arg = df[target_col_clean] if df[target_col_clean].nunique() <= 20 else None

	train_df, test_df = train_test_split(
	df,
	test_size=TEST_SIZE,
	random_state=RANDOM_STATE,
	stratify=stratify_arg,
	)

	df.to_csv(CLEAN_FILE, index=False)
	train_df.to_csv(TRAIN_FILE, index=False)
	test_df.to_csv(TEST_FILE, index=False)

	metadata = {
	"dataset_repo_id": DATASET_REPO_ID,
	"raw_filename": RAW_FILENAME,
	"target_column": target_col_clean,
	"drop_columns": DROP_COLUMNS,
	"cleaned_shape": list(df.shape),
	"train_shape": list(train_df.shape),
	"test_shape": list(test_df.shape),
	"test_size": TEST_SIZE,
	"random_state": RANDOM_STATE,
	}

	with open(METADATA_FILE, "w", encoding="utf-8") as f:
	json.dump(metadata, f, indent=2)

	print(f"Saved cleaned data to: {CLEAN_FILE}")
	print(f"Saved train data to: {TRAIN_FILE}")
	print(f"Saved test data to: {TEST_FILE}")

	return train_df, test_df


	def upload_prepared_files_to_hf() -> None:
	api = get_hf_api()

	files_to_upload = [
	(str(CLEAN_FILE), "processed/cleaned_data.csv"),
	(str(TRAIN_FILE), "processed/train.csv"),
	(str(TEST_FILE), "processed/test.csv"),
	(str(METADATA_FILE), "processed/prep_metadata.json"),
	]

	for local_file, path_in_repo in files_to_upload:
	print(f"Uploading {local_file} -> {path_in_repo}")
	api.upload_file(
	path_or_fileobj=local_file,
	path_in_repo=path_in_repo,
	repo_id=DATASET_REPO_ID,
	repo_type="dataset",
	)

	print("Prepared dataset files uploaded successfully to HF dataset repo.")


	def main():
	df = load_raw_data_from_hf()
	df = clean_data(df)
	split_and_save(df)
	upload_prepared_files_to_hf()
	print("Data preparation completed successfully.")


	if __name__ == "__main__":
	main()