| import os |
| import json |
| from pathlib import Path |
|
|
| import pandas as pd |
| from sklearn.model_selection import train_test_split |
| from huggingface_hub import hf_hub_download, HfApi |
|
|
|
|
| |
| |
| |
| DATASET_REPO_ID = "harikrishna1985/Engine_data" |
| RAW_FILENAME = "data/engine_data.csv" |
|
|
| TARGET_COLUMN = "engine_condition" |
|
|
| |
| DROP_COLUMNS = [ |
| |
| |
| ] |
|
|
| TEST_SIZE = 0.2 |
| RANDOM_STATE = 42 |
|
|
| LOCAL_DATA_DIR = Path("data") |
| LOCAL_DATA_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| TRAIN_FILE = LOCAL_DATA_DIR / "train.csv" |
| TEST_FILE = LOCAL_DATA_DIR / "test.csv" |
| CLEAN_FILE = LOCAL_DATA_DIR / "cleaned_data.csv" |
| METADATA_FILE = LOCAL_DATA_DIR / "prep_metadata.json" |
|
|
|
|
| |
| |
| |
| def get_hf_api() -> HfApi: |
| token = os.getenv("HF_TOKEN") |
| return HfApi(token=token) |
|
|
|
|
| def load_raw_data_from_hf() -> pd.DataFrame: |
| print(f"Downloading raw dataset from HF dataset repo: {DATASET_REPO_ID}") |
| local_path = hf_hub_download( |
| repo_id=DATASET_REPO_ID, |
| filename=RAW_FILENAME, |
| repo_type="dataset", |
| ) |
| df = pd.read_csv(local_path) |
| print(f"Raw data shape: {df.shape}") |
| return df |
|
|
|
|
| def clean_data(df: pd.DataFrame) -> pd.DataFrame: |
| df = df.copy() |
|
|
| |
| df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns] |
|
|
| |
| drop_cols_clean = [c.strip().lower().replace(" ", "_") for c in DROP_COLUMNS] |
| target_col_clean = TARGET_COLUMN.strip().lower().replace(" ", "_") |
|
|
| |
| cols_to_drop = [c for c in drop_cols_clean if c in df.columns] |
| if cols_to_drop: |
| df = df.drop(columns=cols_to_drop) |
|
|
| |
| df = df.drop_duplicates() |
|
|
| |
| if target_col_clean not in df.columns: |
| raise ValueError(f"Target column '{target_col_clean}' not found in dataset columns: {list(df.columns)}") |
|
|
| df = df.dropna(subset=[target_col_clean]) |
|
|
| |
| numeric_cols = df.select_dtypes(include=["number"]).columns.tolist() |
| if numeric_cols: |
| df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median()) |
|
|
| |
| non_numeric_cols = [c for c in df.columns if c not in numeric_cols] |
| for col in non_numeric_cols: |
| if df[col].isna().sum() > 0: |
| mode_vals = df[col].mode() |
| fill_value = mode_vals.iloc[0] if not mode_vals.empty else "unknown" |
| df[col] = df[col].fillna(fill_value) |
|
|
| print(f"Cleaned data shape: {df.shape}") |
| return df |
|
|
|
|
| def split_and_save(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: |
| target_col_clean = TARGET_COLUMN.strip().lower().replace(" ", "_") |
|
|
| |
| stratify_arg = df[target_col_clean] if df[target_col_clean].nunique() <= 20 else None |
|
|
| train_df, test_df = train_test_split( |
| df, |
| test_size=TEST_SIZE, |
| random_state=RANDOM_STATE, |
| stratify=stratify_arg, |
| ) |
|
|
| df.to_csv(CLEAN_FILE, index=False) |
| train_df.to_csv(TRAIN_FILE, index=False) |
| test_df.to_csv(TEST_FILE, index=False) |
|
|
| metadata = { |
| "dataset_repo_id": DATASET_REPO_ID, |
| "raw_filename": RAW_FILENAME, |
| "target_column": target_col_clean, |
| "drop_columns": DROP_COLUMNS, |
| "cleaned_shape": list(df.shape), |
| "train_shape": list(train_df.shape), |
| "test_shape": list(test_df.shape), |
| "test_size": TEST_SIZE, |
| "random_state": RANDOM_STATE, |
| } |
|
|
| with open(METADATA_FILE, "w", encoding="utf-8") as f: |
| json.dump(metadata, f, indent=2) |
|
|
| print(f"Saved cleaned data to: {CLEAN_FILE}") |
| print(f"Saved train data to: {TRAIN_FILE}") |
| print(f"Saved test data to: {TEST_FILE}") |
|
|
| return train_df, test_df |
|
|
|
|
| def upload_prepared_files_to_hf() -> None: |
| api = get_hf_api() |
|
|
| files_to_upload = [ |
| (str(CLEAN_FILE), "processed/cleaned_data.csv"), |
| (str(TRAIN_FILE), "processed/train.csv"), |
| (str(TEST_FILE), "processed/test.csv"), |
| (str(METADATA_FILE), "processed/prep_metadata.json"), |
| ] |
|
|
| for local_file, path_in_repo in files_to_upload: |
| print(f"Uploading {local_file} -> {path_in_repo}") |
| api.upload_file( |
| path_or_fileobj=local_file, |
| path_in_repo=path_in_repo, |
| repo_id=DATASET_REPO_ID, |
| repo_type="dataset", |
| ) |
|
|
| print("Prepared dataset files uploaded successfully to HF dataset repo.") |
|
|
|
|
| def main(): |
| df = load_raw_data_from_hf() |
| df = clean_data(df) |
| split_and_save(df) |
| upload_prepared_files_to_hf() |
| print("Data preparation completed successfully.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|