harikrishna1985 commited on
Commit
66cae1e
·
verified ·
1 Parent(s): 881d19a

Upload src/01_data_prep.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/01_data_prep.py +167 -0
src/01_data_prep.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+ from sklearn.model_selection import train_test_split
7
+ from huggingface_hub import hf_hub_download, HfApi
8
+
9
+
10
+ # =========================
11
+ # CONFIG
12
+ # =========================
13
+ DATASET_REPO_ID = "harikrishna1985/Engine_data"
14
+ RAW_FILENAME = "data/engine_data.csv"
15
+
16
+ TARGET_COLUMN = "engine_condition"
17
+
18
+ # columns to drop if unnecessary
19
+ DROP_COLUMNS = [
20
+ # "unnamed: 0",
21
+ # "id",
22
+ ]
23
+
24
+ TEST_SIZE = 0.2
25
+ RANDOM_STATE = 42
26
+
27
+ LOCAL_DATA_DIR = Path("data")
28
+ LOCAL_DATA_DIR.mkdir(parents=True, exist_ok=True)
29
+
30
+ TRAIN_FILE = LOCAL_DATA_DIR / "train.csv"
31
+ TEST_FILE = LOCAL_DATA_DIR / "test.csv"
32
+ CLEAN_FILE = LOCAL_DATA_DIR / "cleaned_data.csv"
33
+ METADATA_FILE = LOCAL_DATA_DIR / "prep_metadata.json"
34
+
35
+
36
+ # =========================
37
+ # HELPERS
38
+ # =========================
39
+ def get_hf_api() -> HfApi:
40
+ token = os.getenv("HF_TOKEN")
41
+ return HfApi(token=token)
42
+
43
+
44
+ def load_raw_data_from_hf() -> pd.DataFrame:
45
+ print(f"Downloading raw dataset from HF dataset repo: {DATASET_REPO_ID}")
46
+ local_path = hf_hub_download(
47
+ repo_id=DATASET_REPO_ID,
48
+ filename=RAW_FILENAME,
49
+ repo_type="dataset",
50
+ )
51
+ df = pd.read_csv(local_path)
52
+ print(f"Raw data shape: {df.shape}")
53
+ return df
54
+
55
+
56
+ def clean_data(df: pd.DataFrame) -> pd.DataFrame:
57
+ df = df.copy()
58
+
59
+ # standardize column names
60
+ df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
61
+
62
+ # align target/drop names with cleaned columns
63
+ drop_cols_clean = [c.strip().lower().replace(" ", "_") for c in DROP_COLUMNS]
64
+ target_col_clean = TARGET_COLUMN.strip().lower().replace(" ", "_")
65
+
66
+ # drop unwanted columns if present
67
+ cols_to_drop = [c for c in drop_cols_clean if c in df.columns]
68
+ if cols_to_drop:
69
+ df = df.drop(columns=cols_to_drop)
70
+
71
+ # remove duplicates
72
+ df = df.drop_duplicates()
73
+
74
+ # remove rows with missing target
75
+ if target_col_clean not in df.columns:
76
+ raise ValueError(f"Target column '{target_col_clean}' not found in dataset columns: {list(df.columns)}")
77
+
78
+ df = df.dropna(subset=[target_col_clean])
79
+
80
+ # fill numeric missing values with median
81
+ numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
82
+ if numeric_cols:
83
+ df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
84
+
85
+ # fill non-numeric missing values with mode if possible
86
+ non_numeric_cols = [c for c in df.columns if c not in numeric_cols]
87
+ for col in non_numeric_cols:
88
+ if df[col].isna().sum() > 0:
89
+ mode_vals = df[col].mode()
90
+ fill_value = mode_vals.iloc[0] if not mode_vals.empty else "unknown"
91
+ df[col] = df[col].fillna(fill_value)
92
+
93
+ print(f"Cleaned data shape: {df.shape}")
94
+ return df
95
+
96
+
97
+ def split_and_save(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
98
+ target_col_clean = TARGET_COLUMN.strip().lower().replace(" ", "_")
99
+
100
+ # stratify if target is classification-friendly
101
+ stratify_arg = df[target_col_clean] if df[target_col_clean].nunique() <= 20 else None
102
+
103
+ train_df, test_df = train_test_split(
104
+ df,
105
+ test_size=TEST_SIZE,
106
+ random_state=RANDOM_STATE,
107
+ stratify=stratify_arg,
108
+ )
109
+
110
+ df.to_csv(CLEAN_FILE, index=False)
111
+ train_df.to_csv(TRAIN_FILE, index=False)
112
+ test_df.to_csv(TEST_FILE, index=False)
113
+
114
+ metadata = {
115
+ "dataset_repo_id": DATASET_REPO_ID,
116
+ "raw_filename": RAW_FILENAME,
117
+ "target_column": target_col_clean,
118
+ "drop_columns": DROP_COLUMNS,
119
+ "cleaned_shape": list(df.shape),
120
+ "train_shape": list(train_df.shape),
121
+ "test_shape": list(test_df.shape),
122
+ "test_size": TEST_SIZE,
123
+ "random_state": RANDOM_STATE,
124
+ }
125
+
126
+ with open(METADATA_FILE, "w", encoding="utf-8") as f:
127
+ json.dump(metadata, f, indent=2)
128
+
129
+ print(f"Saved cleaned data to: {CLEAN_FILE}")
130
+ print(f"Saved train data to: {TRAIN_FILE}")
131
+ print(f"Saved test data to: {TEST_FILE}")
132
+
133
+ return train_df, test_df
134
+
135
+
136
+ def upload_prepared_files_to_hf() -> None:
137
+ api = get_hf_api()
138
+
139
+ files_to_upload = [
140
+ (str(CLEAN_FILE), "processed/cleaned_data.csv"),
141
+ (str(TRAIN_FILE), "processed/train.csv"),
142
+ (str(TEST_FILE), "processed/test.csv"),
143
+ (str(METADATA_FILE), "processed/prep_metadata.json"),
144
+ ]
145
+
146
+ for local_file, path_in_repo in files_to_upload:
147
+ print(f"Uploading {local_file} -> {path_in_repo}")
148
+ api.upload_file(
149
+ path_or_fileobj=local_file,
150
+ path_in_repo=path_in_repo,
151
+ repo_id=DATASET_REPO_ID,
152
+ repo_type="dataset",
153
+ )
154
+
155
+ print("Prepared dataset files uploaded successfully to HF dataset repo.")
156
+
157
+
158
+ def main():
159
+ df = load_raw_data_from_hf()
160
+ df = clean_data(df)
161
+ split_and_save(df)
162
+ upload_prepared_files_to_hf()
163
+ print("Data preparation completed successfully.")
164
+
165
+
166
+ if __name__ == "__main__":
167
+ main()